def isArticleRelatedToTopic(article, aliases, keyword):
    # step0: split into sentences
    extractor = ExtractSentences()
    sent_text = np.array(extractor.split_into_sentences(article))

    # step1 lower the text
    text = article.lower()
    keyword = keyword.lower()

    # step2 replace aliases from keyword
    for a in aliases:
        text = text.replace(
            str(' ') + a.lower() + str(' '),
            str(' ') + keyword + str(' '))

    # accept article that have keyword's frequency greater than freq_threshold
    freq_threshold = 2
    key_freq = 0
    for word in nltk.word_tokenize(text):
        if word == keyword:
            key_freq = key_freq + 1

    if key_freq > freq_threshold:
        print('\tKeyword frequency ', key_freq)
        return True

    # accept the articles where keyword is in top line_threshold lines
    occ_threshold = 0.5
    sent_text = np.atleast_1d(sent_text)
    top_sent = sent_text[:int(math.ceil(occ_threshold * len(sent_text)))]
    for ts in top_sent:
        ts = ts.lower()
        ts = ts.replace('.', ' ').replace(',', ' ').replace('-', ' ')
        for a in aliases:
            ts = ts.replace(
                str(' ') + a.lower() + str(' '),
                str(' ') + keyword + str(' '))
        if keyword in nltk.word_tokenize(ts):
            print('\t top 50% lines')
            return True

    # accept if keyword is present in any of selected relations
    sNLP = StanfordNLP()
    try:
        pos_text = sNLP.pos(text)
        parse_text = sNLP.dependency_parse(text)
        selected_relation = [
            'amod', 'nmod', 'dobj', 'iobj', 'nsubj', 'nsubjpass'
        ]

        for i in range(1, len(parse_text)):
            rel = parse_text[i][0]
            word1 = pos_text[parse_text[i][1] - 1][0]
            word2 = pos_text[parse_text[i][2] - 1][0]
            if (word1 == keyword
                    or word2 == keyword) and (rel in selected_relation):
                print('\t passed NLP')
                return True
    except json.decoder.JSONDecodeError as e:
        print(e)
        print(text)
    # reject
    return False
コード例 #2
0
#########################################

client = MongoClient(config.mongoConfigs['host'], config.mongoConfigs['port'])
db = client[config.mongoConfigs['db']]
collection = db[resolved_entity_table]  # collection having resolved entities
art_collection = db[article_table]  # collection having articles

entity_types = config.entity_types
short_sources_list = config.short_sources_list
sources_list = config.sources_list
fixed_keywords = [
    'says', 'said', 'asks', 'asked', 'told', 'announced', 'announce',
    'claimed', 'claim'
]

extractor = ExtractSentences()  # object for extracting sentences from text


class StanfordNLP:
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port, timeout=30000)
        self.props = {
            'annotators':
            'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def word_tokenize(self, sentence):
        return self.nlp.word_tokenize(sentence)
コード例 #3
0
# globals
entity_types = config.entity_types
short_sources_list = config.short_sources_list
sources_list = config.sources_list

# get the list of all entities
entities = get_all_entities(collection, entity_types, args.N)
print('All resolved entities crawled from the database')

# parse entities list to get names, aliases and articles
e_names, e_aliases, e_articleIds = get_names_aliases_articles(entities)
print('e_names, e_aliases & e_articleIds parsed from entities list')

# object for extracting sentences from text
extractor = ExtractSentences()


def statements_per_entity(entity_name, aliases=None):

    # Output folder
    res_folder = './Outputs/' + folder + '/' + entity_name + '/'
    directory = os.path.dirname(res_folder)
    if not os.path.exists(directory):
        os.makedirs(directory)

    # search for entity name and get all occurences
    entity_ind = findPowerEliteIndex(entity_name, e_names, e_aliases)
    print('Occurences in entities set : {}'.format(entity_ind))

    # filter articles as per the article ids and source
コード例 #4
0
    by_about_table]  # collection we will construct to hold By/About statements
print(
    'Connection established with the server. Make sure that your StanfordCoreNLP is also running.'
)

# globals
entity_types = config.entity_types
short_sources_list = config.short_sources_list
sources_list = config.sources_list

# get the list of all entities
#entities = get_all_entities(collection, entity_types, args.N)
print('All resolved entities crawled from the database')

# object for extracting sentences from text
extractor = ExtractSentences()

#prepare a dictionary to convert entity to it's type - person, company etc
pipeline = [{"$group": {"_id": "$stdName", "type": {"$first": '$type'}}}]
cursor = list(collection.aggregate(pipeline, allowDiskUse=True))

entity_type_dic = {}
for i in cursor:
    entity_type_dic[i['_id']] = i['type']

#Get all entities from the entity table
e_names = []
e_aliases = []
e_articleIds = []

for type in entities.keys():