def isArticleRelatedToTopic(article, aliases, keyword): # step0: split into sentences extractor = ExtractSentences() sent_text = np.array(extractor.split_into_sentences(article)) # step1 lower the text text = article.lower() keyword = keyword.lower() # step2 replace aliases from keyword for a in aliases: text = text.replace( str(' ') + a.lower() + str(' '), str(' ') + keyword + str(' ')) # accept article that have keyword's frequency greater than freq_threshold freq_threshold = 2 key_freq = 0 for word in nltk.word_tokenize(text): if word == keyword: key_freq = key_freq + 1 if key_freq > freq_threshold: print('\tKeyword frequency ', key_freq) return True # accept the articles where keyword is in top line_threshold lines occ_threshold = 0.5 sent_text = np.atleast_1d(sent_text) top_sent = sent_text[:int(math.ceil(occ_threshold * len(sent_text)))] for ts in top_sent: ts = ts.lower() ts = ts.replace('.', ' ').replace(',', ' ').replace('-', ' ') for a in aliases: ts = ts.replace( str(' ') + a.lower() + str(' '), str(' ') + keyword + str(' ')) if keyword in nltk.word_tokenize(ts): print('\t top 50% lines') return True # accept if keyword is present in any of selected relations sNLP = StanfordNLP() try: pos_text = sNLP.pos(text) parse_text = sNLP.dependency_parse(text) selected_relation = [ 'amod', 'nmod', 'dobj', 'iobj', 'nsubj', 'nsubjpass' ] for i in range(1, len(parse_text)): rel = parse_text[i][0] word1 = pos_text[parse_text[i][1] - 1][0] word2 = pos_text[parse_text[i][2] - 1][0] if (word1 == keyword or word2 == keyword) and (rel in selected_relation): print('\t passed NLP') return True except json.decoder.JSONDecodeError as e: print(e) print(text) # reject return False
######################################### client = MongoClient(config.mongoConfigs['host'], config.mongoConfigs['port']) db = client[config.mongoConfigs['db']] collection = db[resolved_entity_table] # collection having resolved entities art_collection = db[article_table] # collection having articles entity_types = config.entity_types short_sources_list = config.short_sources_list sources_list = config.sources_list fixed_keywords = [ 'says', 'said', 'asks', 'asked', 'told', 'announced', 'announce', 'claimed', 'claim' ] extractor = ExtractSentences() # object for extracting sentences from text class StanfordNLP: def __init__(self, host='http://localhost', port=9000): self.nlp = StanfordCoreNLP(host, port=port, timeout=30000) self.props = { 'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation', 'pipelineLanguage': 'en', 'outputFormat': 'json' } def word_tokenize(self, sentence): return self.nlp.word_tokenize(sentence)
# globals entity_types = config.entity_types short_sources_list = config.short_sources_list sources_list = config.sources_list # get the list of all entities entities = get_all_entities(collection, entity_types, args.N) print('All resolved entities crawled from the database') # parse entities list to get names, aliases and articles e_names, e_aliases, e_articleIds = get_names_aliases_articles(entities) print('e_names, e_aliases & e_articleIds parsed from entities list') # object for extracting sentences from text extractor = ExtractSentences() def statements_per_entity(entity_name, aliases=None): # Output folder res_folder = './Outputs/' + folder + '/' + entity_name + '/' directory = os.path.dirname(res_folder) if not os.path.exists(directory): os.makedirs(directory) # search for entity name and get all occurences entity_ind = findPowerEliteIndex(entity_name, e_names, e_aliases) print('Occurences in entities set : {}'.format(entity_ind)) # filter articles as per the article ids and source
by_about_table] # collection we will construct to hold By/About statements print( 'Connection established with the server. Make sure that your StanfordCoreNLP is also running.' ) # globals entity_types = config.entity_types short_sources_list = config.short_sources_list sources_list = config.sources_list # get the list of all entities #entities = get_all_entities(collection, entity_types, args.N) print('All resolved entities crawled from the database') # object for extracting sentences from text extractor = ExtractSentences() #prepare a dictionary to convert entity to it's type - person, company etc pipeline = [{"$group": {"_id": "$stdName", "type": {"$first": '$type'}}}] cursor = list(collection.aggregate(pipeline, allowDiskUse=True)) entity_type_dic = {} for i in cursor: entity_type_dic[i['_id']] = i['type'] #Get all entities from the entity table e_names = [] e_aliases = [] e_articleIds = [] for type in entities.keys():