def ner_tagged_tokens(record): _, payload = record key = find_key(payload) text = None tagged_tokens = None if key != "": text = html2text(payload) tagged_tokens = nlp_preproc(text) for token in tagged_tokens: yield key, text, token[0]
def Entities_Linking(record): _, payload = record key = find_key(payload) text = None tagged_tokens = None # res = "" if key != "": text = html2text(payload) tagged_tokens = nlp_preproc(text) mentions_types = [] entity_result_dict = {} for token in tagged_tokens: if token not in mentions_types: mentions_types.append(token) for token in mentions_types: Tfidf_score_max = 0 entity_score_max = "" entities = search_candidate(token[0]) for entity, labels in entities: # abstract = query_abstract(SPARQL,entity) abstract = query_candidate_abstract(entity) # print(entity,labels) if abstract != None: score = cosine_sim(text, abstract) if score > Tfidf_score_max: Tfidf_score_max = score entity_score_max = entity if Tfidf_score_max != 0: entity_result_dict[token[0]] = entity_score_max # print(token) # yield key + '\t' + token[0] + '\t' + entity_score_max for token in tagged_tokens: if entity_result_dict.__contains__(token[0]): yield key + '\t' + token[0] + '\t' + entity_result_dict[ token[0]]
if __name__ == '__main__': import sys try: _, INPUT, ELASTICSEARCH, SPARQL = sys.argv except Exception as e: print('Usage: python starter-code.py INPUT ELASTICSEARCH SPARQL') sys.exit(0) with open(INPUT, errors='ignore') as fo: for record in split_records(fo): key = find_key(record) if key != '': text = html2text(record) tagged_tokens = nlp_preproc(text) mentions_types = [] entity_result_dict = {} for token in tagged_tokens: if token not in mentions_types: mentions_types.append(token) for token in mentions_types: Tfidf_score_max = 0 entity_score_max = "" entities = search_candidate(token[0]) for entity, labels in entities: if labels['freebase_label'] == token[0]: score = math.inf Tfidf_score_max = score