def annotate(text, confidence, support): annotations = pd.DataFrame(columns=['mention', 'entity', 'entity_id', 'offset', 'sentence', 'the_sentence'], dtype='unicode', index=None) sentences_spans = [] tokenized_sents = nlp.get_sentences(text) for sentence in nlp.get_sentences_spans(text, tokenized_sents): sentences_spans.append(sentence) query = r'curl http://model.dbpedia-spotlight.org/en/annotate --data-urlencode "text={}" --data "confidence={}&support={}" -H "Accept: application/json"'.format(text, confidence, support) data = json.loads(os.popen(query).read()) if 'Resources' not in data: return annotations for annotation in data['Resources']: sentence_num, sentence_txt = nlp.get_sentence_number( sentences_spans, int(annotation['@offset'])) entity = annotation['@URI'][28:].replace('_', ' ') annotations.loc[len(annotations.index)] = [annotation['@surfaceForm'], entity, nlp.get_entity_id(entity), annotation['@offset'], sentence_num, sentence_txt] return annotations
def search(article_name, text, article_entities): article_body = text #mentions = all_mentions.loc[all_mentions.entity.isin(article_entities), 'mention'] #mentions = all_mentions.query('entity in @article_entities')['mention'] #mentions = pd.merge(article_entities.to_frame(), all_mentions, how='inner', on=['entity'])['mention'] #mentions = util.sorted_dataframe(mentions, mentions.str.len(), ASC=False) mentions = [] try: mentions.extend(map(all_mentions.get, article_entities)) mentions = filter(None, mentions) mentions = reduce(lambda x, y: x + y, mentions) except: pass #mentions = list(set(mentions)) mentions = sorted(mentions, key=len)[::-1] for mention in mentions: entity = disambiguate(None, mention) entity_id = nlp.get_entity_id(entity) article_body = re.sub(r'\b{}\b'.format(re.escape(mention)), ' ' + entity_id + ' ', article_body) return article_body
def annotate(token, text, epsilon): annotations = pd.DataFrame(columns=[ 'mention', 'entity', 'entity_id', 'offset', 'sentence', 'the_sentence' ], dtype='unicode', index=None) sentences_spans = [] tokenized_sents = nlp.get_sentences(text) for sentence in nlp.get_sentences_spans(text, tokenized_sents): sentences_spans.append(sentence) parameters = { 'lang': 'en', 'gcube-token': token, 'text': text, 'epsilon': epsilon, 'long_text': '0' } query = r'https://tagme.d4science.org/tagme/tag' result = requests.post(query, data=parameters) data = json.loads(result.text) for annotation in data['annotations']: if 'title' not in annotation: continue sentence_num, sentence_txt = nlp.get_sentence_number( sentences_spans, int(annotation['start'])) entity = annotation['title'] annotations.loc[len(annotations.index)] = [ annotation['spot'], entity, nlp.get_entity_id(entity), annotation['start'], sentence_num, sentence_txt ] return annotations
def annotate(article): annotations = pd.DataFrame(columns=[ 'article', 'level', 'mention', 'used_entity', 'entity', 'entity_id', 'offset', 'sentence', 'the_sentence' ], dtype='unicode', index=None) article_entities = entities.loc[entities.article == article.page_id] if article_entities.empty: annotations = annotations.drop(['level'], axis=1) return annotations, article.to_string() article_body = article.to_string() article_body = nlp.clean_article(article_body) article_body = nlp.get_paragraphs(article_body) # remove first paragraph del article_body[0] # trim long paragraphs for i, p in enumerate(article_body): if len(p) > 5000: article_body[i] = article_body[i][:4999] article_body = '\n\n'.join([p.strip() for p in article_body]) regex_input = article_body for index, entity_row in article_entities.iterrows(): for pair in re.finditer( nlp.get_entity_pattern(entity_row['used_entity']), regex_input): mention, entity = pair.group()[1:].split(']') entity = entity[1:-1] if util.invalid_entity(entity): annotations.loc[len(annotations.index)] = [ article.page_name, util.Level(3).name, mention, entity, entity, None, pair.start(), -1, None ] article_body = article_body.replace(pair.group(), '☲' * len(mention)) else: resolved = entity_row['entity'] annotations.loc[len(annotations.index)] = [ article.page_name, util.Level(1).name, mention, entity, resolved, nlp.get_entity_id(resolved), pair.start(), -1, None ] article_body = article_body.replace(pair.group(), '☰' * len(mention)) # fix other mentions offsets # work on copy of annotations rows = annotations[['used_entity', 'offset']].copy(deep=True) annotations['ori_offset'] = annotations['offset'] for index, annotation in annotations.iterrows(): for i, row in rows.iterrows(): if row['offset'] < annotation['ori_offset']: annotations.loc[index, 'offset'] -= len(row['used_entity']) + 4 # reconstruct the article for row in annotations.itertuples(): article_body = nlp.replace_part_of_text(article_body, row.mention, row.offset, len(row.mention)) # map offsets to sentences sentences_spans = [] tokenized_sents = nlp.get_sentences(article_body) for sentence in nlp.get_sentences_spans(article_body, tokenized_sents): sentences_spans.append(sentence) # filtre out invalid enitites # drop Level column annotations = annotations.loc[annotations.level != util.Level(3).name] annotations = annotations.drop(['level'], axis=1) annotations = util.sorted_dataframe(annotations, annotations.offset, True) annotations[['sentence', 'the_sentence']] = pd.DataFrame( list(annotations['offset'].map( lambda x: nlp.get_sentence_number(sentences_spans, x)))) return annotations, article_body
def get_annotations(gold_standard): entities = util.get_entities() all_mentions = util.get_mentions() global most_freq most_freq = util.get_most_freq_entities() annotations = pd.DataFrame(columns=[ 'article', 'mention', 'entity', 'entity_id', 'offset', 'sentence', 'the_sentence' ], dtype='unicode', index=None) for article in gold_standard.articles: print(article.title) anno = pd.DataFrame(columns=[ 'article', 'mention', 'entity', 'entity_id', 'offset', 'sentence', 'the_sentence' ], dtype='unicode', index=None) # search for mentions of the article entities article_entities = entities.loc[ entities.article == article.title.replace(' ', '%20'), 'entity'] mentions = [] try: mentions.extend(map(all_mentions.get, article_entities)) mentions = filter(None, mentions) mentions = reduce(lambda x, y: x + y, mentions) except: pass mentions = sorted(mentions, key=len)[::-1] for mention in mentions: for match in re.finditer(r'\b{}\b'.format(re.escape(mention)), article.text): entity = disambiguate(None, match.group()) anno.loc[len(anno.index)] = [ article.title, match.group(), entity, nlp.get_entity_id(entity), match.start(), -1, None ] # map offsets to sentences sentences_spans = [] tokenized_sents = nlp.get_sentences(article.text) for sentence in nlp.get_sentences_spans(article.text, tokenized_sents): sentences_spans.append(sentence) anno = util.sorted_dataframe(anno, anno.offset, True) anno[['sentence', 'the_sentence']] = pd.DataFrame( list(anno['offset'].map( lambda x: nlp.get_sentence_number(sentences_spans, x)))) annotations = annotations.append(anno) return annotations
def annotate(article, and_search=True): annotations = pd.DataFrame(columns=[ 'article', 'level', 'mention', 'used_entity', 'entity', 'entity_id', 'offset' ], dtype='unicode', index=None) #global Original #Original = article.to_string() # find linked entities # get linked entities within the article try: article_entities = entities[article.page_id] article_body = article.to_string() except: #error_articles.append(article) return None, None # invalid entities regex_input = article_body for entity in article_entities.loc[article_entities.valid == 'False', 'used_entity']: for pair in re.finditer(nlp.get_entity_pattern(entity), regex_input): try: mention, target = pair.group()[1:].split(']') article_body = article_body.replace(pair.group(), mention) except Exception as e: pass # valid entities regex_input = article_body for entity in article_entities.loc[article_entities.valid == 'True', 'used_entity']: for pair in re.finditer(nlp.get_entity_pattern(entity), regex_input): try: values = pair.group()[1:].split(']') mention = values[0] entity = values[1][1:-1] # resolve redirect resolved = article_entities.loc[article_entities.used_entity == entity, 'entity'].values[0] entity_id = nlp.get_entity_id(resolved) annotations.loc[len(annotations.index)] = [ article.page_name, util.Level(1).name, mention, entity, resolved, entity_id, pair.start() ] article_body = article_body.replace(pair.group(), entity_id) except Exception as e: pass if and_search: # search for more entities article_body = search(article.page_name, article_body, annotations['entity'].drop_duplicates()) #annotations = annotations.append(search_annotations) #global Final #Final = article_body article_body = nlp.clean_article(article_body) try: print(article.page_name) except: pass # to build the IDs dict return annotations[['entity', 'entity_id']], article_body
def advanced_search(article_name, text, article_entities): # expect clean text annotations = pd.DataFrame(columns=[ 'article', 'level', 'mention', 'used_entity', 'entity', 'entity_id', 'offset' ], dtype='unicode', index=None) # clean article article_body = text #global Middle #Middle = article_body # search for mentions of the article entities mentions = pd.merge(article_entities.to_frame(), all_mentions, how='inner', on='entity')['mention'] mentions = util.sorted_dataframe(mentions, mentions.str.len(), ASC=False) ''' old approach mentions = [] for entity in article_entities: try: mentions.extend(all_mentions[entity]) except: continue #mentions.extend(annotations['mention'].values) #mentions = list(set(mentions)) mentions = sorted(mentions, key=len)[::-1] ''' regex_input = article_body for mention in mentions: for match in re.finditer(re.escape(mention), regex_input): entity = disambiguate(None, match.group()) entity_id = nlp.get_entity_id(entity) annotations.loc[len(annotations.index)] = [ article_name, util.Level(2).name, match.group(), entity, entity, entity_id, match.start() ] # fix other mentions offsets # work on copy of annotations rows = annotations[['mention', 'offset']].copy(deep=True) annotations['ori_offset'] = annotations['offset'] for index, annotation in annotations.iterrows(): for i, row in rows.iterrows(): if row['offset'] < annotation['ori_offset']: annotations.loc[index, 'offset'] += 32 - len(row['mention']) # sort by offset annotations = util.sorted_dataframe(annotations, annotations['offset'], True) # reconstruct the article for row in annotations.itertuples(): article_body = nlp.replace_part_of_text(article_body, row.entity_id, row.offset, len(row.mention)) return annotations, article_body