def annotate(text, confidence, support):

    annotations = pd.DataFrame(columns=['mention', 'entity', 'entity_id',
                                        'offset', 'sentence', 'the_sentence'],
                               dtype='unicode', index=None)

    sentences_spans = []
    tokenized_sents = nlp.get_sentences(text)
    for sentence in nlp.get_sentences_spans(text, tokenized_sents):
        sentences_spans.append(sentence)

    query = r'curl http://model.dbpedia-spotlight.org/en/annotate --data-urlencode "text={}" --data "confidence={}&support={}" -H "Accept: application/json"'.format(text, confidence, support)

    data = json.loads(os.popen(query).read())

    if 'Resources' not in data:
        return annotations

    for annotation in data['Resources']:
        sentence_num, sentence_txt = nlp.get_sentence_number(
                sentences_spans, int(annotation['@offset']))

        entity = annotation['@URI'][28:].replace('_', ' ')

        annotations.loc[len(annotations.index)] = [annotation['@surfaceForm'],
                                                   entity,
                                                   nlp.get_entity_id(entity),
                                                   annotation['@offset'],
                                                   sentence_num,
                                                   sentence_txt]
    return annotations
Esempio n. 2
0
def search(article_name, text, article_entities):

    article_body = text

    #mentions = all_mentions.loc[all_mentions.entity.isin(article_entities), 'mention']
    #mentions = all_mentions.query('entity in @article_entities')['mention']

    #mentions = pd.merge(article_entities.to_frame(), all_mentions, how='inner', on=['entity'])['mention']
    #mentions = util.sorted_dataframe(mentions, mentions.str.len(), ASC=False)

    mentions = []

    try:
        mentions.extend(map(all_mentions.get, article_entities))
        mentions = filter(None, mentions)
        mentions = reduce(lambda x, y: x + y, mentions)
    except:
        pass

    #mentions = list(set(mentions))
    mentions = sorted(mentions, key=len)[::-1]

    for mention in mentions:
        entity = disambiguate(None, mention)
        entity_id = nlp.get_entity_id(entity)
        article_body = re.sub(r'\b{}\b'.format(re.escape(mention)),
                              ' ' + entity_id + ' ', article_body)

    return article_body
def annotate(token, text, epsilon):

    annotations = pd.DataFrame(columns=[
        'mention', 'entity', 'entity_id', 'offset', 'sentence', 'the_sentence'
    ],
                               dtype='unicode',
                               index=None)

    sentences_spans = []
    tokenized_sents = nlp.get_sentences(text)
    for sentence in nlp.get_sentences_spans(text, tokenized_sents):
        sentences_spans.append(sentence)

    parameters = {
        'lang': 'en',
        'gcube-token': token,
        'text': text,
        'epsilon': epsilon,
        'long_text': '0'
    }
    query = r'https://tagme.d4science.org/tagme/tag'

    result = requests.post(query, data=parameters)

    data = json.loads(result.text)

    for annotation in data['annotations']:

        if 'title' not in annotation:
            continue

        sentence_num, sentence_txt = nlp.get_sentence_number(
            sentences_spans, int(annotation['start']))

        entity = annotation['title']

        annotations.loc[len(annotations.index)] = [
            annotation['spot'], entity,
            nlp.get_entity_id(entity), annotation['start'], sentence_num,
            sentence_txt
        ]
    return annotations
def annotate(article):

    annotations = pd.DataFrame(columns=[
        'article', 'level', 'mention', 'used_entity', 'entity', 'entity_id',
        'offset', 'sentence', 'the_sentence'
    ],
                               dtype='unicode',
                               index=None)

    article_entities = entities.loc[entities.article == article.page_id]

    if article_entities.empty:
        annotations = annotations.drop(['level'], axis=1)
        return annotations, article.to_string()

    article_body = article.to_string()

    article_body = nlp.clean_article(article_body)
    article_body = nlp.get_paragraphs(article_body)

    # remove first paragraph
    del article_body[0]

    # trim long paragraphs
    for i, p in enumerate(article_body):
        if len(p) > 5000:
            article_body[i] = article_body[i][:4999]

    article_body = '\n\n'.join([p.strip() for p in article_body])

    regex_input = article_body
    for index, entity_row in article_entities.iterrows():
        for pair in re.finditer(
                nlp.get_entity_pattern(entity_row['used_entity']),
                regex_input):
            mention, entity = pair.group()[1:].split(']')
            entity = entity[1:-1]
            if util.invalid_entity(entity):
                annotations.loc[len(annotations.index)] = [
                    article.page_name,
                    util.Level(3).name, mention, entity, entity, None,
                    pair.start(), -1, None
                ]
                article_body = article_body.replace(pair.group(),
                                                    '☲' * len(mention))
            else:
                resolved = entity_row['entity']
                annotations.loc[len(annotations.index)] = [
                    article.page_name,
                    util.Level(1).name, mention, entity, resolved,
                    nlp.get_entity_id(resolved),
                    pair.start(), -1, None
                ]
                article_body = article_body.replace(pair.group(),
                                                    '☰' * len(mention))

    # fix other mentions offsets
    # work on copy of annotations
    rows = annotations[['used_entity', 'offset']].copy(deep=True)
    annotations['ori_offset'] = annotations['offset']

    for index, annotation in annotations.iterrows():
        for i, row in rows.iterrows():
            if row['offset'] < annotation['ori_offset']:
                annotations.loc[index, 'offset'] -= len(row['used_entity']) + 4

    # reconstruct the article
    for row in annotations.itertuples():
        article_body = nlp.replace_part_of_text(article_body, row.mention,
                                                row.offset, len(row.mention))

    # map offsets to sentences
    sentences_spans = []
    tokenized_sents = nlp.get_sentences(article_body)
    for sentence in nlp.get_sentences_spans(article_body, tokenized_sents):
        sentences_spans.append(sentence)

    # filtre out invalid enitites
    # drop Level column
    annotations = annotations.loc[annotations.level != util.Level(3).name]
    annotations = annotations.drop(['level'], axis=1)

    annotations = util.sorted_dataframe(annotations, annotations.offset, True)
    annotations[['sentence', 'the_sentence']] = pd.DataFrame(
        list(annotations['offset'].map(
            lambda x: nlp.get_sentence_number(sentences_spans, x))))

    return annotations, article_body
Esempio n. 5
0
def get_annotations(gold_standard):

    entities = util.get_entities()
    all_mentions = util.get_mentions()
    global most_freq
    most_freq = util.get_most_freq_entities()

    annotations = pd.DataFrame(columns=[
        'article', 'mention', 'entity', 'entity_id', 'offset', 'sentence',
        'the_sentence'
    ],
                               dtype='unicode',
                               index=None)

    for article in gold_standard.articles:

        print(article.title)

        anno = pd.DataFrame(columns=[
            'article', 'mention', 'entity', 'entity_id', 'offset', 'sentence',
            'the_sentence'
        ],
                            dtype='unicode',
                            index=None)

        # search for mentions of the article entities

        article_entities = entities.loc[
            entities.article == article.title.replace(' ', '%20'), 'entity']

        mentions = []

        try:
            mentions.extend(map(all_mentions.get, article_entities))
            mentions = filter(None, mentions)
            mentions = reduce(lambda x, y: x + y, mentions)
        except:
            pass

        mentions = sorted(mentions, key=len)[::-1]

        for mention in mentions:
            for match in re.finditer(r'\b{}\b'.format(re.escape(mention)),
                                     article.text):
                entity = disambiguate(None, match.group())
                anno.loc[len(anno.index)] = [
                    article.title,
                    match.group(), entity,
                    nlp.get_entity_id(entity),
                    match.start(), -1, None
                ]

        # map offsets to sentences
        sentences_spans = []
        tokenized_sents = nlp.get_sentences(article.text)
        for sentence in nlp.get_sentences_spans(article.text, tokenized_sents):
            sentences_spans.append(sentence)

        anno = util.sorted_dataframe(anno, anno.offset, True)
        anno[['sentence', 'the_sentence']] = pd.DataFrame(
            list(anno['offset'].map(
                lambda x: nlp.get_sentence_number(sentences_spans, x))))

        annotations = annotations.append(anno)

    return annotations
Esempio n. 6
0
def annotate(article, and_search=True):

    annotations = pd.DataFrame(columns=[
        'article', 'level', 'mention', 'used_entity', 'entity', 'entity_id',
        'offset'
    ],
                               dtype='unicode',
                               index=None)

    #global Original
    #Original = article.to_string()

    # find linked entities
    # get linked entities within the article
    try:

        article_entities = entities[article.page_id]

        article_body = article.to_string()

    except:

        #error_articles.append(article)
        return None, None

    # invalid entities
    regex_input = article_body
    for entity in article_entities.loc[article_entities.valid == 'False',
                                       'used_entity']:
        for pair in re.finditer(nlp.get_entity_pattern(entity), regex_input):

            try:
                mention, target = pair.group()[1:].split(']')
                article_body = article_body.replace(pair.group(), mention)

            except Exception as e:
                pass

    # valid entities
    regex_input = article_body
    for entity in article_entities.loc[article_entities.valid == 'True',
                                       'used_entity']:
        for pair in re.finditer(nlp.get_entity_pattern(entity), regex_input):

            try:
                values = pair.group()[1:].split(']')
                mention = values[0]
                entity = values[1][1:-1]
                # resolve redirect
                resolved = article_entities.loc[article_entities.used_entity ==
                                                entity, 'entity'].values[0]
                entity_id = nlp.get_entity_id(resolved)
                annotations.loc[len(annotations.index)] = [
                    article.page_name,
                    util.Level(1).name, mention, entity, resolved, entity_id,
                    pair.start()
                ]
                article_body = article_body.replace(pair.group(), entity_id)

            except Exception as e:
                pass

    if and_search:
        # search for more entities
        article_body = search(article.page_name, article_body,
                              annotations['entity'].drop_duplicates())
        #annotations = annotations.append(search_annotations)

    #global Final
    #Final = article_body

    article_body = nlp.clean_article(article_body)

    try:
        print(article.page_name)
    except:
        pass

    # to build the IDs dict
    return annotations[['entity', 'entity_id']], article_body
Esempio n. 7
0
def advanced_search(article_name, text, article_entities):  # expect clean text

    annotations = pd.DataFrame(columns=[
        'article', 'level', 'mention', 'used_entity', 'entity', 'entity_id',
        'offset'
    ],
                               dtype='unicode',
                               index=None)

    # clean article
    article_body = text

    #global Middle
    #Middle = article_body

    # search for mentions of the article entities

    mentions = pd.merge(article_entities.to_frame(),
                        all_mentions,
                        how='inner',
                        on='entity')['mention']
    mentions = util.sorted_dataframe(mentions, mentions.str.len(), ASC=False)
    ''' old approach
    mentions = []
    for entity in article_entities:
        try:
            mentions.extend(all_mentions[entity])
        except:
            continue

    #mentions.extend(annotations['mention'].values)

    #mentions = list(set(mentions))
    mentions = sorted(mentions, key=len)[::-1]
    '''

    regex_input = article_body
    for mention in mentions:
        for match in re.finditer(re.escape(mention), regex_input):
            entity = disambiguate(None, match.group())
            entity_id = nlp.get_entity_id(entity)
            annotations.loc[len(annotations.index)] = [
                article_name,
                util.Level(2).name,
                match.group(), entity, entity, entity_id,
                match.start()
            ]

    # fix other mentions offsets
    # work on copy of annotations
    rows = annotations[['mention', 'offset']].copy(deep=True)
    annotations['ori_offset'] = annotations['offset']

    for index, annotation in annotations.iterrows():
        for i, row in rows.iterrows():
            if row['offset'] < annotation['ori_offset']:
                annotations.loc[index, 'offset'] += 32 - len(row['mention'])

    # sort by offset
    annotations = util.sorted_dataframe(annotations, annotations['offset'],
                                        True)

    # reconstruct the article
    for row in annotations.itertuples():
        article_body = nlp.replace_part_of_text(article_body, row.entity_id,
                                                row.offset, len(row.mention))

    return annotations, article_body