Example #1
0
def add_hyperlinks(naf, annotations, prefix, verbose=0):
    """
    :param lxml.etree._Element naf: the root element of the XML file    :param wiki_page:
    :param list annotations: list of annotations, e.g.,
    {"surface_form": "buco nero binario", "uri": "Buco_nero_binario", "offset": 20288}
    :param str prefix: the wikipedia prefix of the language, e.g.,
    https://nl.wikipedia.org/wiki/
    :param verbose:
    :return:
    """
    from_start2tid, from_end2tid = xml_utils.load_start_and_end_offset_to_tid(
        naf)
    start_end2info = load_annotations(annotations, prefix=prefix)

    start_offset2token = {
        int(w_el.get('offset')): w_el.text
        for w_el in naf.xpath('text/wf')
    }

    next_id = 1
    entities_layer = etree.SubElement(naf, "entities")
    for (start, end), (sf, uri) in start_end2info.items():

        if start not in from_start2tid:
            if verbose >= 3:
                print(f'MISALIGNMENT {start} not mapped to tid')
            continue
        if end not in from_end2tid:
            if verbose >= 3:
                print(f'MISALIGNMENT {end} not mapped to tid')
            continue

        assert sf.startswith(start_offset2token[start])

        start_tid = from_start2tid[start]
        end_tid = from_end2tid[end]
        t_ids = xml_utils.get_range_of_tids(start_tid, end_tid)

        entity_data = spacy_to_naf.EntityElement(eid='e%d' % next_id,
                                                 entity_type='UNK',
                                                 text=sf,
                                                 targets=t_ids,
                                                 ext_refs=[{
                                                     'reference': uri
                                                 }])
        next_id += 1

        spacy_to_naf.add_entity_element(entities_layer,
                                        entity_data,
                                        add_comments=True)
Example #2
0
nlp = spacy.load('en_core_web_sm')

tree = text_to_NAF('Tom Cruise is an actor.\n\n\nHe likes to act.',
                   nlp,
                   dct=datetime.now(),
                   layers={'raw', 'text', 'terms'},
                   replace_hidden_characters=True,
                   map_udpos2naf_pos=True)  # map UD pos to NAF pos

root = tree.getroot()

entities_layer = root.find('entities')
if entities_layer is None:
    etree.SubElement(root, "entities")
    entities_layer = root.find('entities')

entity_data = EntityElement(eid='1',
                            entity_type='None',
                            targets=['t1', 't2'],
                            text='Tom Cruise',
                            ext_refs=[{
                                'reference':
                                'https://en.wikipedia.org/wiki/Tom_Cruise',
                                'resource': 'Wikipedia'
                            }])

add_entity_element(entities_layer, entity_data)

print(NAF_to_string(root))
Example #3
0
time_as_string = time_in_correct_format(now)

modelname = 'Wikipedia hyperlinks'
add_linguisticProcessors_el(naf_header,
                            layer='entities',
                            start_time=time_as_string,
                            end_time=time_as_string,
                            modelname=modelname)


entities_layer = root.find('entities')
if entities_layer is None:
    etree.SubElement(root, "entities")
    entities_layer = root.find('entities')

entity_data = EntityElement(eid='1',
                            entity_type='None',
                            targets=['t1', 't2'],
                            text='Tom Cruise',
                            ext_refs=[{'reference' : 'https://en.wikipedia.org/wiki/Tom_Cruise',
                                       'resource' : 'https://www.wikipedia.org/',
                                       'source' : 'Wikipedia hyperlinks',
                                       'timestamp' : time_as_string}])

add_entity_element(entities_layer,
                   naf_version,
                   entity_data)

print(NAF_to_string(root))

def add_hyperlinks(naf, annotations, prefix, language, dct, wiki_langlinks={}, verbose=0):
    """
    :param lxml.etree._Element naf: the root element of the XML file    :param wiki_page:
    :param list annotations: list of annotations, e.g.,
    {"surface_form": "buco nero binario", "uri": "Buco_nero_binario", "offset": 20288}
    :param str prefix: the wikipedia prefix of the language, e.g.,
    https://nl.wikipedia.org/wiki/
    :param verbose:
    :return:
    """
    from_start2tid, from_end2tid = xml_utils.load_start_and_end_offset_to_tid(naf)
    start_end2info = load_annotations(annotations,
                                      prefix=prefix)

    start_offset2token = {int(w_el.get('offset')): w_el.text
                          for w_el in naf.xpath('text/wf')}

    next_id = 1
    naf = naf.getroot()
    entities_layer = etree.SubElement(naf, "entities")

    naf_header = naf.find('nafHeader')
    ling_proc = etree.SubElement(naf_header, "linguisticProcessors")
    ling_proc.set("layer", 'entities')
    lp = etree.SubElement(ling_proc, "lp")
    the_time = spacy_to_naf.time_in_correct_format(dct)
    lp.set("beginTimestamp", the_time)
    lp.set('endTimestamp', the_time)
    lp.set('name', 'Wikipedia hyperlinks')
    lp.set('version', 'Wikipedia dump from 2019-07-20')  # TODO: change this if we move to other version of Wikipedia

    date = datetime(2019, 7, 20)
    date_as_string = time_in_correct_format(date)


    for (start, end), (sf, uri) in start_end2info.items():

        if start not in from_start2tid:
            if verbose >= 3:
                print(f'MISALIGNMENT {start} not mapped to tid')
            continue
        if end not in from_end2tid:
            if verbose >= 3:
                print(f'MISALIGNMENT {end} not mapped to tid')
            continue

        assert sf.startswith(start_offset2token[start])

        start_tid = from_start2tid[start]
        end_tid = from_end2tid[end]
        t_ids = xml_utils.get_range_of_tids(start_tid,
                                            end_tid)

        ext_refs = [{'resource': 'Wikipedia hyperlinks',
                     'reference': uri,
                     'source': 'https://www.wikipedia.org/',
                     'timestamp' : date_as_string}]
        if wiki_langlinks:
            for lang, uri in wiki_langlinks[language][uri].items():
                ext_refs.append({'resource': 'Wikipedia hyperlinks',
                                 'reference': uri,
                                 'source' : 'https://www.wikipedia.org/',
                                 'timestamp' : date_as_string})

        entity_data = spacy_to_naf.EntityElement(
            eid='e%d' % next_id,
            entity_type='UNK',
            text=sf,
            targets=t_ids,
            ext_refs=ext_refs)
        next_id += 1

        spacy_to_naf.add_entity_element(entities_layer,     
                                        'v3.1',
                                        entity_data, 
                                        add_comments=True)
Example #5
0
                    ret_tokens, min_token_id = find_next_occurrence(
                        sfs, min_token_id, t_layer, doc)

                    t_ids = [wid2tid[wid] for wid in ret_tokens]

                    if ret_tokens:
                        entity_data = EntityElement(eid='e%d' % next_id,
                                                    entity_type='UNK',
                                                    text=text,
                                                    targets=t_ids,
                                                    ext_refs=[{
                                                        'reference':
                                                        target
                                                    }])
                        spacy_to_naf.add_entity_element(entities_layer,
                                                        entity_data,
                                                        add_comments=True)
                        count_entities += 1
                        next_id += 1

                count_per_doc.append(next_id - 1)
                if naf_output_path is not None:
                    with open(naf_output_path, 'w') as outfile:
                        outfile.write(spacy_to_naf.NAF_to_string(NAF=root))
                        count_outfiles += 1

    print('Input NAFs', count_infiles)
    print('Output NAFs', count_outfiles)
    print('Count entities', count_entities)
    print('Count initial seed', initial_links)
    dist_links = Counter(count_per_doc)