def add_hyperlinks(naf, annotations, prefix, verbose=0): """ :param lxml.etree._Element naf: the root element of the XML file :param wiki_page: :param list annotations: list of annotations, e.g., {"surface_form": "buco nero binario", "uri": "Buco_nero_binario", "offset": 20288} :param str prefix: the wikipedia prefix of the language, e.g., https://nl.wikipedia.org/wiki/ :param verbose: :return: """ from_start2tid, from_end2tid = xml_utils.load_start_and_end_offset_to_tid( naf) start_end2info = load_annotations(annotations, prefix=prefix) start_offset2token = { int(w_el.get('offset')): w_el.text for w_el in naf.xpath('text/wf') } next_id = 1 entities_layer = etree.SubElement(naf, "entities") for (start, end), (sf, uri) in start_end2info.items(): if start not in from_start2tid: if verbose >= 3: print(f'MISALIGNMENT {start} not mapped to tid') continue if end not in from_end2tid: if verbose >= 3: print(f'MISALIGNMENT {end} not mapped to tid') continue assert sf.startswith(start_offset2token[start]) start_tid = from_start2tid[start] end_tid = from_end2tid[end] t_ids = xml_utils.get_range_of_tids(start_tid, end_tid) entity_data = spacy_to_naf.EntityElement(eid='e%d' % next_id, entity_type='UNK', text=sf, targets=t_ids, ext_refs=[{ 'reference': uri }]) next_id += 1 spacy_to_naf.add_entity_element(entities_layer, entity_data, add_comments=True)
nlp = spacy.load('en_core_web_sm') tree = text_to_NAF('Tom Cruise is an actor.\n\n\nHe likes to act.', nlp, dct=datetime.now(), layers={'raw', 'text', 'terms'}, replace_hidden_characters=True, map_udpos2naf_pos=True) # map UD pos to NAF pos root = tree.getroot() entities_layer = root.find('entities') if entities_layer is None: etree.SubElement(root, "entities") entities_layer = root.find('entities') entity_data = EntityElement(eid='1', entity_type='None', targets=['t1', 't2'], text='Tom Cruise', ext_refs=[{ 'reference': 'https://en.wikipedia.org/wiki/Tom_Cruise', 'resource': 'Wikipedia' }]) add_entity_element(entities_layer, entity_data) print(NAF_to_string(root))
time_as_string = time_in_correct_format(now) modelname = 'Wikipedia hyperlinks' add_linguisticProcessors_el(naf_header, layer='entities', start_time=time_as_string, end_time=time_as_string, modelname=modelname) entities_layer = root.find('entities') if entities_layer is None: etree.SubElement(root, "entities") entities_layer = root.find('entities') entity_data = EntityElement(eid='1', entity_type='None', targets=['t1', 't2'], text='Tom Cruise', ext_refs=[{'reference' : 'https://en.wikipedia.org/wiki/Tom_Cruise', 'resource' : 'https://www.wikipedia.org/', 'source' : 'Wikipedia hyperlinks', 'timestamp' : time_as_string}]) add_entity_element(entities_layer, naf_version, entity_data) print(NAF_to_string(root))
def add_hyperlinks(naf, annotations, prefix, language, dct, wiki_langlinks={}, verbose=0): """ :param lxml.etree._Element naf: the root element of the XML file :param wiki_page: :param list annotations: list of annotations, e.g., {"surface_form": "buco nero binario", "uri": "Buco_nero_binario", "offset": 20288} :param str prefix: the wikipedia prefix of the language, e.g., https://nl.wikipedia.org/wiki/ :param verbose: :return: """ from_start2tid, from_end2tid = xml_utils.load_start_and_end_offset_to_tid(naf) start_end2info = load_annotations(annotations, prefix=prefix) start_offset2token = {int(w_el.get('offset')): w_el.text for w_el in naf.xpath('text/wf')} next_id = 1 naf = naf.getroot() entities_layer = etree.SubElement(naf, "entities") naf_header = naf.find('nafHeader') ling_proc = etree.SubElement(naf_header, "linguisticProcessors") ling_proc.set("layer", 'entities') lp = etree.SubElement(ling_proc, "lp") the_time = spacy_to_naf.time_in_correct_format(dct) lp.set("beginTimestamp", the_time) lp.set('endTimestamp', the_time) lp.set('name', 'Wikipedia hyperlinks') lp.set('version', 'Wikipedia dump from 2019-07-20') # TODO: change this if we move to other version of Wikipedia date = datetime(2019, 7, 20) date_as_string = time_in_correct_format(date) for (start, end), (sf, uri) in start_end2info.items(): if start not in from_start2tid: if verbose >= 3: print(f'MISALIGNMENT {start} not mapped to tid') continue if end not in from_end2tid: if verbose >= 3: print(f'MISALIGNMENT {end} not mapped to tid') continue assert sf.startswith(start_offset2token[start]) start_tid = from_start2tid[start] end_tid = from_end2tid[end] t_ids = xml_utils.get_range_of_tids(start_tid, end_tid) ext_refs = [{'resource': 'Wikipedia hyperlinks', 'reference': uri, 'source': 'https://www.wikipedia.org/', 'timestamp' : date_as_string}] if wiki_langlinks: for lang, uri in wiki_langlinks[language][uri].items(): ext_refs.append({'resource': 'Wikipedia hyperlinks', 'reference': uri, 'source' : 'https://www.wikipedia.org/', 'timestamp' : date_as_string}) entity_data = spacy_to_naf.EntityElement( eid='e%d' % next_id, entity_type='UNK', text=sf, targets=t_ids, ext_refs=ext_refs) next_id += 1 spacy_to_naf.add_entity_element(entities_layer, 'v3.1', entity_data, add_comments=True)
ret_tokens, min_token_id = find_next_occurrence( sfs, min_token_id, t_layer, doc) t_ids = [wid2tid[wid] for wid in ret_tokens] if ret_tokens: entity_data = EntityElement(eid='e%d' % next_id, entity_type='UNK', text=text, targets=t_ids, ext_refs=[{ 'reference': target }]) spacy_to_naf.add_entity_element(entities_layer, entity_data, add_comments=True) count_entities += 1 next_id += 1 count_per_doc.append(next_id - 1) if naf_output_path is not None: with open(naf_output_path, 'w') as outfile: outfile.write(spacy_to_naf.NAF_to_string(NAF=root)) count_outfiles += 1 print('Input NAFs', count_infiles) print('Output NAFs', count_outfiles) print('Count entities', count_entities) print('Count initial seed', initial_links) dist_links = Counter(count_per_doc)