def process_conll_doc(input_file_name, output_file_name): columns = { 0: 'text', 1: 'nero', 2: 'nme', 3: 'wiki', } with open(input_file_name, "r") as input_file, open(output_file_name, "w+") as output_file: doc = None docs = [] spos = 0 for line in input_file: if "DOCSTART" in line: if doc == None: doc = Sentence() else: docs.append(doc) doc = Sentence() spos = 0 else: lsplit = line.split("\t") #print(lsplit) token = Token(lsplit[0].strip()) for c in columns: if c != 0: if c < len(lsplit): token.add_tag(columns[c], lsplit[c].strip()) token.start_pos = spos token.end_pos = spos + len(token.text) spos = token.end_pos + 1 doc.add_token(token) for d in docs: myjson = { "text": unidecode.unidecode(d.to_tokenized_string()), "spans": [] } res = requests.post(NEURAL_EL_SERVER, json=myjson) info = res.json() #print(info) for i in info: entity_ran = range(i[0], i[0] + i[1]) #print(i[2] + " " + str(entity_ran)) for t in d.tokens: #print(t.text + " " + str(t.start_pos)) if t.start_position in entity_ran: #print("found tag") t.add_tag("pnme", i[2]) for t in d: output_file.write( t.text + "\t" + t.get_tag("nero").value + "\t" + t.get_tag("nme").value + "\t" + unidecode.unidecode(t.get_tag("wiki").value) + "\t" + t.get_tag("pnme").value + "\n")
def process_conll_doc(input_file_name, output_file_name, ner_model, with_disambiguation, sim_level_disambig): nertagger = SequenceTagger.load(ner_model) columns = { 0: 'text', 1: 'nero', 2: 'nme', 3: 'wiki', } with open(input_file_name, "r") as input_file, open(output_file_name, "w+") as output_file: doc = None docs = [] spos = 0 for line in input_file: if "DOCSTART" in line: if doc == None: doc = Sentence() else: docs.append(doc) doc = Sentence() spos = 0 else: lsplit = line.split("\t") #print(lsplit) token = Token(lsplit[0].strip()) for c in columns: if c != 0: if c < len(lsplit): token.add_tag(columns[c], lsplit[c].strip()) token.start_pos = spos token.end_pos = spos + len(token.text) spos = token.end_pos + 1 doc.add_token(token) for d in docs: nertagger.predict(d) spans = [] for nerspan in d.get_spans('ner'): start = nerspan.start_pos length = nerspan.end_pos - nerspan.start_pos spans.append({"start": start, "length": length}) myjson = { "text": unidecode.unidecode(d.to_tokenized_string()), "spans": spans } res = requests.post(NEURAL_EL_SERVER, json=myjson) info = res.json() for nerspan in d.get_spans('ner'): for i in info: if i[0] == nerspan.start_pos: for t in nerspan.tokens: t.add_tag("pnme", i[2]) break if with_disambiguation: searcher = load_disambiguation() for nerspan in d.get_spans('ner'): if "pnme" not in nerspan.tokens[0].tags: #print("calling with " + nerspan.text) r = searcher.search(nerspan.text.lower(), sim_level_disambig) #print(r) if len(r) > 0: d_tag = unidecode.unidecode( (string.capwords(r[0]) + "_(disambiguation)").replace(" ", "_")) for t2 in nerspan.tokens: t2.add_tag("pnme", d_tag) for t in d: output_file.write( t.text + "\t" + t.get_tag("nero").value + "\t" + t.get_tag("nme").value + "\t" + unidecode.unidecode(t.get_tag("wiki").value) + "\t" + t.get_tag("pnme").value + "\n")
def process_conll_doc(input_file_name, output_file_name, ner_model, with_disambiguation, sim_level_disambig): nertagger = SequenceTagger.load(ner_model) columns = { 0: 'text', 1: 'nero', 2: 'nme', 3: 'wiki', } with open(input_file_name, "r") as input_file, open(output_file_name, "w+") as output_file: doc = None docs = [] spos = 0 for line in input_file: if "DOCSTART" in line: if doc == None: doc = Sentence() else: docs.append(doc) doc = Sentence() spos = 0 else: lsplit = line.split("\t") #print(lsplit) token = Token(lsplit[0].strip()) for c in columns: if c != 0: if c < len(lsplit): token.add_tag(columns[c], lsplit[c].strip()) token.start_pos = spos token.end_pos = spos + len(token.text) spos = token.end_pos + 1 doc.add_token(token) for d in docs: nertagger.predict(d) centity = [] newsent = [] for token in d: #print(token) nertag = token.get_tag("ner").value #print(token.text + " " + nertag) if nertag[0:2] in ['B-', 'S-']: if len(centity) != 0: newsent.append("<entity>" + " ".join(centity) + "</entity>") centity = [] centity.append(token.text) if nertag[0:2] in ['E-', 'I-']: centity.append(token.text) if nertag == "O": if len(centity) != 0: newsent.append("<entity>" + " ".join(centity) + "</entity>") centity = [] newsent.append(token.text) sent_for_ag = " ".join(newsent) agres = ag.disambiguate(sent_for_ag) for entity in d.get_spans('ner'): for r in agres: if r["namedEntity"] == entity.text: for t in entity.tokens: t.add_tag("pnme", r["disambiguatedURL"]) break if with_disambiguation: searcher = load_disambiguation() for nerspan in d.get_spans('ner'): if "pnme" not in nerspan.tokens[0].tags: #print("calling with " + nerspan.text) r = searcher.search(nerspan.text.lower(), sim_level_disambig) #print(r) if len(r) > 0: d_tag = unidecode.unidecode( (string.capwords(r[0]) + "_(disambiguation)").replace(" ", "_")) for t2 in nerspan.tokens: t2.add_tag("pnme", d_tag) for t in d: output_file.write( t.text + "\t" + t.get_tag("nero").value + "\t" + t.get_tag("nme").value + "\t" + unidecode.unidecode(t.get_tag("wiki").value) + "\t" + t.get_tag("pnme").value + "\n")