def process_conll_doc(input_file_name, output_file_name):

    columns = {
        0: 'text',
        1: 'nero',
        2: 'nme',
        3: 'wiki',
    }
    with open(input_file_name,
              "r") as input_file, open(output_file_name, "w+") as output_file:
        doc = None
        docs = []
        spos = 0

        for line in input_file:
            if "DOCSTART" in line:
                if doc == None:
                    doc = Sentence()
                else:
                    docs.append(doc)
                    doc = Sentence()
                    spos = 0
            else:
                lsplit = line.split("\t")
                #print(lsplit)
                token = Token(lsplit[0].strip())
                for c in columns:
                    if c != 0:
                        if c < len(lsplit):
                            token.add_tag(columns[c], lsplit[c].strip())
                token.start_pos = spos
                token.end_pos = spos + len(token.text)
                spos = token.end_pos + 1
                doc.add_token(token)

        for d in docs:

            myjson = {
                "text": unidecode.unidecode(d.to_tokenized_string()),
                "spans": []
            }
            res = requests.post(NEURAL_EL_SERVER, json=myjson)
            info = res.json()
            #print(info)
            for i in info:
                entity_ran = range(i[0], i[0] + i[1])
                #print(i[2] + " " + str(entity_ran))
                for t in d.tokens:
                    #print(t.text + " " + str(t.start_pos))
                    if t.start_position in entity_ran:
                        #print("found tag")
                        t.add_tag("pnme", i[2])

            for t in d:
                output_file.write(
                    t.text + "\t" + t.get_tag("nero").value + "\t" +
                    t.get_tag("nme").value + "\t" +
                    unidecode.unidecode(t.get_tag("wiki").value) + "\t" +
                    t.get_tag("pnme").value + "\n")
Esempio n. 2
0
def process_conll_doc(input_file_name, output_file_name, ner_model,
                      with_disambiguation, sim_level_disambig):

    nertagger = SequenceTagger.load(ner_model)
    columns = {
        0: 'text',
        1: 'nero',
        2: 'nme',
        3: 'wiki',
    }
    with open(input_file_name,
              "r") as input_file, open(output_file_name, "w+") as output_file:
        doc = None
        docs = []
        spos = 0

        for line in input_file:
            if "DOCSTART" in line:
                if doc == None:
                    doc = Sentence()
                else:
                    docs.append(doc)
                    doc = Sentence()
                    spos = 0
            else:
                lsplit = line.split("\t")
                #print(lsplit)
                token = Token(lsplit[0].strip())
                for c in columns:
                    if c != 0:
                        if c < len(lsplit):
                            token.add_tag(columns[c], lsplit[c].strip())
                token.start_pos = spos
                token.end_pos = spos + len(token.text)
                spos = token.end_pos + 1
                doc.add_token(token)

        for d in docs:
            nertagger.predict(d)

            spans = []
            for nerspan in d.get_spans('ner'):
                start = nerspan.start_pos
                length = nerspan.end_pos - nerspan.start_pos
                spans.append({"start": start, "length": length})

            myjson = {
                "text": unidecode.unidecode(d.to_tokenized_string()),
                "spans": spans
            }

            res = requests.post(NEURAL_EL_SERVER, json=myjson)
            info = res.json()

            for nerspan in d.get_spans('ner'):
                for i in info:
                    if i[0] == nerspan.start_pos:
                        for t in nerspan.tokens:
                            t.add_tag("pnme", i[2])
                        break

            if with_disambiguation:
                searcher = load_disambiguation()
                for nerspan in d.get_spans('ner'):
                    if "pnme" not in nerspan.tokens[0].tags:
                        #print("calling with " + nerspan.text)
                        r = searcher.search(nerspan.text.lower(),
                                            sim_level_disambig)
                        #print(r)
                        if len(r) > 0:
                            d_tag = unidecode.unidecode(
                                (string.capwords(r[0]) +
                                 "_(disambiguation)").replace(" ", "_"))
                            for t2 in nerspan.tokens:
                                t2.add_tag("pnme", d_tag)

            for t in d:
                output_file.write(
                    t.text + "\t" + t.get_tag("nero").value + "\t" +
                    t.get_tag("nme").value + "\t" +
                    unidecode.unidecode(t.get_tag("wiki").value) + "\t" +
                    t.get_tag("pnme").value + "\n")
Esempio n. 3
0
def process_conll_doc(input_file_name, output_file_name, ner_model,
                      with_disambiguation, sim_level_disambig):

    nertagger = SequenceTagger.load(ner_model)
    columns = {
        0: 'text',
        1: 'nero',
        2: 'nme',
        3: 'wiki',
    }
    with open(input_file_name,
              "r") as input_file, open(output_file_name, "w+") as output_file:
        doc = None
        docs = []
        spos = 0

        for line in input_file:
            if "DOCSTART" in line:
                if doc == None:
                    doc = Sentence()
                else:
                    docs.append(doc)
                    doc = Sentence()
                    spos = 0
            else:
                lsplit = line.split("\t")
                #print(lsplit)
                token = Token(lsplit[0].strip())
                for c in columns:
                    if c != 0:
                        if c < len(lsplit):
                            token.add_tag(columns[c], lsplit[c].strip())
                token.start_pos = spos
                token.end_pos = spos + len(token.text)
                spos = token.end_pos + 1
                doc.add_token(token)

        for d in docs:
            nertagger.predict(d)

            centity = []
            newsent = []
            for token in d:
                #print(token)
                nertag = token.get_tag("ner").value
                #print(token.text + " " + nertag)
                if nertag[0:2] in ['B-', 'S-']:
                    if len(centity) != 0:
                        newsent.append("<entity>" + " ".join(centity) +
                                       "</entity>")
                        centity = []
                    centity.append(token.text)
                if nertag[0:2] in ['E-', 'I-']:
                    centity.append(token.text)
                if nertag == "O":
                    if len(centity) != 0:
                        newsent.append("<entity>" + " ".join(centity) +
                                       "</entity>")
                        centity = []
                    newsent.append(token.text)
            sent_for_ag = " ".join(newsent)
            agres = ag.disambiguate(sent_for_ag)

            for entity in d.get_spans('ner'):
                for r in agres:
                    if r["namedEntity"] == entity.text:
                        for t in entity.tokens:
                            t.add_tag("pnme", r["disambiguatedURL"])
                        break

            if with_disambiguation:
                searcher = load_disambiguation()
                for nerspan in d.get_spans('ner'):
                    if "pnme" not in nerspan.tokens[0].tags:
                        #print("calling with " + nerspan.text)
                        r = searcher.search(nerspan.text.lower(),
                                            sim_level_disambig)
                        #print(r)
                        if len(r) > 0:
                            d_tag = unidecode.unidecode(
                                (string.capwords(r[0]) +
                                 "_(disambiguation)").replace(" ", "_"))
                            for t2 in nerspan.tokens:
                                t2.add_tag("pnme", d_tag)

            for t in d:
                output_file.write(
                    t.text + "\t" + t.get_tag("nero").value + "\t" +
                    t.get_tag("nme").value + "\t" +
                    unidecode.unidecode(t.get_tag("wiki").value) + "\t" +
                    t.get_tag("pnme").value + "\n")