Ejemplo n.º 1
0
 def __init__(self):
     self.entityNameIdMap = util.EntityNameIdMap()
     self.entityNameIdMap.init_gerbil_compatible_ent_id()
     self.unknown_ent_name = dict()
     self.no_english_uri = dict()
     self.all_gm_cnt = dict()
     self.englishuri_gm_cnt = dict()
     self.valid_gms = dict()
Ejemplo n.º 2
0
def process_aida(in_filepath, out_filepath):

    # _, wiki_id_name_map = util.load_wiki_name_id_map(lowercase=False)
    #_, wiki_id_name_map = util.entity_name_id_map_from_dump()
    entityNameIdMap = util.EntityNameIdMap()
    entityNameIdMap.init_compatible_ent_id()
    unknown_gt_ids = 0  # counter of ground truth entity ids that are not in the wiki_name_id.txt
    ent_id_changes = 0
    with open(in_filepath) as fin, open(out_filepath, "w") as fout:
        in_mention = False  # am i inside a mention span or not
        first_document = True
        for line in fin:
            l = line.split('\t')
            if in_mention and not (len(l) == 7 and l[1] == 'I'):
                # if I am in mention but the current line does not continue the previous mention
                # then print MMEND and be in state in_mention=FALSE
                fout.write("MMEND\n")
                in_mention = False

            if line.startswith("-DOCSTART-"):
                if not first_document:
                    fout.write("DOCEND\n")
                # line = "-DOCSTART- (967testa ATHLETICS)\n"
                doc_title = line[len("-DOCSTART- ("):-2]
                fout.write("DOCSTART_" + doc_title.replace(' ', '_') + "\n")
                first_document = False
            elif line == "\n":
                fout.write("*NL*\n")
            elif len(l) == 7 and l[1] == 'B':  # this is a new mention
                wiki_title = l[4]
                wiki_title = wiki_title[len("http://en.wikipedia.org/wiki/"
                                            ):].replace('_', ' ')
                new_ent_id = entityNameIdMap.compatible_ent_id(
                    wiki_title, l[5])
                if new_ent_id is not None:
                    if new_ent_id != l[5]:
                        ent_id_changes += 1
                        #print(line, "old ent_id: " + l[5], " new_ent_id: ", new_ent_id)
                    fout.write(
                        "MMSTART_" + new_ent_id + "\n"
                    )  # TODO check here if entity id is inside my wikidump
                    # if not then omit this mention
                    fout.write(l[0] + "\n")  # write the word
                    in_mention = True
                else:
                    unknown_gt_ids += 1
                    fout.write(l[0] + "\n")  # write the word
                    print(line)
            else:
                # words that continue a mention len(l) == 7: and l[1]=='I'
                # or normal word outside of mention, or in mention without disambiguation (len(l) == 4)
                fout.write(l[0].rstrip() + "\n")
        fout.write("DOCEND\n")  # for the last document
    print("process_aida     unknown_gt_ids: ", unknown_gt_ids)
    print("process_aida     ent_id_changes: ", ent_id_changes)
Ejemplo n.º 3
0
def process_hipe(in_filepath, out_filepath):

    # _, wiki_id_name_map = util.load_wiki_name_id_map(lowercase=False)
    #_, wiki_id_name_map = util.entity_name_id_map_from_dump()
    entityNameIdMap = util.EntityNameIdMap()
    entityNameIdMap.init_compatible_ent_id()
    unknown_gt_ids = 0  # counter of ground truth entity ids that are not in the wiki_name_id.txt
    ent_id_changes = 0
    with open(in_filepath) as fin, open(out_filepath, "w") as fout:
        in_mention = False  # am i inside a mention span or not
        first_document = True
        for line in fin:
            l = line.split('\t')
            if len(l) == 10:
                if in_mention and not ('I' in l[1]):
                    # if I am in mention but the current line does not continue the previous mention
                    # then print MMEND and be in state in_mention=FALSE
                    # end of the mention
                    fout.write("MMEND\n")
                    in_mention = False

                elif "EndOfLine" in l[9]:
                    # new line
                    fout.write("*NL*\n")
                elif 'B' in l[1]:  # this is a new mention
                    wikidata_id = l[7]
                    fout.write(
                        "MMSTART_" + wikidata_id + "\n"
                    )  # TODO check here if entity id is inside my wikidump
                    # if not then omit this mention
                    fout.write(l[0] + "\n")  # write the word
                    in_mention = True
                elif l[1] == "NE-COARSE-LIT":
                    continue
                else:
                    # words that continue a mention len(l) == 10: and l[1] contains 'I'
                    # or normal word outside of mention
                    fout.write(l[0].rstrip() + "\n")

            elif "# document_id" in line:
                if not first_document:
                    fout.write("DOCEND\n")
                # line = "# document_id = NZZ-1798-01-20-a-p0001\n"
                doc_title = line[len("# document_id = "):-1]
                fout.write("DOCSTART_" + doc_title.replace(' ', '_') + "\n")
                first_document = False
            else:
                continue
        fout.write("DOCEND\n")  # until the last document
def process_wimcor(in_filepath, out_filepath):
    with open(in_filepath) as fin:
        content = fin.read()
    soup = BeautifulSoup(content, 'lxml')

    spacy_tokenizer = English(parser=False)

    entityNameIdMap = util.EntityNameIdMap()
    entityNameIdMap.init_compatible_ent_id()
    unknown_gt_ids = 0   # counter of ground truth entity ids that are not in the wiki_name_id.txt

    with open(out_filepath, "w") as fout:
        for idx, item in enumerate(soup.find_all('sample')):
            fout.write('DOCSTART_{}\n'.format(idx))

            lcontext = str(item.find('pmw').previous_sibling) if item.find('pmw').previous_sibling else ""
            pmw = item.find('pmw').text
            loc_pmw = len(spacy_tokenizer(lcontext))
            len_pmw = len(spacy_tokenizer(pmw))
            sample = '{} {} {}'.format(lcontext, pmw, str(item.find('pmw').next_sibling) if item.find('pmw').next_sibling else "")

            ctr = 0
            in_pmw = False
            for idx, token in enumerate(spacy_tokenizer(sample)):
                if idx == loc_pmw:
                    wiki_title = item.find('pmw')['fine']
                    ent_id = entityNameIdMap.compatible_ent_id(wiki_title)
                    if ent_id is not None:
                        fout.write('MMSTART_{}\n'.format(ent_id))

                        in_pmw = True
                        ctr = len_pmw
                    else:
                        unknown_gt_ids += 1
                    fout.write('{}\n'.format(token))
                elif in_pmw and ctr == 0:
                    in_pmw = False

                    fout.write('MMEND\n')
                    fout.write('{}\n'.format(token))
                else:
                    fout.write('{}\n'.format(token))

                ctr -= 1

            fout.write('DOCEND\n')
    print("process_wimcor    unknown_gt_ids: ", unknown_gt_ids)
Ejemplo n.º 5
0
def wikidump_to_new_format():
    doc_cnt = 0
    hyperlink2EntityId = util.EntityNameIdMap()
    hyperlink2EntityId.init_hyperlink2id()
    if args.debug:
        infilepath = config.base_folder + "data/mydata/tokenized_toy_wiki_dump2.txt"
        outfilepath = args.out_folder + "toy_wikidump.txt"
    else:
        infilepath = config.base_folder + "data/basic_data/tokenizedWiki.txt"
        outfilepath = args.out_folder + "wikidump.txt"
    with open(infilepath) as fin,\
         open(outfilepath, "w") as fout:
        in_mention = False
        for line in fin:
            line = line.rstrip()  # omit the '\n' character
            if line.startswith('<doc\xa0id="'):
                docid = line[9:line.find('"', 9)]
                doctitle = line[line.rfind('="') + 2:-2]
                fout.write("DOCSTART_" + docid + "_" +
                           doctitle.replace(' ', '_') + "\n")
            elif line.startswith('<a\xa0href="'):
                ent_id = hyperlink2EntityId.hyperlink2id(line)
                if ent_id != config.unk_ent_id:
                    in_mention = True
                    fout.write("MMSTART_" + ent_id + "\n")
            elif line == '</doc>':
                fout.write("DOCEND\n")
                doc_cnt += 1
                if doc_cnt % 5000 == 0:
                    print("document counter: ", doc_cnt)
            elif line == '</a>':
                if in_mention:
                    fout.write("MMEND\n")
                    in_mention = False
            else:
                fout.write(line + "\n")
Ejemplo n.º 6
0
def process_aida(in_filepath, out_filepath):

    # _, wiki_id_name_map = util.load_wiki_name_id_map(lowercase=False)
    #_, wiki_id_name_map = util.entity_name_id_map_from_dump()
    entityNameIdMap = util.EntityNameIdMap()
    entityNameIdMap.init_compatible_ent_id()
    unknown_gt_ids = 0  # counter of ground truth entity ids that are not in the wiki_name_id.txt
    ent_id_changes = 0
    text_acc = []
    with open(in_filepath) as fin, open(
            args.output_folder + "tokenize_" + out_filepath, "w") as fout:
        in_mention = False  # am i inside a mention span or not
        first_document = True
        for line in fin:
            l = line.strip().split('\t')
            if in_mention and not (len(l) == 5 and l[1] == 'I'):
                # if I am in mention but the current line does not continue the previous mention
                # then print MMEND and be in state in_mention=FALSE
                #fout.write("MMEND\n")
                text_acc.append("MMEND")
                in_mention = False

            if line.startswith("-DOCSTART-"):
                if not first_document:
                    #fout.write("DOCEND\n")
                    text_acc.append("DOCEND")
                # line = "-DOCSTART- (967testa ATHLETICS)\n"
                doc_title = line[len("-DOCSTART- ("):-2]
                #fout.write("DOCSTART_"+doc_title.replace(' ', '_')+"\n")
                text_acc.append("DOCSTART_" + doc_title.replace(' ', '_'))
                first_document = False
            elif line == "\n":
                #fout.write("*NL*\n")
                text_acc.append("\n")
            elif len(l) == 5 and l[1] == 'B':  # this is a new mention
                wikidataid = l[4]
                wikidataid = wikidataid[len("https://www.wikidata.org/wiki/"):]
                if entityNameIdMap.is_valid_entity_id(wikidataid):
                    text_acc.append("MMSTART_" + wikidataid)
                    # if not then omit this mention
                    #fout.write(l[0]+"\n")  # write the word
                    text_acc.append(l[0])  # write the word
                    in_mention = True
                else:
                    unknown_gt_ids += 1
                    #fout.write(l[0]+"\n")  # write the word
                    text_acc.append(l[0])  # write the word
                    print(line)
            else:
                # words that continue a mention len(l) == 7: and l[1]=='I'
                # or normal word outside of mention, or in mention without disambiguation (len(l) == 4)
                #fout.write(l[0].rstrip()+"\n")
                text_acc.append(l[0].rstrip())
        #fout.write("DOCEND\n")  # for the last document
        text_acc.append("DOCEND")  # for the last document
        fout.write(' '.join(text_acc))
    print("process_aida     unknown_gt_ids: ", unknown_gt_ids)
    print("now tokenize with stanford tokenizer")
    tokenize_command = 'cd {}; java -cp "*" ' \
                       'edu.stanford.nlp.process.PTBTokenizer -options "tokenizeNLs=True" < {} > {}'.format(
        args.stanford_tokenizer_folder, args.output_folder+"tokenize_"+out_filepath, args.output_folder+out_filepath)
    print(tokenize_command)
    call(tokenize_command, shell=True)
 def __init__(self):
     self.entityNameIdMap = util.EntityNameIdMap()
     self.entityNameIdMap.init_compatible_ent_id()