Esempio n. 1
0
 def count_datasets_vocabulary(self):
     new_dataset_folder = config.base_folder+"data/new_datasets/"
     """
     datasets = ['aida_train.txt', 'aida_dev.txt', 'aida_test.txt', 'ace2004.txt',
                 'aquaint.txt', 'clueweb.txt', 'msnbc.txt', 'wikipedia.txt']
     """
     for dataset in util.get_immediate_files(new_dataset_folder):
         dataset = os.path.basename(os.path.normpath(dataset))
         print("Processing dataset: ", dataset)
         self.add(new_dataset_folder+dataset)
     self.print_statistics()
     self.serialize(folder=config.base_folder+"data/vocabulary/")
Esempio n. 2
0
def create_tfrecords():
    new_dataset_folder = config.base_folder+"data/new_datasets/"
    datasets = [os.path.basename(os.path.normpath(d)) for d in util.get_immediate_files(new_dataset_folder)]
    print("datasets: ", datasets)

    tfrecords_generator = TFRecordsGenerator()
    tfrecords_generator.set_gmonly_mode()
    for file in datasets:
        tfrecords_generator.process(filepath=new_dataset_folder+file)
    tfrecords_generator.set_allspans_mode()
    for file in datasets:
        tfrecords_generator.process(filepath=new_dataset_folder+file)
def create_entity_universe(language,
                           gmonly_files=None,
                           allspans_files=None,
                           printSamples=None):
    new_dataset_folder = config.base_folder + "data/new_datasets/" + language + "/"
    if gmonly_files is None:
        gmonly_files = []
    if allspans_files is None:
        #allspans_files = ['aida_train.txt', 'aida_dev.txt', 'aida_test.txt', 'ace2004.txt',
        #                  'aquaint.txt', 'clueweb.txt', 'msnbc.txt', 'wikipedia.txt']
        allspans_files = []
        for dataset in util.get_immediate_files(new_dataset_folder):
            if language in dataset:
                allspans_file.append(
                    os.path.basename(os.path.normpath(dataset)))
    print("gmonly_files: ", gmonly_files)
    print("allspans_files: ", allspans_files)

    def create_entity_universe_aux(generator, datasets):
        entities_universe = set()
        for dataset in datasets:
            print("Processing dataset: ", dataset)
            for sample in generator.process(filepath=new_dataset_folder +
                                            dataset):
                entities_universe.update(*sample.cand_entities)
                entities_universe.update(sample.ground_truth)
                if printSamples:
                    printSamples.print_sample(sample)

        print("Overall statistics: ")
        print("all_gm_misses: ", generator.all_gm_misses)
        print("all_gt_misses: ", generator.all_gt_misses)
        print("all_gm: ", generator.all_gm)
        print("recall %     : ",
              (1 - (generator.all_gm_misses + generator.all_gt_misses) /
               (generator.all_gm + 1.0)) * 100, " %")
        print("len(entities_universe):\t\t\t",
              colored(len(entities_universe), 'red'))
        return entities_universe

    gmonly_entities, allspans_entities = set(), set()
    samplesGenerator = SamplesGenerator()
    if gmonly_files:
        print("gmonly files statistics: ")
        samplesGenerator.set_gmonly_mode()
        gmonly_entities = create_entity_universe_aux(samplesGenerator,
                                                     gmonly_files)
    if allspans_files:
        print("Test files statistics: ")
        samplesGenerator.set_allspans_mode()
        allspans_entities = create_entity_universe_aux(samplesGenerator,
                                                       allspans_files)

    all_entities = gmonly_entities | allspans_entities
    print("len(all_entities) = ", len(all_entities))

    # print the entities of our universe to a file together with the name
    with open(
            config.base_folder + "data/entities/" + language +
            "/entities_universe.txt", "w") as fout:
        _, wiki_id_name_map = util.load_wiki_name_id_map()
        for ent_id in all_entities:
            fout.write(ent_id + "\t" +
                       wiki_id_name_map[ent_id].replace(' ', '_') + "\n")

    return all_entities