Ejemplo n.º 1
0
    def flush_entity_universe(self):
        print("len(self.entities_universe) =", len(self.entities_universe))
        entities_folder = config.base_folder + "data/entities/extension_entities/"
        _, wiki_id_name_map = load_wiki_name_id_map()
        if not os.path.exists(entities_folder):
            os.makedirs(entities_folder)

        def dump_entities(entity_set, name):
            with open(entities_folder + name + ".pickle", 'wb') as handle:
                pickle.dump(entity_set, handle)
            with open(entities_folder + name + ".txt", "w") as fout:
                for ent_id in entity_set:
                    fout.write(ent_id + "\t" +
                               wiki_id_name_map[ent_id].replace(' ', '_') +
                               "\n")

        dump_entities(self.entities_universe, "entities_universe")
        # now calculate the expansion i.e. from this universe omit the ones that we have already trained
        extension_entity_set = set()
        wikiid2nnid = load_wikiid2nnid()
        for wikiid in self.entities_universe:
            if wikiid not in wikiid2nnid:
                extension_entity_set.add(wikiid)

        print("len(extension_entity_set) =", len(extension_entity_set))
        dump_entities(extension_entity_set, "extension_entities")
    def __init__(self, output_folder, predictions_folder, entity_extension=None):
        self.thr = None
        self.output_folder = output_folder
        self.predictions_folder = predictions_folder
        with open(output_folder+"word_char_maps.pickle", 'rb') as handle:
            _, self.id2word, _, self.id2char, _, _ = pickle.load(handle)

        self.nnid2wikiid = reverse_dict(load_wikiid2nnid(entity_extension), unique_values=True)
        _, self.wiki_id_name_map = load_wiki_name_id_map()
        self.extra_info = ""
Ejemplo n.º 3
0
def create_entity_universe(gmonly_files=None, allspans_files=None, printSamples=None):
    new_dataset_folder = config.base_folder+"data/hipe_new/"
    if gmonly_files is None:
        gmonly_files = []
    if allspans_files is None:
        allspans_files = ['HIPE-data-v1.0-train-de.txt', 'HIPE-data-v1.0-dev-de.txt','HIPE-data-v1.0-test-de.txt']
    print("gmonly_files: ", gmonly_files)
    print("allspans_files: ", allspans_files)

    def create_entity_universe_aux(generator, datasets):
        entities_universe = set()
        for dataset in datasets:
            print("Processing dataset: ", dataset)
            for sample in generator.process(filepath=new_dataset_folder+dataset):
                entities_universe.update(*sample.cand_entities)
                entities_universe.update(sample.ground_truth)
                if printSamples:
                    printSamples.print_sample(sample)

        print("Overall statistics: ")
        print("all_gm_misses: ", generator.all_gm_misses)
        print("all_gt_misses: ", generator.all_gt_misses)
        print("all_gm: ", generator.all_gm)
        print("recall %     : ", (1 - (generator.all_gm_misses+generator.all_gt_misses)/generator.all_gm)*100, " %")
        print("len(entities_universe):\t\t\t", colored(len(entities_universe), 'red'))
        return entities_universe

    gmonly_entities, allspans_entities = set(), set()
    samplesGenerator = SamplesGenerator()
    if gmonly_files:
        print("gmonly files statistics: ")
        samplesGenerator.set_gmonly_mode()
        gmonly_entities = create_entity_universe_aux(samplesGenerator, gmonly_files)
    if allspans_files:
        print("Test files statistics: ")
        samplesGenerator.set_allspans_mode()
        allspans_entities = create_entity_universe_aux(samplesGenerator, allspans_files)

    all_entities = gmonly_entities | allspans_entities
    print("len(all_entities) = ", len(all_entities))

    # print the entities of our universe to a file together with the name
    with open(config.base_folder+"data/entities/entities_universe.txt", "w") as fout:
        wiki_name_map_path = config.base_folder +  "data/basic_data/wiki_name_map.txt"
        _, wiki_id_name_map = util.load_wiki_name_id_map(filepath = wiki_name_map_path)
        for ent_id in all_entities:
            if ent_id in wiki_id_name_map:
                fout.write(ent_id + "\t" + wiki_id_name_map[ent_id].replace(' ', '_') + "\n")

    return all_entities
Ejemplo n.º 4
0
def print_p_e_m_dictionary_to_file(p_e_m, full_filepath):
    _, wiki_id_name_map = util.load_wiki_name_id_map()
    with open(full_filepath, "w") as fout:
        for mention, entities in p_e_m.items():
            out_acc = []
            # entities is a   defaultdict(int)
            # so items returns ent2: 10,  ent54:20, ent3:2
            sorted_ = sorted(entities.items(), key=operator.itemgetter(1), reverse=True)
            # a list of tuples   [(ent54,20), (ent2,10), (ent3,2)]
            total_freq = 0
            for ent_id, prob in sorted_:
                if len(out_acc) > 100:    # at most 100 candidate entities
                    break
                total_freq += prob
                out_acc.append(','.join([ent_id, str(prob),
                                         wiki_id_name_map[ent_id].replace(' ', '_')]))
            fout.write(mention + "\t" + str(total_freq) + "\t" + "\t".join(out_acc) + "\n")
Ejemplo n.º 5
0
    def __init__(self,
                 output_folder,
                 predictions_folder,
                 entity_extension=None,
                 gm_bucketing_pempos=None,
                 print_global_voters=False,
                 print_global_pairwise_scores=False):
        self.thr = None
        self.output_folder = output_folder
        self.predictions_folder = predictions_folder
        with open(output_folder + "word_char_maps.pickle", 'rb') as handle:
            _, self.id2word, _, self.id2char, _, _ = pickle.load(handle)

        self.nnid2wikiid = reverse_dict(load_wikiid2nnid(entity_extension),
                                        unique_values=True)
        _, self.wiki_id_name_map = load_wiki_name_id_map()
        self.extra_info = ""
        self.gm_bucketing = GMBucketingResults(
            gm_bucketing_pempos) if gm_bucketing_pempos else None
        self.print_global_pairwise_scores = print_global_pairwise_scores
        self.print_global_voters = print_global_voters
Ejemplo n.º 6
0
 def __init__(self, only_misses=True):
     _, self.wiki_id_name_map = util.load_wiki_name_id_map()
     self.only_misses = only_misses
Ejemplo n.º 7
0
    def __init__(self, train_args, args):
        self.args = args
        # input pipeline
        self.streaming_samples = StreamingSamples()
        ds = tf.data.Dataset.from_generator(
            self.streaming_samples.gen,
            (
                tf.int64,
                tf.int64,
                tf.int64,
                tf.int64,  #words, words_len, chars, chars_len
                tf.int64,
                tf.int64,
                tf.int64,  # begin_span, end_span, span_len
                tf.int64,
                tf.float32,
                tf.int64
            ),  #cand_entities, cand_entities_scores, cand_entities_len
            (tf.TensorShape([None]), tf.TensorShape(
                []), tf.TensorShape([None, None]), tf.TensorShape([None]),
             tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape(
                 []), tf.TensorShape([None, None]), tf.TensorShape(
                     [None, None]), tf.TensorShape([None])))
        next_element = ds.make_one_shot_iterator().get_next()
        # batch size = 1   i expand the dims now to match the training that has batch dimension
        next_element = [tf.expand_dims(t, 0) for t in next_element]
        next_element = [
            None, *next_element[:-1], None, next_element[-1], None, None, None,
            None
        ]

        # restore model
        print("loading Model:", train_args.output_folder)
        model = Model(train_args, next_element)
        model.build()
        checkpoint_path = model.restore_session("el" if args.el_mode else "ed")
        self.model = model
        if args.hardcoded_thr:
            self.thr = args.hardcoded_thr
            print("threshold used:", self.thr)
        else:
            # optimal threshold recovery from log files.
            # based on the checkpoint selected look at the log file for threshold (otherwise recompute it)
            self.thr = retrieve_optimal_threshold_from_logfile(
                train_args.output_folder, checkpoint_path, args.el_mode)
            print("optimal threshold selected = ", self.thr)

        if args.running_mode == "el_mode":
            args.el_mode = True
        elif args.running_mode == "ed_mode":
            args.el_mode = False

        # convert text to tensors for the NN
        with open(args.experiment_folder + "word_char_maps.pickle",
                  'rb') as handle:
            self.word2id, _, self.char2id, _, _, _ = pickle.load(handle)

        self.wikiid2nnid = load_wikiid2nnid(
            extension_name=args.entity_extension)
        self.nnid2wikiid = reverse_dict(self.wikiid2nnid, unique_values=True)
        _, self.wiki_id_name_map = load_wiki_name_id_map()

        with open(args.experiment_folder + "prepro_args.pickle",
                  'rb') as handle:
            self.prepro_args = pickle.load(handle)
            if args.lowercase_spans_pem:
                self.prepro_args.lowercase_p_e_m = True
                self.prepro_args.lowercase_spans = True
        print("prepro_args:", self.prepro_args)
        self.prepro_args.persons_coreference = args.persons_coreference
        self.prepro_args.persons_coreference_merge = args.persons_coreference_merge
        self.fetchFilteredCoreferencedCandEntities = FetchFilteredCoreferencedCandEntities(
            self.prepro_args)
        prepro_util.args = self.prepro_args

        self.special_tokenized_words = {"``", '"', "''"}
        self.special_words_assertion_errors = 0
        self.gm_idx_errors = 0
        if self.args.el_with_stanfordner_and_our_ed:
            from nltk.tag import StanfordNERTagger
            self.st = StanfordNERTagger(
                '../data/stanford_core_nlp/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz',
                '../data/stanford_core_nlp/stanford-ner-2018-02-27/stanford-ner.jar',
                encoding='utf-8')
        self.from_myspans_to_given_spans_map_errors = 0
def create_entity_universe(language,
                           gmonly_files=None,
                           allspans_files=None,
                           printSamples=None):
    new_dataset_folder = config.base_folder + "data/new_datasets/" + language + "/"
    if gmonly_files is None:
        gmonly_files = []
    if allspans_files is None:
        #allspans_files = ['aida_train.txt', 'aida_dev.txt', 'aida_test.txt', 'ace2004.txt',
        #                  'aquaint.txt', 'clueweb.txt', 'msnbc.txt', 'wikipedia.txt']
        allspans_files = []
        for dataset in util.get_immediate_files(new_dataset_folder):
            if language in dataset:
                allspans_file.append(
                    os.path.basename(os.path.normpath(dataset)))
    print("gmonly_files: ", gmonly_files)
    print("allspans_files: ", allspans_files)

    def create_entity_universe_aux(generator, datasets):
        entities_universe = set()
        for dataset in datasets:
            print("Processing dataset: ", dataset)
            for sample in generator.process(filepath=new_dataset_folder +
                                            dataset):
                entities_universe.update(*sample.cand_entities)
                entities_universe.update(sample.ground_truth)
                if printSamples:
                    printSamples.print_sample(sample)

        print("Overall statistics: ")
        print("all_gm_misses: ", generator.all_gm_misses)
        print("all_gt_misses: ", generator.all_gt_misses)
        print("all_gm: ", generator.all_gm)
        print("recall %     : ",
              (1 - (generator.all_gm_misses + generator.all_gt_misses) /
               (generator.all_gm + 1.0)) * 100, " %")
        print("len(entities_universe):\t\t\t",
              colored(len(entities_universe), 'red'))
        return entities_universe

    gmonly_entities, allspans_entities = set(), set()
    samplesGenerator = SamplesGenerator()
    if gmonly_files:
        print("gmonly files statistics: ")
        samplesGenerator.set_gmonly_mode()
        gmonly_entities = create_entity_universe_aux(samplesGenerator,
                                                     gmonly_files)
    if allspans_files:
        print("Test files statistics: ")
        samplesGenerator.set_allspans_mode()
        allspans_entities = create_entity_universe_aux(samplesGenerator,
                                                       allspans_files)

    all_entities = gmonly_entities | allspans_entities
    print("len(all_entities) = ", len(all_entities))

    # print the entities of our universe to a file together with the name
    with open(
            config.base_folder + "data/entities/" + language +
            "/entities_universe.txt", "w") as fout:
        _, wiki_id_name_map = util.load_wiki_name_id_map()
        for ent_id in all_entities:
            fout.write(ent_id + "\t" +
                       wiki_id_name_map[ent_id].replace(' ', '_') + "\n")

    return all_entities
Ejemplo n.º 9
0
 def __init__(self, only_misses=True):
     wiki_name_map_path = config.base_folder +  "data/basic_data/wiki_name_map.txt"
     _, self.wiki_id_name_map = util.load_wiki_name_id_map(filepath = wiki_name_map_path)
     self.only_misses = only_misses