Esempio n. 1
0
    def __init__(self, config, vocabloader):
        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)

        (self.label2idx, self.idx2label) = vocabloader.getLabelVocab()

        # Initializing with zero count
        self.typeCount = {}
        for t in self.label2idx.keys():
            self.typeCount[t] = 0

        self.makeLabelCount()
Esempio n. 2
0
    def __init__(self,
                 config,
                 vocabloader,
                 val_file,
                 num_cands,
                 batch_size,
                 strict_context=True,
                 pretrain_wordembed=True,
                 wordDropoutKeep=1.0,
                 cohDropoutKeep=1.0):
        '''
        Reader especially for training data, but can be used for test data as
        validation and test file inputs. The requirement is that the mention candidates
        should be added to the TrValCandidateDict using readers.train.crosswikis_vocab

        DataType 0/1 corresponds to train/val_file
        '''
        self.config = config
        self.batch_size = batch_size
        print("[#] Initializing Training Reader Batch Size: {}".format(
            self.batch_size))
        stime = time.time()
        self.start_word = start_word
        self.end_word = end_word
        self.unk_word = 'unk'  # In tune with glove
        self.unk_wid = "<unk_wid>"
        self.pretrain_wordembed = pretrain_wordembed
        assert 0.0 < wordDropoutKeep <= 1.0
        self.wordDropoutKeep = wordDropoutKeep
        assert 0.0 < cohDropoutKeep <= 1.0
        self.cohDropoutKeep = cohDropoutKeep
        self.num_cands = num_cands
        self.strict_context = strict_context

        # Coherence String Vocab
        (self.cohG92idx,
         self.idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl)
        self.num_cohstr = len(self.idx2cohG9)
        print("[#] Coherence Loaded. Num Coherence Strings: {}".format(
            self.num_cohstr))

        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)
        self.num_tr_mens_files = len(self.tr_mens_files)
        print("[#] Training Mention Files : {} files".format(
            self.num_tr_mens_files))

        etime = time.time()
        ttime = etime - stime
        print("[#] TRAINING READER LOADING COMPLETE. "
              "Time Taken: {} secs\n".format(ttime))
    def __init__(self, config, vocabloader):
        ''' Used to make pruned crosswikis dict and candidate dictionary
        for training and validation data

        train_val_cwikis_pkl : Slice of crosswikis for surfaces in train/val (NOT USED)

        train_val_cwikis_cands_pkl: Train/Val data only contain known entities
        This dict acts as pre-cache of mention candidates.
        key   : (LNRM(surface), WID)
        Value : ([Candidate_IDXs], [CProbs])
        Candidate_Idxs : The first idx is the true wid_idx, rest are candidates
        Padded with Unk_Wid_Idx(=0) if less than number of candidates needed.
        '''
        self.config = config
        train_mentions_dir = config.train_mentions_dir
        val_mentions_file = config.val_mentions_file
        test_mentions_file = config.test_mentions_file

        tr_mens_files = utils.get_mention_files(train_mentions_dir)
        self.numc = 30
        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()
        self.wid2WikiTitle = vocabloader.getWID2Wikititle()

        if not os.path.exists(config.trval_kwnidx_cands_pkl):
            self.crosswikis_dict = vocabloader.loadPrunedCrosswikis()
            print("Crosswikis Loaded. Size: {}".format(
                len(self.crosswikis_dict)))

            print("Size of known entities: {}".format(len(self.knwid2idx)))
            print("Making Train/Validation/Test CWiki Candidates.\n"
                  "{Key:(surface, wid), V: ([CandWids], [PriorProbs])")
            train_val_candidates_dict = self.make_train_val_candidatesDict(
                train_mentions_dir, tr_mens_files, val_mentions_file,
                test_mentions_file)
            utils.save(config.trval_kwnidx_cands_pkl,
                       train_val_candidates_dict)
            print("Train/Val Candidates Dict Saved")
            sys.exit(0)
        else:
            print("Train/Val CWiki Candidates already exists")
            trval_cand_dict = utils.load(train_val_cwikis_cands_pkl)
            print("Loaded dict")
            key = ('barackobama', '534366')
            (candidates, probs) = (trval_cand_dict[key][0],
                                   trval_cand_dict[key][1])
            candidates = [self.idx2knwid[wididx] for wididx in candidates]
            candidates = [self.wid2WikiTitle[wid] for wid in candidates]

            print((key, candidates, probs))
Esempio n. 4
0
    def __init__(self, config, vocabloader, testfile):
        self.test_mentions = utils.make_mentions_from_file(testfile)
        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)

        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()

        # key:wiktitle Value:set(context_words)
        self.train_entities = {}

        # key:wiktitle Value: List of set(context_words) for each mention
        self.test_entities = {}

        self.encontext_pr = {}

        self.getTestEntitiesContextWords()
        self.getTrainEntitiesContextWords()
        self.computeEntityContextPR(self.train_entities, self.test_entities)
    def __init__(self, config, vocabloader):
        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)

        print("Loading Coherence String Dicts")
        (coh2idx, idx2coh) = utils.load(config.cohstring_vocab_pkl)
        (cohG92idx, idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl)

        print("Coherence Stirng set Size : {}, cnt >= 10 size : {}".format(
            len(idx2coh), len(idx2cohG9)))

        self.testDataCountCohLessMens(config.val_mentions_file, cohG92idx)
        self.testDataCountCohLessMens(config.test_mentions_file, cohG92idx)

        self.testDataCountCohLessMens(config.ace_mentions_file, cohG92idx)

        self.testDataCountCohLessMens(config.aida_inkb_dev_file, cohG92idx)
        self.testDataCountCohLessMens(config.aida_inkb_test_file, cohG92idx)
        self.testDataCountCohLessMens(config.aida_inkb_train_file, cohG92idx)
Esempio n. 6
0
    def __init__(self, config, vocabloader, num_cands):
        self.unk_wid = "<unk_wid>"

        self.g_num_wids = 0
        self.g_num_elements = 0

        # Known WID Vocab
        # Known WID Vocab
        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()
        self.num_kwn_entities = len(self.idx2knwid)
        print(" [#] Loaded. Num of known wids : {}".format(
            self.num_kwn_entities))

        # Candidates Dict
        print("[#] Loading training/val crosswikis candidate dict ... ")
        self.trval_cands_dict = vocabloader.getTrainValCandidateDict()

        print("[#] Training Mentions Dir : {}".format(
            config.train_mentions_dir))
        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)
        self.num_tr_mens_files = len(self.tr_mens_files)
        print(" [#] Training Mention Files : {} files".format(
            self.num_tr_mens_files))

        print("[#] Validation Mentions File : {}".format(
            config.val_mentions_file))
        print("[#] Test Mentions File : {}".format(config.test_mentions_file))

        print("[#] Pre-loading validation mentions ... ")
        self.val_mentions = utils.make_mentions_from_file(
            config.val_mentions_file)
        self.test_mentions = utils.make_mentions_from_file(
            config.test_mentions_file)
        self.num_val_mens = len(self.val_mentions)
        self.num_test_mens = len(self.test_mentions)
        print("[#] Validation Mentions : {}, Test Mentions : {}".format(
            self.num_val_mens, self.num_test_mens))

        self.num_cands = num_cands

        print("\n[#] LOADING COMPLETE:")
Esempio n. 7
0
    def __init__(self, config, vocabloader, testfile):
        self.test_mentions = utils.make_mentions_from_file(testfile)
        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)

        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()

        # Wid2Wikititle Map
        self.wid2WikiTitle = vocabloader.getWID2Wikititle()
        print(" [#] Size of Wid2Wikititle: {}".format(len(self.wid2WikiTitle)))

        # {wiktitle: set(coherence_words)}
        self.train_entities = {}

        # {wiktitle: List of set(coherence_words) for each mention
        self.test_entities = {}

        self.encontext_pr = {}

        self.getTestEntitiesCoherenceWords()
        self.getTrainEntitiesCoherenceWords()
        self.computeEntityCoherencePR(self.train_entities, self.test_entities)
Esempio n. 8
0
    def __init__(self,
                 config,
                 widWikititle_file,
                 widLabel_file,
                 word_threshold=1):
        '''Given training data, makes word vocab, glove word vocab,
           doc_mentions vocab, type lables vocab, known_wid vocab,
           wid2Wikititle
        '''
        self.start_word = start_word
        self.end_word = end_word
        self.unk_word = 'unk'  # In tune with word2vec
        self.unk_wid = "<unk_wid>"

        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)
        self.num_tr_mens_files = len(self.tr_mens_files)
        print("[#] Training Mention Files : {} files".format(
            self.num_tr_mens_files))

        print("[#] Validation Mentions File : {}".format(
            config.val_mentions_file))

        tr_data_vocabs_exist = self.check_train_data_vocabs_exist(
            config.word_vocab_pkl, config.label_vocab_pkl,
            config.kwnwid_vocab_pkl, config.cohstring_vocab_pkl,
            config.cohstringG1_vocab_pkl)

        if not tr_data_vocabs_exist:
            print("[#] Loading pretrained word2vec embeddings .. ")
            self.word2vec = gensim.models.Word2Vec.load_word2vec_format(
                config.word2vec_bin_gz, binary=True)
            self.word2vec.init_sims(replace=True)

            print("All/Some Training Vocabs do not exist. Making ... ")
            self.make_training_data_vocabs(
                self.tr_mens_dir, self.tr_mens_files, config.word_vocab_pkl,
                config.label_vocab_pkl, config.kwnwid_vocab_pkl,
                config.cohstring_vocab_pkl, config.cohstringG1_vocab_pkl,
                config.cohstringG9_vocab_pkl, word_threshold)

        if not os.path.exists(config.widWiktitle_pkl):
            print(" [#] Making wid2Wikititle Map")
            wid2Wikititle = self.make_widWikititleMap(widWikititle_file)
            utils.save(config.widWiktitle_pkl, wid2Wikititle)
            print(" [#] Done. Size : {}".format(len(wid2Wikititle)))

        if not os.path.exists(config.wid2typelabels_vocab_pkl):
            print(" [#] Making wid2Types Map")
            wid2types = self.make_wid2TypesMap(widLabel_file)
            utils.save(config.wid2typelabels_vocab_pkl, wid2types)
            print(" [#] Done. Size : {}".format(len(wid2types)))

        if not os.path.exists(config.glove_word_vocab_pkl):
            print(" [#] Makign GloVe Word Vocabs")
            glove2vec = utils.load(config.glove_pkl)
            print("   [#] Glove embeddings loaded. Size: {}".format(
                len(glove2vec)))
            (glove_word2idx,
             glove_idx2word) = self.make_glovewordvocab(glove2vec)
            utils.save(config.glove_word_vocab_pkl,
                       (glove_word2idx, glove_idx2word))
Esempio n. 9
0
                    outf.write(m)
                    outf.write("\n")

        print("Priocessing : {}".format(self.cold_val1))
        with open(self.cold_val2, 'r') as f:
            lines = f.readlines()
            for line in lines:
                m = line.strip()
                wid = m.split("\t")[1]
                if wid in self.coldWIDS:
                    outf.write(m)
                    outf.write("\n")

        outf.close()


if __name__ == '__main__':
    config = Config("configs/all_mentions_config.ini")
    vocabloader = VocabLoader(config)
    a = TrainColdValEntityIntersection(config, vocabloader)

    a.findColdEntitiesInTest(config.aida_inkb_dev_file)
    a.findColdEntitiesInTest(config.aida_inkb_test_file)
    a.findColdEntitiesInTest(config.ace_mentions_file)
    a.findColdEntitiesInTest(config.wikidata_inkb_test_file)
    a.findColdEntitiesInTest(config.msnbc_inkb_test_file)

    print(utils.get_mention_files(config.train_mentions_dir))

    #a.updateKnwWidVocab()