def updateTrValCandDict(self, trValCandDict_pkl, crosswikis_pkl,
                            knwn_wid_vocab_pkl, *args):
        if not os.path.exists(trValCandDict_pkl):
            print("Train/Val CWiki Candidates Dict doesn't exist")
            sys.exit()

        print("Updating TrValKwnCandDict for : ")

        print("Loading trvalCandsDict ... ")
        candsDict = utils.load(trValCandDict_pkl)
        print("TrValCandDictSize : {}".format(len(candsDict)))
        self.crosswikis_dict = utils.load_crosswikis(crosswikis_pkl)
        print("Loading known wid2idx dict")
        (self.knwid2idx, self.idx2knwid) = utils.load(knwn_wid_vocab_pkl)
        print("Adding candidates for additional mentions")

        datasetsToUpdate = args
        for dataset in datasetsToUpdate:
            test_file = dataset
            print(test_file)
            mentions = utils.make_mentions_from_file(mens_file=test_file)
            self._addCandidatesForAdditionalMentions(mentions, candsDict)
            print("Size now : {}".format(len(candsDict)))

        utils.save(trValCandDict_pkl, candsDict)
        print("TrValCandDictSize : {}".format(len(candsDict)))
Beispiel #2
0
    def makeCWKnown(self, cwOutPath):
        cw = {}
        MAXCAND = 30
        surfacesProcessed = 0
        for surface, c_cprobs in self.crosswikis.items():
            surfacesProcessed += 1
            if surfacesProcessed % 1000000 == 0:
                print("Surfaces Processed : {}".format(surfacesProcessed))

            if len(c_cprobs) == 0:
                continue
            if len(surface) <= 1:
                continue
            candsAdded = 0
            c_probs = ([], [])
            # cw[surface] = ([], [])
            for (wid, prob) in c_cprobs:
                if candsAdded == 30:
                    break
                if wid in self.knwid2idx:
                    c_probs[0].append(wid)
                    c_probs[1].append(prob)
                    candsAdded += 1
            if candsAdded != 0:
                cw[surface] = c_probs
        print("Processed")
        print("Size of CW : {}".format(len(cw)))
        utils.save(cwOutPath, cw)
        print("Saved pruned CW")
Beispiel #3
0
    def updateKnwWidVocab(self):
        print("Old : {} Old : {}".format(len(self.knwid2idx),
                                         len(self.idx2knwid)))
        for m in self.new_mentions:
            if m.wid not in self.knwid2idx:
                self.idx2knwid.append(m.wid)
                self.knwid2idx[m.wid] = len(self.idx2knwid) - 1

        print("new : {} new : {}".format(len(self.knwid2idx),
                                         len(self.idx2knwid)))
        utils.save(self.new_knw_wid_vocab, (self.knwid2idx, self.idx2knwid))
    def convertGloveToNumpy(self):
        if os.path.exists(self.config.glove_numpy_pkl):
            print("Glove numpy already exists. ")
        else:
            print("Making glove numpy")
            wordvecs = []
            for idx, word in enumerate(self.idx2word):
                wordvecs.append(self.word2vec[word])

            glovenumpy = np.array(wordvecs)
            utils.save(self.config.glove_numpy_pkl, glovenumpy)
            print("done")
    def updateTestCandsDict(self, test_file):

        print("Updating Test Candidates Dict. Size:{}\n"
              "Key:(surface, wid), V: ([CandWids], [PriorProbs])".format(
                  len(self.test_kwnen_cands_dict)))
        print("Test File: {}".format(test_file))
        test_cands_dict = self.make_test_candidates(test_file)

        self.test_kwnen_cands_dict.update(test_cands_dict)

        utils.save(self.config.test_kwnen_cands_pkl,
                   self.test_kwnen_cands_dict)
        print("Train/Val Candidates Dict Saved. Size:{}".format(
            len(self.test_kwnen_cands_dict)))
    def __init__(self, config, vocabloader):
        ''' Used to make pruned crosswikis dict and candidate dictionary
        for training and validation data

        train_val_cwikis_pkl : Slice of crosswikis for surfaces in train/val (NOT USED)

        train_val_cwikis_cands_pkl: Train/Val data only contain known entities
        This dict acts as pre-cache of mention candidates.
        key   : (LNRM(surface), WID)
        Value : ([Candidate_IDXs], [CProbs])
        Candidate_Idxs : The first idx is the true wid_idx, rest are candidates
        Padded with Unk_Wid_Idx(=0) if less than number of candidates needed.
        '''
        self.config = config
        train_mentions_dir = config.train_mentions_dir
        val_mentions_file = config.val_mentions_file
        test_mentions_file = config.test_mentions_file

        tr_mens_files = utils.get_mention_files(train_mentions_dir)
        self.numc = 30
        (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab()
        self.wid2WikiTitle = vocabloader.getWID2Wikititle()

        if not os.path.exists(config.trval_kwnidx_cands_pkl):
            self.crosswikis_dict = vocabloader.loadPrunedCrosswikis()
            print("Crosswikis Loaded. Size: {}".format(
                len(self.crosswikis_dict)))

            print("Size of known entities: {}".format(len(self.knwid2idx)))
            print("Making Train/Validation/Test CWiki Candidates.\n"
                  "{Key:(surface, wid), V: ([CandWids], [PriorProbs])")
            train_val_candidates_dict = self.make_train_val_candidatesDict(
                train_mentions_dir, tr_mens_files, val_mentions_file,
                test_mentions_file)
            utils.save(config.trval_kwnidx_cands_pkl,
                       train_val_candidates_dict)
            print("Train/Val Candidates Dict Saved")
            sys.exit(0)
        else:
            print("Train/Val CWiki Candidates already exists")
            trval_cand_dict = utils.load(train_val_cwikis_cands_pkl)
            print("Loaded dict")
            key = ('barackobama', '534366')
            (candidates, probs) = (trval_cand_dict[key][0],
                                   trval_cand_dict[key][1])
            candidates = [self.idx2knwid[wididx] for wididx in candidates]
            candidates = [self.wid2WikiTitle[wid] for wid in candidates]

            print((key, candidates, probs))
Beispiel #7
0
    def __init__(self,
                 config,
                 widWikititle_file,
                 widLabel_file,
                 word_threshold=1):
        '''Given training data, makes word vocab, glove word vocab,
           doc_mentions vocab, type lables vocab, known_wid vocab,
           wid2Wikititle
        '''
        self.start_word = start_word
        self.end_word = end_word
        self.unk_word = 'unk'  # In tune with word2vec
        self.unk_wid = "<unk_wid>"

        self.tr_mens_dir = config.train_mentions_dir
        self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir)
        self.num_tr_mens_files = len(self.tr_mens_files)
        print("[#] Training Mention Files : {} files".format(
            self.num_tr_mens_files))

        print("[#] Validation Mentions File : {}".format(
            config.val_mentions_file))

        tr_data_vocabs_exist = self.check_train_data_vocabs_exist(
            config.word_vocab_pkl, config.label_vocab_pkl,
            config.kwnwid_vocab_pkl, config.cohstring_vocab_pkl,
            config.cohstringG1_vocab_pkl)

        if not tr_data_vocabs_exist:
            print("[#] Loading pretrained word2vec embeddings .. ")
            self.word2vec = gensim.models.Word2Vec.load_word2vec_format(
                config.word2vec_bin_gz, binary=True)
            self.word2vec.init_sims(replace=True)

            print("All/Some Training Vocabs do not exist. Making ... ")
            self.make_training_data_vocabs(
                self.tr_mens_dir, self.tr_mens_files, config.word_vocab_pkl,
                config.label_vocab_pkl, config.kwnwid_vocab_pkl,
                config.cohstring_vocab_pkl, config.cohstringG1_vocab_pkl,
                config.cohstringG9_vocab_pkl, word_threshold)

        if not os.path.exists(config.widWiktitle_pkl):
            print(" [#] Making wid2Wikititle Map")
            wid2Wikititle = self.make_widWikititleMap(widWikititle_file)
            utils.save(config.widWiktitle_pkl, wid2Wikititle)
            print(" [#] Done. Size : {}".format(len(wid2Wikititle)))

        if not os.path.exists(config.wid2typelabels_vocab_pkl):
            print(" [#] Making wid2Types Map")
            wid2types = self.make_wid2TypesMap(widLabel_file)
            utils.save(config.wid2typelabels_vocab_pkl, wid2types)
            print(" [#] Done. Size : {}".format(len(wid2types)))

        if not os.path.exists(config.glove_word_vocab_pkl):
            print(" [#] Makign GloVe Word Vocabs")
            glove2vec = utils.load(config.glove_pkl)
            print("   [#] Glove embeddings loaded. Size: {}".format(
                len(glove2vec)))
            (glove_word2idx,
             glove_idx2word) = self.make_glovewordvocab(glove2vec)
            utils.save(config.glove_word_vocab_pkl,
                       (glove_word2idx, glove_idx2word))
Beispiel #8
0
    def make_training_data_vocabs(self, tr_mens_dir, tr_mens_files,
                                  word_vocab_pkl, label_vocab_pkl,
                                  knwn_wid_vocab_pkl, coh_vocab_pkl,
                                  cohG1_vocab_pkl, cohG2_vocab_pkl, threshold):

        print("Building training vocabs : ")
        word_count_dict = {}
        coh_count_dict = {}
        idx2word = [self.unk_word]
        word2idx = {self.unk_word: 0}
        idx2label = []
        label2idx = {}
        idx2knwid = [self.unk_wid]
        knwid2idx = {self.unk_wid: 0}
        idx2coh = [self.unk_word]
        coh2idx = {self.unk_word: 0}
        idx2cohG1 = [self.unk_word]
        cohG12idx = {self.unk_word: 0}
        idx2cohG2 = [self.unk_word]
        cohG22idx = {self.unk_word: 0}

        files_done = 0
        for file in tr_mens_files:
            mens_fpath = os.path.join(tr_mens_dir, file)
            mentions = utils.make_mentions_from_file(mens_file=mens_fpath)
            for mention in mentions:
                for typel in mention.types:
                    self.add_to_vocab(element2idx=label2idx,
                                      idx2element=idx2label,
                                      element=typel)
                for token in mention.sent_tokens:
                    if token not in word_count_dict:
                        word_count_dict[token] = 0
                    word_count_dict[token] = word_count_dict[token] + 1

                for cohstring in mention.coherence:
                    if cohstring not in coh_count_dict:
                        coh_count_dict[cohstring] = 0
                    coh_count_dict[cohstring] = coh_count_dict[cohstring] + 1

                self.add_to_vocab(element2idx=knwid2idx,
                                  idx2element=idx2knwid,
                                  element=mention.wid)
            files_done += 1
            print("Files done : {}".format(files_done))
        # all-files-processed
        # WORD VOCAB
        # for word, count in word_count_dict.items():
        #       if count > threshold:
        #               self.add_to_vocab(element2idx=word2idx, idx2element=idx2word,
        #                                                 element=word)

        for word in self.word2vec.vocab:
            self.add_to_vocab(element2idx=word2idx,
                              idx2element=idx2word,
                              element=word)
        # Coherence (and greater 1) VOCAB
        for (cstr, cnt) in coh_count_dict.items():
            self.add_to_vocab(element2idx=coh2idx,
                              idx2element=idx2coh,
                              element=cstr)
            if cnt > 1:
                self.add_to_vocab(element2idx=cohG12idx,
                                  idx2element=idx2cohG1,
                                  element=cstr)
            if cnt > 9:
                self.add_to_vocab(element2idx=cohG22idx,
                                  idx2element=idx2cohG2,
                                  element=cstr)

        print(" [#] Total Words : : {}".format(len(word_count_dict)))
        print(" [#] Threhsolded word vocab. Word Vocab Size: {}".format(
            len(idx2word)))
        utils.save(word_vocab_pkl, (word2idx, idx2word))
        print(" [#] Label Vocab Size: {}".format(len(idx2label)))
        utils.save(label_vocab_pkl, (label2idx, idx2label))
        print(" [#] Known Wiki Titles Size: {}".format(len(idx2knwid)))
        utils.save(knwn_wid_vocab_pkl, (knwid2idx, idx2knwid))
        print(" [#] Coherence String Set Size: {}".format(len(idx2coh)))
        utils.save(coh_vocab_pkl, (coh2idx, idx2coh))
        print(" [#] Coherence String (cnt > 1) Size: {}".format(
            len(idx2cohG1)))
        utils.save(cohG1_vocab_pkl, (cohG12idx, idx2cohG1))
        print(" [#] Coherence String (cnt > 2) Size: {}".format(
            len(idx2cohG2)))
        utils.save(cohG2_vocab_pkl, (cohG22idx, idx2cohG2))
Beispiel #9
0
    b.incrementCrossWikis(test_mentions)
    print("After increments ... ")
    print(" Size of test known en cwiki : {}".format(len(b.test_kwn_cwiki)))
    print(" Size of test all en cwiki : {}".format(len(b.test_all_cwiki)))

    print(config.ace_mentions_file)
    test_mentions = utils.make_mentions_from_file(config.ace_mentions_file)
    b.incrementCrossWikis(test_mentions)
    print("After increments ... ")
    print(" Size of test known en cwiki : {}".format(len(b.test_kwn_cwiki)))
    print(" Size of test all en cwiki : {}".format(len(b.test_all_cwiki)))

    print(config.msnbc_inkb_test_file)
    test_mentions = utils.make_mentions_from_file(config.msnbc_inkb_test_file)
    b.incrementCrossWikis(test_mentions)
    print("After increments ... ")
    print(" Size of test known en cwiki : {}".format(len(b.test_kwn_cwiki)))
    print(" Size of test all en cwiki : {}".format(len(b.test_all_cwiki)))

    print(config.wikidata_inkb_test_file)
    test_mentions = utils.make_mentions_from_file(config.wikidata_inkb_test_file)
    b.incrementCrossWikis(test_mentions)
    print("After increments ... ")
    print(" Size of test known en cwiki : {}".format(len(b.test_kwn_cwiki)))
    print(" Size of test all en cwiki : {}".format(len(b.test_all_cwiki)))
    '''

    utils.save(config.test_kwnen_cwikis_pkl, b.test_kwn_cwiki)
    utils.save(config.test_allen_cwikis_pkl, b.test_all_cwiki)
    print("DONE")