def updateTrValCandDict(self, trValCandDict_pkl, crosswikis_pkl, knwn_wid_vocab_pkl, *args): if not os.path.exists(trValCandDict_pkl): print("Train/Val CWiki Candidates Dict doesn't exist") sys.exit() print("Updating TrValKwnCandDict for : ") print("Loading trvalCandsDict ... ") candsDict = utils.load(trValCandDict_pkl) print("TrValCandDictSize : {}".format(len(candsDict))) self.crosswikis_dict = utils.load_crosswikis(crosswikis_pkl) print("Loading known wid2idx dict") (self.knwid2idx, self.idx2knwid) = utils.load(knwn_wid_vocab_pkl) print("Adding candidates for additional mentions") datasetsToUpdate = args for dataset in datasetsToUpdate: test_file = dataset print(test_file) mentions = utils.make_mentions_from_file(mens_file=test_file) self._addCandidatesForAdditionalMentions(mentions, candsDict) print("Size now : {}".format(len(candsDict))) utils.save(trValCandDict_pkl, candsDict) print("TrValCandDictSize : {}".format(len(candsDict)))
def makeCWKnown(self, cwOutPath): cw = {} MAXCAND = 30 surfacesProcessed = 0 for surface, c_cprobs in self.crosswikis.items(): surfacesProcessed += 1 if surfacesProcessed % 1000000 == 0: print("Surfaces Processed : {}".format(surfacesProcessed)) if len(c_cprobs) == 0: continue if len(surface) <= 1: continue candsAdded = 0 c_probs = ([], []) # cw[surface] = ([], []) for (wid, prob) in c_cprobs: if candsAdded == 30: break if wid in self.knwid2idx: c_probs[0].append(wid) c_probs[1].append(prob) candsAdded += 1 if candsAdded != 0: cw[surface] = c_probs print("Processed") print("Size of CW : {}".format(len(cw))) utils.save(cwOutPath, cw) print("Saved pruned CW")
def updateKnwWidVocab(self): print("Old : {} Old : {}".format(len(self.knwid2idx), len(self.idx2knwid))) for m in self.new_mentions: if m.wid not in self.knwid2idx: self.idx2knwid.append(m.wid) self.knwid2idx[m.wid] = len(self.idx2knwid) - 1 print("new : {} new : {}".format(len(self.knwid2idx), len(self.idx2knwid))) utils.save(self.new_knw_wid_vocab, (self.knwid2idx, self.idx2knwid))
def convertGloveToNumpy(self): if os.path.exists(self.config.glove_numpy_pkl): print("Glove numpy already exists. ") else: print("Making glove numpy") wordvecs = [] for idx, word in enumerate(self.idx2word): wordvecs.append(self.word2vec[word]) glovenumpy = np.array(wordvecs) utils.save(self.config.glove_numpy_pkl, glovenumpy) print("done")
def updateTestCandsDict(self, test_file): print("Updating Test Candidates Dict. Size:{}\n" "Key:(surface, wid), V: ([CandWids], [PriorProbs])".format( len(self.test_kwnen_cands_dict))) print("Test File: {}".format(test_file)) test_cands_dict = self.make_test_candidates(test_file) self.test_kwnen_cands_dict.update(test_cands_dict) utils.save(self.config.test_kwnen_cands_pkl, self.test_kwnen_cands_dict) print("Train/Val Candidates Dict Saved. Size:{}".format( len(self.test_kwnen_cands_dict)))
def __init__(self, config, vocabloader): ''' Used to make pruned crosswikis dict and candidate dictionary for training and validation data train_val_cwikis_pkl : Slice of crosswikis for surfaces in train/val (NOT USED) train_val_cwikis_cands_pkl: Train/Val data only contain known entities This dict acts as pre-cache of mention candidates. key : (LNRM(surface), WID) Value : ([Candidate_IDXs], [CProbs]) Candidate_Idxs : The first idx is the true wid_idx, rest are candidates Padded with Unk_Wid_Idx(=0) if less than number of candidates needed. ''' self.config = config train_mentions_dir = config.train_mentions_dir val_mentions_file = config.val_mentions_file test_mentions_file = config.test_mentions_file tr_mens_files = utils.get_mention_files(train_mentions_dir) self.numc = 30 (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() self.wid2WikiTitle = vocabloader.getWID2Wikititle() if not os.path.exists(config.trval_kwnidx_cands_pkl): self.crosswikis_dict = vocabloader.loadPrunedCrosswikis() print("Crosswikis Loaded. Size: {}".format( len(self.crosswikis_dict))) print("Size of known entities: {}".format(len(self.knwid2idx))) print("Making Train/Validation/Test CWiki Candidates.\n" "{Key:(surface, wid), V: ([CandWids], [PriorProbs])") train_val_candidates_dict = self.make_train_val_candidatesDict( train_mentions_dir, tr_mens_files, val_mentions_file, test_mentions_file) utils.save(config.trval_kwnidx_cands_pkl, train_val_candidates_dict) print("Train/Val Candidates Dict Saved") sys.exit(0) else: print("Train/Val CWiki Candidates already exists") trval_cand_dict = utils.load(train_val_cwikis_cands_pkl) print("Loaded dict") key = ('barackobama', '534366') (candidates, probs) = (trval_cand_dict[key][0], trval_cand_dict[key][1]) candidates = [self.idx2knwid[wididx] for wididx in candidates] candidates = [self.wid2WikiTitle[wid] for wid in candidates] print((key, candidates, probs))
def __init__(self, config, widWikititle_file, widLabel_file, word_threshold=1): '''Given training data, makes word vocab, glove word vocab, doc_mentions vocab, type lables vocab, known_wid vocab, wid2Wikititle ''' self.start_word = start_word self.end_word = end_word self.unk_word = 'unk' # In tune with word2vec self.unk_wid = "<unk_wid>" self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) self.num_tr_mens_files = len(self.tr_mens_files) print("[#] Training Mention Files : {} files".format( self.num_tr_mens_files)) print("[#] Validation Mentions File : {}".format( config.val_mentions_file)) tr_data_vocabs_exist = self.check_train_data_vocabs_exist( config.word_vocab_pkl, config.label_vocab_pkl, config.kwnwid_vocab_pkl, config.cohstring_vocab_pkl, config.cohstringG1_vocab_pkl) if not tr_data_vocabs_exist: print("[#] Loading pretrained word2vec embeddings .. ") self.word2vec = gensim.models.Word2Vec.load_word2vec_format( config.word2vec_bin_gz, binary=True) self.word2vec.init_sims(replace=True) print("All/Some Training Vocabs do not exist. Making ... ") self.make_training_data_vocabs( self.tr_mens_dir, self.tr_mens_files, config.word_vocab_pkl, config.label_vocab_pkl, config.kwnwid_vocab_pkl, config.cohstring_vocab_pkl, config.cohstringG1_vocab_pkl, config.cohstringG9_vocab_pkl, word_threshold) if not os.path.exists(config.widWiktitle_pkl): print(" [#] Making wid2Wikititle Map") wid2Wikititle = self.make_widWikititleMap(widWikititle_file) utils.save(config.widWiktitle_pkl, wid2Wikititle) print(" [#] Done. Size : {}".format(len(wid2Wikititle))) if not os.path.exists(config.wid2typelabels_vocab_pkl): print(" [#] Making wid2Types Map") wid2types = self.make_wid2TypesMap(widLabel_file) utils.save(config.wid2typelabels_vocab_pkl, wid2types) print(" [#] Done. Size : {}".format(len(wid2types))) if not os.path.exists(config.glove_word_vocab_pkl): print(" [#] Makign GloVe Word Vocabs") glove2vec = utils.load(config.glove_pkl) print(" [#] Glove embeddings loaded. Size: {}".format( len(glove2vec))) (glove_word2idx, glove_idx2word) = self.make_glovewordvocab(glove2vec) utils.save(config.glove_word_vocab_pkl, (glove_word2idx, glove_idx2word))
def make_training_data_vocabs(self, tr_mens_dir, tr_mens_files, word_vocab_pkl, label_vocab_pkl, knwn_wid_vocab_pkl, coh_vocab_pkl, cohG1_vocab_pkl, cohG2_vocab_pkl, threshold): print("Building training vocabs : ") word_count_dict = {} coh_count_dict = {} idx2word = [self.unk_word] word2idx = {self.unk_word: 0} idx2label = [] label2idx = {} idx2knwid = [self.unk_wid] knwid2idx = {self.unk_wid: 0} idx2coh = [self.unk_word] coh2idx = {self.unk_word: 0} idx2cohG1 = [self.unk_word] cohG12idx = {self.unk_word: 0} idx2cohG2 = [self.unk_word] cohG22idx = {self.unk_word: 0} files_done = 0 for file in tr_mens_files: mens_fpath = os.path.join(tr_mens_dir, file) mentions = utils.make_mentions_from_file(mens_file=mens_fpath) for mention in mentions: for typel in mention.types: self.add_to_vocab(element2idx=label2idx, idx2element=idx2label, element=typel) for token in mention.sent_tokens: if token not in word_count_dict: word_count_dict[token] = 0 word_count_dict[token] = word_count_dict[token] + 1 for cohstring in mention.coherence: if cohstring not in coh_count_dict: coh_count_dict[cohstring] = 0 coh_count_dict[cohstring] = coh_count_dict[cohstring] + 1 self.add_to_vocab(element2idx=knwid2idx, idx2element=idx2knwid, element=mention.wid) files_done += 1 print("Files done : {}".format(files_done)) # all-files-processed # WORD VOCAB # for word, count in word_count_dict.items(): # if count > threshold: # self.add_to_vocab(element2idx=word2idx, idx2element=idx2word, # element=word) for word in self.word2vec.vocab: self.add_to_vocab(element2idx=word2idx, idx2element=idx2word, element=word) # Coherence (and greater 1) VOCAB for (cstr, cnt) in coh_count_dict.items(): self.add_to_vocab(element2idx=coh2idx, idx2element=idx2coh, element=cstr) if cnt > 1: self.add_to_vocab(element2idx=cohG12idx, idx2element=idx2cohG1, element=cstr) if cnt > 9: self.add_to_vocab(element2idx=cohG22idx, idx2element=idx2cohG2, element=cstr) print(" [#] Total Words : : {}".format(len(word_count_dict))) print(" [#] Threhsolded word vocab. Word Vocab Size: {}".format( len(idx2word))) utils.save(word_vocab_pkl, (word2idx, idx2word)) print(" [#] Label Vocab Size: {}".format(len(idx2label))) utils.save(label_vocab_pkl, (label2idx, idx2label)) print(" [#] Known Wiki Titles Size: {}".format(len(idx2knwid))) utils.save(knwn_wid_vocab_pkl, (knwid2idx, idx2knwid)) print(" [#] Coherence String Set Size: {}".format(len(idx2coh))) utils.save(coh_vocab_pkl, (coh2idx, idx2coh)) print(" [#] Coherence String (cnt > 1) Size: {}".format( len(idx2cohG1))) utils.save(cohG1_vocab_pkl, (cohG12idx, idx2cohG1)) print(" [#] Coherence String (cnt > 2) Size: {}".format( len(idx2cohG2))) utils.save(cohG2_vocab_pkl, (cohG22idx, idx2cohG2))
b.incrementCrossWikis(test_mentions) print("After increments ... ") print(" Size of test known en cwiki : {}".format(len(b.test_kwn_cwiki))) print(" Size of test all en cwiki : {}".format(len(b.test_all_cwiki))) print(config.ace_mentions_file) test_mentions = utils.make_mentions_from_file(config.ace_mentions_file) b.incrementCrossWikis(test_mentions) print("After increments ... ") print(" Size of test known en cwiki : {}".format(len(b.test_kwn_cwiki))) print(" Size of test all en cwiki : {}".format(len(b.test_all_cwiki))) print(config.msnbc_inkb_test_file) test_mentions = utils.make_mentions_from_file(config.msnbc_inkb_test_file) b.incrementCrossWikis(test_mentions) print("After increments ... ") print(" Size of test known en cwiki : {}".format(len(b.test_kwn_cwiki))) print(" Size of test all en cwiki : {}".format(len(b.test_all_cwiki))) print(config.wikidata_inkb_test_file) test_mentions = utils.make_mentions_from_file(config.wikidata_inkb_test_file) b.incrementCrossWikis(test_mentions) print("After increments ... ") print(" Size of test known en cwiki : {}".format(len(b.test_kwn_cwiki))) print(" Size of test all en cwiki : {}".format(len(b.test_all_cwiki))) ''' utils.save(config.test_kwnen_cwikis_pkl, b.test_kwn_cwiki) utils.save(config.test_allen_cwikis_pkl, b.test_all_cwiki) print("DONE")