def __init__(self, config, vocabloader): self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) (self.label2idx, self.idx2label) = vocabloader.getLabelVocab() # Initializing with zero count self.typeCount = {} for t in self.label2idx.keys(): self.typeCount[t] = 0 self.makeLabelCount()
def __init__(self, config, vocabloader, val_file, num_cands, batch_size, strict_context=True, pretrain_wordembed=True, wordDropoutKeep=1.0, cohDropoutKeep=1.0): ''' Reader especially for training data, but can be used for test data as validation and test file inputs. The requirement is that the mention candidates should be added to the TrValCandidateDict using readers.train.crosswikis_vocab DataType 0/1 corresponds to train/val_file ''' self.config = config self.batch_size = batch_size print("[#] Initializing Training Reader Batch Size: {}".format( self.batch_size)) stime = time.time() self.start_word = start_word self.end_word = end_word self.unk_word = 'unk' # In tune with glove self.unk_wid = "<unk_wid>" self.pretrain_wordembed = pretrain_wordembed assert 0.0 < wordDropoutKeep <= 1.0 self.wordDropoutKeep = wordDropoutKeep assert 0.0 < cohDropoutKeep <= 1.0 self.cohDropoutKeep = cohDropoutKeep self.num_cands = num_cands self.strict_context = strict_context # Coherence String Vocab (self.cohG92idx, self.idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl) self.num_cohstr = len(self.idx2cohG9) print("[#] Coherence Loaded. Num Coherence Strings: {}".format( self.num_cohstr)) self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) self.num_tr_mens_files = len(self.tr_mens_files) print("[#] Training Mention Files : {} files".format( self.num_tr_mens_files)) etime = time.time() ttime = etime - stime print("[#] TRAINING READER LOADING COMPLETE. " "Time Taken: {} secs\n".format(ttime))
def __init__(self, config, vocabloader): ''' Used to make pruned crosswikis dict and candidate dictionary for training and validation data train_val_cwikis_pkl : Slice of crosswikis for surfaces in train/val (NOT USED) train_val_cwikis_cands_pkl: Train/Val data only contain known entities This dict acts as pre-cache of mention candidates. key : (LNRM(surface), WID) Value : ([Candidate_IDXs], [CProbs]) Candidate_Idxs : The first idx is the true wid_idx, rest are candidates Padded with Unk_Wid_Idx(=0) if less than number of candidates needed. ''' self.config = config train_mentions_dir = config.train_mentions_dir val_mentions_file = config.val_mentions_file test_mentions_file = config.test_mentions_file tr_mens_files = utils.get_mention_files(train_mentions_dir) self.numc = 30 (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() self.wid2WikiTitle = vocabloader.getWID2Wikititle() if not os.path.exists(config.trval_kwnidx_cands_pkl): self.crosswikis_dict = vocabloader.loadPrunedCrosswikis() print("Crosswikis Loaded. Size: {}".format( len(self.crosswikis_dict))) print("Size of known entities: {}".format(len(self.knwid2idx))) print("Making Train/Validation/Test CWiki Candidates.\n" "{Key:(surface, wid), V: ([CandWids], [PriorProbs])") train_val_candidates_dict = self.make_train_val_candidatesDict( train_mentions_dir, tr_mens_files, val_mentions_file, test_mentions_file) utils.save(config.trval_kwnidx_cands_pkl, train_val_candidates_dict) print("Train/Val Candidates Dict Saved") sys.exit(0) else: print("Train/Val CWiki Candidates already exists") trval_cand_dict = utils.load(train_val_cwikis_cands_pkl) print("Loaded dict") key = ('barackobama', '534366') (candidates, probs) = (trval_cand_dict[key][0], trval_cand_dict[key][1]) candidates = [self.idx2knwid[wididx] for wididx in candidates] candidates = [self.wid2WikiTitle[wid] for wid in candidates] print((key, candidates, probs))
def __init__(self, config, vocabloader, testfile): self.test_mentions = utils.make_mentions_from_file(testfile) self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() # key:wiktitle Value:set(context_words) self.train_entities = {} # key:wiktitle Value: List of set(context_words) for each mention self.test_entities = {} self.encontext_pr = {} self.getTestEntitiesContextWords() self.getTrainEntitiesContextWords() self.computeEntityContextPR(self.train_entities, self.test_entities)
def __init__(self, config, vocabloader): self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) print("Loading Coherence String Dicts") (coh2idx, idx2coh) = utils.load(config.cohstring_vocab_pkl) (cohG92idx, idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl) print("Coherence Stirng set Size : {}, cnt >= 10 size : {}".format( len(idx2coh), len(idx2cohG9))) self.testDataCountCohLessMens(config.val_mentions_file, cohG92idx) self.testDataCountCohLessMens(config.test_mentions_file, cohG92idx) self.testDataCountCohLessMens(config.ace_mentions_file, cohG92idx) self.testDataCountCohLessMens(config.aida_inkb_dev_file, cohG92idx) self.testDataCountCohLessMens(config.aida_inkb_test_file, cohG92idx) self.testDataCountCohLessMens(config.aida_inkb_train_file, cohG92idx)
def __init__(self, config, vocabloader, num_cands): self.unk_wid = "<unk_wid>" self.g_num_wids = 0 self.g_num_elements = 0 # Known WID Vocab # Known WID Vocab (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() self.num_kwn_entities = len(self.idx2knwid) print(" [#] Loaded. Num of known wids : {}".format( self.num_kwn_entities)) # Candidates Dict print("[#] Loading training/val crosswikis candidate dict ... ") self.trval_cands_dict = vocabloader.getTrainValCandidateDict() print("[#] Training Mentions Dir : {}".format( config.train_mentions_dir)) self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) self.num_tr_mens_files = len(self.tr_mens_files) print(" [#] Training Mention Files : {} files".format( self.num_tr_mens_files)) print("[#] Validation Mentions File : {}".format( config.val_mentions_file)) print("[#] Test Mentions File : {}".format(config.test_mentions_file)) print("[#] Pre-loading validation mentions ... ") self.val_mentions = utils.make_mentions_from_file( config.val_mentions_file) self.test_mentions = utils.make_mentions_from_file( config.test_mentions_file) self.num_val_mens = len(self.val_mentions) self.num_test_mens = len(self.test_mentions) print("[#] Validation Mentions : {}, Test Mentions : {}".format( self.num_val_mens, self.num_test_mens)) self.num_cands = num_cands print("\n[#] LOADING COMPLETE:")
def __init__(self, config, vocabloader, testfile): self.test_mentions = utils.make_mentions_from_file(testfile) self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() # Wid2Wikititle Map self.wid2WikiTitle = vocabloader.getWID2Wikititle() print(" [#] Size of Wid2Wikititle: {}".format(len(self.wid2WikiTitle))) # {wiktitle: set(coherence_words)} self.train_entities = {} # {wiktitle: List of set(coherence_words) for each mention self.test_entities = {} self.encontext_pr = {} self.getTestEntitiesCoherenceWords() self.getTrainEntitiesCoherenceWords() self.computeEntityCoherencePR(self.train_entities, self.test_entities)
def __init__(self, config, widWikititle_file, widLabel_file, word_threshold=1): '''Given training data, makes word vocab, glove word vocab, doc_mentions vocab, type lables vocab, known_wid vocab, wid2Wikititle ''' self.start_word = start_word self.end_word = end_word self.unk_word = 'unk' # In tune with word2vec self.unk_wid = "<unk_wid>" self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) self.num_tr_mens_files = len(self.tr_mens_files) print("[#] Training Mention Files : {} files".format( self.num_tr_mens_files)) print("[#] Validation Mentions File : {}".format( config.val_mentions_file)) tr_data_vocabs_exist = self.check_train_data_vocabs_exist( config.word_vocab_pkl, config.label_vocab_pkl, config.kwnwid_vocab_pkl, config.cohstring_vocab_pkl, config.cohstringG1_vocab_pkl) if not tr_data_vocabs_exist: print("[#] Loading pretrained word2vec embeddings .. ") self.word2vec = gensim.models.Word2Vec.load_word2vec_format( config.word2vec_bin_gz, binary=True) self.word2vec.init_sims(replace=True) print("All/Some Training Vocabs do not exist. Making ... ") self.make_training_data_vocabs( self.tr_mens_dir, self.tr_mens_files, config.word_vocab_pkl, config.label_vocab_pkl, config.kwnwid_vocab_pkl, config.cohstring_vocab_pkl, config.cohstringG1_vocab_pkl, config.cohstringG9_vocab_pkl, word_threshold) if not os.path.exists(config.widWiktitle_pkl): print(" [#] Making wid2Wikititle Map") wid2Wikititle = self.make_widWikititleMap(widWikititle_file) utils.save(config.widWiktitle_pkl, wid2Wikititle) print(" [#] Done. Size : {}".format(len(wid2Wikititle))) if not os.path.exists(config.wid2typelabels_vocab_pkl): print(" [#] Making wid2Types Map") wid2types = self.make_wid2TypesMap(widLabel_file) utils.save(config.wid2typelabels_vocab_pkl, wid2types) print(" [#] Done. Size : {}".format(len(wid2types))) if not os.path.exists(config.glove_word_vocab_pkl): print(" [#] Makign GloVe Word Vocabs") glove2vec = utils.load(config.glove_pkl) print(" [#] Glove embeddings loaded. Size: {}".format( len(glove2vec))) (glove_word2idx, glove_idx2word) = self.make_glovewordvocab(glove2vec) utils.save(config.glove_word_vocab_pkl, (glove_word2idx, glove_idx2word))
outf.write(m) outf.write("\n") print("Priocessing : {}".format(self.cold_val1)) with open(self.cold_val2, 'r') as f: lines = f.readlines() for line in lines: m = line.strip() wid = m.split("\t")[1] if wid in self.coldWIDS: outf.write(m) outf.write("\n") outf.close() if __name__ == '__main__': config = Config("configs/all_mentions_config.ini") vocabloader = VocabLoader(config) a = TrainColdValEntityIntersection(config, vocabloader) a.findColdEntitiesInTest(config.aida_inkb_dev_file) a.findColdEntitiesInTest(config.aida_inkb_test_file) a.findColdEntitiesInTest(config.ace_mentions_file) a.findColdEntitiesInTest(config.wikidata_inkb_test_file) a.findColdEntitiesInTest(config.msnbc_inkb_test_file) print(utils.get_mention_files(config.train_mentions_dir)) #a.updateKnwWidVocab()