def updateTrValCandDict(self, trValCandDict_pkl, crosswikis_pkl, knwn_wid_vocab_pkl, *args): if not os.path.exists(trValCandDict_pkl): print("Train/Val CWiki Candidates Dict doesn't exist") sys.exit() print("Updating TrValKwnCandDict for : ") print("Loading trvalCandsDict ... ") candsDict = utils.load(trValCandDict_pkl) print("TrValCandDictSize : {}".format(len(candsDict))) self.crosswikis_dict = utils.load_crosswikis(crosswikis_pkl) print("Loading known wid2idx dict") (self.knwid2idx, self.idx2knwid) = utils.load(knwn_wid_vocab_pkl) print("Adding candidates for additional mentions") datasetsToUpdate = args for dataset in datasetsToUpdate: test_file = dataset print(test_file) mentions = utils.make_mentions_from_file(mens_file=test_file) self._addCandidatesForAdditionalMentions(mentions, candsDict) print("Size now : {}".format(len(candsDict))) utils.save(trValCandDict_pkl, candsDict) print("TrValCandDictSize : {}".format(len(candsDict)))
def __init__(self, config, vocabloader, test_mentions_file): ''' Updates a test crosswikis which is the original crosswikis pruned but only with surfaces from test data There are 2 dictionaries that mare maintained : test_kwn_cwiki : Only has candidates that are in KnownEntity set test_all_cwiki : All entities from KB can be candidates (i.e. full cwikis) ''' if not os.path.exists(config.test_kwnen_cwikis_pkl): print("Test Known Entity CWiki does not exist ... ") self.test_kwn_cwiki = {} else: self.test_kwn_cwiki = utils.load(config.test_kwnen_cwikis_pkl) print("Size of test known en cwiki : {}".format( len(self.test_kwn_cwiki))) if not os.path.exists(config.test_allen_cwikis_pkl): print("Test Data All Entity CWiki does not exist ... ") self.test_all_cwiki = {} else: self.test_all_cwiki = utils.load(config.test_allen_cwikis_pkl) print("Size of test all en cwiki : {}".format(len( self.test_all_cwiki))) # Known WID Vocab print("[#] Loading Known Entities Vocab : ") (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() self.num_knwn_entities = len(self.idx2knwid) print(" [#] Loaded. Num of known wids : {}".format( self.num_knwn_entities)) self.crosswikis_dict = utils.load_crosswikis(config.crosswikis_pkl) '''
def __init__(self, test_mentions_file, word_vocab_pkl, label_vocab_pkl, knwn_wid_vocab_pkl): self.start_word = start_word self.end_word = end_word self.unk_word = 'unk' # In tune with word2vec self.unk_wid = "<unk_wid>" self.tr_sup = 'tr_sup' self.tr_unsup = 'tr_unsup' if not (os.path.exists(word_vocab_pkl) and os.path.exists(label_vocab_pkl) and os.path.exists(knwn_wid_vocab_pkl)): print( "Atleast one vocab not found. Run vocabs.py before running model." ) sys.exit() # Word VOCAB print("[#] Loading word vocab ... ") (self.word2idx, self.idx2word) = utils.load(word_vocab_pkl) self.num_words = len(self.idx2word) print(" [#] Word vocab loaded. Size of vocab : {}".format( self.num_words)) # Label Vocab print("[#] Loading label vocab ... ") (self.label2idx, self.idx2label) = utils.load(label_vocab_pkl) self.num_labels = len(self.idx2label) print(" [#] Label vocab loaded. Number of labels : {}".format( self.num_labels)) # Known WID Vocab print("[#] Loading Known Entities Vocab : ") (self.knwid2idx, self.idx2knwid) = utils.load(knwn_wid_vocab_pkl) self.num_knwn_entities = len(self.idx2knwid) print(" [#] Loaded. Num of known wids : {}".format( self.num_knwn_entities)) # Crosswikis #print("[#] Loading training/val crosswikis dictionary ... ") #self.crosswikis_dict = utils.load_crosswikis(trval_crosswikis_pkl) print("[#] Test Mentions File : {}".format(test_mentions_file)) print("[#] Loading test mentions ... ") self.test_mentions = self._make_mentions_from_file(test_mentions_file) self.num_test_mentions = len(self.test_mentions) print("[#] Test Mentions : {}".format(self.num_test_mentions)) print("\n[#] LOADING COMPLETE")
def getTrainValCandidateDict(self): if self.trval_cands_dict == None: if not os.path.exists(self.config.trval_kwnidx_cands_pkl): print("Train Validation Candidate Dict missing") sys.exit() self.trval_cands_dict = utils.load(self.config.trval_kwnidx_cands_pkl) return self.trval_cands_dict
def getLabelVocab(self): if self.label2idx == None or self.idx2label == None: if not os.path.exists(self.config.label_vocab_pkl): print("Label Vocab PKL missing") sys.exit() (self.label2idx, self.idx2label) = utils.load(self.config.label_vocab_pkl) return (self.label2idx, self.idx2label)
def getTestAllEnCwiki(self): if self.test_allen_cwikis == None: if not os.path.exists(self.config.test_allen_cwikis_pkl): print("Test All Entity CWikis Dict missing") sys.exit() self.test_allen_cwikis = utils.load(self.config.test_allen_cwikis_pkl) return self.test_allen_cwikis
def loadCrosswikis(self): if self.crosswikis == None: if not os.path.exists(self.config.crosswikis_pkl): print("Crosswikis pkl missing") sys.exit() self.crosswikis = utils.load(self.config.crosswikis_pkl) return self.crosswikis
def getWID2TypeLabels(self): if self.wid2TypeLabels == None: if not os.path.exists(self.config.wid2typelabels_vocab_pkl): print("wid2TypeLabels pkl missing") sys.exit() self.wid2TypeLabels = utils.load(self.config.wid2typelabels_vocab_pkl) return self.wid2TypeLabels
def loadKnownWIDDescVecs(self): if self.knownwid2descvecs == None: if not os.path.exists(self.config.knownwid2descvectors): print("Known WIDS Description Vectors PKL missing") sys.exit() self.knownwid2descvecs = utils.load(self.config.knownwid2descvectors) return self.knownwid2descvecs
def getWordVocab(self): if self.word2idx == None or self.idx2word == None: if not os.path.exists(self.config.word_vocab_pkl): print("Word Vocab PKL missing") sys.exit() (self.word2idx, self.idx2word) = utils.load(self.config.word_vocab_pkl) return (self.word2idx, self.idx2word)
def loadGloveNumpy(self): if self.glovenumpy is None: if not os.path.exists(self.config.glove_numpy_pkl): print("Glove_Numpy doesnot exist") sys.exit() self.glovenumpy = utils.load(self.config.glove_numpy_pkl) return self.glovenumpy
def loadGloveVectors(self): if self.glove2vec == None: if not os.path.exists(self.config.glove_pkl): print("Glove_Vectors_PKL doesnot exist") sys.exit() self.glove2vec = utils.load(self.config.glove_pkl) return self.glove2vec
def getWID2Wikititle(self): if self.wid2Wikititle == None: if not os.path.exists(self.config.widWiktitle_pkl): print("wid2Wikititle pkl missing") sys.exit() self.wid2Wikititle = utils.load(self.config.widWiktitle_pkl) return self.wid2Wikititle
def getKnwnWidVocab(self): if self.knwid2idx == None or self.idx2knwid == None: if not os.path.exists(self.config.kwnwid_vocab_pkl): print("Known Entities Vocab PKL missing") sys.exit() (self.knwid2idx, self.idx2knwid) = utils.load(self.config.kwnwid_vocab_pkl) return (self.knwid2idx, self.idx2knwid)
def getTestCandidateDict(self): if self.test_kwnen_cands_dict is None: if not os.path.exists(self.config.test_kwnen_cands_pkl): print("Train Validation Candidate Dict missing") sys.exit() self.test_kwnen_cands_dict = utils.load( self.config.test_kwnen_cands_pkl) return self.test_kwnen_cands_dict
def getCrosswikisSlice(self): if self.cwikis_slice == None: if not os.path.exists(self.config.crosswikis_slice): print("CWikis Slice Dict missing") sys.exit() print("Loading CWIKI Slice") self.cwikis_slice = utils.load(self.config.crosswikis_slice) return self.cwikis_slice
def loadPrunedCrosswikis(self): if self.crosswikis_pruned == None: if not os.path.exists(self.config.crosswikis_pruned_pkl): print("Crosswikis Pruned Does not exist.") sys.exit() self.crosswikis_pruned = utils.load( self.config.crosswikis_pruned_pkl) return self.crosswikis_pruned
def getGloveWordVocab(self): if self.gword2idx == None or self.gidx2word == None: if not os.path.exists(self.config.glove_word_vocab_pkl): print("Glove Word Vocab PKL missing") sys.exit() print("Loading Glove Word Vocabulary") (self.gword2idx, self.gidx2word) = utils.load(self.config.glove_word_vocab_pkl) return (self.gword2idx, self.gidx2word)
def getTestKnwEnCwiki(self): if self.test_knwen_cwikis == None: if not os.path.exists(self.config.test_kwnen_cwikis_pkl): print("Test Known Entity CWikis Dict missing") sys.exit() print("Loading Test Data Known Entity CWIKI") self.test_knwen_cwikis = utils.load( self.config.test_kwnen_cwikis_pkl) return self.test_knwen_cwikis
def __init__(self, config, vocabloader): self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) print("Loading Coherence String Dicts") (coh2idx, idx2coh) = utils.load(config.cohstring_vocab_pkl) (cohG92idx, idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl) print("Coherence Stirng set Size : {}, cnt >= 10 size : {}".format( len(idx2coh), len(idx2cohG9))) self.testDataCountCohLessMens(config.val_mentions_file, cohG92idx) self.testDataCountCohLessMens(config.test_mentions_file, cohG92idx) self.testDataCountCohLessMens(config.ace_mentions_file, cohG92idx) self.testDataCountCohLessMens(config.aida_inkb_dev_file, cohG92idx) self.testDataCountCohLessMens(config.aida_inkb_test_file, cohG92idx) self.testDataCountCohLessMens(config.aida_inkb_train_file, cohG92idx)
def __init__(self, config, vocabloader): self.new_knw_wid_vocab = "/save/ngupta19/wikipedia/wiki_mentions/wcoh/vocab/new/new_knwn_wid_vocab.pkl" (self.knwid2idx, self.idx2knwid) = utils.load(self.new_knw_wid_vocab) newfile = "/save/ngupta19/wikipedia/wiki_mentions/wcoh/newmentions.txt" self.new_mentions = utils.make_mentions_from_file(newfile) self.coldWIDS = set()
def __init__(self, config, vocabloader): print("Loading Crosswikis") # self.crosswikis = vocabloader.loadCrosswikis() stime = time.time() self.crosswikis = utils.load(config.crosswikis_pruned_pkl) ttime = time.time() - stime print("Crosswikis Loaded. Size : {}".format(len(self.crosswikis))) print("Time taken : {} secs".format(ttime)) (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() print("Size of known wids : {}".format(len(self.knwid2idx)))
def __init__(self, figermodel): print( "###### ENTERED THE COLD WORLD OF THE UNKNOWN ##############") # Object of the WikiELModel Class self.fm = figermodel self.coldDir = self.fm.reader.coldDir coldWid2DescVecs_pkl = os.path.join(self.coldDir, "coldwid2descvecs.pkl") self.coldWid2DescVecs = utils.load(coldWid2DescVecs_pkl) self.num_cold_entities = self.fm.reader.num_cold_entities self.batch_size = self.fm.batch_size (self.coldwid2idx, self.idx2coldwid) = (self.fm.reader.coldwid2idx, self.fm.reader.idx2coldwid)
def __init__(self, config, vocabloader, val_file, num_cands, batch_size, strict_context=True, pretrain_wordembed=True, wordDropoutKeep=1.0, cohDropoutKeep=1.0): ''' Reader especially for training data, but can be used for test data as validation and test file inputs. The requirement is that the mention candidates should be added to the TrValCandidateDict using readers.train.crosswikis_vocab DataType 0/1 corresponds to train/val_file ''' self.config = config self.batch_size = batch_size print("[#] Initializing Training Reader Batch Size: {}".format( self.batch_size)) stime = time.time() self.start_word = start_word self.end_word = end_word self.unk_word = 'unk' # In tune with glove self.unk_wid = "<unk_wid>" self.pretrain_wordembed = pretrain_wordembed assert 0.0 < wordDropoutKeep <= 1.0 self.wordDropoutKeep = wordDropoutKeep assert 0.0 < cohDropoutKeep <= 1.0 self.cohDropoutKeep = cohDropoutKeep self.num_cands = num_cands self.strict_context = strict_context # Coherence String Vocab (self.cohG92idx, self.idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl) self.num_cohstr = len(self.idx2cohG9) print("[#] Coherence Loaded. Num Coherence Strings: {}".format( self.num_cohstr)) self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) self.num_tr_mens_files = len(self.tr_mens_files) print("[#] Training Mention Files : {} files".format( self.num_tr_mens_files)) etime = time.time() ttime = etime - stime print("[#] TRAINING READER LOADING COMPLETE. " "Time Taken: {} secs\n".format(ttime))
def __init__(self, config, vocabloader): ''' Used to make pruned crosswikis dict and candidate dictionary for training and validation data train_val_cwikis_pkl : Slice of crosswikis for surfaces in train/val (NOT USED) train_val_cwikis_cands_pkl: Train/Val data only contain known entities This dict acts as pre-cache of mention candidates. key : (LNRM(surface), WID) Value : ([Candidate_IDXs], [CProbs]) Candidate_Idxs : The first idx is the true wid_idx, rest are candidates Padded with Unk_Wid_Idx(=0) if less than number of candidates needed. ''' self.config = config train_mentions_dir = config.train_mentions_dir val_mentions_file = config.val_mentions_file test_mentions_file = config.test_mentions_file tr_mens_files = utils.get_mention_files(train_mentions_dir) self.numc = 30 (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() self.wid2WikiTitle = vocabloader.getWID2Wikititle() if not os.path.exists(config.trval_kwnidx_cands_pkl): self.crosswikis_dict = vocabloader.loadPrunedCrosswikis() print("Crosswikis Loaded. Size: {}".format( len(self.crosswikis_dict))) print("Size of known entities: {}".format(len(self.knwid2idx))) print("Making Train/Validation/Test CWiki Candidates.\n" "{Key:(surface, wid), V: ([CandWids], [PriorProbs])") train_val_candidates_dict = self.make_train_val_candidatesDict( train_mentions_dir, tr_mens_files, val_mentions_file, test_mentions_file) utils.save(config.trval_kwnidx_cands_pkl, train_val_candidates_dict) print("Train/Val Candidates Dict Saved") sys.exit(0) else: print("Train/Val CWiki Candidates already exists") trval_cand_dict = utils.load(train_val_cwikis_cands_pkl) print("Loaded dict") key = ('barackobama', '534366') (candidates, probs) = (trval_cand_dict[key][0], trval_cand_dict[key][1]) candidates = [self.idx2knwid[wididx] for wididx in candidates] candidates = [self.wid2WikiTitle[wid] for wid in candidates] print((key, candidates, probs))
def __init__(self, config, vocabloader, test_mens_file, num_cands, batch_size, strict_context=True, pretrain_wordembed=True, coherence=True, glove=True): print("Loading Test Reader: {}".format(test_mens_file)) self.typeOfReader="test" self.start_word = start_word self.end_word = end_word self.unk_word = 'unk' # In tune with word2vec self.unk_wid = "<unk_wid>" # self.useKnownEntitesOnly = True self.pretrain_wordembed = pretrain_wordembed self.coherence = coherence # Word Vocab (self.word2idx, self.idx2word) = vocabloader.getGloveWordVocab() self.num_words = len(self.idx2word) print(" [#] Word vocab loaded. Size of vocab : {}".format( self.num_words)) # Label Vocab (self.label2idx, self.idx2label) = vocabloader.getLabelVocab() self.num_labels = len(self.idx2label) print(" [#] Label vocab loaded. Number of labels : {}".format( self.num_labels)) # Known WID Vocab (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() self.num_knwn_entities = len(self.idx2knwid) print(" [#] Loaded. Num of known wids : {}".format( self.num_knwn_entities)) # Wid2Wikititle Map self.wid2WikiTitle = vocabloader.getWID2Wikititle() print(" [#] Size of Wid2Wikititle: {}".format(len( self.wid2WikiTitle))) # # Wid2TypeLabels Map # self.wid2TypeLabels = vocabloader.getWID2TypeLabels() # print(" [#] Total number of Wids : {}".format(len( # self.wid2TypeLabels))) # Coherence String Vocab print("Loading Coherence Strings Dicts ... ") (self.cohG92idx, self.idx2cohG9) = utils.load( config.cohstringG9_vocab_pkl) self.num_cohstr = len(self.idx2cohG9) print(" [#] Number of Coherence Strings in Vocab : {}".format( self.num_cohstr)) # Known WID Description Vectors # self.kwnwid2descvecs = vocabloader.loadKnownWIDDescVecs() # print(" [#] Size of kwn wid desc vecs dict : {}".format( # len(self.kwnwid2descvecs))) # # Crosswikis # print("[#] Loading training/val crosswikis dictionary ... ") # self.test_kwnen_cwikis = vocabloader.getTestKnwEnCwiki() # self.test_allen_cwikis = vocabloader.getTestAllEnCwiki() # Crosswikis print("Loading Crosswikis dict. (takes ~2 mins to load)") self.crosswikis = utils.load(config.crosswikis_pruned_pkl) # self.crosswikis = {} print("Crosswikis loaded. Size: {}".format(len(self.crosswikis))) if self.pretrain_wordembed: stime = time.time() self.word2vec = vocabloader.loadGloveVectors() print("[#] Glove Vectors loaded!") ttime = (time.time() - stime)/float(60) print("[#] Time to load vectors : {} mins".format(ttime)) print("[#] Test Mentions File : {}".format(test_mens_file)) print("[#] Pre-loading test mentions ... ") self.mentions = utils.make_mentions_from_file(test_mens_file) self.men_idx = 0 self.num_mens = len(self.mentions) self.epochs = 0 print( "[#] Test Mentions : {}".format(self.num_mens)) self.batch_size = batch_size print("[#] Batch Size: %d" % self.batch_size) self.num_cands = num_cands self.strict_context = strict_context print("\n[#]LOADING COMPLETE")
def __init__(self, config, vocabloader, test_mens_file, num_cands, batch_size, strict_context=True, pretrain_wordembed=True, coherence=True): self.pipeline = remote_pipeline.RemotePipeline( server_api='http://austen.cs.illinois.edu:5800') self.typeOfReader = "inference" self.start_word = start_word self.end_word = end_word self.unk_word = 'unk' # In tune with word2vec self.unk_wid = "<unk_wid>" self.tr_sup = 'tr_sup' self.tr_unsup = 'tr_unsup' self.pretrain_wordembed = pretrain_wordembed self.coherence = coherence # Word Vocab (self.word2idx, self.idx2word) = vocabloader.getGloveWordVocab() self.num_words = len(self.idx2word) # Label Vocab (self.label2idx, self.idx2label) = vocabloader.getLabelVocab() self.num_labels = len(self.idx2label) # Known WID Vocab (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() self.num_knwn_entities = len(self.idx2knwid) # Wid2Wikititle Map self.wid2WikiTitle = vocabloader.getWID2Wikititle() # Coherence String Vocab print("Loading Coherence Strings Dicts ... ") (self.cohG92idx, self.idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl) self.num_cohstr = len(self.idx2cohG9) # Crosswikis print("Loading Crosswikis dict. (takes ~2 mins to load)") self.crosswikis = utils.load(config.crosswikis_pruned_pkl) print("Crosswikis loaded. Size: {}".format(len(self.crosswikis))) if self.pretrain_wordembed: stime = time.time() self.word2vec = vocabloader.loadGloveVectors() print("[#] Glove Vectors loaded!") ttime = (time.time() - stime) / float(60) print("[#] Time to load vectors : {} mins".format(ttime)) print("[#] Test Mentions File : {}".format(test_mens_file)) print("[#] Loading test file and preprocessing ... ") self.processTestDoc(test_mens_file) self.mention_lines = self.convertSent2NerToMentionLines() self.mentions = [] for line in self.mention_lines: m = Mention(line) self.mentions.append(m) self.men_idx = 0 self.num_mens = len(self.mentions) self.epochs = 0 print("[#] Test Mentions : {}".format(self.num_mens)) self.batch_size = batch_size print("[#] Batch Size: %d" % self.batch_size) self.num_cands = num_cands self.strict_context = strict_context print("\n[#]LOADING COMPLETE")
def __init__(self, config, widWikititle_file, widLabel_file, word_threshold=1): '''Given training data, makes word vocab, glove word vocab, doc_mentions vocab, type lables vocab, known_wid vocab, wid2Wikititle ''' self.start_word = start_word self.end_word = end_word self.unk_word = 'unk' # In tune with word2vec self.unk_wid = "<unk_wid>" self.tr_mens_dir = config.train_mentions_dir self.tr_mens_files = utils.get_mention_files(self.tr_mens_dir) self.num_tr_mens_files = len(self.tr_mens_files) print("[#] Training Mention Files : {} files".format( self.num_tr_mens_files)) print("[#] Validation Mentions File : {}".format( config.val_mentions_file)) tr_data_vocabs_exist = self.check_train_data_vocabs_exist( config.word_vocab_pkl, config.label_vocab_pkl, config.kwnwid_vocab_pkl, config.cohstring_vocab_pkl, config.cohstringG1_vocab_pkl) if not tr_data_vocabs_exist: print("[#] Loading pretrained word2vec embeddings .. ") self.word2vec = gensim.models.Word2Vec.load_word2vec_format( config.word2vec_bin_gz, binary=True) self.word2vec.init_sims(replace=True) print("All/Some Training Vocabs do not exist. Making ... ") self.make_training_data_vocabs( self.tr_mens_dir, self.tr_mens_files, config.word_vocab_pkl, config.label_vocab_pkl, config.kwnwid_vocab_pkl, config.cohstring_vocab_pkl, config.cohstringG1_vocab_pkl, config.cohstringG9_vocab_pkl, word_threshold) if not os.path.exists(config.widWiktitle_pkl): print(" [#] Making wid2Wikititle Map") wid2Wikititle = self.make_widWikititleMap(widWikititle_file) utils.save(config.widWiktitle_pkl, wid2Wikititle) print(" [#] Done. Size : {}".format(len(wid2Wikititle))) if not os.path.exists(config.wid2typelabels_vocab_pkl): print(" [#] Making wid2Types Map") wid2types = self.make_wid2TypesMap(widLabel_file) utils.save(config.wid2typelabels_vocab_pkl, wid2types) print(" [#] Done. Size : {}".format(len(wid2types))) if not os.path.exists(config.glove_word_vocab_pkl): print(" [#] Makign GloVe Word Vocabs") glove2vec = utils.load(config.glove_pkl) print(" [#] Glove embeddings loaded. Size: {}".format( len(glove2vec))) (glove_word2idx, glove_idx2word) = self.make_glovewordvocab(glove2vec) utils.save(config.glove_word_vocab_pkl, (glove_word2idx, glove_idx2word))
def __init__(self, config, vocabloader, num_cands, batch_size, strict_context=True, pretrain_wordembed=True, coherence=True): self.typeOfReader = "inference" self.start_word = start_word self.end_word = end_word self.unk_word = 'unk' # In tune with word2vec self.unk_wid = "<unk_wid>" self.tr_sup = 'tr_sup' self.tr_unsup = 'tr_unsup' self.pretrain_wordembed = pretrain_wordembed self.coherence = coherence # Word Vocab (self.word2idx, self.idx2word) = vocabloader.getGloveWordVocab() self.num_words = len(self.idx2word) # Label Vocab (self.label2idx, self.idx2label) = vocabloader.getLabelVocab() self.num_labels = len(self.idx2label) # Known WID Vocab (self.knwid2idx, self.idx2knwid) = vocabloader.getKnwnWidVocab() self.num_knwn_entities = len(self.idx2knwid) # Wid2Wikititle Map self.wid2WikiTitle = vocabloader.getWID2Wikititle() # Coherence String Vocab print("Loading Coherence Strings Dicts ... ") (self.cohG92idx, self.idx2cohG9) = utils.load(config.cohstringG9_vocab_pkl) self.num_cohstr = len(self.idx2cohG9) # Crosswikis print("Loading Crosswikis dict. (takes ~2 mins to load)") self.crosswikis = utils.load(config.crosswikis_pruned_pkl) print("Crosswikis loaded. Size: {}".format(len(self.crosswikis))) if self.pretrain_wordembed: stime = time.time() self.word2vec = vocabloader.loadGloveVectors() print("[#] Glove Vectors loaded!") ttime = (time.time() - stime) / float(60) # print("[#] Test Mentions File : {}".format(test_mens_file)) # print("[#] Loading test file and preprocessing ... ") # with open(test_mens_file, 'r') as f: # tajsonstr = f.read() # ta = TextAnnotation(json_str=tajsonstr) # # (sentences_tokenized, modified_ner_cons_list) = self.processTestDoc(ta) # # self.mention_lines = self.convertSent2NerToMentionLines( # sentences_tokenized, modified_ner_cons_list) # # self.mentions = [] # for line in self.mention_lines: # m = Mention(line) # self.mentions.append(m) self.men_idx = 0 # self.num_mens = len(self.mentions) self.epochs = 0 # print( "[#] Test Mentions : {}".format(self.num_mens)) self.batch_size = batch_size print("[#] Batch Size: %d" % self.batch_size) self.num_cands = num_cands self.strict_context = strict_context print("\n[#]LOADING COMPLETE")
def main(_): pp.pprint(flags.FLAGS.__flags) output_file = "data/output.json" #sys.argv[2] range_start = 0 #int(sys.argv[3]) range_end = 10 #int(sys.argv[4]) file_name = "data/qanta.train.2018.04.18.json" #sys.argv[1] question_list = json.loads(open(file_name).read())["questions"] sentences = question_list[range_start:min(range_end, len(question_list))] FLAGS_check(FLAGS) config = Config(FLAGS.config, verbose=False) vocabloader = VocabLoader(config) print("Loading in variables!") word2idx, idx2word = vocabloader.getGloveWordVocab() wid2WikiTitle = vocabloader.getWID2Wikititle() crosswikis = utils.load(config.crosswikis_pruned_pkl) word2vec = vocabloader.loadGloveVectors() print("DONE LOADING IN VARIABLES!!!") all_entities = [] for sent in sentences: tf.reset_default_graph() loc = config.test_file.replace( "sampletest.txt", "{}_{}.txt".format(range_start, range_end)) w = open(loc, "w") config.test_file = loc sent["text"] = decrypt(sent["text"].replace("\xa0", " ")) w.write(sent["text"].encode("ascii", "ignore").decode("ascii")) print(sent["text"].encode("ascii", "ignore").decode("ascii")) w.close() FLAGS.dropout_keep_prob = 1.0 FLAGS.wordDropoutKeep = 1.0 FLAGS.cohDropoutKeep = 1.0 start = time.time() print("Test file {} ".format(config.test_file)) reader = InferenceReader(config=config, vocabloader=vocabloader, test_mens_file=config.test_file, num_cands=FLAGS.num_cand_entities, batch_size=FLAGS.batch_size, word2idx=word2idx, idx2word=idx2word, wid2WikiTitle=wid2WikiTitle, crosswikis=crosswikis, word2vec=word2vec, strict_context=FLAGS.strict_context, pretrain_wordembed=FLAGS.pretrain_wordembed, coherence=FLAGS.coherence) print("Took {} time to create inference reader".format(time.time() - start)) docta = reader.ccgdoc model_mode = 'inference' config_proto = tf.ConfigProto() config_proto.allow_soft_placement = True config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) print("COHSTR", reader.num_cohstr) """with sess.as_default(): start = time.time() model = ELModel( sess=sess, reader=reader, dataset=FLAGS.dataset, max_steps=FLAGS.max_steps, pretrain_max_steps=FLAGS.pretraining_steps, word_embed_dim=FLAGS.word_embed_dim, context_encoded_dim=FLAGS.context_encoded_dim, context_encoder_num_layers=FLAGS.context_encoder_num_layers, context_encoder_lstmsize=FLAGS.context_encoder_lstmsize, coherence_numlayers=FLAGS.coherence_numlayers, jointff_numlayers=FLAGS.jointff_numlayers, learning_rate=FLAGS.learning_rate, dropout_keep_prob=FLAGS.dropout_keep_prob, reg_constant=FLAGS.reg_constant, checkpoint_dir=FLAGS.checkpoint_dir, optimizer=FLAGS.optimizer, mode=model_mode, strict=FLAGS.strict_context, pretrain_word_embed=FLAGS.pretrain_wordembed, typing=FLAGS.typing, el=FLAGS.el, coherence=FLAGS.coherence, textcontext=FLAGS.textcontext, useCNN=FLAGS.useCNN, WDLength=FLAGS.WDLength, Fsize=FLAGS.Fsize, entyping=FLAGS.entyping) print("Loading EL Model took {} time".format(time.time()-start)) print("Doing inference") try: start = time.time() (predTypScNPmat_list, widIdxs_list, priorProbs_list, textProbs_list, jointProbs_list, evWTs_list, pred_TypeSetsList) = model.inference(ckptpath=FLAGS.model_path) print("Inference took {} time".format(time.time()-start)) except: entity_list = {'qanta_id':sent['qanta_id'],'mentions':[]} all_entities.append(entity_list) print("No entities") continue start = time.time() numMentionsInference = len(widIdxs_list) numMentionsReader = 0 for sent_idx in reader.sentidx2ners: numMentionsReader += len(reader.sentidx2ners[sent_idx]) assert numMentionsInference == numMentionsReader mentionnum = 0 entityTitleList = [] print("Tokenized sentences {}".format(reader.sentences_tokenized)) for sent_idx in reader.sentidx2ners: nerDicts = reader.sentidx2ners[sent_idx] sentence = ' '.join(reader.sentences_tokenized[sent_idx]) for s, ner in nerDicts: [evWTs, evWIDS, evProbs] = evWTs_list[mentionnum] predTypes = pred_TypeSetsList[mentionnum] entityTitleList.append(evWTs[2]) mentionnum += 1 elview = copy.deepcopy(docta.view_dictionary['NER_CONLL']) elview.view_name = 'ENG_NEURAL_EL' for i, cons in enumerate(elview.cons_list): cons['label'] = entityTitleList[i] docta.view_dictionary['ENG_NEURAL_EL'] = elview print("Processing took {} time".format(time.time()-start)) print("List of entities") #print(elview.cons_list) print("\n") s = sent["text"] print("New S is {}".format(s)) e = elview.cons_list t = reader.sentences_tokenized c = [] f = [] print(s) #print("E {}".format(e)) print("T {}".format(t)) for i in t: for j in i: f.append(j) i = 0 token_pointer = 0 while token_pointer < len(f) and i < len(s): token_len = len(f[token_pointer]) while i+token_len<len(s) and s[i:i+token_len] != f[token_pointer]: i+=1 c.append((i,token_len+i)) i+=1 token_pointer+=1 if len(c) != len(f): print("ERROR in C and F") unflattened_c = [] c_pointer = 0 for i in range(len(t)): l = c[c_pointer:c_pointer+len(t[i])] c_pointer+=len(t[i]) unflattened_c.append(l) #print("C {}".format(c)) #print("F {}".format(f)) #print("Unflattened C {}".format(unflattened_c)) entity_list = {'qanta_id':sent['qanta_id'],'mentions':[]} sentence_num = 0 UNK = "<unk_wid>" for i in range(len(e)): if e[i]["label"]!=UNK: all_words = False while not all_words and sentence_num < len(t): all_words = True #print(e[i]) for word in range(e[i]["start"],e[i]["end"]+1): if len(t[sentence_num])<=word or t[sentence_num][word] not in e[i]["tokens"]: all_words = False if not all_words: sentence_num+=1 if sentence_num == len(t): print("Error with sentence_num") else: entity_list['mentions'].append({'entity':e[i]["label"],'span':[unflattened_c[sentence_num][e[i]['start']][0],unflattened_c[sentence_num][e[i]['end']][1]]}) #print("Entity list is {}".format(entity_list)) all_entities.append(entity_list) local_vars = list(locals().items()) del reader del predTypScNPmat_list del widIdxs_list del priorProbs_list del textProbs_list del jointProbs_list del evWTs_list del model del pred_TypeSetsList print("Memory usage {}".format(getCurrentMemoryUsage())) #print("All entities are {}".format(all_entities)) del sess""" gc.collect() tf.reset_default_graph() w = open(output_file, "w") w.write(json.dumps(all_entities)) w.close() print("Dumped JSON, all done") print("Took {} time".format(time.time() - prog_start)) return sys.exit()