WikiTitles.append(wits) return WikiTitles def widIdx2WikiTitle(self, widIdx): wid = self.idx2knwid[widIdx] wikiTitle = self.wid2WikiTitle[wid] return wikiTitle if __name__ == '__main__': sttime = time.time() batch_size = 1 num_batch = 1000 configpath = "configs/config.ini" config = Config(configpath, verbose=False) vocabloader = VocabLoader(config) b = TestDataReader(config=config, vocabloader=vocabloader, test_mens_file=config.test_file, num_cands=30, batch_size=batch_size, strict_context=False, pretrain_wordembed=False, coherence=False) stime = time.time() i = 0 kwn = 0 total_instances = 0
def addToTypeCount(self, mens): for m in mens: types = m.types for t in types: self.typeCount[t] += 1 def convertTypeCountToFraction(self, decSorted, numMens): typeCount = [] for (t, c) in decSorted: typeCount.append((t, float(c) / numMens)) return typeCount def makeLabelCount(self): totalMentions = 0 for (i, mens_file) in enumerate(self.tr_mens_files): print("File Num : {}".format(i)) file = os.path.join(self.tr_mens_dir, mens_file) mens = utils.make_mentions_from_file(file) self.addToTypeCount(mens) totalMentions += len(mens) decSorted = utils.decrSortedDict(self.typeCount) decSorted = self.convertTypeCountToFraction(decSorted, totalMentions) print("Total Mentions : {}".format(totalMentions)) pp.pprint(decSorted) if __name__ == '__main__': config = Config("configs/vocab_config.ini") vocabloader = VocabLoader(config) a = typeCountDistribution(config, vocabloader)
import os import readers.utils as utils from readers.Mention import Mention from readers.config import Config from readers.vocabloader import VocabLoader config = Config("configs/all_mentions_config.ini") vocabloader = VocabLoader(config) (knwid2idx, idx2knwid) = vocabloader.getKnwnWidVocab() wid2WikiTitle = vocabloader.getWID2Wikititle() print("Known {} total {}".format(len(knwid2idx), len(wid2WikiTitle))) widswithtext = set() with open("/save/ngupta19/wikipedia/wiki_kb/widswithtext", 'r') as f: docswithtext = f.readlines() for l in docswithtext: widswithtext.add(l.strip()) print("Total docs with text : {}".format(len(widswithtext))) missing = 0 total = 0 for wid in knwid2idx: total += 1 if not (str(wid) in widswithtext): print(wid) missing += 1 print("Known Total : {} Missing {} ".format(total, missing))
def main(_): pp.pprint(flags.FLAGS.__flags) FLAGS_check(FLAGS) config = Config(FLAGS.config, verbose=False) vocabloader = VocabLoader(config) FLAGS.dropout_keep_prob = 1.0 FLAGS.wordDropoutKeep = 1.0 FLAGS.cohDropoutKeep = 1.0 (intput_ta_files, output_ta_files) = getAllTAFilePaths(FLAGS) print("TOTAL NUMBER OF TAS : {}".format(len(intput_ta_files))) reader = TextAnnoTestReader( config=config, vocabloader=vocabloader, # test_mens_file=config.test_file, num_cands=30, batch_size=FLAGS.batch_size, strict_context=FLAGS.strict_context, pretrain_wordembed=FLAGS.pretrain_wordembed, coherence=FLAGS.coherence) model_mode = 'test' config_proto = tf.ConfigProto() config_proto.allow_soft_placement = True config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) with sess.as_default(): model = ELModel( sess=sess, reader=reader, dataset=FLAGS.dataset, max_steps=FLAGS.max_steps, pretrain_max_steps=FLAGS.pretraining_steps, word_embed_dim=FLAGS.word_embed_dim, context_encoded_dim=FLAGS.context_encoded_dim, context_encoder_num_layers=FLAGS.context_encoder_num_layers, context_encoder_lstmsize=FLAGS.context_encoder_lstmsize, coherence_numlayers=FLAGS.coherence_numlayers, jointff_numlayers=FLAGS.jointff_numlayers, learning_rate=FLAGS.learning_rate, dropout_keep_prob=FLAGS.dropout_keep_prob, reg_constant=FLAGS.reg_constant, checkpoint_dir=FLAGS.checkpoint_dir, optimizer=FLAGS.optimizer, mode=model_mode, strict=FLAGS.strict_context, pretrain_word_embed=FLAGS.pretrain_wordembed, typing=FLAGS.typing, el=FLAGS.el, coherence=FLAGS.coherence, textcontext=FLAGS.textcontext, useCNN=FLAGS.useCNN, WDLength=FLAGS.WDLength, Fsize=FLAGS.Fsize, entyping=FLAGS.entyping) model.load_ckpt_model(ckptpath=FLAGS.model_path) print("Total files: {}".format(len(output_ta_files))) erroneous_files = 0 for in_ta_path, out_ta_path in zip(intput_ta_files, output_ta_files): # print("Running the inference for : {}".format(in_ta_path)) try: reader.new_test_file(in_ta_path) except: print("Error reading : {}".format(in_ta_path)) erroneous_files += 1 continue (predTypScNPmat_list, widIdxs_list, priorProbs_list, textProbs_list, jointProbs_list, evWTs_list, pred_TypeSetsList) = model.inference_run() # model.inference(ckptpath=FLAGS.model_path) wiki_view = copy.deepcopy(reader.textanno.get_view("NER_LORELEI")) #wiki_view = copy.deepcopy(reader.textanno.get_view("MENTION")) #wiki_view = copy.deepcopy(reader.textanno.get_view("NER_CONLL")) #wiki_view = copy.deepcopy(reader.textanno.get_view("English_NERVIEW")) # wiki_view_json = copy.deepcopy(reader.textanno.get_view("NER").as_json) docta = reader.textanno #el_cons_list = [con for con in wiki_view.cons_list if "NAM" in con['label']] el_cons_list = wiki_view.cons_list numMentionsInference = len(widIdxs_list) # print("Number of mentions in model: {}".format(len(widIdxs_list))) # print("Number of NER mention: {}".format(len(el_cons_list))) assert len(el_cons_list) == numMentionsInference mentionnum = 0 for ner_cons in el_cons_list: priorScoreMap = {} contextScoreMap = {} jointScoreMap = {} (wididxs, pps, mps, jps) = (widIdxs_list[mentionnum], priorProbs_list[mentionnum], textProbs_list[mentionnum], jointProbs_list[mentionnum]) maxJointProb = 0.0 maxJointEntity = "" for (wididx, prp, mp, jp) in zip(wididxs, pps, mps, jps): wT = reader.widIdx2WikiTitle(wididx) priorScoreMap[wT] = prp contextScoreMap[wT] = mp jointScoreMap[wT] = jp if jp > maxJointProb: maxJointProb = jp maxJointEntity = wT ''' add labels2score map here ''' #ner_cons["jointScoreMap"] = jointScoreMap #ner_cons["contextScoreMap"] = contextScoreMap #ner_cons["priorScoreMap"] = priorScoreMap ner_cons["labelScoreMap"] = jointScoreMap # add max scoring entity as label ner_cons["label"] = maxJointEntity ner_cons["score"] = maxJointProb mentionnum += 1 wiki_view.view_name = "NEUREL" docta.view_dictionary["NEUREL"] = wiki_view docta_json = docta.as_json json.dump(docta_json, open(out_ta_path, "w"), indent=True) print("Number of erroneous files: {}".format(erroneous_files)) print("Annotation completed. Program can be exited safely.") sys.exit()
sys.exit() print("Loading wid2TypeLabels") self.wid2TypeLabels = utils.load( self.config.wid2typelabels_vocab_pkl) return self.wid2TypeLabels def loadGloveVectors(self): if self.glove2vec == None: if not os.path.exists(self.config.glove_pkl): print("Glove_Vectors_PKL doesnot exist") sys.exit() print("Loading Glove Word Vectors") self.glove2vec = utils.load(self.config.glove_pkl) return self.glove2vec def getGloveWordVocab(self): if self.gword2idx == None or self.gidx2word == None: if not os.path.exists(self.config.glove_word_vocab_pkl): print("Glove Word Vocab PKL missing") sys.exit() print("Loading Glove Word Vocabulary") (self.gword2idx, self.gidx2word) = utils.load(self.config.glove_word_vocab_pkl) return (self.gword2idx, self.gidx2word) if __name__ == '__main__': config = Config("configs/wcoh_config.ini") a = VocabLoader(config) a.loadWord2Vec()
def __init__(self): print("[#] Launching Training Job. DeviceId : {}".format(device_id)) self.config = Config(configpath, verbose=False) self.vocabloader = VocabLoader(self.config) test_file = self.config.aida_kwn_dev_file valbs = 2 self.tr_reader = TrainingDataReader( config=self.config, vocabloader=self.vocabloader, val_file=self.config.aida_kwn_dev_file, num_cands=30, batch_size=bs, wordDropoutKeep=worddropout, cohDropoutKeep=cohdropout) self.test_reader = TestDataReader(config=self.config, vocabloader=self.vocabloader, val_file=test_file, num_cands=30, batch_size=valbs, wordDropoutKeep=1.0, cohDropoutKeep=1.0) self.wvocab_size = len(self.tr_reader.word2idx) self.envocab_size = len(self.tr_reader.knwid2idx) self.typevocab_size = len(self.tr_reader.label2idx) self.cohvocab_size = len(self.tr_reader.cohG92idx) print("[#] Word Vocab : {}, Entity Vocab: {}, Type Vocab: {} " "CohString Vocab : {}".format(self.wvocab_size, self.envocab_size, self.typevocab_size, self.cohvocab_size)) if modeltype == 'ELModel': print("[#] MODEL : ELModel") self.model = ELModel( device_id=device_id, wordvocab=(self.tr_reader.word2idx, self.tr_reader.idx2word), cohvocab=(self.tr_reader.cohG92idx, self.tr_reader.idx2cohG9), envocab=(self.tr_reader.knwid2idx, self.tr_reader.idx2knwid), typevocab=(self.tr_reader.label2idx, self.tr_reader.idx2label), wdim=wdim, edim=endim, num_cands=30, hsize=endim, mlp_nlayers=1, dropout=dropout, init_range=init_range, mentyping=mentype, entyping=entype, descencoding=endesc) if modeltype not in MODELTYPES: print("Invalid modeltype : {}".format(modeltype)) sys.exit() if device_id is not None: self.model.cuda(device_id) if optim == 'adam': print("[#] OPTIM : ADAM") self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=lr) elif optim == 'sgd': print("[#] OPTIM : SGD LR:{} Momentum:{} Nestrov:{}".format( lr, momentum, nestrov)) self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, self.model.parameters()), lr=lr, momentum=momentum, nesterov=nestrov) else: print("Wrong Optimizer") sys.exit(0) timeout = 5 print("Press any key to run (or wait {} seconds) ... ".format(timeout)) rlist, wlist, xlist = select.select([sys.stdin], [], [], timeout)
def main(_): pp.pprint(flags.FLAGS.__flags) FLAGS_check(FLAGS) # pipeline = LocalPipeline() pipeline = RemotePipeline(server_api="http://macniece.seas.upenn.edu:4001") config = Config(FLAGS.config, verbose=False) vocabloader = VocabLoader(config) FLAGS.dropout_keep_prob = 1.0 FLAGS.wordDropoutKeep = 1.0 FLAGS.cohDropoutKeep = 1.0 input_jsonl = FLAGS.input_jsonl output_jsonl = FLAGS.output_jsonl reader = TextAnnoTestReader( config=config, vocabloader=vocabloader, num_cands=30, batch_size=FLAGS.batch_size, strict_context=FLAGS.strict_context, pretrain_wordembed=FLAGS.pretrain_wordembed, coherence=FLAGS.coherence, ) model_mode = "test" config_proto = tf.ConfigProto() config_proto.allow_soft_placement = True config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) with sess.as_default(): model = ELModel( sess=sess, reader=reader, dataset=FLAGS.dataset, max_steps=FLAGS.max_steps, pretrain_max_steps=FLAGS.pretraining_steps, word_embed_dim=FLAGS.word_embed_dim, context_encoded_dim=FLAGS.context_encoded_dim, context_encoder_num_layers=FLAGS.context_encoder_num_layers, context_encoder_lstmsize=FLAGS.context_encoder_lstmsize, coherence_numlayers=FLAGS.coherence_numlayers, jointff_numlayers=FLAGS.jointff_numlayers, learning_rate=FLAGS.learning_rate, dropout_keep_prob=FLAGS.dropout_keep_prob, reg_constant=FLAGS.reg_constant, checkpoint_dir=FLAGS.checkpoint_dir, optimizer=FLAGS.optimizer, mode=model_mode, strict=FLAGS.strict_context, pretrain_word_embed=FLAGS.pretrain_wordembed, typing=FLAGS.typing, el=FLAGS.el, coherence=FLAGS.coherence, textcontext=FLAGS.textcontext, useCNN=FLAGS.useCNN, WDLength=FLAGS.WDLength, Fsize=FLAGS.Fsize, entyping=FLAGS.entyping, ) model.load_ckpt_model(ckptpath=FLAGS.model_path) outf = open(output_jsonl, "w") inpf = list(open(input_jsonl, "r")) for line in tqdm(inpf): example_json = json.loads(line) doctext = ftfy.fix_text(example_json['text']) out_dict = { 'text': doctext, 'qanta_id': example_json['qanta_id'], 'sent_id': example_json['sent_id'] } try: ta = pipeline.doc(doctext) except MisalignedCharError as e: out_dict['error'] = str(e) outf.write(json.dumps(out_dict)) outf.write("\n") continue _ = ta.get_ner_conll # Make instances for this document reader.new_ta(ta) ( predTypScNPmat_list, widIdxs_list, priorProbs_list, textProbs_list, jointProbs_list, evWTs_list, pred_TypeSetsList, ) = model.inference_run() if pred_TypeSetsList is None: continue wiki_view = copy.deepcopy(reader.textanno.get_view("NER_CONLL")) docta = reader.textanno el_cons_list = wiki_view.cons_list numMentionsInference = len(widIdxs_list) assert len(el_cons_list) == numMentionsInference el_mentions = [] mentionnum = 0 for ner_cons in el_cons_list: # ner_cons is a dict mentiondict = {} mentiondict["tokens"] = ner_cons["tokens"] mentiondict["end"] = ner_cons["end"] mentiondict["start"] = ner_cons["start"] priorScoreMap = {} contextScoreMap = {} jointScoreMap = {} (wididxs, pps, mps, jps) = ( widIdxs_list[mentionnum], priorProbs_list[mentionnum], textProbs_list[mentionnum], jointProbs_list[mentionnum], ) maxJointProb = 0.0 maxJointEntity = "" for (wididx, prp, mp, jp) in zip(wididxs, pps, mps, jps): wT = reader.widIdx2WikiTitle(wididx) priorScoreMap[wT] = prp contextScoreMap[wT] = mp jointScoreMap[wT] = jp if jp > maxJointProb: maxJointProb = jp maxJointEntity = wT mentiondict["jointScoreMap"] = jointScoreMap mentiondict["contextScoreMap"] = contextScoreMap mentiondict["priorScoreMap"] = priorScoreMap # add max scoring entity as label mentiondict["label"] = maxJointEntity mentiondict["score"] = maxJointProb mentionnum += 1 el_mentions.append(mentiondict) out_dict["nel"] = el_mentions outf.write(json.dumps(out_dict)) outf.write("\n") outf.close() print("Annotation completed. Program can be exited safely.") sys.exit()
def main(_): pp.pprint(flags.FLAGS.__flags) output_file = "data/output.json" #sys.argv[2] range_start = 0 #int(sys.argv[3]) range_end = 10 #int(sys.argv[4]) file_name = "data/qanta.train.2018.04.18.json" #sys.argv[1] question_list = json.loads(open(file_name).read())["questions"] sentences = question_list[range_start:min(range_end, len(question_list))] FLAGS_check(FLAGS) config = Config(FLAGS.config, verbose=False) vocabloader = VocabLoader(config) print("Loading in variables!") word2idx, idx2word = vocabloader.getGloveWordVocab() wid2WikiTitle = vocabloader.getWID2Wikititle() crosswikis = utils.load(config.crosswikis_pruned_pkl) word2vec = vocabloader.loadGloveVectors() print("DONE LOADING IN VARIABLES!!!") all_entities = [] for sent in sentences: tf.reset_default_graph() loc = config.test_file.replace( "sampletest.txt", "{}_{}.txt".format(range_start, range_end)) w = open(loc, "w") config.test_file = loc sent["text"] = decrypt(sent["text"].replace("\xa0", " ")) w.write(sent["text"].encode("ascii", "ignore").decode("ascii")) print(sent["text"].encode("ascii", "ignore").decode("ascii")) w.close() FLAGS.dropout_keep_prob = 1.0 FLAGS.wordDropoutKeep = 1.0 FLAGS.cohDropoutKeep = 1.0 start = time.time() print("Test file {} ".format(config.test_file)) reader = InferenceReader(config=config, vocabloader=vocabloader, test_mens_file=config.test_file, num_cands=FLAGS.num_cand_entities, batch_size=FLAGS.batch_size, word2idx=word2idx, idx2word=idx2word, wid2WikiTitle=wid2WikiTitle, crosswikis=crosswikis, word2vec=word2vec, strict_context=FLAGS.strict_context, pretrain_wordembed=FLAGS.pretrain_wordembed, coherence=FLAGS.coherence) print("Took {} time to create inference reader".format(time.time() - start)) docta = reader.ccgdoc model_mode = 'inference' config_proto = tf.ConfigProto() config_proto.allow_soft_placement = True config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) print("COHSTR", reader.num_cohstr) """with sess.as_default(): start = time.time() model = ELModel( sess=sess, reader=reader, dataset=FLAGS.dataset, max_steps=FLAGS.max_steps, pretrain_max_steps=FLAGS.pretraining_steps, word_embed_dim=FLAGS.word_embed_dim, context_encoded_dim=FLAGS.context_encoded_dim, context_encoder_num_layers=FLAGS.context_encoder_num_layers, context_encoder_lstmsize=FLAGS.context_encoder_lstmsize, coherence_numlayers=FLAGS.coherence_numlayers, jointff_numlayers=FLAGS.jointff_numlayers, learning_rate=FLAGS.learning_rate, dropout_keep_prob=FLAGS.dropout_keep_prob, reg_constant=FLAGS.reg_constant, checkpoint_dir=FLAGS.checkpoint_dir, optimizer=FLAGS.optimizer, mode=model_mode, strict=FLAGS.strict_context, pretrain_word_embed=FLAGS.pretrain_wordembed, typing=FLAGS.typing, el=FLAGS.el, coherence=FLAGS.coherence, textcontext=FLAGS.textcontext, useCNN=FLAGS.useCNN, WDLength=FLAGS.WDLength, Fsize=FLAGS.Fsize, entyping=FLAGS.entyping) print("Loading EL Model took {} time".format(time.time()-start)) print("Doing inference") try: start = time.time() (predTypScNPmat_list, widIdxs_list, priorProbs_list, textProbs_list, jointProbs_list, evWTs_list, pred_TypeSetsList) = model.inference(ckptpath=FLAGS.model_path) print("Inference took {} time".format(time.time()-start)) except: entity_list = {'qanta_id':sent['qanta_id'],'mentions':[]} all_entities.append(entity_list) print("No entities") continue start = time.time() numMentionsInference = len(widIdxs_list) numMentionsReader = 0 for sent_idx in reader.sentidx2ners: numMentionsReader += len(reader.sentidx2ners[sent_idx]) assert numMentionsInference == numMentionsReader mentionnum = 0 entityTitleList = [] print("Tokenized sentences {}".format(reader.sentences_tokenized)) for sent_idx in reader.sentidx2ners: nerDicts = reader.sentidx2ners[sent_idx] sentence = ' '.join(reader.sentences_tokenized[sent_idx]) for s, ner in nerDicts: [evWTs, evWIDS, evProbs] = evWTs_list[mentionnum] predTypes = pred_TypeSetsList[mentionnum] entityTitleList.append(evWTs[2]) mentionnum += 1 elview = copy.deepcopy(docta.view_dictionary['NER_CONLL']) elview.view_name = 'ENG_NEURAL_EL' for i, cons in enumerate(elview.cons_list): cons['label'] = entityTitleList[i] docta.view_dictionary['ENG_NEURAL_EL'] = elview print("Processing took {} time".format(time.time()-start)) print("List of entities") #print(elview.cons_list) print("\n") s = sent["text"] print("New S is {}".format(s)) e = elview.cons_list t = reader.sentences_tokenized c = [] f = [] print(s) #print("E {}".format(e)) print("T {}".format(t)) for i in t: for j in i: f.append(j) i = 0 token_pointer = 0 while token_pointer < len(f) and i < len(s): token_len = len(f[token_pointer]) while i+token_len<len(s) and s[i:i+token_len] != f[token_pointer]: i+=1 c.append((i,token_len+i)) i+=1 token_pointer+=1 if len(c) != len(f): print("ERROR in C and F") unflattened_c = [] c_pointer = 0 for i in range(len(t)): l = c[c_pointer:c_pointer+len(t[i])] c_pointer+=len(t[i]) unflattened_c.append(l) #print("C {}".format(c)) #print("F {}".format(f)) #print("Unflattened C {}".format(unflattened_c)) entity_list = {'qanta_id':sent['qanta_id'],'mentions':[]} sentence_num = 0 UNK = "<unk_wid>" for i in range(len(e)): if e[i]["label"]!=UNK: all_words = False while not all_words and sentence_num < len(t): all_words = True #print(e[i]) for word in range(e[i]["start"],e[i]["end"]+1): if len(t[sentence_num])<=word or t[sentence_num][word] not in e[i]["tokens"]: all_words = False if not all_words: sentence_num+=1 if sentence_num == len(t): print("Error with sentence_num") else: entity_list['mentions'].append({'entity':e[i]["label"],'span':[unflattened_c[sentence_num][e[i]['start']][0],unflattened_c[sentence_num][e[i]['end']][1]]}) #print("Entity list is {}".format(entity_list)) all_entities.append(entity_list) local_vars = list(locals().items()) del reader del predTypScNPmat_list del widIdxs_list del priorProbs_list del textProbs_list del jointProbs_list del evWTs_list del model del pred_TypeSetsList print("Memory usage {}".format(getCurrentMemoryUsage())) #print("All entities are {}".format(all_entities)) del sess""" gc.collect() tf.reset_default_graph() w = open(output_file, "w") w.write(json.dumps(all_entities)) w.close() print("Dumped JSON, all done") print("Took {} time".format(time.time() - prog_start)) return sys.exit()
numCands = numCands / float(numMens) print("Training Known Candidates Stats : ") print("Number of mentions: {}".format(numMens)) print("Recall @ 1 : {}".format(recallAt1)) print("Recall @ 30 : {}".format(recallAt30)) print("No Cands : {}".format(noCands)) print("Correct WID not in Cands : {}".format(candsButNotCorr)) print("Num of Cands : {}".format(numCands)) #enddef if __name__ == '__main__': sttime = time.time() batch_size = 1000 num_batch = 1000 configpath = "configs/wcoh_config.ini" config = Config(configpath) vocabloader = VocabLoader(config) b = CandidateStats(config=config, vocabloader=vocabloader, num_cands=30) stime = time.time() #b.allCandidateStats() #b.knwnCandsDictStats() b.validationKnwnCandsStats() b.testKnwnCandsStats() #b.trainKnwnCandsStats() sys.exit()
mentions = utils.make_mentions_from_file(mens_file=test_file) self._addCandidatesForMentions(mentions, cands_dict) return cands_dict def updateTestCandsDict(self, test_file): print("Updating Test Candidates Dict. Size:{}\n" "Key:(surface, wid), V: ([CandWids], [PriorProbs])".format( len(self.test_kwnen_cands_dict))) print("Test File: {}".format(test_file)) test_cands_dict = self.make_test_candidates(test_file) self.test_kwnen_cands_dict.update(test_cands_dict) utils.save(self.config.test_kwnen_cands_pkl, self.test_kwnen_cands_dict) print("Train/Val Candidates Dict Saved. Size:{}".format( len(self.test_kwnen_cands_dict))) if __name__ == '__main__': config = Config("configs/config.ini") vocabloader = VocabLoader(config) b = TestCandidateDictionary(config=config, vocabloader=vocabloader) b.updateTestCandsDict(test_file=config.aida_kwn_dev_file) b.updateTestCandsDict(test_file=config.aida_kwn_test_file) sys.exit(0)
def main(_): pp.pprint(flags.FLAGS.__flags) FLAGS_check(FLAGS) config = Config(FLAGS.config, verbose=False) vocabloader = VocabLoader(config) if FLAGS.mode == 'inference': FLAGS.dropout_keep_prob = 1.0 FLAGS.wordDropoutKeep = 1.0 FLAGS.cohDropoutKeep = 1.0 reader = InferenceReader(config=config, vocabloader=vocabloader, test_mens_file=config.test_file, num_cands=FLAGS.num_cand_entities, batch_size=FLAGS.batch_size, strict_context=FLAGS.strict_context, pretrain_wordembed=FLAGS.pretrain_wordembed, coherence=FLAGS.coherence) docta = reader.ccgdoc model_mode = 'inference' elif FLAGS.mode == 'test': FLAGS.dropout_keep_prob = 1.0 FLAGS.wordDropoutKeep = 1.0 FLAGS.cohDropoutKeep = 1.0 reader = TestDataReader(config=config, vocabloader=vocabloader, test_mens_file=config.test_file, num_cands=30, batch_size=FLAGS.batch_size, strict_context=FLAGS.strict_context, pretrain_wordembed=FLAGS.pretrain_wordembed, coherence=FLAGS.coherence) model_mode = 'test' else: print("MODE in FLAGS is incorrect : {}".format(FLAGS.mode)) sys.exit() config_proto = tf.ConfigProto() config_proto.allow_soft_placement = True config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) with sess.as_default(): model = ELModel( sess=sess, reader=reader, dataset=FLAGS.dataset, max_steps=FLAGS.max_steps, pretrain_max_steps=FLAGS.pretraining_steps, word_embed_dim=FLAGS.word_embed_dim, context_encoded_dim=FLAGS.context_encoded_dim, context_encoder_num_layers=FLAGS.context_encoder_num_layers, context_encoder_lstmsize=FLAGS.context_encoder_lstmsize, coherence_numlayers=FLAGS.coherence_numlayers, jointff_numlayers=FLAGS.jointff_numlayers, learning_rate=FLAGS.learning_rate, dropout_keep_prob=FLAGS.dropout_keep_prob, reg_constant=FLAGS.reg_constant, checkpoint_dir=FLAGS.checkpoint_dir, optimizer=FLAGS.optimizer, mode=model_mode, strict=FLAGS.strict_context, pretrain_word_embed=FLAGS.pretrain_wordembed, typing=FLAGS.typing, el=FLAGS.el, coherence=FLAGS.coherence, textcontext=FLAGS.textcontext, useCNN=FLAGS.useCNN, WDLength=FLAGS.WDLength, Fsize=FLAGS.Fsize, entyping=FLAGS.entyping) if FLAGS.mode == 'inference': print("Doing inference") (predTypScNPmat_list, widIdxs_list, priorProbs_list, textProbs_list, jointProbs_list, evWTs_list, pred_TypeSetsList) = model.inference(ckptpath=FLAGS.model_path) numMentionsInference = len(widIdxs_list) numMentionsReader = 0 for sent_idx in reader.sentidx2ners: numMentionsReader += len(reader.sentidx2ners[sent_idx]) assert numMentionsInference == numMentionsReader mentionnum = 0 entityTitleList = [] for sent_idx in reader.sentidx2ners: nerDicts = reader.sentidx2ners[sent_idx] sentence = ' '.join(reader.sentences_tokenized[sent_idx]) for s, ner in nerDicts: [evWTs, evWIDS, evProbs] = evWTs_list[mentionnum] predTypes = pred_TypeSetsList[mentionnum] print(reader.bracketMentionInSentence(sentence, ner)) print("Prior: {} {}, Context: {} {}, Joint: {} {}".format( evWTs[0], evProbs[0], evWTs[1], evProbs[1], evWTs[2], evProbs[2])) entityTitleList.append(evWTs[2]) print("Predicted Entity Types : {}".format(predTypes)) print("\n") mentionnum += 1 elview = copy.deepcopy(docta.view_dictionary['NER_CONLL']) elview.view_name = 'ENG_NEURAL_EL' for i, cons in enumerate(elview.cons_list): cons['label'] = entityTitleList[i] docta.view_dictionary['ENG_NEURAL_EL'] = elview print("elview.cons_list") print(elview.cons_list) print("\n") for v in docta.as_json['views']: print(v) print("\n") elif FLAGS.mode == 'test': print("Testing on Data ") (widIdxs_list, condProbs_list, contextProbs_list, condContextJointProbs_list, evWTs, sortedContextWTs) = model.dataset_test(ckptpath=FLAGS.model_path) print(len(widIdxs_list)) print(len(condProbs_list)) print(len(contextProbs_list)) print(len(condContextJointProbs_list)) print(len(reader.mentions)) print("Writing Test Predictions: {}".format(FLAGS.test_out_fp)) with open(FLAGS.test_out_fp, 'w') as f: for (wididxs, pps, mps, jps) in zip(widIdxs_list, condProbs_list, contextProbs_list, condContextJointProbs_list): mentionPred = "" for (wididx, prp, mp, jp) in zip(wididxs, pps, mps, jps): wit = reader.widIdx2WikiTitle(wididx) mentionPred += wit + " " + str(prp) + " " + \ str(mp) + " " + str(jp) mentionPred += "\t" mentionPred = mentionPred.strip() + "\n" f.write(mentionPred) print("Done writing. Can Exit.") else: print("WRONG MODE!") sys.exit(0) sys.exit()
def main(_): pp.pprint(flags.FLAGS.__flags) FLAGS_check(FLAGS) config = Config(FLAGS.config, verbose=False) vocabloader = VocabLoader(config) if FLAGS.mode == 'inference': FLAGS.dropout_keep_prob = 1.0 FLAGS.wordDropoutKeep = 1.0 FLAGS.cohDropoutKeep = 1.0 reader = InferenceReader(config=config, vocabloader=vocabloader, test_mens_file=config.test_file, num_cands=FLAGS.num_cand_entities, batch_size=FLAGS.batch_size, strict_context=FLAGS.strict_context, pretrain_wordembed=FLAGS.pretrain_wordembed, coherence=FLAGS.coherence) model_mode = 'test' else: print("MODE in FLAGS is incorrect : {}".format(FLAGS.mode)) sys.exit() config_proto = tf.ConfigProto() config_proto.allow_soft_placement = True config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) with sess.as_default(): model = ELModel( sess=sess, reader=reader, dataset=FLAGS.dataset, max_steps=FLAGS.max_steps, pretrain_max_steps=FLAGS.pretraining_steps, word_embed_dim=FLAGS.word_embed_dim, context_encoded_dim=FLAGS.context_encoded_dim, context_encoder_num_layers=FLAGS.context_encoder_num_layers, context_encoder_lstmsize=FLAGS.context_encoder_lstmsize, coherence_numlayers=FLAGS.coherence_numlayers, jointff_numlayers=FLAGS.jointff_numlayers, learning_rate=FLAGS.learning_rate, dropout_keep_prob=FLAGS.dropout_keep_prob, reg_constant=FLAGS.reg_constant, checkpoint_dir=FLAGS.checkpoint_dir, optimizer=FLAGS.optimizer, mode=model_mode, strict=FLAGS.strict_context, pretrain_word_embed=FLAGS.pretrain_wordembed, typing=FLAGS.typing, el=FLAGS.el, coherence=FLAGS.coherence, textcontext=FLAGS.textcontext, useCNN=FLAGS.useCNN, WDLength=FLAGS.WDLength, Fsize=FLAGS.Fsize, entyping=FLAGS.entyping) if FLAGS.mode == 'inference': print("Doing inference") (predTypScNPmat_list, widIdxs_list, priorProbs_list, textProbs_list, evWTs_list, pred_TypeSetsList) = model.inference(ckptpath=FLAGS.model_path) numMentionsInference = len(widIdxs_list) numMentionsReader = 0 for sent_idx in reader.sentidx2ners: numMentionsReader += len(reader.sentidx2ners[sent_idx]) assert numMentionsInference == numMentionsReader mentionnum = 0 for sent_idx in reader.sentidx2ners: nerDicts = reader.sentidx2ners[sent_idx] sentence = ' '.join(reader.sentences_tokenized[sent_idx]) for s, ner in nerDicts: [evWTs, evWIDS, evProbs] = evWTs_list[mentionnum] predTypes = pred_TypeSetsList[mentionnum] print(reader.bracketMentionInSentence(sentence, ner)) print("Prior: {} {}, Context: {} {}, Joint: {} {}".format( evWTs[0], evProbs[0], evWTs[1], evProbs[1], evWTs[2], evProbs[2])) print("Predicted Entity Types : {}".format(predTypes)) print("\n") mentionnum += 1 else: print("WRONG MODE!") sys.exit(0) sys.exit()
def main(_): pp.pprint(flags.FLAGS.__flags) FLAGS_check(FLAGS) config = Config(FLAGS.config, verbose=False) vocabloader = VocabLoader(config) FLAGS.dropout_keep_prob = 1.0 FLAGS.wordDropoutKeep = 1.0 FLAGS.cohDropoutKeep = 1.0 input_jsonl = FLAGS.input_jsonl output_jsonl = FLAGS.output_jsonl doc_key = FLAGS.doc_key reader = TextAnnoTestReader(config=config, vocabloader=vocabloader, num_cands=30, batch_size=FLAGS.batch_size, strict_context=FLAGS.strict_context, pretrain_wordembed=FLAGS.pretrain_wordembed, coherence=FLAGS.coherence) model_mode = 'test' config_proto = tf.ConfigProto() config_proto.allow_soft_placement = True config_proto.gpu_options.allow_growth = True sess = tf.Session(config=config_proto) with sess.as_default(): model = ELModel( sess=sess, reader=reader, dataset=FLAGS.dataset, max_steps=FLAGS.max_steps, pretrain_max_steps=FLAGS.pretraining_steps, word_embed_dim=FLAGS.word_embed_dim, context_encoded_dim=FLAGS.context_encoded_dim, context_encoder_num_layers=FLAGS.context_encoder_num_layers, context_encoder_lstmsize=FLAGS.context_encoder_lstmsize, coherence_numlayers=FLAGS.coherence_numlayers, jointff_numlayers=FLAGS.jointff_numlayers, learning_rate=FLAGS.learning_rate, dropout_keep_prob=FLAGS.dropout_keep_prob, reg_constant=FLAGS.reg_constant, checkpoint_dir=FLAGS.checkpoint_dir, optimizer=FLAGS.optimizer, mode=model_mode, strict=FLAGS.strict_context, pretrain_word_embed=FLAGS.pretrain_wordembed, typing=FLAGS.typing, el=FLAGS.el, coherence=FLAGS.coherence, textcontext=FLAGS.textcontext, useCNN=FLAGS.useCNN, WDLength=FLAGS.WDLength, Fsize=FLAGS.Fsize, entyping=FLAGS.entyping) model.load_ckpt_model(ckptpath=FLAGS.model_path) erroneous_files = 0 outf = open(output_jsonl, 'w') inpf = open(input_jsonl, 'r') for line in inpf: jsonobj = json.loads(line) doctext = jsonobj[doc_key] ta = localpipeline.doc(doctext, pretokenized=FLAGS.pretokenized) _ = ta.get_ner_conll # Make instances for this document reader.new_ta(ta) (predTypScNPmat_list, widIdxs_list, priorProbs_list, textProbs_list, jointProbs_list, evWTs_list, pred_TypeSetsList) = model.inference_run() wiki_view = copy.deepcopy(reader.textanno.get_view("NER_CONLL")) docta = reader.textanno el_cons_list = wiki_view.cons_list numMentionsInference = len(widIdxs_list) assert len(el_cons_list) == numMentionsInference out_dict = {doc_key: doctext} el_mentions = [] mentionnum = 0 for ner_cons in el_cons_list: # ner_cons is a dict mentiondict = {} mentiondict['tokens'] = ner_cons['tokens'] mentiondict['end'] = ner_cons['end'] mentiondict['start'] = ner_cons['start'] priorScoreMap = {} contextScoreMap = {} jointScoreMap = {} (wididxs, pps, mps, jps) = (widIdxs_list[mentionnum], priorProbs_list[mentionnum], textProbs_list[mentionnum], jointProbs_list[mentionnum]) maxJointProb = 0.0 maxJointEntity = "" for (wididx, prp, mp, jp) in zip(wididxs, pps, mps, jps): wT = reader.widIdx2WikiTitle(wididx) priorScoreMap[wT] = prp contextScoreMap[wT] = mp jointScoreMap[wT] = jp if jp > maxJointProb: maxJointProb = jp maxJointEntity = wT mentiondict["jointScoreMap"] = jointScoreMap mentiondict["contextScoreMap"] = contextScoreMap mentiondict["priorScoreMap"] = priorScoreMap # add max scoring entity as label mentiondict["label"] = maxJointEntity mentiondict["score"] = maxJointProb mentionnum += 1 el_mentions.append(mentiondict) out_dict['nel'] = el_mentions outstr = json.dumps(out_dict) outf.write(outstr) outf.write("\n") outf.close() inpf.close() print("Number of erroneous files: {}".format(erroneous_files)) print("Annotation completed. Program can be exited safely.") sys.exit()