def get_stats(treebanks): traindata = list(utils.read_conll_dir(treebanks, "train")) #do countings here d = {} d['rh'] = 0 d['lh'] = 0 tot = 0 for sentence in traindata: conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for item in conll_sentence: if item.relation.split(":")[0] in CONTENT_DEPRELS: tot += 1 if item.id < item.parent_id: d['rh'] += 1 else: d['lh'] += 1 d['rh'] /= float(tot) d['lh'] /= float(tot) d['rh'] *= 100 d['lh'] *= 100 print treebanks[0].iso_id print "Right-headed rels: " + str(d['rh']) print "Left-headed rels: " + str(d['lh'])
def get_stats_c(treebanks): traindata = list(utils.read_conll_dir(treebanks, "train")) n_dep = 0. depth = 0. for sentence in traindata: conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] for item in conll_sentence: if item.id != 0: depth_tok = get_dep_len(item, sentence, 1) depth += depth_tok n_dep += 1 av_dep = depth / n_dep print treebanks[0].iso_id print "Average distance to root:" + str(av_dep)
def get_stats_b(treebanks): traindata = list(utils.read_conll_dir(treebanks, "train")) n_sen = 0. sen_len = 0. n_dep = 0. dep_len = 0. for sentence in traindata: conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] sen_len += len(conll_sentence) n_sen += 1 for item in conll_sentence: dep_len += abs(item.id - item.parent_id) n_dep += 1 av_len = sen_len / n_sen av_dep = dep_len / n_dep print treebanks[0].iso_id print "Average sentence length:" + str(av_len) print "Average dependency length:" + str(av_dep)
def Predict(self, treebanks, datasplit, options): char_map = {} if options.char_map_file: char_map_fh = codecs.open(options.char_map_file,encoding='utf-8') char_map = json.loads(char_map_fh.read()) # should probably use a namedtuple in get_vocab to make this prettier _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map) # get external embeddings for the set of words and chars in the # test vocab but not in the training vocab test_embeddings = defaultdict(lambda: {}) if options.word_emb_size > 0 and options.ext_word_emb_file: new_test_words = \ set(test_words) - self.feature_extractor.words.viewkeys() print "Number of OOV word types at test time: %i (out of %i)" % ( len(new_test_words), len(test_words)) if len(new_test_words) > 0: # no point loading embeddings if there are no words to look for for lang in test_langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_word_emb_file, lang=lang, words=new_test_words ) test_embeddings["words"].update(embeddings) if len(test_langs) > 1 and test_embeddings["words"]: print "External embeddings found for %i words "\ "(out of %i)" % \ (len(test_embeddings["words"]), len(new_test_words)) if options.char_emb_size > 0: new_test_chars = \ set(test_chars) - self.feature_extractor.chars.viewkeys() print "Number of OOV char types at test time: %i (out of %i)" % ( len(new_test_chars), len(test_chars)) if len(new_test_chars) > 0: for lang in test_langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_char_emb_file, lang=lang, words=new_test_chars, chars=True ) test_embeddings["chars"].update(embeddings) if len(test_langs) > 1 and test_embeddings["chars"]: print "External embeddings found for %i chars "\ "(out of %i)" % \ (len(test_embeddings["chars"]), len(new_test_chars)) data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map) for iSentence, osentence in enumerate(data,1): sentence = deepcopy(osentence) self.feature_extractor.Init(options) conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings) scores, exprs = self.__evaluate(conll_sentence, True) if self.proj: heads = decoder.parse_proj(scores) #LATTICE solution to multiple roots # see https://github.com/jujbob/multilingual-bist-parser/blob/master/bist-parser/bmstparser/src/mstlstm.py ## ADD for handling multi-roots problem rootHead = [head for head in heads if head==0] if len(rootHead) != 1: print "it has multi-root, changing it for heading first root for other roots" rootHead = [seq for seq, head in enumerate(heads) if head == 0] for seq in rootHead[1:]:heads[seq] = rootHead[0] ## finish to multi-roots else: heads = chuliu_edmonds_one_root(scores.T) for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1) conll_sentence[modifier+1].pred_relation = self.feature_extractor.irels[max(enumerate(scores), key=itemgetter(1))[0]] dy.renew_cg() #keep in memory the information we need, not all the vectors oconll_sentence = [entry for entry in osentence if isinstance(entry, utils.ConllEntry)] for tok_o, tok in zip(oconll_sentence, conll_sentence): tok_o.pred_relation = tok.pred_relation tok_o.pred_parent_id = tok.pred_parent_id yield osentence
def Predict(self, treebanks, datasplit, options): reached_max_swap = 0 char_map = {} if options.char_map_file: char_map_fh = codecs.open(options.char_map_file,encoding='utf-8') char_map = json.loads(char_map_fh.read()) # should probably use a namedtuple in get_vocab to make this prettier print "Collecting test data vocab" _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map) # get external embeddings for the set of words and chars in the test vocab but not in the training vocab test_embeddings = defaultdict(lambda:{}) if options.word_emb_size > 0: new_test_words = set(test_words) - self.feature_extractor.words.viewkeys() print "Number of OOV word types at test time: %i (out of %i)"%(len(new_test_words),len(test_words)) if len(new_test_words) > 0: # no point loading embeddings if there are no words to look for for lang in test_langs: test_embeddings["words"].update(utils.get_external_embeddings(options,lang,new_test_words)) if len(test_langs) > 1 and test_embeddings["words"]: print "External embeddings found for %i words (out of %i)"%(len(test_embeddings["words"]),len(new_test_words)) if options.char_emb_size > 0: new_test_chars = set(test_chars) - self.feature_extractor.chars.viewkeys() print "Number of OOV char types at test time: %i (out of %i)"%(len(new_test_chars),len(test_chars)) if len(new_test_chars) > 0: for lang in test_langs: test_embeddings["chars"].update(utils.get_external_embeddings(options,lang,new_test_chars,chars=True)) if len(test_langs) > 1 and test_embeddings["chars"]: print "External embeddings found for %i chars (out of %i)"%(len(test_embeddings["chars"]),len(new_test_chars)) ts = time() data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map) for iSentence, osentence in enumerate(data,1): sentence = deepcopy(osentence) reached_swap_for_i_sentence = False max_swap = 2*len(sentence) iSwap = 0 self.feature_extractor.Init(options) conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] conll_sentence = conll_sentence[1:] + [conll_sentence[0]] self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings) stack = ParseForest([]) buf = ParseForest(conll_sentence) hoffset = 1 if self.headFlag else 0 for root in conll_sentence: root.lstms = [root.vec] if self.headFlag else [] if not self.recursive_composition: root.lstms += [self.feature_extractor.paddingVec for _ in range(self.nnvecs - hoffset)] else: root.lstms += [root.vec] root.lstm = None #only necessary for treeLSTM case root.composed_rep = root.vec.value() while not (len(buf) == 1 and len(stack) == 0): scores = self.__evaluate(stack, buf, False) best = max(chain(*(scores if iSwap < max_swap else scores[:3] )), key = itemgetter(2) ) if iSwap == max_swap and not reached_swap_for_i_sentence: reached_max_swap += 1 reached_swap_for_i_sentence = True print "reached max swap in %d out of %d sentences"%(reached_max_swap, iSentence) self.apply_transition(best,stack,buf,hoffset) if best[1] == SWAP: iSwap += 1 #keep in memory the information we need, not all the vectors oconll_sentence = [entry for entry in osentence if isinstance(entry, utils.ConllEntry)] oconll_sentence = oconll_sentence[1:] + [oconll_sentence[0]] for tok_o, tok in zip(oconll_sentence, conll_sentence): tok_o.pred_relation = tok.pred_relation tok_o.pred_parent_id = tok.pred_parent_id if self.recursive_composition: tok_o.composed_rep = tok.composed_rep yield osentence dy.renew_cg() print "Total prediction time: %.2fs"%(time()-ts)
def run(om, options, i): if options.multiling: outdir = options.outdir else: cur_treebank = om.languages[i] outdir = cur_treebank.outdir if options.shared_task: outdir = options.shared_task_outdir if not options.predict: # training print 'Preparing vocab' if options.multiling: path_is_dir = True, words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\ path_is_dir, options.shareWordLookup,\ options.shareCharLookup) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( cur_treebank.trainfile) paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'w') as paramsfp: print 'Saving params to ' + paramsfile pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) if options.continueModel is not None: parser.Load(options.continueModel) for epoch in xrange(options.first_epoch, options.first_epoch + options.epochs): print 'Starting epoch ' + str(epoch) if options.multiling: traindata = list( utils.read_conll_dir(om.languages, "train", options.max_sentences)) else: traindata = list( utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id, options.max_sentences)) parser.Train(traindata) print 'Finished epoch ' + str(epoch) model_file = os.path.join(outdir, options.model + str(epoch)) parser.Save(model_file) if options.pred_dev: # use the model to predict on dev data if options.multiling: pred_langs = [ lang for lang in om.languages if lang.pred_dev ] # languages which have dev data on which to predict for lang in pred_langs: lang.outfilename = os.path.join( lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu') print "Predicting on dev data for " + lang.name devdata = utils.read_conll_dir(pred_langs, "dev") pred = list(parser.Predict(devdata)) if len(pred) > 0: utils.write_conll_multiling(pred, pred_langs) else: print "Warning: prediction empty" if options.pred_eval: for lang in pred_langs: print "Evaluating dev prediction for " + lang.name utils.evaluate(lang.dev_gold, lang.outfilename, om.conllu) else: # monolingual case if cur_treebank.pred_dev: print "Predicting on dev data for " + cur_treebank.name devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id) cur_treebank.outfilename = os.path.join( outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu')) pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.pred_eval: print "Evaluating dev prediction for " + cur_treebank.name score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) if options.model_selection: if score > cur_treebank.dev_best[1]: cur_treebank.dev_best = [epoch, score] if epoch == options.epochs: # at the last epoch choose which model to copy to barchybrid.model if not options.model_selection: best_epoch = options.epochs # take the final epoch if model selection off completely (for example multilingual case) else: best_epoch = cur_treebank.dev_best[ 0] # will be final epoch by default if model selection not on for this treebank if cur_treebank.model_selection: print "Best dev score of " + str( cur_treebank.dev_best[1] ) + " found at epoch " + str(cur_treebank.dev_best[0]) bestmodel_file = os.path.join( outdir, "barchybrid.model" + str(best_epoch)) model_file = os.path.join(outdir, "barchybrid.model") print "Copying " + bestmodel_file + " to " + model_file copyfile(bestmodel_file, model_file) else: #if predict - so if options.multiling: modeldir = options.modeldir else: modeldir = om.languages[i].modeldir params = os.path.join(modeldir, options.params) print 'Reading params from ' + params with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load( paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) model = os.path.join(modeldir, options.model) parser.Load(model) if options.multiling: testdata = utils.read_conll_dir(om.languages, "test") else: testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, l.outfilename) pred = list(parser.Predict(testdata)) utils.write_conll_multiling(pred, om.languages) else: if cur_treebank.outfilename: cur_treebank.outfilename = os.path.join( outdir, cur_treebank.outfilename) else: cur_treebank.outfilename = os.path.join( outdir, 'out' + ('.conll' if not om.conllu else '.conllu')) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata)) te = time.time() if options.pred_eval: if options.multiling: for l in om.languages: print "Evaluating on " + l.name score = utils.evaluate(l.test_gold, l.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % (score, l.name) else: print "Evaluating on " + cur_treebank.name score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % ( score, cur_treebank.name) print 'Finished predicting'
def run(om,options,i): if options.multiling: outdir = options.outdir else: cur_treebank = om.languages[i] outdir = cur_treebank.outdir if options.shared_task: outdir = options.shared_task_outdir if not options.predict: # training fineTune = False start_from = 1 if options.continueModel is None: continueTraining = False else: continueTraining = True trainedModel = options.continueModel if options.fineTune: fineTune = True else: start_from = options.first_epoch - 1 if not continueTraining: print 'Preparing vocab' if options.multiling: path_is_dir=True, words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\ path_is_dir, options.shareWordLookup,\ options.shareCharLookup) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab(cur_treebank.trainfile) paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'w') as paramsfp: print 'Saving params to ' + paramsfile pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' else: paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'rb') as paramsfp: print 'Load params from ' + paramsfile words, w2i, pos, rels, cpos, langs, options, ch = pickle.load(paramsfp) print 'Finished loading vocab' max_epochs = options.first_epoch + options.epochs print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) if continueTraining: if not fineTune: # continue training only, not doing fine tuning options.first_epoch = start_from + 1 max_epochs = options.epochs else: # fine tune model options.first_epoch = options.epochs + 1 max_epochs = options.first_epoch + 15 print 'Fine tune model for another', max_epochs - options.first_epoch, 'epochs' parser.Load(trainedModel) best_multi_las = -1 best_multi_epoch = 0 if continueTraining: train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'a', encoding='utf-8') else: train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'w', encoding='utf-8') for epoch in xrange(options.first_epoch, max_epochs + 1): print 'Starting epoch ' + str(epoch) if options.multiling: traindata = list(utils.read_conll_dir(om.languages, "train", options.max_sentences)) else: traindata = list(utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id,options.max_sentences)) parser.Train(traindata) train_stats.write(unicode('Epoch ' + str(epoch) + '\n')) print 'Finished epoch ' + str(epoch) model_file = os.path.join(outdir, options.model + '.tmp') parser.Save(model_file) if options.pred_dev: # use the model to predict on dev data if options.multiling: pred_langs = [lang for lang in om.languages if lang.pred_dev] # languages which have dev data on which to predict for lang in pred_langs: lang.outfilename = os.path.join(lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu') print "Predicting on dev data for " + lang.name devdata = utils.read_conll_dir(pred_langs,"dev") pred = list(parser.Predict(devdata)) if len(pred)>0: utils.write_conll_multiling(pred,pred_langs) else: print "Warning: prediction empty" if options.pred_eval: total_las = 0 for lang in pred_langs: print "Evaluating dev prediction for " + lang.name las_score = utils.evaluate(lang.dev_gold, lang.outfilename,om.conllu) total_las += las_score train_stats.write(unicode('Dev LAS ' + lang.name + ': ' + str(las_score) + '\n')) if options.model_selection: if total_las > best_multi_las: best_multi_las = total_las best_multi_epoch = epoch else: # monolingual case if cur_treebank.pred_dev: print "Predicting on dev data for " + cur_treebank.name devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id) cur_treebank.outfilename = os.path.join(outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu')) pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.pred_eval: print "Evaluating dev prediction for " + cur_treebank.name las_score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) if options.model_selection: if las_score > cur_treebank.dev_best[1]: cur_treebank.dev_best = [epoch, las_score] train_stats.write(unicode('Dev LAS ' + cur_treebank.name + ': ' + str(las_score) + '\n')) if epoch == max_epochs: # at the last epoch choose which model to copy to barchybrid.model if not options.model_selection: best_epoch = options.epochs # take the final epoch if model selection off completely (for example multilingual case) else: if options.multiling: best_epoch = best_multi_epoch else: best_epoch = cur_treebank.dev_best[0] # will be final epoch by default if model selection not on for this treebank if cur_treebank.model_selection: print "Best dev score of " + str(cur_treebank.dev_best[1]) + " found at epoch " + str(cur_treebank.dev_best[0]) bestmodel_file = os.path.join(outdir,"barchybrid.model.tmp") model_file = os.path.join(outdir,"barchybrid.model") if fineTune: model_file = os.path.join(outdir,"barchybrid.tuned.model") print "Best epoch: " + str(best_epoch) print "Copying " + bestmodel_file + " to " + model_file copyfile(bestmodel_file,model_file) train_stats.close() else: #if predict - so # import pdb;pdb.set_trace() eval_type = options.evaltype print "Eval type: ", eval_type if eval_type == "train": if options.multiling: for l in om.languages: l.test_gold = l.test_gold.replace('test', 'train') else: cur_treebank.testfile = cur_treebank.trainfile cur_treebank.test_gold = cur_treebank.trainfile elif eval_type == "dev": if options.multiling: for l in om.languages: l.test_gold = l.test_gold.replace('test', 'dev') else: cur_treebank.testfile = cur_treebank.devfile cur_treebank.test_gold = cur_treebank.devfile if options.multiling: modeldir = options.modeldir if options.fineTune: prefix = [os.path.join(outdir, os.path.basename(l.test_gold) + '-tuned') for l in om.languages] else: prefix = [os.path.join(outdir, os.path.basename(l.test_gold)) for l in om.languages] else: modeldir = om.languages[i].modeldir if options.fineTune: prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile)) + '-tuned' else: prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile)) if not options.extract_vectors: prefix = None params = os.path.join(modeldir, options.params) print 'Reading params from ' + params with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) if options.fineTune: options.model = options.model.replace('.model', '.tuned.model') model = os.path.join(modeldir, options.model) parser.Load(model) if options.multiling: testdata = utils.read_conll_dir(om.languages, eval_type) else: testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, eval_type + "-" + l.outfilename) pred = list(parser.Predict(testdata, prefix)) utils.write_conll_multiling(pred,om.languages) else: if cur_treebank.outfilename: cur_treebank.outfilename = os.path.join(outdir, eval_type + "-" + cur_treebank.outfilename) else: cur_treebank.outfilename = os.path.join(outdir, 'out' + ('.conll' if not om.conllu else '.conllu')) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata, prefix)) te = time.time() if options.pred_eval: if options.multiling: for l in om.languages: print "Evaluating on " + l.name score = utils.evaluate(l.test_gold, l.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" %(score, l.name) else: print "Evaluating on " + cur_treebank.name score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" %(score,cur_treebank.name) print 'Finished predicting'
def Predict(self, treebanks, datasplit, options): reached_max_swap = 0 char_map = {} if options.char_map_file: char_map_fh = open(options.char_map_file, encoding='utf-8') char_map = json.loads(char_map_fh.read()) # should probably use a namedtuple in get_vocab to make this prettier _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab( treebanks, datasplit, char_map) # get external embeddings for the set of words and chars in the # test vocab but not in the training vocab test_embeddings = defaultdict(lambda: {}) if options.word_emb_size > 0 and options.ext_word_emb_file: new_test_words = \ set(test_words) - self.feature_extractor.words.keys() print("Number of OOV word types at test time: %i (out of %i)" % (len(new_test_words), len(test_words))) if len(new_test_words) > 0: # no point loading embeddings if there are no words to look for for lang in test_langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_word_emb_file, lang=lang, words=new_test_words) test_embeddings["words"].update(embeddings) if len(test_langs) > 1 and test_embeddings["words"]: print("External embeddings found for %i words "\ "(out of %i)" % \ (len(test_embeddings["words"]), len(new_test_words))) if options.char_emb_size > 0: new_test_chars = \ set(test_chars) - self.feature_extractor.chars.keys() print("Number of OOV char types at test time: %i (out of %i)" % (len(new_test_chars), len(test_chars))) if len(new_test_chars) > 0: for lang in test_langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_char_emb_file, lang=lang, words=new_test_chars, chars=True) test_embeddings["chars"].update(embeddings) if len(test_langs) > 1 and test_embeddings["chars"]: print("External embeddings found for %i chars "\ "(out of %i)" % \ (len(test_embeddings["chars"]), len(new_test_chars))) data = utils.read_conll_dir(treebanks, datasplit, char_map=char_map) for iSentence, osentence in enumerate(data, 1): sentence = deepcopy(osentence) reached_swap_for_i_sentence = False max_swap = 2 * len(sentence) iSwap = 0 self.feature_extractor.Init(options) conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] conll_sentence = conll_sentence[1:] + [conll_sentence[0]] self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings) stack = ParseForest([]) buf = ParseForest(conll_sentence) hoffset = 1 if self.headFlag else 0 for root in conll_sentence: #empty = dy.zeros(2*options.lstm_output_size) root.lstms = [root.vec] if self.headFlag else [] root.lstms += [root.vec for _ in range(self.nnvecs - hoffset)] root.relation = root.relation if root.relation in self.irels else 'runk' while not (len(buf) == 1 and len(stack) == 0): scores = self.__evaluate(stack, buf, False) best = max( chain(*(scores if iSwap < max_swap else scores[:3])), key=itemgetter(2)) if iSwap == max_swap and not reached_swap_for_i_sentence: reached_max_swap += 1 reached_swap_for_i_sentence = True print("reached max swap in %d out of %d sentences" % (reached_max_swap, iSentence)) self.apply_transition(best, stack, buf, hoffset) if best[1] == SWAP: iSwap += 1 dy.renew_cg() #keep in memory the information we need, not all the vectors oconll_sentence = [ entry for entry in osentence if isinstance(entry, utils.ConllEntry) ] oconll_sentence = oconll_sentence[1:] + [oconll_sentence[0]] for tok_o, tok in zip(oconll_sentence, conll_sentence): tok_o.pred_relation = tok.pred_relation tok_o.pred_parent_id = tok.pred_parent_id yield osentence
def run(om, options, i): if options.multiling: outdir = options.outdir else: cur_treebank = om.languages[i] outdir = cur_treebank.outdir if options.shared_task: outdir = options.shared_task_outdir if not options.predict: # training print 'Preparing vocab' if options.multiling: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( om.languages, path_is_dir=True) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( cur_treebank.trainfile) paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'w') as paramsfp: print 'Saving params to ' + paramsfile pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) durations = [] for epoch in xrange(options.first_epoch, options.first_epoch + options.epochs): print 'Starting epoch ' + str(epoch) start_time = time.time() if options.multiling: traindata = list( utils.read_conll_dir(om.languages, "train", options.max_sentences)) else: traindata = list( utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id, options.max_sentences)) parser.Train(traindata) print 'Finished epoch ' + str(epoch) if not options.overwrite_model: model_file = os.path.join(outdir, options.model + str(epoch)) parser.Save(model_file) if options.pred_dev: # use the model to predict on dev data if options.multiling: pred_langs = [ lang for lang in om.languages if lang.pred_dev ] # languages which have dev data on which to predict for lang in pred_langs: lang.outfilename = os.path.join( lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu') print "Predicting on dev data for " + lang.name devdata = utils.read_conll_dir(pred_langs, "dev") pred = list(parser.Predict(devdata)) if len(pred) > 0: utils.write_conll_multiling(pred, pred_langs) else: print "Warning: prediction empty" if options.pred_eval: for lang in pred_langs: print "Evaluating dev prediction for " + lang.name utils.evaluate(lang.dev_gold, lang.outfilename, om.conllu) else: # monolingual case if cur_treebank.pred_dev: print "Predicting on dev data for " + cur_treebank.name devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id) cur_treebank.outfilename = os.path.join( outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu')) pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.pred_eval: print "Evaluating dev prediction for " + cur_treebank.name score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) if options.model_selection: if score > cur_treebank.dev_best[1]: cur_treebank.dev_best = [epoch, score] if options.overwrite_model: print "Overwriting model due to higher dev score" model_file = os.path.join( cur_treebank.outdir, options.model) parser.Save(model_file) if options.deadline: # keep track of duration of training+eval now = time.time() duration = now - start_time durations.append(duration) # estimate when next epoch will finish last_five_durations = durations[-5:] eta = time.time() + max(last_five_durations) print 'Deadline in %.1f seconds' % (options.deadline - now) print 'ETA of next epoch in %.1f seconds' % (eta - now) # does it exceed the deadline? exceeds_deadline = eta > options.deadline else: # no deadline exceeds_deadline = False if exceeds_deadline or epoch == options.epochs: # at the last epoch copy the best model to barchybrid.model if not options.model_selection: # model selection off completely (for example multilingual case) # --> take the final epoch, i.e. the current epoch best_epoch = epoch else: best_epoch = cur_treebank.dev_best[ 0] # will be final epoch by default if model selection not on for this treebank if cur_treebank.model_selection: print "Best dev score of " + str( cur_treebank.dev_best[1] ) + " found at epoch " + str(cur_treebank.dev_best[0]) if not options.overwrite_model: bestmodel_file = os.path.join( outdir, "barchybrid.model" + str(best_epoch)) model_file = os.path.join(outdir, "barchybrid.model") print "Copying " + bestmodel_file + " to " + model_file copyfile(bestmodel_file, model_file) if exceeds_deadline and epoch < options.epochs: print 'Leaving epoch loop early to avoid exceeding deadline' break if exceeds_deadline and epoch < options.epochs: print 'Leaving epoch loop early to avoid exceeding deadline' break else: #if predict - so if options.multiling: modeldir = options.modeldir else: modeldir = om.languages[i].modeldir params = os.path.join(modeldir, options.params) print 'Reading params from ' + params with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load( paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) model = os.path.join(modeldir, options.model) parser.Load(model) if options.multiling: testdata = utils.read_conll_dir(om.languages, "test") else: testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, l.outfilename) pred = list(parser.Predict(testdata)) utils.write_conll_multiling(pred, om.languages) else: if cur_treebank.outfilename: cur_treebank.outfilename = os.path.join( outdir, cur_treebank.outfilename) else: cur_treebank.outfilename = os.path.join( outdir, 'out' + ('.conll' if not om.conllu else '.conllu')) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata)) te = time.time() if options.pred_eval: if options.multiling: for l in om.languages: print "Evaluating on " + l.name score = utils.evaluate(l.test_gold, l.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % (score, l.name) else: print "Evaluating on " + cur_treebank.name score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % ( score, cur_treebank.name) print 'Finished predicting'
def run(experiment,options): if options.graph_based: from mstlstm import MSTParserLSTM as Parser print('Working with a graph-based parser') else: from arc_hybrid import ArcHybridLSTM as Parser print('Working with a transition-based parser') if not options.predict: # training paramsfile = os.path.join(experiment.outdir, options.params) if not options.continueTraining: print('Preparing vocab') vocab = utils.get_vocab(experiment.treebanks,"train") print('Finished collecting vocab') with open(paramsfile, 'wb') as paramsfp: print('Saving params to ' + paramsfile) pickle.dump((vocab, options), paramsfp) print('Initializing the model') parser = Parser(vocab, options) else: #continue if options.continueParams: paramsfile = options.continueParams with open(paramsfile, 'r') as paramsfp: stored_vocab, stored_options = pickle.load(paramsfp) print('Initializing the model:') parser = Parser(stored_vocab, stored_options) parser.Load(options.continueModel) dev_best = [options.epochs,-1.0] # best epoch, best score for epoch in range(options.first_epoch, options.epochs+1): print('Starting epoch ' + str(epoch)) traindata = list(utils.read_conll_dir(experiment.treebanks, "train", options.max_sentences)) parser.Train(traindata,options) print('Finished epoch ' + str(epoch)) model_file = os.path.join(experiment.outdir, options.model + str(epoch)) parser.Save(model_file) if options.pred_dev: # use the model to predict on dev data # not all treebanks necessarily have dev data pred_treebanks = [treebank for treebank in experiment.treebanks if treebank.pred_dev] if pred_treebanks: for treebank in pred_treebanks: treebank.outfilename = os.path.join(treebank.outdir, 'dev_epoch_' + str(epoch) + '.conllu') print("Predicting on dev data for " + treebank.name) pred = list(parser.Predict(pred_treebanks,"dev",options)) utils.write_conll_multiling(pred,pred_treebanks) if options.pred_eval: # evaluate the prediction against gold data mean_score = 0.0 for treebank in pred_treebanks: score = utils.evaluate(treebank.dev_gold,treebank.outfilename,options.conllu) print("Dev score %.2f at epoch %i for %s"%(score,epoch,treebank.name)) mean_score += score if len(pred_treebanks) > 1: # multiling case mean_score = mean_score/len(pred_treebanks) print("Mean dev score %.2f at epoch %i"%(mean_score,epoch)) if options.model_selection: if mean_score > dev_best[1]: dev_best = [epoch,mean_score] # update best dev score # hack to printthe word "mean" if the dev score is an average mean_string = "mean " if len(pred_treebanks) > 1 else "" print("Best %sdev score %.2f at epoch %i"%(mean_string,dev_best[1],dev_best[0])) # at the last epoch choose which model to copy to barchybrid.model if epoch == options.epochs: bestmodel_file = os.path.join(experiment.outdir,"barchybrid.model" + str(dev_best[0])) model_file = os.path.join(experiment.outdir,"barchybrid.model") print("Copying " + bestmodel_file + " to " + model_file) copyfile(bestmodel_file,model_file) best_dev_file = os.path.join(experiment.outdir,"best_dev_epoch.txt") with open (best_dev_file, 'w') as fh: print("Writing best scores to: " + best_dev_file) if len(experiment.treebanks) == 1: fh.write("Best dev score %s at epoch %i\n"%(dev_best[1],dev_best[0])) else: fh.write("Best mean dev score %s at epoch %i\n"%(dev_best[1],dev_best[0])) else: #if predict - so params = os.path.join(experiment.modeldir,options.params) print('Reading params from ' + params) with open(params, 'rb') as paramsfp: stored_vocab, stored_opt = pickle.load(paramsfp) # we need to update/add certain options based on new user input utils.fix_stored_options(stored_opt,options) parser = Parser(stored_vocab, stored_opt) model = os.path.join(experiment.modeldir, options.model) parser.Load(model) ts = time.time() for treebank in experiment.treebanks: if options.predict_all_epochs: # name outfile after epoch number in model file try: m = re.search('(\d+)$',options.model) epoch = m.group(1) treebank.outfilename = 'dev_epoch_%s.conllu'%epoch except AttributeError: raise Exception("No epoch number found in model file (e.g. barchybrid.model22)") if not treebank.outfilename: treebank.outfilename = 'out' + ('.conll' if not options.conllu else '.conllu') treebank.outfilename = os.path.join(treebank.outdir, treebank.outfilename) pred = list(parser.Predict(experiment.treebanks,"test",stored_opt)) utils.write_conll_multiling(pred,experiment.treebanks) te = time.time() if options.pred_eval: for treebank in experiment.treebanks: print("Evaluating on " + treebank.name) score = utils.evaluate(treebank.test_gold,treebank.outfilename,options.conllu) print("Obtained LAS F1 score of %.2f on %s" %(score,treebank.name)) print('Finished predicting')
def run(om, options, i): outdir = options.output if options.multi_monoling: cur_treebank = om.languages[i] outdir = cur_treebank.outdir modelDir = cur_treebank.modelDir else: outdir = options.output modelDir = om.languages[i].modelDir if options.shared_task: outdir = options.shared_task_outdir if not options.include: cur_treebank = om.treebank if not options.predictFlag: print 'Preparing vocab' if options.multiling: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( om.languages, path_is_dir=True) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( cur_treebank.trainfile) with open(os.path.join(outdir, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) for epoch in xrange(options.first_epoch - 1, options.first_epoch - 1 + options.epochs): if options.multiling: traindata = list( utils.read_conll_dir(om.languages, "train", options.drop_proj, options.maxCorpus)) devdata = enumerate(utils.read_conll_dir(om.languages, "dev")) else: conllFP = open(cur_treebank.trainfile, 'r') traindata = list( utils.read_conll(conllFP, options.drop_proj, cur_treebank.iso_id)) if os.path.exists(cur_treebank.devfile): conllFP = open(cur_treebank.devfile, 'r') devdata = enumerate( utils.read_conll(conllFP, False, cur_treebank.iso_id)) else: tot_sen = len(traindata) #take a bit less than 5% of train sentences for dev if tot_sen > 1000: import random random.shuffle(traindata) dev_len = int(0.05 * tot_sen) #gen object * 2 devdata, dev_gold = itertools.tee(traindata[:dev_len]) devdata = enumerate(devdata) dev_gold_f = os.path.join(outdir, 'dev_gold' + '.conllu') utils.write_conll(dev_gold_f, dev_gold) cur_treebank.dev_gold = dev_gold_f traindata = traindata[dev_len:] else: devdata = None print 'Starting epoch', epoch parser.Train(traindata) if options.multiling: for l in om.languages: l.outfilename = os.path.join( l.outdir, 'dev_epoch_' + str(epoch + 1) + '.conllu') pred = list(parser.Predict(devdata)) if len(pred) > 0: utils.write_conll_multiling(pred, om.languages) else: cur_treebank.outfilename = os.path.join( outdir, 'dev_epoch_' + str(epoch + 1) + ('.conll' if not om.conllu else '.conllu')) if devdata: pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.multiling: for l in om.languages: utils.evaluate(l.dev_gold, l.outfilename, om.conllu) else: utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) print 'Finished predicting dev' parser.Save(os.path.join(outdir, options.model + str(epoch + 1))) else: #if predict - so params = os.path.join(modelDir, options.params) with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load( paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) model = os.path.join(modelDir, options.model) parser.Load(model) if options.multiling: testdata = enumerate(utils.read_conll_dir( om.languages, "test")) if not options.multiling: conllFP = open(cur_treebank.testfile, 'r') testdata = enumerate( utils.read_conll(conllFP, False, cur_treebank.iso_id)) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, l.outfilename) pred = list(parser.Predict(testdata)) utils.write_conll_multiling(pred, om.languages) else: cur_treebank.outfilename = os.path.join( outdir, cur_treebank.outfilename) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata)) te = time.time() if options.predEval: if options.multiling: for l in om.languages: utils.evaluate(l.test_gold, l.outfilename, om.conllu) else: utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print 'Finished predicting test', te - ts