def createDebugData(self,treebank,options): ext = '.conllu' if options.conllu else '.conll' print 'Creating smaller data sets for debugging' if not options.predict: train_data = list(utils.read_conll(treebank.trainfile,maxSize=options.debug_train_sents,hard_lim=True)) train_file = os.path.join(treebank.outdir,'train-debug' + ext) # location for the new train file utils.write_conll(train_file,train_data) # write the new dev data to file treebank.trainfile = train_file if treebank.devfile and os.path.exists(treebank.devfile) and options.pred_dev: dev_data = list(utils.read_conll(treebank.devfile,maxSize=options.debug_dev_sents,hard_lim=True)) dev_file = os.path.join(treebank.outdir,'dev-debug' + ext) # location for the new dev file utils.write_conll(dev_file,dev_data) # write the new dev data to file # have to create a separate debug gold file if not the same as input file if treebank.dev_gold != treebank.devfile: dev_gold_data = list(utils.read_conll(treebank.dev_gold,maxSize=options.debug_dev_sents,hard_lim=True)) dev_gold_file = os.path.join(treebank.outdir,'dev-gold-debug' + ext) # location for the new dev file utils.write_conll(dev_gold_file,dev_gold_data) # write the new dev gold data to file treebank.dev_gold = dev_gold_file else: treebank.dev_gold = dev_file treebank.devfile = dev_file # important to do this last else: test_data = list(utils.read_conll(treebank.testfile,maxSize=options.debug_test_sents,hard_lim=True)) test_file = os.path.join(treebank.outdir,'test-debug' + ext) # location for the new dev file utils.write_conll(test_file,test_data) # write the new dev data to file if treebank.test_gold != treebank.testfile: test_gold_data = list(utils.read_conll(treebank.test_gold,maxSize=options.debug_test_sents,hard_lim=True)) test_gold_file = os.path.join(treebank.outdir,'test-gold-debug' + ext) # location for the new dev file utils.write_conll(test_gold_file,test_gold_data) # write the new dev data to file treebank.test_gold = test_gold_file else: treebank.test_gold = test_file treebank.testfile = test_file
def Train(self, mini_batches, epoch, best_f_score, options): print 'Start time', time.ctime() start = time.time() errs, loss, iters, sen_num = [], 0, 0, 0 dev_path = options.conll_dev part_size = len(mini_batches) / 5 part = 0 best_part = 0 for b, mini_batch in enumerate(mini_batches): e = self.buildGraph(mini_batch, True) errs += e sum_errs = esum(errs) / len(errs) loss += sum_errs.scalar_value() sum_errs.backward() self.trainer.update() renew_cg() self.x_le.init_row(self.NO_LEMMA, [0] * self.d_l) renew_cg() print 'loss:', loss / ( b + 1), 'time:', time.time() - start, 'progress', round( 100 * float(b + 1) / len(mini_batches), 2), '%' loss, start = 0, time.time() errs, sen_num = [], 0 iters += 1 if (b + 1) % part_size == 0: part += 1 if dev_path != '': start = time.time() write_conll( os.path.join(options.outdir, options.model) + str(epoch + 1) + "_" + str(part) + '.txt', self.Predict(dev_path)) os.system('perl src/utils/eval.pl -g ' + dev_path + ' -s ' + os.path.join(options.outdir, options.model) + str(epoch + 1) + "_" + str(part) + '.txt' + ' > ' + os.path.join(options.outdir, options.model) + str(epoch + 1) + "_" + str(part) + '.eval') print 'Finished predicting dev on part ' + str( part) + '; time:', time.time() - start labeled_f, unlabeled_f = get_scores( os.path.join(options.outdir, options.model) + str(epoch + 1) + "_" + str(part) + '.eval') print 'epoch: ' + str(epoch) + ' part: ' + str( part) + '-- labeled F1: ' + str( labeled_f) + ' Unlabaled F: ' + str(unlabeled_f) if float(labeled_f) > best_f_score: self.Save(os.path.join(options.outdir, options.model)) best_f_score = float(labeled_f) best_part = part print 'best part on this epoch: ' + str(best_part) return best_f_score
def prepareDev(self,treebank,options): treebank.pred_dev = options.pred_dev # even if options.pred_dev is True, might change treebank.pred_dev to False later if no dev data available if not treebank.devfile or not os.path.exists(treebank.devfile): if options.create_dev: # create some dev data from the training data train_data = list(utils.read_conll(treebank.trainfile)) tot_sen = len(train_data) if tot_sen > options.min_train_sents: # need to have at least min_train_sents to move forward dev_file = os.path.join(treebank.outdir,'dev-split' + '.conllu') # location for the new dev file train_file = os.path.join(treebank.outdir,'train-split' + '.conllu') # location for the new train file dev_len = int(0.01*options.dev_percent*tot_sen) print ("Taking " + str(dev_len) + " of " + str(tot_sen) + " sentences from training data as new dev data for " + treebank.name) random.shuffle(train_data) dev_data = train_data[:dev_len] utils.write_conll(dev_file,dev_data) # write the new dev data to file train_data = train_data[dev_len:] # put the rest of the training data in a new file too utils.write_conll(train_file,train_data) # update some variables with the new file locations treebank.dev_gold = dev_file treebank.devfile = dev_file treebank.trainfile = train_file else: # not enough sentences print ("Warning: not enough sentences in training data to create dev set for " + treebank.name + " (minimum required --min-train-size: " + str(options.min_train_sents) + ")") treebank.pred_dev = False else: # option --create-dev not set print ("Warning: No dev data for " + treebank.name + ", consider adding option --create-dev to create dev data from training set") treebank.pred_dev = False if options.model_selection and not treebank.pred_dev: print "Warning: can't do model selection for " + treebank.name + " as prediction on dev data is off"
def ensemble(files, outfile): """ Takes conllu files as input """ conllu_files = [] for f in files: cf = utils.read_conll(f) conllu_files.append(cf) zipped_sentences = zip(*conllu_files) decoder = DependencyDecoder() sentences_out = [] for zipped_sentence in zipped_sentences: conll_sentence = [ entry for entry in zipped_sentence[0] if isinstance(entry, utils.ConllEntry) ] n_words = len(conll_sentence) m = np.zeros((n_words, n_words)) for i_sentence in zipped_sentence: conll_sen = [ entry for entry in i_sentence if isinstance(entry, utils.ConllEntry) ] for item in conll_sen: head = item.parent_id dep = item.id m[head, dep] += 1 #NOTE: this takes the label of the first! heads = decoder.parse_nonproj(m) for entry in zipped_sentence[0]: if isinstance(entry, utils.ConllEntry): entry.pred_parent_id = heads[entry.id] sentences_out.append(zipped_sentence[0]) utils.write_conll(outfile, sentences_out)
def CRF_eval(data, test_index, y_pred, path, self_eval): test_char = [data[i] for i in test_index] if self_eval: datawpred = [[[data[0], data[-1]] + [pred] for data, pred in zip(test_char[j], y_pred[j])] for j in range(len(y_pred))] else: datawpred = [[[data, pred] for data, pred in zip(test_char[j], y_pred[j])] for j in range(len(y_pred))] with open(path + "pred{}.conll".format(self_eval != True), 'w', encoding='utf-8') as f: write_conll(f, input_data_transform(datawpred)) if self_eval: test_ner(path)
def evaluate_model(): conllu = (os.path.splitext(dev_file.lower())[1] == '.conllu') devpath = os.path.join( output_file, 'dev_epoch_' + str(epoch + 1) + ('.conll' if not conllu else '.conllu')) utils.write_conll(devpath, parser.predict(dev_file)) if not conllu: perl_command = 'perl ' + utils_path + '/eval.pl -g ' + dev_file + ' -s ' + devpath + ' > ' \ + devpath + '.txt' print(perl_command) os.system(perl_command) with open(devpath + '.txt', 'r') as f: for i in range(0, 3): print(f.readline()) else: python_command = 'python3 ' + utils_path + 'evaluation_script/conll17_ud_eval.py -v -w ' + \ utils_path + 'evaluation_script/weights.clas ' + dev_file + ' ' + devpath + ' > ' \ + devpath + '.txt' print(python_command) os.system(python_command)
def createDebugData(self,treebank,options): ext = '.conllu' if self.conllu else '.conll' print 'Creating smaller data sets for debugging' if not options.predict: traindata = list(utils.read_conll(treebank.trainfile,treebank.iso_id,maxSize=options.debug_train_sents,hard_lim=True)) train_file = os.path.join(treebank.outdir,'train-debug' + ext) # location for the new train file utils.write_conll(train_file,traindata) # write the new dev data to file treebank.trainfile = train_file if treebank.devfile and os.path.exists(treebank.devfile) and options.pred_dev: devdata = list(utils.read_conll(treebank.devfile,treebank.iso_id,maxSize=options.debug_dev_sents,hard_lim=True)) dev_file = os.path.join(treebank.outdir,'dev-debug' + ext) # location for the new dev file utils.write_conll(dev_file,devdata) # write the new dev data to file treebank.dev_gold = dev_file treebank.devfile = dev_file else: testdata = list(utils.read_conll(treebank.testfile,treebank.iso_id,maxSize=options.debug_test_sents,hard_lim=True)) test_file = os.path.join(treebank.outdir,'test-debug' + ext) # location for the new dev file utils.write_conll(test_file,testdata) # write the new dev data to file treebank.test_gold = test_file treebank.testfile = test_file
def run(om,options,i): if options.multiling: outdir = options.outdir else: cur_treebank = om.languages[i] outdir = cur_treebank.outdir if options.shared_task: outdir = options.shared_task_outdir if not options.predict: # training fineTune = False start_from = 1 if options.continueModel is None: continueTraining = False else: continueTraining = True trainedModel = options.continueModel if options.fineTune: fineTune = True else: start_from = options.first_epoch - 1 if not continueTraining: print 'Preparing vocab' if options.multiling: path_is_dir=True, words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\ path_is_dir, options.shareWordLookup,\ options.shareCharLookup) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab(cur_treebank.trainfile) paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'w') as paramsfp: print 'Saving params to ' + paramsfile pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' else: paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'rb') as paramsfp: print 'Load params from ' + paramsfile words, w2i, pos, rels, cpos, langs, options, ch = pickle.load(paramsfp) print 'Finished loading vocab' max_epochs = options.first_epoch + options.epochs print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) if continueTraining: if not fineTune: # continue training only, not doing fine tuning options.first_epoch = start_from + 1 max_epochs = options.epochs else: # fine tune model options.first_epoch = options.epochs + 1 max_epochs = options.first_epoch + 15 print 'Fine tune model for another', max_epochs - options.first_epoch, 'epochs' parser.Load(trainedModel) best_multi_las = -1 best_multi_epoch = 0 if continueTraining: train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'a', encoding='utf-8') else: train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'w', encoding='utf-8') for epoch in xrange(options.first_epoch, max_epochs + 1): print 'Starting epoch ' + str(epoch) if options.multiling: traindata = list(utils.read_conll_dir(om.languages, "train", options.max_sentences)) else: traindata = list(utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id,options.max_sentences)) parser.Train(traindata) train_stats.write(unicode('Epoch ' + str(epoch) + '\n')) print 'Finished epoch ' + str(epoch) model_file = os.path.join(outdir, options.model + '.tmp') parser.Save(model_file) if options.pred_dev: # use the model to predict on dev data if options.multiling: pred_langs = [lang for lang in om.languages if lang.pred_dev] # languages which have dev data on which to predict for lang in pred_langs: lang.outfilename = os.path.join(lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu') print "Predicting on dev data for " + lang.name devdata = utils.read_conll_dir(pred_langs,"dev") pred = list(parser.Predict(devdata)) if len(pred)>0: utils.write_conll_multiling(pred,pred_langs) else: print "Warning: prediction empty" if options.pred_eval: total_las = 0 for lang in pred_langs: print "Evaluating dev prediction for " + lang.name las_score = utils.evaluate(lang.dev_gold, lang.outfilename,om.conllu) total_las += las_score train_stats.write(unicode('Dev LAS ' + lang.name + ': ' + str(las_score) + '\n')) if options.model_selection: if total_las > best_multi_las: best_multi_las = total_las best_multi_epoch = epoch else: # monolingual case if cur_treebank.pred_dev: print "Predicting on dev data for " + cur_treebank.name devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id) cur_treebank.outfilename = os.path.join(outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu')) pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.pred_eval: print "Evaluating dev prediction for " + cur_treebank.name las_score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) if options.model_selection: if las_score > cur_treebank.dev_best[1]: cur_treebank.dev_best = [epoch, las_score] train_stats.write(unicode('Dev LAS ' + cur_treebank.name + ': ' + str(las_score) + '\n')) if epoch == max_epochs: # at the last epoch choose which model to copy to barchybrid.model if not options.model_selection: best_epoch = options.epochs # take the final epoch if model selection off completely (for example multilingual case) else: if options.multiling: best_epoch = best_multi_epoch else: best_epoch = cur_treebank.dev_best[0] # will be final epoch by default if model selection not on for this treebank if cur_treebank.model_selection: print "Best dev score of " + str(cur_treebank.dev_best[1]) + " found at epoch " + str(cur_treebank.dev_best[0]) bestmodel_file = os.path.join(outdir,"barchybrid.model.tmp") model_file = os.path.join(outdir,"barchybrid.model") if fineTune: model_file = os.path.join(outdir,"barchybrid.tuned.model") print "Best epoch: " + str(best_epoch) print "Copying " + bestmodel_file + " to " + model_file copyfile(bestmodel_file,model_file) train_stats.close() else: #if predict - so # import pdb;pdb.set_trace() eval_type = options.evaltype print "Eval type: ", eval_type if eval_type == "train": if options.multiling: for l in om.languages: l.test_gold = l.test_gold.replace('test', 'train') else: cur_treebank.testfile = cur_treebank.trainfile cur_treebank.test_gold = cur_treebank.trainfile elif eval_type == "dev": if options.multiling: for l in om.languages: l.test_gold = l.test_gold.replace('test', 'dev') else: cur_treebank.testfile = cur_treebank.devfile cur_treebank.test_gold = cur_treebank.devfile if options.multiling: modeldir = options.modeldir if options.fineTune: prefix = [os.path.join(outdir, os.path.basename(l.test_gold) + '-tuned') for l in om.languages] else: prefix = [os.path.join(outdir, os.path.basename(l.test_gold)) for l in om.languages] else: modeldir = om.languages[i].modeldir if options.fineTune: prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile)) + '-tuned' else: prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile)) if not options.extract_vectors: prefix = None params = os.path.join(modeldir, options.params) print 'Reading params from ' + params with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) if options.fineTune: options.model = options.model.replace('.model', '.tuned.model') model = os.path.join(modeldir, options.model) parser.Load(model) if options.multiling: testdata = utils.read_conll_dir(om.languages, eval_type) else: testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, eval_type + "-" + l.outfilename) pred = list(parser.Predict(testdata, prefix)) utils.write_conll_multiling(pred,om.languages) else: if cur_treebank.outfilename: cur_treebank.outfilename = os.path.join(outdir, eval_type + "-" + cur_treebank.outfilename) else: cur_treebank.outfilename = os.path.join(outdir, 'out' + ('.conll' if not om.conllu else '.conllu')) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata, prefix)) te = time.time() if options.pred_eval: if options.multiling: for l in om.languages: print "Evaluating on " + l.name score = utils.evaluate(l.test_gold, l.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" %(score, l.name) else: print "Evaluating on " + cur_treebank.name score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" %(score,cur_treebank.name) print 'Finished predicting'
def run(om, options, i): if options.multiling: outdir = options.outdir else: cur_treebank = om.languages[i] outdir = cur_treebank.outdir if options.shared_task: outdir = options.shared_task_outdir if not options.predict: # training print 'Preparing vocab' if options.multiling: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( om.languages, path_is_dir=True) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( cur_treebank.trainfile) paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'w') as paramsfp: print 'Saving params to ' + paramsfile pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) durations = [] for epoch in xrange(options.first_epoch, options.first_epoch + options.epochs): print 'Starting epoch ' + str(epoch) start_time = time.time() if options.multiling: traindata = list( utils.read_conll_dir(om.languages, "train", options.max_sentences)) else: traindata = list( utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id, options.max_sentences)) parser.Train(traindata) print 'Finished epoch ' + str(epoch) if not options.overwrite_model: model_file = os.path.join(outdir, options.model + str(epoch)) parser.Save(model_file) if options.pred_dev: # use the model to predict on dev data if options.multiling: pred_langs = [ lang for lang in om.languages if lang.pred_dev ] # languages which have dev data on which to predict for lang in pred_langs: lang.outfilename = os.path.join( lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu') print "Predicting on dev data for " + lang.name devdata = utils.read_conll_dir(pred_langs, "dev") pred = list(parser.Predict(devdata)) if len(pred) > 0: utils.write_conll_multiling(pred, pred_langs) else: print "Warning: prediction empty" if options.pred_eval: for lang in pred_langs: print "Evaluating dev prediction for " + lang.name utils.evaluate(lang.dev_gold, lang.outfilename, om.conllu) else: # monolingual case if cur_treebank.pred_dev: print "Predicting on dev data for " + cur_treebank.name devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id) cur_treebank.outfilename = os.path.join( outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu')) pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.pred_eval: print "Evaluating dev prediction for " + cur_treebank.name score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) if options.model_selection: if score > cur_treebank.dev_best[1]: cur_treebank.dev_best = [epoch, score] if options.overwrite_model: print "Overwriting model due to higher dev score" model_file = os.path.join( cur_treebank.outdir, options.model) parser.Save(model_file) if options.deadline: # keep track of duration of training+eval now = time.time() duration = now - start_time durations.append(duration) # estimate when next epoch will finish last_five_durations = durations[-5:] eta = time.time() + max(last_five_durations) print 'Deadline in %.1f seconds' % (options.deadline - now) print 'ETA of next epoch in %.1f seconds' % (eta - now) # does it exceed the deadline? exceeds_deadline = eta > options.deadline else: # no deadline exceeds_deadline = False if exceeds_deadline or epoch == options.epochs: # at the last epoch copy the best model to barchybrid.model if not options.model_selection: # model selection off completely (for example multilingual case) # --> take the final epoch, i.e. the current epoch best_epoch = epoch else: best_epoch = cur_treebank.dev_best[ 0] # will be final epoch by default if model selection not on for this treebank if cur_treebank.model_selection: print "Best dev score of " + str( cur_treebank.dev_best[1] ) + " found at epoch " + str(cur_treebank.dev_best[0]) if not options.overwrite_model: bestmodel_file = os.path.join( outdir, "barchybrid.model" + str(best_epoch)) model_file = os.path.join(outdir, "barchybrid.model") print "Copying " + bestmodel_file + " to " + model_file copyfile(bestmodel_file, model_file) if exceeds_deadline and epoch < options.epochs: print 'Leaving epoch loop early to avoid exceeding deadline' break if exceeds_deadline and epoch < options.epochs: print 'Leaving epoch loop early to avoid exceeding deadline' break else: #if predict - so if options.multiling: modeldir = options.modeldir else: modeldir = om.languages[i].modeldir params = os.path.join(modeldir, options.params) print 'Reading params from ' + params with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load( paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) model = os.path.join(modeldir, options.model) parser.Load(model) if options.multiling: testdata = utils.read_conll_dir(om.languages, "test") else: testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, l.outfilename) pred = list(parser.Predict(testdata)) utils.write_conll_multiling(pred, om.languages) else: if cur_treebank.outfilename: cur_treebank.outfilename = os.path.join( outdir, cur_treebank.outfilename) else: cur_treebank.outfilename = os.path.join( outdir, 'out' + ('.conll' if not om.conllu else '.conllu')) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata)) te = time.time() if options.pred_eval: if options.multiling: for l in om.languages: print "Evaluating on " + l.name score = utils.evaluate(l.test_gold, l.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % (score, l.name) else: print "Evaluating on " + cur_treebank.name score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % ( score, cur_treebank.name) print 'Finished predicting'
paramsfp) print('Initializing lstm mstparser:') parser = mstlstm.MSTParserLSTM(words, pos, rels, enum_word, stored_opt, onto, cpos) parser.load(model_path) conllu = (os.path.splitext(test_file.lower())[1] == '.conllu') testpath = os.path.join( output_file, 'test_pred.conll' if not conllu else 'test_pred.conllu') ts = time.time() test_res = list(parser.predict(test_file)) te = time.time() print('Finished predicting test.', te - ts, 'seconds.') utils.write_conll(testpath, test_res) if not conllu: os.system('perl ' + utils_path + 'eval.pl -g ' + test_file + ' -s ' + testpath + ' > ' + testpath + '.txt') else: python_command = 'python3 ' + utils_path + 'evaluation_script/conll17_ud_eval.py -v -w ' + utils_path + \ 'evaluation_script/weights.clas ' + test_file + ' ' + testpath + ' > ' + testpath + '.txt' print(python_command) os.system(python_command) with open(testpath + '.txt', 'r') as f: for l in f: if l.startswith('UAS'): print('UAS:%s' % l.strip().split()[-1]) elif l.startswith('LAS'): print('LAS:%s' % l.strip().split()[-1])
'w') as paramsfp: pickle.dump((words, w2i, pos, rels, options), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, w2i, options) for epoch in xrange(options.epochs): print 'Starting epoch', epoch parser.Train(options.conll_train) conllu = (os.path.splitext( options.conll_dev.lower())[1] == '.conllu') devpath = os.path.join( options.output, 'dev_epoch_' + str(epoch + 1) + ('.conll' if not conllu else '.conllu')) utils.write_conll(devpath, parser.Predict(options.conll_dev)) if not conllu: os.system('perl src/utils/eval.pl -g ' + options.conll_dev + ' -s ' + devpath + ' > ' + devpath + '.txt') else: os.system( 'python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_dev + ' ' + devpath + ' > ' + devpath + '.txt') print 'Finished predicting dev' parser.Save( os.path.join(options.output, options.model + str(epoch + 1))) else: with open(options.params, 'r') as paramsfp:
words, w2i, pos, rels = utils.vocab(options.conll_train) with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, rels, options), paramsfp) print("Finished collecting vocab") print("Initializing LSTM mstparser") parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, options) for epoch in xrange(options.epochs): print('Starting epoch: ', epoch) parser.train(options.conll_train) conllu = (os.path.splitext(options.conll_dev.lower())[1] == '.conllu') devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch + 1) + ('.conll' if not conllu else '.conllu')) utils.write_conll(devpath, parser.predict(options.conll_dev)) parser.save(os.path.join(options.output, os.path.basename(options.model) + str(epoch + 1))) if not conllu: os.system( 'perl src/utils/eval.pl -g ' + options.conll_dev + ' -s ' + devpath + ' > ' + devpath + '.txt') else: os.system( 'python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_dev + ' ' + devpath + ' > ' + devpath + '.txt') with open(devpath + '.txt', 'rb') as f: for l in f: if l.startswith('UAS'): print('UAS:%s' % l.strip().split()[-1]) elif l.startswith('LAS'): print ('LAS:%s' % l.strip().split()[-1])
if options.conll_dev == None: parser.Save(os.path.join(options.outdir, options.model)) if options.input and options.output: with open(os.path.join(options.outdir, options.params), 'r') as paramsfp: words, pWords, plemmas, pos, roles, chars, sense_mask, stored_opt = pickle.load( paramsfp) stored_opt.external_embedding = options.external_embedding parser = SRLLSTM(words, pWords, plemmas, pos, roles, chars, sense_mask, stored_opt) parser.Load(os.path.join(options.outdir, options.model)) ts = time.time() pred = list(parser.Predict(options.input, sen_cut, use_default)) te = time.time() utils.write_conll(options.output, pred) print 'Finished predicting test', te - ts if options.inputdir and options.outputdir: with open(os.path.join(options.outdir, options.params), 'r') as paramsfp: words, pWords, plemmas, pos, roles, chars, sense_mask, stored_opt = pickle.load( paramsfp) stored_opt.external_embedding = options.external_embedding parser = SRLLSTM(words, pWords, plemmas, pos, roles, chars, sense_mask, stored_opt) parser.Load(os.path.join(options.outdir, options.model)) ts = time.time() for dir, subdir, files in os.walk(options.inputdir): for f in files: print 'predicting ' + os.path.join(dir, f)
max_len = max([len(d) for d in train_data]) min_len = min([len(d) for d in train_data]) buckets = [list() for i in range(min_len, max_len)] for d in train_data: buckets[len(d) - min_len - 1].append(d) buckets = [x for x in buckets if x != []] for epoch in xrange(options.epochs): print 'Starting epoch', epoch print 'best F-score before starting the epoch: ' + str( best_f_score) best_f_score = parser.Train( utils.get_batches(buckets, parser, True), epoch, best_f_score, options) print 'best F-score after finishing the epoch: ' + str( best_f_score) if options.input and options.output: with open(os.path.join(options.outdir, options.params), 'r') as paramsfp: words, lemmas, pos, roles, chars, stored_opt = pickle.load( paramsfp) stored_opt.external_embedding = options.external_embedding parser = SRLLSTM(words, lemmas, pos, roles, chars, stored_opt) parser.Load(os.path.join(options.outdir, options.model)) print 'loaded the model' ts = time.time() pred = list(parser.Predict(options.input)) te = time.time() utils.write_conll(options.output, pred) print 'Finished predicting test', te - ts
buckets = [list() for i in range(min_len, max_len)] for d in train_data: buckets[len(d) - min_len - 1].append(d) buckets = [x for x in buckets if x != []] for epoch in xrange(options.epochs): print 'Starting epoch', epoch parser.Train(utils.get_batches(buckets, parser, True)) if options.save_epoch: parser.Save( os.path.join(options.outdir, options.model + str(epoch + 1))) if options.conll_dev != '': start = time.time() utils.write_conll( os.path.join(options.outdir, options.model) + str(epoch + 1) + '.txt', parser.Predict(options.conll_dev)) os.system('perl src/utils/eval.pl -g ' + options.conll_dev + ' -s ' + os.path.join(options.outdir, options.model) + str(epoch + 1) + '.txt' + ' > ' + os.path.join(options.outdir, options.model) + str(epoch + 1) + '.eval &') print 'Finished predicting dev; time:', time.time() - start parser.Save(os.path.join(options.outdir, options.model)) if options.input and options.output: with open(options.outdir + '/' + options.params, 'r') as paramsfp: words, lemmas, pos, roles, chars, stored_opt = pickle.load( paramsfp) stored_opt.external_embedding = options.external_embedding
def run(om, options, i): if options.multiling: outdir = options.outdir else: cur_treebank = om.languages[i] outdir = cur_treebank.outdir if options.shared_task: outdir = options.shared_task_outdir if not options.predict: # training print 'Preparing vocab' if options.multiling: path_is_dir = True, words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\ path_is_dir, options.shareWordLookup,\ options.shareCharLookup) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( cur_treebank.trainfile) paramsfile = os.path.join(outdir, options.params) with open(paramsfile, 'w') as paramsfp: print 'Saving params to ' + paramsfile pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) if options.continueModel is not None: parser.Load(options.continueModel) for epoch in xrange(options.first_epoch, options.first_epoch + options.epochs): print 'Starting epoch ' + str(epoch) if options.multiling: traindata = list( utils.read_conll_dir(om.languages, "train", options.max_sentences)) else: traindata = list( utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id, options.max_sentences)) parser.Train(traindata) print 'Finished epoch ' + str(epoch) model_file = os.path.join(outdir, options.model + str(epoch)) parser.Save(model_file) if options.pred_dev: # use the model to predict on dev data if options.multiling: pred_langs = [ lang for lang in om.languages if lang.pred_dev ] # languages which have dev data on which to predict for lang in pred_langs: lang.outfilename = os.path.join( lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu') print "Predicting on dev data for " + lang.name devdata = utils.read_conll_dir(pred_langs, "dev") pred = list(parser.Predict(devdata)) if len(pred) > 0: utils.write_conll_multiling(pred, pred_langs) else: print "Warning: prediction empty" if options.pred_eval: for lang in pred_langs: print "Evaluating dev prediction for " + lang.name utils.evaluate(lang.dev_gold, lang.outfilename, om.conllu) else: # monolingual case if cur_treebank.pred_dev: print "Predicting on dev data for " + cur_treebank.name devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id) cur_treebank.outfilename = os.path.join( outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu')) pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.pred_eval: print "Evaluating dev prediction for " + cur_treebank.name score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) if options.model_selection: if score > cur_treebank.dev_best[1]: cur_treebank.dev_best = [epoch, score] if epoch == options.epochs: # at the last epoch choose which model to copy to barchybrid.model if not options.model_selection: best_epoch = options.epochs # take the final epoch if model selection off completely (for example multilingual case) else: best_epoch = cur_treebank.dev_best[ 0] # will be final epoch by default if model selection not on for this treebank if cur_treebank.model_selection: print "Best dev score of " + str( cur_treebank.dev_best[1] ) + " found at epoch " + str(cur_treebank.dev_best[0]) bestmodel_file = os.path.join( outdir, "barchybrid.model" + str(best_epoch)) model_file = os.path.join(outdir, "barchybrid.model") print "Copying " + bestmodel_file + " to " + model_file copyfile(bestmodel_file, model_file) else: #if predict - so if options.multiling: modeldir = options.modeldir else: modeldir = om.languages[i].modeldir params = os.path.join(modeldir, options.params) print 'Reading params from ' + params with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load( paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) model = os.path.join(modeldir, options.model) parser.Load(model) if options.multiling: testdata = utils.read_conll_dir(om.languages, "test") else: testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, l.outfilename) pred = list(parser.Predict(testdata)) utils.write_conll_multiling(pred, om.languages) else: if cur_treebank.outfilename: cur_treebank.outfilename = os.path.join( outdir, cur_treebank.outfilename) else: cur_treebank.outfilename = os.path.join( outdir, 'out' + ('.conll' if not om.conllu else '.conllu')) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata)) te = time.time() if options.pred_eval: if options.multiling: for l in om.languages: print "Evaluating on " + l.name score = utils.evaluate(l.test_gold, l.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % (score, l.name) else: print "Evaluating on " + cur_treebank.name score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print "Obtained LAS F1 score of %.2f on %s" % ( score, cur_treebank.name) print 'Finished predicting'
print 'Preparing vocab' words, w2i, pos, rels = utils.vocab(options.conll_train) with open(os.path.join(options.output, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, rels, options), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, w2i, options) for epoch in xrange(options.epochs): print 'Starting epoch', epoch parser.Train(options.conll_train) conllu = (os.path.splitext(options.conll_dev.lower())[1] == '.conllu') devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + ('.conll' if not conllu else '.conllu')) utils.write_conll(devpath, parser.Predict(options.conll_dev)) if not conllu: os.system('perl src/utils/eval.pl -g ' + options.conll_dev + ' -s ' + devpath + ' > ' + devpath + '.txt') else: os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_dev + ' ' + devpath + ' > ' + devpath + '.txt') print 'Finished predicting dev' parser.Save(os.path.join(options.output, options.model + str(epoch+1))) else: with open(options.params, 'r') as paramsfp: words, w2i, pos, rels, stored_opt = pickle.load(paramsfp) stored_opt.external_embedding = options.external_embedding parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt)
stored_opt.external_embedding = options.external_embedding print 'Initializing lstm mstparser:' parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, stored_opt) parser.Load(options.model) conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu') tespath = os.path.join( options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu') ts = time.time() test_res = list(parser.Predict(options.conll_test)) te = time.time() print 'Finished predicting test.', te - ts, 'seconds.' utils.write_conll(tespath, test_res) if not conllu: os.system('perl conll/eval.pl -g ' + options.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt') else: os.system( 'python conll/evaluation_script/conll17_ud_eval.py -v -w conll/evaluation_script/weights.clas ' + options.conll_test + ' ' + tespath + ' > ' + tespath + '.txt') else: print 'Preparing vocab' words, w2i, pos, rels = utils.vocab(options.conll_train) with open(os.path.join(options.output, options.params), 'w') as paramsfp:
#print 'Finished predicting dev' print "Total time:", total_time, "words/sec:", nwords / total_time, "sents/sec:", nsents / total_time else: with open(options.params, 'r') as paramsfp: words, w2i, pos, rels, stored_opt = pickle.load(paramsfp) stored_opt.external_embedding = options.external_embedding parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt) parser.Load(options.model) conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu') tespath = os.path.join( options.output, 'test_pred.conll' if not conllu else 'test_pred.conllu') ts = time.time() pred = list(parser.Predict(options.conll_test, options.batch_size)) te = time.time() pred_time = te - ts nsents = len(pred) nwords = sum([len(s) for s in pred]) print pred_time, "sents/sec:", nsents / pred_time, "words/sec:", nwords / pred_time print nsents, nwords utils.write_conll(tespath, pred) #if not conllu: # os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt') #else: # os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + tespath + ' > ' + testpath + '.txt') print 'Finished predicting test', te - ts
def main(train_file, test_file, output, model, num_epochs, embeddings_init=None, pos_d=0, seed=0): vocab = utils.Vocabulary(train_file) print('reading train...') train = list(utils.read_conll(train_file)) print('read {} examples'.format(len(train))) print('reading test...') test = list(utils.read_conll(test_file)) print('read {} examples'.format(len(test))) print 'Initializing lstm parser:' parser = graphParser.GraphParser(vocab.num_words, vocab.num_rel, pos_d=pos_d, pos_V=vocab.num_pos if pos_d else None, embeddings_init=embeddings_init, seed=seed, verbose=True) print('formatting test data...') test_indices, test_pos_indices, test_arcs, test_labels = vocab.process( test, deterministic=True) i = 1 epochs = [] las_scores = [] uas_scores = [] for epoch in range(num_epochs): print 'Starting epoch', epoch loss = 0 #shuffle the training data random.shuffle(train) #convert to indices, sample, etc indices, pos_indices, gold_arcs, gold_labels = vocab.process(train) #train and return loss loss = parser.train(indices, gold_arcs, gold_labels, pos_indices if pos_d else None) #get predicted labels for test set predicted_arcs, predicted_labels = parser.predict( test_indices, test_pos_indices if pos_d else None) #write the predictions to a CONLL formatted file devpath = os.path.join(output, 'dev_tmp.conll') utils.write_conll( devpath, vocab.entry(test_indices, test_pos_indices, predicted_arcs, predicted_labels)) #call the CONLL evaluation script and extract the LAS and UAS p = subprocess.Popen( ['perl', 'src/utils/eval.pl', '-g', test_file, '-s', devpath], stdout=subprocess.PIPE) out, err = p.communicate() las = float(out.splitlines()[0].split()[-2]) uas = float(out.splitlines()[1].split()[-2]) las_scores.append(las) uas_scores.append(uas) epochs.append(i) i = i + 1 #do whatever metrics utils.metrics(loss, uas, las) #save the current model parser.save(os.path.join(output, os.path.basename(model)), vocab.idx2word, vocab.idx2pos if pos_d else None) print 'epochs', epochs print 'las', las_scores print 'uas', uas_scores fig = plt.figure() plt.plot(epochs, las_scores) plt.plot(epochs, uas_scores) plt.legend(['LAS', 'UAS']) plt.show() fig.savefig('pos_accuracy.png')
def run(om, options, i): outdir = options.output if options.multi_monoling: cur_treebank = om.languages[i] outdir = cur_treebank.outdir modelDir = cur_treebank.modelDir else: outdir = options.output modelDir = om.languages[i].modelDir if options.shared_task: outdir = options.shared_task_outdir if not options.include: cur_treebank = om.treebank if not options.predictFlag: print 'Preparing vocab' if options.multiling: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( om.languages, path_is_dir=True) else: words, w2i, pos, cpos, rels, langs, ch = utils.vocab( cur_treebank.trainfile) with open(os.path.join(outdir, options.params), 'w') as paramsfp: pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch), paramsfp) print 'Finished collecting vocab' print 'Initializing blstm arc hybrid:' parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options) for epoch in xrange(options.first_epoch - 1, options.first_epoch - 1 + options.epochs): if options.multiling: traindata = list( utils.read_conll_dir(om.languages, "train", options.drop_proj, options.maxCorpus)) devdata = enumerate(utils.read_conll_dir(om.languages, "dev")) else: conllFP = open(cur_treebank.trainfile, 'r') traindata = list( utils.read_conll(conllFP, options.drop_proj, cur_treebank.iso_id)) if os.path.exists(cur_treebank.devfile): conllFP = open(cur_treebank.devfile, 'r') devdata = enumerate( utils.read_conll(conllFP, False, cur_treebank.iso_id)) else: tot_sen = len(traindata) #take a bit less than 5% of train sentences for dev if tot_sen > 1000: import random random.shuffle(traindata) dev_len = int(0.05 * tot_sen) #gen object * 2 devdata, dev_gold = itertools.tee(traindata[:dev_len]) devdata = enumerate(devdata) dev_gold_f = os.path.join(outdir, 'dev_gold' + '.conllu') utils.write_conll(dev_gold_f, dev_gold) cur_treebank.dev_gold = dev_gold_f traindata = traindata[dev_len:] else: devdata = None print 'Starting epoch', epoch parser.Train(traindata) if options.multiling: for l in om.languages: l.outfilename = os.path.join( l.outdir, 'dev_epoch_' + str(epoch + 1) + '.conllu') pred = list(parser.Predict(devdata)) if len(pred) > 0: utils.write_conll_multiling(pred, om.languages) else: cur_treebank.outfilename = os.path.join( outdir, 'dev_epoch_' + str(epoch + 1) + ('.conll' if not om.conllu else '.conllu')) if devdata: pred = list(parser.Predict(devdata)) utils.write_conll(cur_treebank.outfilename, pred) if options.multiling: for l in om.languages: utils.evaluate(l.dev_gold, l.outfilename, om.conllu) else: utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu) print 'Finished predicting dev' parser.Save(os.path.join(outdir, options.model + str(epoch + 1))) else: #if predict - so params = os.path.join(modelDir, options.params) with open(params, 'r') as paramsfp: words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load( paramsfp) parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, stored_opt) model = os.path.join(modelDir, options.model) parser.Load(model) if options.multiling: testdata = enumerate(utils.read_conll_dir( om.languages, "test")) if not options.multiling: conllFP = open(cur_treebank.testfile, 'r') testdata = enumerate( utils.read_conll(conllFP, False, cur_treebank.iso_id)) ts = time.time() if options.multiling: for l in om.languages: l.outfilename = os.path.join(outdir, l.outfilename) pred = list(parser.Predict(testdata)) utils.write_conll_multiling(pred, om.languages) else: cur_treebank.outfilename = os.path.join( outdir, cur_treebank.outfilename) utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata)) te = time.time() if options.predEval: if options.multiling: for l in om.languages: utils.evaluate(l.test_gold, l.outfilename, om.conllu) else: utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu) print 'Finished predicting test', te - ts