コード例 #1
0
 def createDebugData(self,treebank,options):
     ext = '.conllu' if options.conllu else '.conll'
     print 'Creating smaller data sets for debugging'
     if not options.predict:
         train_data = list(utils.read_conll(treebank.trainfile,maxSize=options.debug_train_sents,hard_lim=True))
         train_file = os.path.join(treebank.outdir,'train-debug' + ext) # location for the new train file
         utils.write_conll(train_file,train_data) # write the new dev data to file
         treebank.trainfile = train_file
         if treebank.devfile and os.path.exists(treebank.devfile) and options.pred_dev:
             dev_data = list(utils.read_conll(treebank.devfile,maxSize=options.debug_dev_sents,hard_lim=True))
             dev_file = os.path.join(treebank.outdir,'dev-debug' + ext) # location for the new dev file
             utils.write_conll(dev_file,dev_data) # write the new dev data to file
             # have to create a separate debug gold file if not the same as input file
             if treebank.dev_gold != treebank.devfile:
                 dev_gold_data = list(utils.read_conll(treebank.dev_gold,maxSize=options.debug_dev_sents,hard_lim=True))
                 dev_gold_file = os.path.join(treebank.outdir,'dev-gold-debug' + ext) # location for the new dev file
                 utils.write_conll(dev_gold_file,dev_gold_data) # write the new dev gold data to file
                 treebank.dev_gold = dev_gold_file
             else:
                 treebank.dev_gold = dev_file
             treebank.devfile = dev_file # important to do this last
     else:
         test_data = list(utils.read_conll(treebank.testfile,maxSize=options.debug_test_sents,hard_lim=True))
         test_file = os.path.join(treebank.outdir,'test-debug' + ext) # location for the new dev file
         utils.write_conll(test_file,test_data) # write the new dev data to file
         if treebank.test_gold != treebank.testfile:
             test_gold_data = list(utils.read_conll(treebank.test_gold,maxSize=options.debug_test_sents,hard_lim=True))
             test_gold_file = os.path.join(treebank.outdir,'test-gold-debug' + ext) # location for the new dev file
             utils.write_conll(test_gold_file,test_gold_data) # write the new dev data to file
             treebank.test_gold = test_gold_file
         else:
             treebank.test_gold = test_file
         treebank.testfile = test_file
コード例 #2
0
ファイル: srl.py プロジェクト: rasoolims/Neural_SRL
    def Train(self, mini_batches, epoch, best_f_score, options):
        print 'Start time', time.ctime()
        start = time.time()
        errs, loss, iters, sen_num = [], 0, 0, 0
        dev_path = options.conll_dev

        part_size = len(mini_batches) / 5
        part = 0
        best_part = 0

        for b, mini_batch in enumerate(mini_batches):
            e = self.buildGraph(mini_batch, True)
            errs += e
            sum_errs = esum(errs) / len(errs)
            loss += sum_errs.scalar_value()
            sum_errs.backward()
            self.trainer.update()
            renew_cg()
            self.x_le.init_row(self.NO_LEMMA, [0] * self.d_l)
            renew_cg()
            print 'loss:', loss / (
                b + 1), 'time:', time.time() - start, 'progress', round(
                    100 * float(b + 1) / len(mini_batches), 2), '%'
            loss, start = 0, time.time()
            errs, sen_num = [], 0
            iters += 1

            if (b + 1) % part_size == 0:
                part += 1

                if dev_path != '':
                    start = time.time()
                    write_conll(
                        os.path.join(options.outdir, options.model) +
                        str(epoch + 1) + "_" + str(part) + '.txt',
                        self.Predict(dev_path))
                    os.system('perl src/utils/eval.pl -g ' + dev_path +
                              ' -s ' +
                              os.path.join(options.outdir, options.model) +
                              str(epoch + 1) + "_" + str(part) + '.txt' +
                              ' > ' +
                              os.path.join(options.outdir, options.model) +
                              str(epoch + 1) + "_" + str(part) + '.eval')
                    print 'Finished predicting dev on part ' + str(
                        part) + '; time:', time.time() - start

                    labeled_f, unlabeled_f = get_scores(
                        os.path.join(options.outdir, options.model) +
                        str(epoch + 1) + "_" + str(part) + '.eval')
                    print 'epoch: ' + str(epoch) + ' part: ' + str(
                        part) + '-- labeled F1: ' + str(
                            labeled_f) + ' Unlabaled F: ' + str(unlabeled_f)

                    if float(labeled_f) > best_f_score:
                        self.Save(os.path.join(options.outdir, options.model))
                        best_f_score = float(labeled_f)
                        best_part = part

        print 'best part on this epoch: ' + str(best_part)
        return best_f_score
コード例 #3
0
 def prepareDev(self,treebank,options):
     treebank.pred_dev = options.pred_dev # even if options.pred_dev is True, might change treebank.pred_dev to False later if no dev data available
     if not treebank.devfile or not os.path.exists(treebank.devfile):
         if options.create_dev: # create some dev data from the training data
             train_data = list(utils.read_conll(treebank.trainfile))
             tot_sen = len(train_data)
             if tot_sen > options.min_train_sents: # need to have at least min_train_sents to move forward
                 dev_file = os.path.join(treebank.outdir,'dev-split' + '.conllu') # location for the new dev file
                 train_file = os.path.join(treebank.outdir,'train-split' + '.conllu') # location for the new train file
                 dev_len = int(0.01*options.dev_percent*tot_sen)
                 print ("Taking " + str(dev_len) + " of " + str(tot_sen)
                         + " sentences from training data as new dev data for " + treebank.name)
                 random.shuffle(train_data)
                 dev_data = train_data[:dev_len]
                 utils.write_conll(dev_file,dev_data) # write the new dev data to file
                 train_data = train_data[dev_len:] # put the rest of the training data in a new file too
                 utils.write_conll(train_file,train_data)
                 # update some variables with the new file locations
                 treebank.dev_gold = dev_file
                 treebank.devfile = dev_file
                 treebank.trainfile = train_file
             else: # not enough sentences
                 print ("Warning: not enough sentences in training data to create dev set for "
                     + treebank.name + " (minimum required --min-train-size: " + str(options.min_train_sents) + ")")
                 treebank.pred_dev = False
         else: # option --create-dev not set
             print ("Warning: No dev data for " + treebank.name
                     + ", consider adding option --create-dev to create dev data from training set")
             treebank.pred_dev = False
     if options.model_selection and not treebank.pred_dev:
         print "Warning: can't do model selection for " + treebank.name + " as prediction on dev data is off"
コード例 #4
0
def ensemble(files, outfile):
    """
    Takes conllu files as input
    """
    conllu_files = []
    for f in files:
        cf = utils.read_conll(f)
        conllu_files.append(cf)
    zipped_sentences = zip(*conllu_files)
    decoder = DependencyDecoder()
    sentences_out = []
    for zipped_sentence in zipped_sentences:
        conll_sentence = [
            entry for entry in zipped_sentence[0]
            if isinstance(entry, utils.ConllEntry)
        ]
        n_words = len(conll_sentence)
        m = np.zeros((n_words, n_words))
        for i_sentence in zipped_sentence:
            conll_sen = [
                entry for entry in i_sentence
                if isinstance(entry, utils.ConllEntry)
            ]
            for item in conll_sen:
                head = item.parent_id
                dep = item.id
                m[head, dep] += 1

        #NOTE: this takes the label of the first!
        heads = decoder.parse_nonproj(m)
        for entry in zipped_sentence[0]:
            if isinstance(entry, utils.ConllEntry):
                entry.pred_parent_id = heads[entry.id]
        sentences_out.append(zipped_sentence[0])
    utils.write_conll(outfile, sentences_out)
コード例 #5
0
ファイル: crf_func.py プロジェクト: dgeng7/CNER
def CRF_eval(data, test_index, y_pred, path, self_eval):
    test_char = [data[i] for i in test_index]
    if self_eval:
        datawpred = [[[data[0], data[-1]] + [pred]
                      for data, pred in zip(test_char[j], y_pred[j])]
                     for j in range(len(y_pred))]
    else:
        datawpred = [[[data, pred]
                      for data, pred in zip(test_char[j], y_pred[j])]
                     for j in range(len(y_pred))]
    with open(path + "pred{}.conll".format(self_eval != True),
              'w',
              encoding='utf-8') as f:
        write_conll(f, input_data_transform(datawpred))
    if self_eval:
        test_ner(path)
コード例 #6
0
ファイル: parser.py プロジェクト: danilojsl/bist-parser
def evaluate_model():
    conllu = (os.path.splitext(dev_file.lower())[1] == '.conllu')
    devpath = os.path.join(
        output_file, 'dev_epoch_' + str(epoch + 1) +
        ('.conll' if not conllu else '.conllu'))
    utils.write_conll(devpath, parser.predict(dev_file))

    if not conllu:
        perl_command = 'perl ' + utils_path + '/eval.pl -g ' + dev_file + ' -s ' + devpath + ' > ' \
                       + devpath + '.txt'
        print(perl_command)
        os.system(perl_command)
        with open(devpath + '.txt', 'r') as f:
            for i in range(0, 3):
                print(f.readline())
    else:
        python_command = 'python3 ' + utils_path + 'evaluation_script/conll17_ud_eval.py -v -w ' + \
                         utils_path + 'evaluation_script/weights.clas ' + dev_file + ' ' + devpath + ' > ' \
                         + devpath + '.txt'
        print(python_command)
        os.system(python_command)
コード例 #7
0
 def createDebugData(self,treebank,options):
     ext = '.conllu' if self.conllu else '.conll'
     print 'Creating smaller data sets for debugging'
     if not options.predict:
         traindata = list(utils.read_conll(treebank.trainfile,treebank.iso_id,maxSize=options.debug_train_sents,hard_lim=True))
         train_file = os.path.join(treebank.outdir,'train-debug' + ext) # location for the new train file
         utils.write_conll(train_file,traindata) # write the new dev data to file
         treebank.trainfile = train_file
         if treebank.devfile and os.path.exists(treebank.devfile) and options.pred_dev:
             devdata = list(utils.read_conll(treebank.devfile,treebank.iso_id,maxSize=options.debug_dev_sents,hard_lim=True))
             dev_file = os.path.join(treebank.outdir,'dev-debug' + ext) # location for the new dev file
             utils.write_conll(dev_file,devdata) # write the new dev data to file
             treebank.dev_gold = dev_file
             treebank.devfile = dev_file
     else:
        testdata = list(utils.read_conll(treebank.testfile,treebank.iso_id,maxSize=options.debug_test_sents,hard_lim=True))
        test_file = os.path.join(treebank.outdir,'test-debug' + ext) # location for the new dev file
        utils.write_conll(test_file,testdata) # write the new dev data to file
        treebank.test_gold = test_file
        treebank.testfile = test_file
コード例 #8
0
def run(om,options,i):

    if options.multiling:
        outdir = options.outdir
    else:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.predict: # training

        fineTune = False
        start_from = 1
        if options.continueModel is None:
            continueTraining = False
        else:
            continueTraining = True
            trainedModel = options.continueModel
            if options.fineTune:
                fineTune = True
            else:
                start_from = options.first_epoch - 1

        if not continueTraining:
            print 'Preparing vocab'
            if options.multiling:
                path_is_dir=True,
                words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\
                                                                     path_is_dir,
                                                                     options.shareWordLookup,\
                                                                     options.shareCharLookup)

            else:
                words, w2i, pos, cpos, rels, langs, ch = utils.vocab(cur_treebank.trainfile)

            paramsfile = os.path.join(outdir, options.params)
            with open(paramsfile, 'w') as paramsfp:
                print 'Saving params to ' + paramsfile
                pickle.dump((words, w2i, pos, rels, cpos, langs,
                             options, ch), paramsfp)
                print 'Finished collecting vocab'
        else:
            paramsfile = os.path.join(outdir, options.params)
            with open(paramsfile, 'rb') as paramsfp:
                print 'Load params from ' + paramsfile
                words, w2i, pos, rels, cpos, langs, options, ch = pickle.load(paramsfp)
                print 'Finished loading vocab'

        max_epochs = options.first_epoch + options.epochs
        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i,
                               ch, options)

        if continueTraining:
            if not fineTune: 
                # continue training only, not doing fine tuning
                options.first_epoch = start_from + 1
                max_epochs = options.epochs
            else:
                # fine tune model
                options.first_epoch = options.epochs + 1
                max_epochs = options.first_epoch + 15
                print 'Fine tune model for another', max_epochs - options.first_epoch, 'epochs'

            parser.Load(trainedModel)
            

        best_multi_las = -1
        best_multi_epoch = 0
        
        if continueTraining:
            train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'a', encoding='utf-8')
        else:
            train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'w', encoding='utf-8')
                
        for epoch in xrange(options.first_epoch, max_epochs + 1):

            print 'Starting epoch ' + str(epoch)

            if options.multiling:
                traindata = list(utils.read_conll_dir(om.languages, "train", options.max_sentences))
            else:
                traindata = list(utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id,options.max_sentences))

            parser.Train(traindata)
            train_stats.write(unicode('Epoch ' + str(epoch) + '\n'))
            print 'Finished epoch ' + str(epoch)

            model_file = os.path.join(outdir, options.model + '.tmp')
            parser.Save(model_file)

            if options.pred_dev: # use the model to predict on dev data
                if options.multiling:
                    pred_langs = [lang for lang in om.languages if lang.pred_dev] # languages which have dev data on which to predict
                    for lang in pred_langs:
                        lang.outfilename = os.path.join(lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu')
                        print "Predicting on dev data for " + lang.name
                    devdata = utils.read_conll_dir(pred_langs,"dev")
                    pred = list(parser.Predict(devdata))

                    if len(pred)>0:
                        utils.write_conll_multiling(pred,pred_langs)
                    else:
                        print "Warning: prediction empty"
                    
                    if options.pred_eval:
                        total_las = 0
                        for lang in pred_langs:
                            print "Evaluating dev prediction for " + lang.name
                            las_score = utils.evaluate(lang.dev_gold, lang.outfilename,om.conllu)
                            total_las += las_score
                            train_stats.write(unicode('Dev LAS ' + lang.name + ': ' + str(las_score) + '\n'))
                        if options.model_selection:
                            if total_las > best_multi_las:
                                best_multi_las = total_las
                                best_multi_epoch = epoch 

                else: # monolingual case
                    if cur_treebank.pred_dev:
                        print "Predicting on dev data for " + cur_treebank.name
                        devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id)
                        cur_treebank.outfilename = os.path.join(outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu'))
                        pred = list(parser.Predict(devdata))
                        utils.write_conll(cur_treebank.outfilename, pred)
                        if options.pred_eval:
                            print "Evaluating dev prediction for " + cur_treebank.name
                            las_score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu)
                            if options.model_selection:
                                if las_score > cur_treebank.dev_best[1]:
                                    cur_treebank.dev_best = [epoch, las_score]
                                    train_stats.write(unicode('Dev LAS ' + cur_treebank.name + ': ' + str(las_score) + '\n'))
                                    

            if epoch == max_epochs: # at the last epoch choose which model to copy to barchybrid.model
                if not options.model_selection:
                    best_epoch = options.epochs # take the final epoch if model selection off completely (for example multilingual case)
                else:
                    if options.multiling:
                        best_epoch = best_multi_epoch
                    else:
                        best_epoch = cur_treebank.dev_best[0] # will be final epoch by default if model selection not on for this treebank
                        if cur_treebank.model_selection:
                            print "Best dev score of " + str(cur_treebank.dev_best[1]) + " found at epoch " + str(cur_treebank.dev_best[0])

                bestmodel_file = os.path.join(outdir,"barchybrid.model.tmp")
                model_file = os.path.join(outdir,"barchybrid.model")
                if fineTune:
                    model_file = os.path.join(outdir,"barchybrid.tuned.model")
                print "Best epoch: " + str(best_epoch)
                print "Copying " + bestmodel_file + " to " + model_file
                copyfile(bestmodel_file,model_file)

        train_stats.close()

    else: #if predict - so

        # import pdb;pdb.set_trace()
        eval_type = options.evaltype
        print "Eval type: ", eval_type
        if eval_type == "train":
            if options.multiling:
                for l in om.languages:
                    l.test_gold = l.test_gold.replace('test', 'train')
            else:
                cur_treebank.testfile = cur_treebank.trainfile
                cur_treebank.test_gold = cur_treebank.trainfile

        elif eval_type == "dev":
            if options.multiling:
                for l in om.languages:
                    l.test_gold = l.test_gold.replace('test', 'dev')
            else:
                cur_treebank.testfile = cur_treebank.devfile
                cur_treebank.test_gold = cur_treebank.devfile

        if options.multiling:
            modeldir = options.modeldir
            if options.fineTune:
                prefix = [os.path.join(outdir, os.path.basename(l.test_gold) + '-tuned') for l in om.languages] 
            else:
                prefix = [os.path.join(outdir, os.path.basename(l.test_gold)) for l in om.languages] 
        else:
            modeldir = om.languages[i].modeldir
            if options.fineTune:
                prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile)) + '-tuned'
            else:
                prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile))

        if not options.extract_vectors:
            prefix = None


        params = os.path.join(modeldir, options.params)
        print 'Reading params from ' + params
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i,
                               ch, stored_opt)

            if options.fineTune:
                options.model = options.model.replace('.model', '.tuned.model')
            model = os.path.join(modeldir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = utils.read_conll_dir(om.languages, eval_type)
            else:
                testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id)

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, eval_type + "-" + l.outfilename)
                pred = list(parser.Predict(testdata, prefix))
                utils.write_conll_multiling(pred,om.languages)
            else:
                if cur_treebank.outfilename:
                    cur_treebank.outfilename = os.path.join(outdir, eval_type + "-" + cur_treebank.outfilename)
                else:
                    cur_treebank.outfilename = os.path.join(outdir, 'out' + ('.conll' if not om.conllu else '.conllu'))
                utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata, prefix))

            te = time.time()

            if options.pred_eval:
                if options.multiling:
                    for l in om.languages:
                        print "Evaluating on " + l.name
                        score = utils.evaluate(l.test_gold, l.outfilename, om.conllu)
                        print "Obtained LAS F1 score of %.2f on %s" %(score, l.name)
                else:
                    print "Evaluating on " + cur_treebank.name
                    score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu)
                    print "Obtained LAS F1 score of %.2f on %s" %(score,cur_treebank.name)

            print 'Finished predicting'
コード例 #9
0
def run(om, options, i):

    if options.multiling:
        outdir = options.outdir
    else:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.predict:  # training

        print 'Preparing vocab'
        if options.multiling:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                om.languages, path_is_dir=True)

        else:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                cur_treebank.trainfile)

        paramsfile = os.path.join(outdir, options.params)
        with open(paramsfile, 'w') as paramsfp:
            print 'Saving params to ' + paramsfile
            pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch),
                        paramsfp)
            print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options)

        durations = []
        for epoch in xrange(options.first_epoch,
                            options.first_epoch + options.epochs):

            print 'Starting epoch ' + str(epoch)
            start_time = time.time()

            if options.multiling:
                traindata = list(
                    utils.read_conll_dir(om.languages, "train",
                                         options.max_sentences))
            else:
                traindata = list(
                    utils.read_conll(cur_treebank.trainfile,
                                     cur_treebank.iso_id,
                                     options.max_sentences))

            parser.Train(traindata)
            print 'Finished epoch ' + str(epoch)

            if not options.overwrite_model:
                model_file = os.path.join(outdir, options.model + str(epoch))
                parser.Save(model_file)

            if options.pred_dev:  # use the model to predict on dev data

                if options.multiling:
                    pred_langs = [
                        lang for lang in om.languages if lang.pred_dev
                    ]  # languages which have dev data on which to predict
                    for lang in pred_langs:
                        lang.outfilename = os.path.join(
                            lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu')
                        print "Predicting on dev data for " + lang.name
                    devdata = utils.read_conll_dir(pred_langs, "dev")
                    pred = list(parser.Predict(devdata))
                    if len(pred) > 0:
                        utils.write_conll_multiling(pred, pred_langs)
                    else:
                        print "Warning: prediction empty"
                    if options.pred_eval:
                        for lang in pred_langs:
                            print "Evaluating dev prediction for " + lang.name
                            utils.evaluate(lang.dev_gold, lang.outfilename,
                                           om.conllu)
                else:  # monolingual case
                    if cur_treebank.pred_dev:
                        print "Predicting on dev data for " + cur_treebank.name
                        devdata = utils.read_conll(cur_treebank.devfile,
                                                   cur_treebank.iso_id)
                        cur_treebank.outfilename = os.path.join(
                            outdir, 'dev_epoch_' + str(epoch) +
                            ('.conll' if not om.conllu else '.conllu'))
                        pred = list(parser.Predict(devdata))
                        utils.write_conll(cur_treebank.outfilename, pred)
                        if options.pred_eval:
                            print "Evaluating dev prediction for " + cur_treebank.name
                            score = utils.evaluate(cur_treebank.dev_gold,
                                                   cur_treebank.outfilename,
                                                   om.conllu)
                            if options.model_selection:
                                if score > cur_treebank.dev_best[1]:
                                    cur_treebank.dev_best = [epoch, score]
                                if options.overwrite_model:
                                    print "Overwriting model due to higher dev score"
                                    model_file = os.path.join(
                                        cur_treebank.outdir, options.model)
                                    parser.Save(model_file)

            if options.deadline:
                # keep track of duration of training+eval
                now = time.time()
                duration = now - start_time
                durations.append(duration)
                # estimate when next epoch will finish
                last_five_durations = durations[-5:]
                eta = time.time() + max(last_five_durations)
                print 'Deadline in %.1f seconds' % (options.deadline - now)
                print 'ETA of next epoch in %.1f seconds' % (eta - now)
                # does it exceed the deadline?
                exceeds_deadline = eta > options.deadline
            else:
                # no deadline
                exceeds_deadline = False

            if exceeds_deadline or epoch == options.epochs:
                # at the last epoch copy the best model to barchybrid.model
                if not options.model_selection:
                    # model selection off completely (for example multilingual case)
                    # --> take the final epoch, i.e. the current epoch
                    best_epoch = epoch
                else:
                    best_epoch = cur_treebank.dev_best[
                        0]  # will be final epoch by default if model selection not on for this treebank
                    if cur_treebank.model_selection:
                        print "Best dev score of " + str(
                            cur_treebank.dev_best[1]
                        ) + " found at epoch " + str(cur_treebank.dev_best[0])

                if not options.overwrite_model:
                    bestmodel_file = os.path.join(
                        outdir, "barchybrid.model" + str(best_epoch))
                    model_file = os.path.join(outdir, "barchybrid.model")
                    print "Copying " + bestmodel_file + " to " + model_file
                    copyfile(bestmodel_file, model_file)

            if exceeds_deadline and epoch < options.epochs:
                print 'Leaving epoch loop early to avoid exceeding deadline'
                break

            if exceeds_deadline and epoch < options.epochs:
                print 'Leaving epoch loop early to avoid exceeding deadline'
                break

    else:  #if predict - so

        if options.multiling:
            modeldir = options.modeldir
        else:
            modeldir = om.languages[i].modeldir

        params = os.path.join(modeldir, options.params)
        print 'Reading params from ' + params
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(
                paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch,
                                   stored_opt)
            model = os.path.join(modeldir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = utils.read_conll_dir(om.languages, "test")
            else:
                testdata = utils.read_conll(cur_treebank.testfile,
                                            cur_treebank.iso_id)

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, l.outfilename)
                pred = list(parser.Predict(testdata))
                utils.write_conll_multiling(pred, om.languages)
            else:
                if cur_treebank.outfilename:
                    cur_treebank.outfilename = os.path.join(
                        outdir, cur_treebank.outfilename)
                else:
                    cur_treebank.outfilename = os.path.join(
                        outdir,
                        'out' + ('.conll' if not om.conllu else '.conllu'))
                utils.write_conll(cur_treebank.outfilename,
                                  parser.Predict(testdata))

            te = time.time()

            if options.pred_eval:
                if options.multiling:
                    for l in om.languages:
                        print "Evaluating on " + l.name
                        score = utils.evaluate(l.test_gold, l.outfilename,
                                               om.conllu)
                        print "Obtained LAS F1 score of %.2f on %s" % (score,
                                                                       l.name)
                else:
                    print "Evaluating on " + cur_treebank.name
                    score = utils.evaluate(cur_treebank.test_gold,
                                           cur_treebank.outfilename, om.conllu)
                    print "Obtained LAS F1 score of %.2f on %s" % (
                        score, cur_treebank.name)

            print 'Finished predicting'
コード例 #10
0
ファイル: parser.py プロジェクト: danilojsl/bist-parser
                paramsfp)

        print('Initializing lstm mstparser:')
        parser = mstlstm.MSTParserLSTM(words, pos, rels, enum_word, stored_opt,
                                       onto, cpos)
        parser.load(model_path)
        conllu = (os.path.splitext(test_file.lower())[1] == '.conllu')
        testpath = os.path.join(
            output_file,
            'test_pred.conll' if not conllu else 'test_pred.conllu')

        ts = time.time()
        test_res = list(parser.predict(test_file))
        te = time.time()
        print('Finished predicting test.', te - ts, 'seconds.')
        utils.write_conll(testpath, test_res)

        if not conllu:
            os.system('perl ' + utils_path + 'eval.pl -g ' + test_file +
                      ' -s ' + testpath + ' > ' + testpath + '.txt')
        else:
            python_command = 'python3 ' + utils_path + 'evaluation_script/conll17_ud_eval.py -v -w ' + utils_path + \
                             'evaluation_script/weights.clas ' + test_file + ' ' + testpath + ' > ' + testpath + '.txt'
            print(python_command)
            os.system(python_command)
            with open(testpath + '.txt', 'r') as f:
                for l in f:
                    if l.startswith('UAS'):
                        print('UAS:%s' % l.strip().split()[-1])
                    elif l.startswith('LAS'):
                        print('LAS:%s' % l.strip().split()[-1])
コード例 #11
0
ファイル: parser.py プロジェクト: liisaratsep/bist-parser
                  'w') as paramsfp:
            pickle.dump((words, w2i, pos, rels, options), paramsfp)
        print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, w2i, options)

        for epoch in xrange(options.epochs):
            print 'Starting epoch', epoch
            parser.Train(options.conll_train)
            conllu = (os.path.splitext(
                options.conll_dev.lower())[1] == '.conllu')
            devpath = os.path.join(
                options.output, 'dev_epoch_' + str(epoch + 1) +
                ('.conll' if not conllu else '.conllu'))
            utils.write_conll(devpath, parser.Predict(options.conll_dev))

            if not conllu:
                os.system('perl src/utils/eval.pl -g ' + options.conll_dev +
                          ' -s ' + devpath + ' > ' + devpath + '.txt')
            else:
                os.system(
                    'python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas '
                    + options.conll_dev + ' ' + devpath + ' > ' + devpath +
                    '.txt')

            print 'Finished predicting dev'
            parser.Save(
                os.path.join(options.output, options.model + str(epoch + 1)))
    else:
        with open(options.params, 'r') as paramsfp:
コード例 #12
0
        words, w2i, pos, rels = utils.vocab(options.conll_train)

        with open(os.path.join(options.output, options.params), 'w') as paramsfp:
            pickle.dump((words, w2i, pos, rels, options), paramsfp)
        print("Finished collecting vocab")

        print("Initializing LSTM mstparser")
        parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, options)

        for epoch in xrange(options.epochs):
            print('Starting epoch: ', epoch)
            parser.train(options.conll_train)
            conllu = (os.path.splitext(options.conll_dev.lower())[1] == '.conllu')
            devpath = os.path.join(options.output,
                                   'dev_epoch_' + str(epoch + 1) + ('.conll' if not conllu else '.conllu'))
            utils.write_conll(devpath, parser.predict(options.conll_dev))
            parser.save(os.path.join(options.output, os.path.basename(options.model) + str(epoch + 1)))

        if not conllu:
                os.system(
                    'perl src/utils/eval.pl -g ' + options.conll_dev + ' -s ' + devpath + ' > ' + devpath + '.txt')
            else:
                os.system(
                    'python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_dev + ' ' + devpath + ' > ' + devpath + '.txt')
                with open(devpath + '.txt', 'rb') as f:
                    for l in f:
                        if l.startswith('UAS'):
                            print('UAS:%s' % l.strip().split()[-1])
                        elif l.startswith('LAS'):
                            print ('LAS:%s' % l.strip().split()[-1])
コード例 #13
0
        if options.conll_dev == None:
            parser.Save(os.path.join(options.outdir, options.model))

    if options.input and options.output:
        with open(os.path.join(options.outdir, options.params),
                  'r') as paramsfp:
            words, pWords, plemmas, pos, roles, chars, sense_mask, stored_opt = pickle.load(
                paramsfp)
        stored_opt.external_embedding = options.external_embedding
        parser = SRLLSTM(words, pWords, plemmas, pos, roles, chars, sense_mask,
                         stored_opt)
        parser.Load(os.path.join(options.outdir, options.model))
        ts = time.time()
        pred = list(parser.Predict(options.input, sen_cut, use_default))
        te = time.time()
        utils.write_conll(options.output, pred)
        print 'Finished predicting test', te - ts

    if options.inputdir and options.outputdir:
        with open(os.path.join(options.outdir, options.params),
                  'r') as paramsfp:
            words, pWords, plemmas, pos, roles, chars, sense_mask, stored_opt = pickle.load(
                paramsfp)
        stored_opt.external_embedding = options.external_embedding
        parser = SRLLSTM(words, pWords, plemmas, pos, roles, chars, sense_mask,
                         stored_opt)
        parser.Load(os.path.join(options.outdir, options.model))
        ts = time.time()
        for dir, subdir, files in os.walk(options.inputdir):
            for f in files:
                print 'predicting ' + os.path.join(dir, f)
コード例 #14
0
        max_len = max([len(d) for d in train_data])
        min_len = min([len(d) for d in train_data])
        buckets = [list() for i in range(min_len, max_len)]
        for d in train_data:
            buckets[len(d) - min_len - 1].append(d)
        buckets = [x for x in buckets if x != []]

        for epoch in xrange(options.epochs):
            print 'Starting epoch', epoch
            print 'best F-score before starting the epoch: ' + str(
                best_f_score)
            best_f_score = parser.Train(
                utils.get_batches(buckets, parser, True), epoch, best_f_score,
                options)
            print 'best F-score after finishing the epoch: ' + str(
                best_f_score)

    if options.input and options.output:
        with open(os.path.join(options.outdir, options.params),
                  'r') as paramsfp:
            words, lemmas, pos, roles, chars, stored_opt = pickle.load(
                paramsfp)
        stored_opt.external_embedding = options.external_embedding
        parser = SRLLSTM(words, lemmas, pos, roles, chars, stored_opt)
        parser.Load(os.path.join(options.outdir, options.model))
        print 'loaded the model'
        ts = time.time()
        pred = list(parser.Predict(options.input))
        te = time.time()
        utils.write_conll(options.output, pred)
        print 'Finished predicting test', te - ts
コード例 #15
0
        buckets = [list() for i in range(min_len, max_len)]
        for d in train_data:
            buckets[len(d) - min_len - 1].append(d)
        buckets = [x for x in buckets if x != []]

        for epoch in xrange(options.epochs):
            print 'Starting epoch', epoch
            parser.Train(utils.get_batches(buckets, parser, True))
            if options.save_epoch:
                parser.Save(
                    os.path.join(options.outdir,
                                 options.model + str(epoch + 1)))
            if options.conll_dev != '':
                start = time.time()
                utils.write_conll(
                    os.path.join(options.outdir, options.model) +
                    str(epoch + 1) + '.txt', parser.Predict(options.conll_dev))
                os.system('perl src/utils/eval.pl -g ' + options.conll_dev +
                          ' -s ' +
                          os.path.join(options.outdir, options.model) +
                          str(epoch + 1) + '.txt' + ' > ' +
                          os.path.join(options.outdir, options.model) +
                          str(epoch + 1) + '.eval &')
                print 'Finished predicting dev; time:', time.time() - start
        parser.Save(os.path.join(options.outdir, options.model))

    if options.input and options.output:
        with open(options.outdir + '/' + options.params, 'r') as paramsfp:
            words, lemmas, pos, roles, chars, stored_opt = pickle.load(
                paramsfp)
        stored_opt.external_embedding = options.external_embedding
コード例 #16
0
ファイル: parser.py プロジェクト: vin-ivar/uuparser
def run(om, options, i):

    if options.multiling:
        outdir = options.outdir
    else:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.predict:  # training

        print 'Preparing vocab'
        if options.multiling:
            path_is_dir = True,
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\
                                                                 path_is_dir,
                                                                 options.shareWordLookup,\
                                                                 options.shareCharLookup)

        else:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                cur_treebank.trainfile)

        paramsfile = os.path.join(outdir, options.params)
        with open(paramsfile, 'w') as paramsfp:
            print 'Saving params to ' + paramsfile
            pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch),
                        paramsfp)
            print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options)
        if options.continueModel is not None:
            parser.Load(options.continueModel)

        for epoch in xrange(options.first_epoch,
                            options.first_epoch + options.epochs):

            print 'Starting epoch ' + str(epoch)

            if options.multiling:
                traindata = list(
                    utils.read_conll_dir(om.languages, "train",
                                         options.max_sentences))
            else:
                traindata = list(
                    utils.read_conll(cur_treebank.trainfile,
                                     cur_treebank.iso_id,
                                     options.max_sentences))

            parser.Train(traindata)
            print 'Finished epoch ' + str(epoch)

            model_file = os.path.join(outdir, options.model + str(epoch))
            parser.Save(model_file)

            if options.pred_dev:  # use the model to predict on dev data

                if options.multiling:
                    pred_langs = [
                        lang for lang in om.languages if lang.pred_dev
                    ]  # languages which have dev data on which to predict
                    for lang in pred_langs:
                        lang.outfilename = os.path.join(
                            lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu')
                        print "Predicting on dev data for " + lang.name
                    devdata = utils.read_conll_dir(pred_langs, "dev")
                    pred = list(parser.Predict(devdata))
                    if len(pred) > 0:
                        utils.write_conll_multiling(pred, pred_langs)
                    else:
                        print "Warning: prediction empty"
                    if options.pred_eval:
                        for lang in pred_langs:
                            print "Evaluating dev prediction for " + lang.name
                            utils.evaluate(lang.dev_gold, lang.outfilename,
                                           om.conllu)
                else:  # monolingual case
                    if cur_treebank.pred_dev:
                        print "Predicting on dev data for " + cur_treebank.name
                        devdata = utils.read_conll(cur_treebank.devfile,
                                                   cur_treebank.iso_id)
                        cur_treebank.outfilename = os.path.join(
                            outdir, 'dev_epoch_' + str(epoch) +
                            ('.conll' if not om.conllu else '.conllu'))
                        pred = list(parser.Predict(devdata))
                        utils.write_conll(cur_treebank.outfilename, pred)
                        if options.pred_eval:
                            print "Evaluating dev prediction for " + cur_treebank.name
                            score = utils.evaluate(cur_treebank.dev_gold,
                                                   cur_treebank.outfilename,
                                                   om.conllu)
                            if options.model_selection:
                                if score > cur_treebank.dev_best[1]:
                                    cur_treebank.dev_best = [epoch, score]

            if epoch == options.epochs:  # at the last epoch choose which model to copy to barchybrid.model
                if not options.model_selection:
                    best_epoch = options.epochs  # take the final epoch if model selection off completely (for example multilingual case)
                else:
                    best_epoch = cur_treebank.dev_best[
                        0]  # will be final epoch by default if model selection not on for this treebank
                    if cur_treebank.model_selection:
                        print "Best dev score of " + str(
                            cur_treebank.dev_best[1]
                        ) + " found at epoch " + str(cur_treebank.dev_best[0])

                bestmodel_file = os.path.join(
                    outdir, "barchybrid.model" + str(best_epoch))
                model_file = os.path.join(outdir, "barchybrid.model")
                print "Copying " + bestmodel_file + " to " + model_file
                copyfile(bestmodel_file, model_file)

    else:  #if predict - so

        if options.multiling:
            modeldir = options.modeldir
        else:
            modeldir = om.languages[i].modeldir

        params = os.path.join(modeldir, options.params)
        print 'Reading params from ' + params
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(
                paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch,
                                   stored_opt)
            model = os.path.join(modeldir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = utils.read_conll_dir(om.languages, "test")
            else:
                testdata = utils.read_conll(cur_treebank.testfile,
                                            cur_treebank.iso_id)

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, l.outfilename)
                pred = list(parser.Predict(testdata))
                utils.write_conll_multiling(pred, om.languages)
            else:
                if cur_treebank.outfilename:
                    cur_treebank.outfilename = os.path.join(
                        outdir, cur_treebank.outfilename)
                else:
                    cur_treebank.outfilename = os.path.join(
                        outdir,
                        'out' + ('.conll' if not om.conllu else '.conllu'))
                utils.write_conll(cur_treebank.outfilename,
                                  parser.Predict(testdata))

            te = time.time()

            if options.pred_eval:
                if options.multiling:
                    for l in om.languages:
                        print "Evaluating on " + l.name
                        score = utils.evaluate(l.test_gold, l.outfilename,
                                               om.conllu)
                        print "Obtained LAS F1 score of %.2f on %s" % (score,
                                                                       l.name)
                else:
                    print "Evaluating on " + cur_treebank.name
                    score = utils.evaluate(cur_treebank.test_gold,
                                           cur_treebank.outfilename, om.conllu)
                    print "Obtained LAS F1 score of %.2f on %s" % (
                        score, cur_treebank.name)

            print 'Finished predicting'
コード例 #17
0
ファイル: parser.py プロジェクト: elikip/bist-parser
        print 'Preparing vocab'
        words, w2i, pos, rels = utils.vocab(options.conll_train)

        with open(os.path.join(options.output, options.params), 'w') as paramsfp:
            pickle.dump((words, w2i, pos, rels, options), paramsfp)
        print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, w2i, options)

        for epoch in xrange(options.epochs):
            print 'Starting epoch', epoch
            parser.Train(options.conll_train)
            conllu = (os.path.splitext(options.conll_dev.lower())[1] == '.conllu')
            devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + ('.conll' if not conllu else '.conllu'))
            utils.write_conll(devpath, parser.Predict(options.conll_dev))

            if not conllu:
                os.system('perl src/utils/eval.pl -g ' + options.conll_dev  + ' -s ' + devpath  + ' > ' + devpath + '.txt')
            else:
                os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_dev + ' ' + devpath + ' > ' + devpath + '.txt')
            
            print 'Finished predicting dev'
            parser.Save(os.path.join(options.output, options.model + str(epoch+1)))
    else:
        with open(options.params, 'r') as paramsfp:
            words, w2i, pos, rels, stored_opt = pickle.load(paramsfp)

        stored_opt.external_embedding = options.external_embedding

        parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt)
コード例 #18
0
        stored_opt.external_embedding = options.external_embedding

        print 'Initializing lstm mstparser:'
        parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, stored_opt)

        parser.Load(options.model)
        conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu')
        tespath = os.path.join(
            options.output,
            'test_pred.conll' if not conllu else 'test_pred.conllu')

        ts = time.time()
        test_res = list(parser.Predict(options.conll_test))
        te = time.time()
        print 'Finished predicting test.', te - ts, 'seconds.'
        utils.write_conll(tespath, test_res)

        if not conllu:
            os.system('perl conll/eval.pl -g ' + options.conll_test + ' -s ' +
                      tespath + ' > ' + tespath + '.txt')
        else:
            os.system(
                'python conll/evaluation_script/conll17_ud_eval.py -v -w conll/evaluation_script/weights.clas '
                + options.conll_test + ' ' + tespath + ' > ' + tespath +
                '.txt')
    else:
        print 'Preparing vocab'
        words, w2i, pos, rels = utils.vocab(options.conll_train)

        with open(os.path.join(options.output, options.params),
                  'w') as paramsfp:
コード例 #19
0
            #print 'Finished predicting dev'
        print "Total time:", total_time, "words/sec:", nwords / total_time, "sents/sec:", nsents / total_time
    else:
        with open(options.params, 'r') as paramsfp:
            words, w2i, pos, rels, stored_opt = pickle.load(paramsfp)

        stored_opt.external_embedding = options.external_embedding

        parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt)
        parser.Load(options.model)
        conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu')
        tespath = os.path.join(
            options.output,
            'test_pred.conll' if not conllu else 'test_pred.conllu')
        ts = time.time()
        pred = list(parser.Predict(options.conll_test, options.batch_size))
        te = time.time()
        pred_time = te - ts
        nsents = len(pred)
        nwords = sum([len(s) for s in pred])
        print pred_time, "sents/sec:", nsents / pred_time, "words/sec:", nwords / pred_time
        print nsents, nwords
        utils.write_conll(tespath, pred)

        #if not conllu:
        #    os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + tespath  + ' > ' + tespath + '.txt')
        #else:
        #    os.system('python src/utils/evaluation_script/conll17_ud_eval.py -v -w src/utils/evaluation_script/weights.clas ' + options.conll_test + ' ' + tespath + ' > ' + testpath + '.txt')

        print 'Finished predicting test', te - ts
コード例 #20
0
def main(train_file,
         test_file,
         output,
         model,
         num_epochs,
         embeddings_init=None,
         pos_d=0,
         seed=0):
    vocab = utils.Vocabulary(train_file)
    print('reading train...')
    train = list(utils.read_conll(train_file))
    print('read {} examples'.format(len(train)))
    print('reading test...')
    test = list(utils.read_conll(test_file))
    print('read {} examples'.format(len(test)))

    print 'Initializing lstm parser:'
    parser = graphParser.GraphParser(vocab.num_words,
                                     vocab.num_rel,
                                     pos_d=pos_d,
                                     pos_V=vocab.num_pos if pos_d else None,
                                     embeddings_init=embeddings_init,
                                     seed=seed,
                                     verbose=True)

    print('formatting test data...')
    test_indices, test_pos_indices, test_arcs, test_labels = vocab.process(
        test, deterministic=True)
    i = 1
    epochs = []
    las_scores = []
    uas_scores = []
    for epoch in range(num_epochs):
        print 'Starting epoch', epoch
        loss = 0

        #shuffle the training data
        random.shuffle(train)

        #convert to indices, sample, etc
        indices, pos_indices, gold_arcs, gold_labels = vocab.process(train)
        #train and return loss
        loss = parser.train(indices, gold_arcs, gold_labels,
                            pos_indices if pos_d else None)

        #get predicted labels for test set
        predicted_arcs, predicted_labels = parser.predict(
            test_indices, test_pos_indices if pos_d else None)

        #write the predictions to a CONLL formatted file
        devpath = os.path.join(output, 'dev_tmp.conll')
        utils.write_conll(
            devpath,
            vocab.entry(test_indices, test_pos_indices, predicted_arcs,
                        predicted_labels))

        #call the CONLL evaluation script and extract the LAS and UAS
        p = subprocess.Popen(
            ['perl', 'src/utils/eval.pl', '-g', test_file, '-s', devpath],
            stdout=subprocess.PIPE)
        out, err = p.communicate()
        las = float(out.splitlines()[0].split()[-2])
        uas = float(out.splitlines()[1].split()[-2])
        las_scores.append(las)
        uas_scores.append(uas)
        epochs.append(i)
        i = i + 1
        #do whatever metrics
        utils.metrics(loss, uas, las)

        #save the current model
        parser.save(os.path.join(output, os.path.basename(model)),
                    vocab.idx2word, vocab.idx2pos if pos_d else None)

    print 'epochs', epochs
    print 'las', las_scores
    print 'uas', uas_scores
    fig = plt.figure()
    plt.plot(epochs, las_scores)
    plt.plot(epochs, uas_scores)
    plt.legend(['LAS', 'UAS'])
    plt.show()
    fig.savefig('pos_accuracy.png')
コード例 #21
0
ファイル: parser.py プロジェクト: ZhuJiahui/uuparser
def run(om, options, i):
    outdir = options.output
    if options.multi_monoling:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir
        modelDir = cur_treebank.modelDir
    else:
        outdir = options.output
        modelDir = om.languages[i].modelDir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.include:
        cur_treebank = om.treebank

    if not options.predictFlag:

        print 'Preparing vocab'
        if options.multiling:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                om.languages, path_is_dir=True)

        else:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                cur_treebank.trainfile)

        with open(os.path.join(outdir, options.params), 'w') as paramsfp:
            pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch),
                        paramsfp)
            print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options)

        for epoch in xrange(options.first_epoch - 1,
                            options.first_epoch - 1 + options.epochs):
            if options.multiling:
                traindata = list(
                    utils.read_conll_dir(om.languages, "train",
                                         options.drop_proj, options.maxCorpus))
                devdata = enumerate(utils.read_conll_dir(om.languages, "dev"))

            else:
                conllFP = open(cur_treebank.trainfile, 'r')
                traindata = list(
                    utils.read_conll(conllFP, options.drop_proj,
                                     cur_treebank.iso_id))
                if os.path.exists(cur_treebank.devfile):
                    conllFP = open(cur_treebank.devfile, 'r')
                    devdata = enumerate(
                        utils.read_conll(conllFP, False, cur_treebank.iso_id))
                else:
                    tot_sen = len(traindata)
                    #take a bit less than 5% of train sentences for dev
                    if tot_sen > 1000:
                        import random
                        random.shuffle(traindata)
                        dev_len = int(0.05 * tot_sen)
                        #gen object * 2
                        devdata, dev_gold = itertools.tee(traindata[:dev_len])
                        devdata = enumerate(devdata)
                        dev_gold_f = os.path.join(outdir,
                                                  'dev_gold' + '.conllu')
                        utils.write_conll(dev_gold_f, dev_gold)
                        cur_treebank.dev_gold = dev_gold_f
                        traindata = traindata[dev_len:]
                    else:
                        devdata = None

            print 'Starting epoch', epoch
            parser.Train(traindata)

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(
                        l.outdir, 'dev_epoch_' + str(epoch + 1) + '.conllu')
                pred = list(parser.Predict(devdata))
                if len(pred) > 0:
                    utils.write_conll_multiling(pred, om.languages)
            else:
                cur_treebank.outfilename = os.path.join(
                    outdir, 'dev_epoch_' + str(epoch + 1) +
                    ('.conll' if not om.conllu else '.conllu'))
                if devdata:
                    pred = list(parser.Predict(devdata))
                    utils.write_conll(cur_treebank.outfilename, pred)

            if options.multiling:
                for l in om.languages:
                    utils.evaluate(l.dev_gold, l.outfilename, om.conllu)
            else:
                utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename,
                               om.conllu)

            print 'Finished predicting dev'
            parser.Save(os.path.join(outdir, options.model + str(epoch + 1)))

    else:  #if predict - so
        params = os.path.join(modelDir, options.params)
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(
                paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch,
                                   stored_opt)
            model = os.path.join(modelDir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = enumerate(utils.read_conll_dir(
                    om.languages, "test"))

            if not options.multiling:
                conllFP = open(cur_treebank.testfile, 'r')
                testdata = enumerate(
                    utils.read_conll(conllFP, False, cur_treebank.iso_id))

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, l.outfilename)
                pred = list(parser.Predict(testdata))
                utils.write_conll_multiling(pred, om.languages)
            else:
                cur_treebank.outfilename = os.path.join(
                    outdir, cur_treebank.outfilename)
                utils.write_conll(cur_treebank.outfilename,
                                  parser.Predict(testdata))

            te = time.time()

            if options.predEval:
                if options.multiling:
                    for l in om.languages:
                        utils.evaluate(l.test_gold, l.outfilename, om.conllu)
                else:
                    utils.evaluate(cur_treebank.test_gold,
                                   cur_treebank.outfilename, om.conllu)

            print 'Finished predicting test', te - ts