def get_stats(treebanks):
    traindata = list(utils.read_conll_dir(treebanks, "train"))
    #do countings here
    d = {}
    d['rh'] = 0
    d['lh'] = 0
    tot = 0
    for sentence in traindata:
        conll_sentence = [
            entry for entry in sentence if isinstance(entry, utils.ConllEntry)
        ]
        for item in conll_sentence:
            if item.relation.split(":")[0] in CONTENT_DEPRELS:
                tot += 1
                if item.id < item.parent_id:
                    d['rh'] += 1
                else:
                    d['lh'] += 1
    d['rh'] /= float(tot)
    d['lh'] /= float(tot)
    d['rh'] *= 100
    d['lh'] *= 100
    print treebanks[0].iso_id
    print "Right-headed rels: " + str(d['rh'])
    print "Left-headed rels: " + str(d['lh'])
def get_stats_c(treebanks):
    traindata = list(utils.read_conll_dir(treebanks, "train"))
    n_dep = 0.
    depth = 0.
    for sentence in traindata:
        conll_sentence = [
            entry for entry in sentence if isinstance(entry, utils.ConllEntry)
        ]
        for item in conll_sentence:
            if item.id != 0:
                depth_tok = get_dep_len(item, sentence, 1)
                depth += depth_tok
                n_dep += 1

    av_dep = depth / n_dep

    print treebanks[0].iso_id
    print "Average distance to root:" + str(av_dep)
def get_stats_b(treebanks):
    traindata = list(utils.read_conll_dir(treebanks, "train"))
    n_sen = 0.
    sen_len = 0.
    n_dep = 0.
    dep_len = 0.
    for sentence in traindata:
        conll_sentence = [
            entry for entry in sentence if isinstance(entry, utils.ConllEntry)
        ]
        sen_len += len(conll_sentence)
        n_sen += 1
        for item in conll_sentence:
            dep_len += abs(item.id - item.parent_id)
            n_dep += 1

    av_len = sen_len / n_sen
    av_dep = dep_len / n_dep

    print treebanks[0].iso_id
    print "Average sentence length:" + str(av_len)
    print "Average dependency length:" + str(av_dep)
Example #4
0
    def Predict(self, treebanks, datasplit, options):
        char_map = {}
        if options.char_map_file:
            char_map_fh = codecs.open(options.char_map_file,encoding='utf-8')
            char_map = json.loads(char_map_fh.read())
        # should probably use a namedtuple in get_vocab to make this prettier
        _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map)

        # get external embeddings for the set of words and chars in the
        # test vocab but not in the training vocab
        test_embeddings = defaultdict(lambda: {})
        if options.word_emb_size > 0 and options.ext_word_emb_file:
            new_test_words = \
                    set(test_words) - self.feature_extractor.words.viewkeys()

            print "Number of OOV word types at test time: %i (out of %i)" % (
                len(new_test_words), len(test_words))

            if len(new_test_words) > 0:
                # no point loading embeddings if there are no words to look for
                for lang in test_langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=new_test_words
                    )
                    test_embeddings["words"].update(embeddings)
                    if len(test_langs) > 1 and test_embeddings["words"]:
                        print "External embeddings found for %i words "\
                                "(out of %i)" % \
                                (len(test_embeddings["words"]), len(new_test_words))

        if options.char_emb_size > 0:
            new_test_chars = \
                    set(test_chars) - self.feature_extractor.chars.viewkeys()
            print "Number of OOV char types at test time: %i (out of %i)" % (
                len(new_test_chars), len(test_chars))

            if len(new_test_chars) > 0:
                for lang in test_langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=new_test_chars,
                        chars=True
                    )
                    test_embeddings["chars"].update(embeddings)
                    if len(test_langs) > 1 and test_embeddings["chars"]:
                        print "External embeddings found for %i chars "\
                                "(out of %i)" % \
                                (len(test_embeddings["chars"]), len(new_test_chars))

        data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map)
        for iSentence, osentence in enumerate(data,1):
            sentence = deepcopy(osentence)
            self.feature_extractor.Init(options)
            conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]
            self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings)

            scores, exprs = self.__evaluate(conll_sentence, True)
            if self.proj:
                heads = decoder.parse_proj(scores)
                #LATTICE solution to multiple roots
                # see https://github.com/jujbob/multilingual-bist-parser/blob/master/bist-parser/bmstparser/src/mstlstm.py
                ## ADD for handling multi-roots problem
                rootHead = [head for head in heads if head==0]
                if len(rootHead) != 1:
                    print "it has multi-root, changing it for heading first root for other roots"
                    rootHead = [seq for seq, head in enumerate(heads) if head == 0]
                    for seq in rootHead[1:]:heads[seq] = rootHead[0]
                ## finish to multi-roots

            else:
                heads = chuliu_edmonds_one_root(scores.T)

            for entry, head in zip(conll_sentence, heads):
                entry.pred_parent_id = head
                entry.pred_relation = '_'

            if self.labelsFlag:
                for modifier, head in enumerate(heads[1:]):
                    scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1)
                    conll_sentence[modifier+1].pred_relation = self.feature_extractor.irels[max(enumerate(scores), key=itemgetter(1))[0]]

            dy.renew_cg()

            #keep in memory the information we need, not all the vectors
            oconll_sentence = [entry for entry in osentence if isinstance(entry, utils.ConllEntry)]
            for tok_o, tok in zip(oconll_sentence, conll_sentence):
                tok_o.pred_relation = tok.pred_relation
                tok_o.pred_parent_id = tok.pred_parent_id
            yield osentence
Example #5
0
    def Predict(self, treebanks, datasplit, options):
        reached_max_swap = 0
        char_map = {}
        if options.char_map_file:
            char_map_fh = codecs.open(options.char_map_file,encoding='utf-8')
            char_map = json.loads(char_map_fh.read())
        # should probably use a namedtuple in get_vocab to make this prettier
        print "Collecting test data vocab"
        _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map)
        # get external embeddings for the set of words and chars in the test vocab but not in the training vocab
        test_embeddings = defaultdict(lambda:{})
        if options.word_emb_size > 0:
            new_test_words = set(test_words) - self.feature_extractor.words.viewkeys()
            print "Number of OOV word types at test time: %i (out of %i)"%(len(new_test_words),len(test_words))
            if len(new_test_words) > 0: # no point loading embeddings if there are no words to look for
                for lang in test_langs:
                    test_embeddings["words"].update(utils.get_external_embeddings(options,lang,new_test_words))
                if len(test_langs) > 1 and test_embeddings["words"]:
                    print "External embeddings found for %i words (out of %i)"%(len(test_embeddings["words"]),len(new_test_words))
        if options.char_emb_size > 0:
            new_test_chars = set(test_chars) - self.feature_extractor.chars.viewkeys()
            print "Number of OOV char types at test time: %i (out of %i)"%(len(new_test_chars),len(test_chars))
            if len(new_test_chars) > 0:
                for lang in test_langs:
                    test_embeddings["chars"].update(utils.get_external_embeddings(options,lang,new_test_chars,chars=True))
                if len(test_langs) > 1 and test_embeddings["chars"]:
                    print "External embeddings found for %i chars (out of %i)"%(len(test_embeddings["chars"]),len(new_test_chars))

        ts = time()
        data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map)
        for iSentence, osentence in enumerate(data,1):
            sentence = deepcopy(osentence)
            reached_swap_for_i_sentence = False
            max_swap = 2*len(sentence)
            iSwap = 0
            self.feature_extractor.Init(options)
            conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]
            conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
            self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings)
            stack = ParseForest([])
            buf = ParseForest(conll_sentence)

            hoffset = 1 if self.headFlag else 0

            for root in conll_sentence:
                root.lstms = [root.vec] if self.headFlag else []
                if not self.recursive_composition:
                    root.lstms += [self.feature_extractor.paddingVec for _ in range(self.nnvecs - hoffset)]
                else:
                    root.lstms += [root.vec]
                    root.lstm = None #only necessary for treeLSTM case
                    root.composed_rep = root.vec.value()

            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, False)
                best = max(chain(*(scores if iSwap < max_swap else scores[:3] )), key = itemgetter(2) )
                if iSwap == max_swap and not reached_swap_for_i_sentence:
                    reached_max_swap += 1
                    reached_swap_for_i_sentence = True
                    print "reached max swap in %d out of %d sentences"%(reached_max_swap, iSentence)
                self.apply_transition(best,stack,buf,hoffset)
                if best[1] == SWAP:
                    iSwap += 1

            #keep in memory the information we need, not all the vectors
            oconll_sentence = [entry for entry in osentence if isinstance(entry, utils.ConllEntry)]
            oconll_sentence = oconll_sentence[1:] + [oconll_sentence[0]]
            for tok_o, tok in zip(oconll_sentence, conll_sentence):
                tok_o.pred_relation = tok.pred_relation
                tok_o.pred_parent_id = tok.pred_parent_id
                if self.recursive_composition:
                    tok_o.composed_rep = tok.composed_rep
            yield osentence

            dy.renew_cg()

        print "Total prediction time: %.2fs"%(time()-ts)
Example #6
0
def run(om, options, i):

    if options.multiling:
        outdir = options.outdir
    else:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.predict:  # training

        print 'Preparing vocab'
        if options.multiling:
            path_is_dir = True,
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\
                                                                 path_is_dir,
                                                                 options.shareWordLookup,\
                                                                 options.shareCharLookup)

        else:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                cur_treebank.trainfile)

        paramsfile = os.path.join(outdir, options.params)
        with open(paramsfile, 'w') as paramsfp:
            print 'Saving params to ' + paramsfile
            pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch),
                        paramsfp)
            print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options)
        if options.continueModel is not None:
            parser.Load(options.continueModel)

        for epoch in xrange(options.first_epoch,
                            options.first_epoch + options.epochs):

            print 'Starting epoch ' + str(epoch)

            if options.multiling:
                traindata = list(
                    utils.read_conll_dir(om.languages, "train",
                                         options.max_sentences))
            else:
                traindata = list(
                    utils.read_conll(cur_treebank.trainfile,
                                     cur_treebank.iso_id,
                                     options.max_sentences))

            parser.Train(traindata)
            print 'Finished epoch ' + str(epoch)

            model_file = os.path.join(outdir, options.model + str(epoch))
            parser.Save(model_file)

            if options.pred_dev:  # use the model to predict on dev data

                if options.multiling:
                    pred_langs = [
                        lang for lang in om.languages if lang.pred_dev
                    ]  # languages which have dev data on which to predict
                    for lang in pred_langs:
                        lang.outfilename = os.path.join(
                            lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu')
                        print "Predicting on dev data for " + lang.name
                    devdata = utils.read_conll_dir(pred_langs, "dev")
                    pred = list(parser.Predict(devdata))
                    if len(pred) > 0:
                        utils.write_conll_multiling(pred, pred_langs)
                    else:
                        print "Warning: prediction empty"
                    if options.pred_eval:
                        for lang in pred_langs:
                            print "Evaluating dev prediction for " + lang.name
                            utils.evaluate(lang.dev_gold, lang.outfilename,
                                           om.conllu)
                else:  # monolingual case
                    if cur_treebank.pred_dev:
                        print "Predicting on dev data for " + cur_treebank.name
                        devdata = utils.read_conll(cur_treebank.devfile,
                                                   cur_treebank.iso_id)
                        cur_treebank.outfilename = os.path.join(
                            outdir, 'dev_epoch_' + str(epoch) +
                            ('.conll' if not om.conllu else '.conllu'))
                        pred = list(parser.Predict(devdata))
                        utils.write_conll(cur_treebank.outfilename, pred)
                        if options.pred_eval:
                            print "Evaluating dev prediction for " + cur_treebank.name
                            score = utils.evaluate(cur_treebank.dev_gold,
                                                   cur_treebank.outfilename,
                                                   om.conllu)
                            if options.model_selection:
                                if score > cur_treebank.dev_best[1]:
                                    cur_treebank.dev_best = [epoch, score]

            if epoch == options.epochs:  # at the last epoch choose which model to copy to barchybrid.model
                if not options.model_selection:
                    best_epoch = options.epochs  # take the final epoch if model selection off completely (for example multilingual case)
                else:
                    best_epoch = cur_treebank.dev_best[
                        0]  # will be final epoch by default if model selection not on for this treebank
                    if cur_treebank.model_selection:
                        print "Best dev score of " + str(
                            cur_treebank.dev_best[1]
                        ) + " found at epoch " + str(cur_treebank.dev_best[0])

                bestmodel_file = os.path.join(
                    outdir, "barchybrid.model" + str(best_epoch))
                model_file = os.path.join(outdir, "barchybrid.model")
                print "Copying " + bestmodel_file + " to " + model_file
                copyfile(bestmodel_file, model_file)

    else:  #if predict - so

        if options.multiling:
            modeldir = options.modeldir
        else:
            modeldir = om.languages[i].modeldir

        params = os.path.join(modeldir, options.params)
        print 'Reading params from ' + params
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(
                paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch,
                                   stored_opt)
            model = os.path.join(modeldir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = utils.read_conll_dir(om.languages, "test")
            else:
                testdata = utils.read_conll(cur_treebank.testfile,
                                            cur_treebank.iso_id)

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, l.outfilename)
                pred = list(parser.Predict(testdata))
                utils.write_conll_multiling(pred, om.languages)
            else:
                if cur_treebank.outfilename:
                    cur_treebank.outfilename = os.path.join(
                        outdir, cur_treebank.outfilename)
                else:
                    cur_treebank.outfilename = os.path.join(
                        outdir,
                        'out' + ('.conll' if not om.conllu else '.conllu'))
                utils.write_conll(cur_treebank.outfilename,
                                  parser.Predict(testdata))

            te = time.time()

            if options.pred_eval:
                if options.multiling:
                    for l in om.languages:
                        print "Evaluating on " + l.name
                        score = utils.evaluate(l.test_gold, l.outfilename,
                                               om.conllu)
                        print "Obtained LAS F1 score of %.2f on %s" % (score,
                                                                       l.name)
                else:
                    print "Evaluating on " + cur_treebank.name
                    score = utils.evaluate(cur_treebank.test_gold,
                                           cur_treebank.outfilename, om.conllu)
                    print "Obtained LAS F1 score of %.2f on %s" % (
                        score, cur_treebank.name)

            print 'Finished predicting'
Example #7
0
def run(om,options,i):

    if options.multiling:
        outdir = options.outdir
    else:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.predict: # training

        fineTune = False
        start_from = 1
        if options.continueModel is None:
            continueTraining = False
        else:
            continueTraining = True
            trainedModel = options.continueModel
            if options.fineTune:
                fineTune = True
            else:
                start_from = options.first_epoch - 1

        if not continueTraining:
            print 'Preparing vocab'
            if options.multiling:
                path_is_dir=True,
                words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\
                                                                     path_is_dir,
                                                                     options.shareWordLookup,\
                                                                     options.shareCharLookup)

            else:
                words, w2i, pos, cpos, rels, langs, ch = utils.vocab(cur_treebank.trainfile)

            paramsfile = os.path.join(outdir, options.params)
            with open(paramsfile, 'w') as paramsfp:
                print 'Saving params to ' + paramsfile
                pickle.dump((words, w2i, pos, rels, cpos, langs,
                             options, ch), paramsfp)
                print 'Finished collecting vocab'
        else:
            paramsfile = os.path.join(outdir, options.params)
            with open(paramsfile, 'rb') as paramsfp:
                print 'Load params from ' + paramsfile
                words, w2i, pos, rels, cpos, langs, options, ch = pickle.load(paramsfp)
                print 'Finished loading vocab'

        max_epochs = options.first_epoch + options.epochs
        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i,
                               ch, options)

        if continueTraining:
            if not fineTune: 
                # continue training only, not doing fine tuning
                options.first_epoch = start_from + 1
                max_epochs = options.epochs
            else:
                # fine tune model
                options.first_epoch = options.epochs + 1
                max_epochs = options.first_epoch + 15
                print 'Fine tune model for another', max_epochs - options.first_epoch, 'epochs'

            parser.Load(trainedModel)
            

        best_multi_las = -1
        best_multi_epoch = 0
        
        if continueTraining:
            train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'a', encoding='utf-8')
        else:
            train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'w', encoding='utf-8')
                
        for epoch in xrange(options.first_epoch, max_epochs + 1):

            print 'Starting epoch ' + str(epoch)

            if options.multiling:
                traindata = list(utils.read_conll_dir(om.languages, "train", options.max_sentences))
            else:
                traindata = list(utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id,options.max_sentences))

            parser.Train(traindata)
            train_stats.write(unicode('Epoch ' + str(epoch) + '\n'))
            print 'Finished epoch ' + str(epoch)

            model_file = os.path.join(outdir, options.model + '.tmp')
            parser.Save(model_file)

            if options.pred_dev: # use the model to predict on dev data
                if options.multiling:
                    pred_langs = [lang for lang in om.languages if lang.pred_dev] # languages which have dev data on which to predict
                    for lang in pred_langs:
                        lang.outfilename = os.path.join(lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu')
                        print "Predicting on dev data for " + lang.name
                    devdata = utils.read_conll_dir(pred_langs,"dev")
                    pred = list(parser.Predict(devdata))

                    if len(pred)>0:
                        utils.write_conll_multiling(pred,pred_langs)
                    else:
                        print "Warning: prediction empty"
                    
                    if options.pred_eval:
                        total_las = 0
                        for lang in pred_langs:
                            print "Evaluating dev prediction for " + lang.name
                            las_score = utils.evaluate(lang.dev_gold, lang.outfilename,om.conllu)
                            total_las += las_score
                            train_stats.write(unicode('Dev LAS ' + lang.name + ': ' + str(las_score) + '\n'))
                        if options.model_selection:
                            if total_las > best_multi_las:
                                best_multi_las = total_las
                                best_multi_epoch = epoch 

                else: # monolingual case
                    if cur_treebank.pred_dev:
                        print "Predicting on dev data for " + cur_treebank.name
                        devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id)
                        cur_treebank.outfilename = os.path.join(outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu'))
                        pred = list(parser.Predict(devdata))
                        utils.write_conll(cur_treebank.outfilename, pred)
                        if options.pred_eval:
                            print "Evaluating dev prediction for " + cur_treebank.name
                            las_score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu)
                            if options.model_selection:
                                if las_score > cur_treebank.dev_best[1]:
                                    cur_treebank.dev_best = [epoch, las_score]
                                    train_stats.write(unicode('Dev LAS ' + cur_treebank.name + ': ' + str(las_score) + '\n'))
                                    

            if epoch == max_epochs: # at the last epoch choose which model to copy to barchybrid.model
                if not options.model_selection:
                    best_epoch = options.epochs # take the final epoch if model selection off completely (for example multilingual case)
                else:
                    if options.multiling:
                        best_epoch = best_multi_epoch
                    else:
                        best_epoch = cur_treebank.dev_best[0] # will be final epoch by default if model selection not on for this treebank
                        if cur_treebank.model_selection:
                            print "Best dev score of " + str(cur_treebank.dev_best[1]) + " found at epoch " + str(cur_treebank.dev_best[0])

                bestmodel_file = os.path.join(outdir,"barchybrid.model.tmp")
                model_file = os.path.join(outdir,"barchybrid.model")
                if fineTune:
                    model_file = os.path.join(outdir,"barchybrid.tuned.model")
                print "Best epoch: " + str(best_epoch)
                print "Copying " + bestmodel_file + " to " + model_file
                copyfile(bestmodel_file,model_file)

        train_stats.close()

    else: #if predict - so

        # import pdb;pdb.set_trace()
        eval_type = options.evaltype
        print "Eval type: ", eval_type
        if eval_type == "train":
            if options.multiling:
                for l in om.languages:
                    l.test_gold = l.test_gold.replace('test', 'train')
            else:
                cur_treebank.testfile = cur_treebank.trainfile
                cur_treebank.test_gold = cur_treebank.trainfile

        elif eval_type == "dev":
            if options.multiling:
                for l in om.languages:
                    l.test_gold = l.test_gold.replace('test', 'dev')
            else:
                cur_treebank.testfile = cur_treebank.devfile
                cur_treebank.test_gold = cur_treebank.devfile

        if options.multiling:
            modeldir = options.modeldir
            if options.fineTune:
                prefix = [os.path.join(outdir, os.path.basename(l.test_gold) + '-tuned') for l in om.languages] 
            else:
                prefix = [os.path.join(outdir, os.path.basename(l.test_gold)) for l in om.languages] 
        else:
            modeldir = om.languages[i].modeldir
            if options.fineTune:
                prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile)) + '-tuned'
            else:
                prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile))

        if not options.extract_vectors:
            prefix = None


        params = os.path.join(modeldir, options.params)
        print 'Reading params from ' + params
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i,
                               ch, stored_opt)

            if options.fineTune:
                options.model = options.model.replace('.model', '.tuned.model')
            model = os.path.join(modeldir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = utils.read_conll_dir(om.languages, eval_type)
            else:
                testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id)

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, eval_type + "-" + l.outfilename)
                pred = list(parser.Predict(testdata, prefix))
                utils.write_conll_multiling(pred,om.languages)
            else:
                if cur_treebank.outfilename:
                    cur_treebank.outfilename = os.path.join(outdir, eval_type + "-" + cur_treebank.outfilename)
                else:
                    cur_treebank.outfilename = os.path.join(outdir, 'out' + ('.conll' if not om.conllu else '.conllu'))
                utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata, prefix))

            te = time.time()

            if options.pred_eval:
                if options.multiling:
                    for l in om.languages:
                        print "Evaluating on " + l.name
                        score = utils.evaluate(l.test_gold, l.outfilename, om.conllu)
                        print "Obtained LAS F1 score of %.2f on %s" %(score, l.name)
                else:
                    print "Evaluating on " + cur_treebank.name
                    score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu)
                    print "Obtained LAS F1 score of %.2f on %s" %(score,cur_treebank.name)

            print 'Finished predicting'
Example #8
0
    def Predict(self, treebanks, datasplit, options):
        reached_max_swap = 0
        char_map = {}
        if options.char_map_file:
            char_map_fh = open(options.char_map_file, encoding='utf-8')
            char_map = json.loads(char_map_fh.read())
        # should probably use a namedtuple in get_vocab to make this prettier
        _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(
            treebanks, datasplit, char_map)

        # get external embeddings for the set of words and chars in the
        # test vocab but not in the training vocab
        test_embeddings = defaultdict(lambda: {})
        if options.word_emb_size > 0 and options.ext_word_emb_file:
            new_test_words = \
                set(test_words) - self.feature_extractor.words.keys()

            print("Number of OOV word types at test time: %i (out of %i)" %
                  (len(new_test_words), len(test_words)))

            if len(new_test_words) > 0:
                # no point loading embeddings if there are no words to look for
                for lang in test_langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=new_test_words)
                    test_embeddings["words"].update(embeddings)
                if len(test_langs) > 1 and test_embeddings["words"]:
                    print("External embeddings found for %i words "\
                          "(out of %i)" % \
                          (len(test_embeddings["words"]), len(new_test_words)))

        if options.char_emb_size > 0:
            new_test_chars = \
                set(test_chars) - self.feature_extractor.chars.keys()
            print("Number of OOV char types at test time: %i (out of %i)" %
                  (len(new_test_chars), len(test_chars)))

            if len(new_test_chars) > 0:
                for lang in test_langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=new_test_chars,
                        chars=True)
                    test_embeddings["chars"].update(embeddings)
                if len(test_langs) > 1 and test_embeddings["chars"]:
                    print("External embeddings found for %i chars "\
                          "(out of %i)" % \
                          (len(test_embeddings["chars"]), len(new_test_chars)))

        data = utils.read_conll_dir(treebanks, datasplit, char_map=char_map)
        for iSentence, osentence in enumerate(data, 1):
            sentence = deepcopy(osentence)
            reached_swap_for_i_sentence = False
            max_swap = 2 * len(sentence)
            iSwap = 0
            self.feature_extractor.Init(options)
            conll_sentence = [
                entry for entry in sentence
                if isinstance(entry, utils.ConllEntry)
            ]
            conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
            self.feature_extractor.getWordEmbeddings(conll_sentence, False,
                                                     options, test_embeddings)
            stack = ParseForest([])
            buf = ParseForest(conll_sentence)

            hoffset = 1 if self.headFlag else 0

            for root in conll_sentence:
                #empty = dy.zeros(2*options.lstm_output_size)
                root.lstms = [root.vec] if self.headFlag else []
                root.lstms += [root.vec for _ in range(self.nnvecs - hoffset)]
                root.relation = root.relation if root.relation in self.irels else 'runk'

            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, False)
                best = max(
                    chain(*(scores if iSwap < max_swap else scores[:3])),
                    key=itemgetter(2))
                if iSwap == max_swap and not reached_swap_for_i_sentence:
                    reached_max_swap += 1
                    reached_swap_for_i_sentence = True
                    print("reached max swap in %d out of %d sentences" %
                          (reached_max_swap, iSentence))
                self.apply_transition(best, stack, buf, hoffset)
                if best[1] == SWAP:
                    iSwap += 1

            dy.renew_cg()

            #keep in memory the information we need, not all the vectors
            oconll_sentence = [
                entry for entry in osentence
                if isinstance(entry, utils.ConllEntry)
            ]
            oconll_sentence = oconll_sentence[1:] + [oconll_sentence[0]]
            for tok_o, tok in zip(oconll_sentence, conll_sentence):
                tok_o.pred_relation = tok.pred_relation
                tok_o.pred_parent_id = tok.pred_parent_id
            yield osentence
Example #9
0
def run(om, options, i):

    if options.multiling:
        outdir = options.outdir
    else:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.predict:  # training

        print 'Preparing vocab'
        if options.multiling:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                om.languages, path_is_dir=True)

        else:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                cur_treebank.trainfile)

        paramsfile = os.path.join(outdir, options.params)
        with open(paramsfile, 'w') as paramsfp:
            print 'Saving params to ' + paramsfile
            pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch),
                        paramsfp)
            print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options)

        durations = []
        for epoch in xrange(options.first_epoch,
                            options.first_epoch + options.epochs):

            print 'Starting epoch ' + str(epoch)
            start_time = time.time()

            if options.multiling:
                traindata = list(
                    utils.read_conll_dir(om.languages, "train",
                                         options.max_sentences))
            else:
                traindata = list(
                    utils.read_conll(cur_treebank.trainfile,
                                     cur_treebank.iso_id,
                                     options.max_sentences))

            parser.Train(traindata)
            print 'Finished epoch ' + str(epoch)

            if not options.overwrite_model:
                model_file = os.path.join(outdir, options.model + str(epoch))
                parser.Save(model_file)

            if options.pred_dev:  # use the model to predict on dev data

                if options.multiling:
                    pred_langs = [
                        lang for lang in om.languages if lang.pred_dev
                    ]  # languages which have dev data on which to predict
                    for lang in pred_langs:
                        lang.outfilename = os.path.join(
                            lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu')
                        print "Predicting on dev data for " + lang.name
                    devdata = utils.read_conll_dir(pred_langs, "dev")
                    pred = list(parser.Predict(devdata))
                    if len(pred) > 0:
                        utils.write_conll_multiling(pred, pred_langs)
                    else:
                        print "Warning: prediction empty"
                    if options.pred_eval:
                        for lang in pred_langs:
                            print "Evaluating dev prediction for " + lang.name
                            utils.evaluate(lang.dev_gold, lang.outfilename,
                                           om.conllu)
                else:  # monolingual case
                    if cur_treebank.pred_dev:
                        print "Predicting on dev data for " + cur_treebank.name
                        devdata = utils.read_conll(cur_treebank.devfile,
                                                   cur_treebank.iso_id)
                        cur_treebank.outfilename = os.path.join(
                            outdir, 'dev_epoch_' + str(epoch) +
                            ('.conll' if not om.conllu else '.conllu'))
                        pred = list(parser.Predict(devdata))
                        utils.write_conll(cur_treebank.outfilename, pred)
                        if options.pred_eval:
                            print "Evaluating dev prediction for " + cur_treebank.name
                            score = utils.evaluate(cur_treebank.dev_gold,
                                                   cur_treebank.outfilename,
                                                   om.conllu)
                            if options.model_selection:
                                if score > cur_treebank.dev_best[1]:
                                    cur_treebank.dev_best = [epoch, score]
                                if options.overwrite_model:
                                    print "Overwriting model due to higher dev score"
                                    model_file = os.path.join(
                                        cur_treebank.outdir, options.model)
                                    parser.Save(model_file)

            if options.deadline:
                # keep track of duration of training+eval
                now = time.time()
                duration = now - start_time
                durations.append(duration)
                # estimate when next epoch will finish
                last_five_durations = durations[-5:]
                eta = time.time() + max(last_five_durations)
                print 'Deadline in %.1f seconds' % (options.deadline - now)
                print 'ETA of next epoch in %.1f seconds' % (eta - now)
                # does it exceed the deadline?
                exceeds_deadline = eta > options.deadline
            else:
                # no deadline
                exceeds_deadline = False

            if exceeds_deadline or epoch == options.epochs:
                # at the last epoch copy the best model to barchybrid.model
                if not options.model_selection:
                    # model selection off completely (for example multilingual case)
                    # --> take the final epoch, i.e. the current epoch
                    best_epoch = epoch
                else:
                    best_epoch = cur_treebank.dev_best[
                        0]  # will be final epoch by default if model selection not on for this treebank
                    if cur_treebank.model_selection:
                        print "Best dev score of " + str(
                            cur_treebank.dev_best[1]
                        ) + " found at epoch " + str(cur_treebank.dev_best[0])

                if not options.overwrite_model:
                    bestmodel_file = os.path.join(
                        outdir, "barchybrid.model" + str(best_epoch))
                    model_file = os.path.join(outdir, "barchybrid.model")
                    print "Copying " + bestmodel_file + " to " + model_file
                    copyfile(bestmodel_file, model_file)

            if exceeds_deadline and epoch < options.epochs:
                print 'Leaving epoch loop early to avoid exceeding deadline'
                break

            if exceeds_deadline and epoch < options.epochs:
                print 'Leaving epoch loop early to avoid exceeding deadline'
                break

    else:  #if predict - so

        if options.multiling:
            modeldir = options.modeldir
        else:
            modeldir = om.languages[i].modeldir

        params = os.path.join(modeldir, options.params)
        print 'Reading params from ' + params
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(
                paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch,
                                   stored_opt)
            model = os.path.join(modeldir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = utils.read_conll_dir(om.languages, "test")
            else:
                testdata = utils.read_conll(cur_treebank.testfile,
                                            cur_treebank.iso_id)

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, l.outfilename)
                pred = list(parser.Predict(testdata))
                utils.write_conll_multiling(pred, om.languages)
            else:
                if cur_treebank.outfilename:
                    cur_treebank.outfilename = os.path.join(
                        outdir, cur_treebank.outfilename)
                else:
                    cur_treebank.outfilename = os.path.join(
                        outdir,
                        'out' + ('.conll' if not om.conllu else '.conllu'))
                utils.write_conll(cur_treebank.outfilename,
                                  parser.Predict(testdata))

            te = time.time()

            if options.pred_eval:
                if options.multiling:
                    for l in om.languages:
                        print "Evaluating on " + l.name
                        score = utils.evaluate(l.test_gold, l.outfilename,
                                               om.conllu)
                        print "Obtained LAS F1 score of %.2f on %s" % (score,
                                                                       l.name)
                else:
                    print "Evaluating on " + cur_treebank.name
                    score = utils.evaluate(cur_treebank.test_gold,
                                           cur_treebank.outfilename, om.conllu)
                    print "Obtained LAS F1 score of %.2f on %s" % (
                        score, cur_treebank.name)

            print 'Finished predicting'
Example #10
0
def run(experiment,options):
    if options.graph_based:
        from mstlstm import MSTParserLSTM as Parser
        print('Working with a graph-based parser')
    else:
        from arc_hybrid import ArcHybridLSTM as Parser
        print('Working with a transition-based parser')

    if not options.predict: # training

        paramsfile = os.path.join(experiment.outdir, options.params)

        if not options.continueTraining:
            print('Preparing vocab')
            vocab = utils.get_vocab(experiment.treebanks,"train")
            print('Finished collecting vocab')

            with open(paramsfile, 'wb') as paramsfp:
                print('Saving params to ' + paramsfile)
                pickle.dump((vocab, options), paramsfp)

                print('Initializing the model')
                parser = Parser(vocab, options)
        else:  #continue
            if options.continueParams:
                paramsfile = options.continueParams
            with open(paramsfile, 'r') as paramsfp:
                stored_vocab, stored_options = pickle.load(paramsfp)
                print('Initializing the model:')
                parser = Parser(stored_vocab, stored_options)

            parser.Load(options.continueModel)

        dev_best = [options.epochs,-1.0] # best epoch, best score

        for epoch in range(options.first_epoch, options.epochs+1):

            print('Starting epoch ' + str(epoch))
            traindata = list(utils.read_conll_dir(experiment.treebanks, "train", options.max_sentences))
            parser.Train(traindata,options)
            print('Finished epoch ' + str(epoch))

            model_file = os.path.join(experiment.outdir, options.model + str(epoch))
            parser.Save(model_file)

            if options.pred_dev: # use the model to predict on dev data

                # not all treebanks necessarily have dev data
                pred_treebanks = [treebank for treebank in experiment.treebanks if treebank.pred_dev]
                if pred_treebanks:
                    for treebank in pred_treebanks:
                        treebank.outfilename = os.path.join(treebank.outdir, 'dev_epoch_' + str(epoch) + '.conllu')
                        print("Predicting on dev data for " + treebank.name)
                    pred = list(parser.Predict(pred_treebanks,"dev",options))
                    utils.write_conll_multiling(pred,pred_treebanks)

                    if options.pred_eval: # evaluate the prediction against gold data
                        mean_score = 0.0
                        for treebank in pred_treebanks:
                            score = utils.evaluate(treebank.dev_gold,treebank.outfilename,options.conllu)
                            print("Dev score %.2f at epoch %i for %s"%(score,epoch,treebank.name))
                            mean_score += score
                        if len(pred_treebanks) > 1: # multiling case
                            mean_score = mean_score/len(pred_treebanks)
                            print("Mean dev score %.2f at epoch %i"%(mean_score,epoch))
                        if options.model_selection:
                            if mean_score > dev_best[1]:
                                dev_best = [epoch,mean_score] # update best dev score
                            # hack to printthe word "mean" if the dev score is an average
                            mean_string = "mean " if len(pred_treebanks) > 1 else ""
                            print("Best %sdev score %.2f at epoch %i"%(mean_string,dev_best[1],dev_best[0]))


            # at the last epoch choose which model to copy to barchybrid.model
            if epoch == options.epochs:
                bestmodel_file = os.path.join(experiment.outdir,"barchybrid.model" + str(dev_best[0]))
                model_file = os.path.join(experiment.outdir,"barchybrid.model")
                print("Copying " + bestmodel_file + " to " + model_file)
                copyfile(bestmodel_file,model_file)
                best_dev_file = os.path.join(experiment.outdir,"best_dev_epoch.txt")
                with open (best_dev_file, 'w') as fh:
                    print("Writing best scores to: " + best_dev_file)
                    if len(experiment.treebanks) == 1:
                        fh.write("Best dev score %s at epoch %i\n"%(dev_best[1],dev_best[0]))
                    else:
                        fh.write("Best mean dev score %s at epoch %i\n"%(dev_best[1],dev_best[0]))

    else: #if predict - so

        params = os.path.join(experiment.modeldir,options.params)
        print('Reading params from ' + params)
        with open(params, 'rb') as paramsfp:
            stored_vocab, stored_opt = pickle.load(paramsfp)

            # we need to update/add certain options based on new user input
            utils.fix_stored_options(stored_opt,options)

            parser = Parser(stored_vocab, stored_opt)
            model = os.path.join(experiment.modeldir, options.model)
            parser.Load(model)

            ts = time.time()

            for treebank in experiment.treebanks:
                if options.predict_all_epochs: # name outfile after epoch number in model file
                    try:
                        m = re.search('(\d+)$',options.model)
                        epoch = m.group(1)
                        treebank.outfilename = 'dev_epoch_%s.conllu'%epoch
                    except AttributeError:
                        raise Exception("No epoch number found in model file (e.g. barchybrid.model22)")
                if not treebank.outfilename:
                    treebank.outfilename = 'out' + ('.conll' if not options.conllu else '.conllu')
                treebank.outfilename = os.path.join(treebank.outdir, treebank.outfilename)

            pred = list(parser.Predict(experiment.treebanks,"test",stored_opt))
            utils.write_conll_multiling(pred,experiment.treebanks)

            te = time.time()

            if options.pred_eval:
                for treebank in experiment.treebanks:
                    print("Evaluating on " + treebank.name)
                    score = utils.evaluate(treebank.test_gold,treebank.outfilename,options.conllu)
                    print("Obtained LAS F1 score of %.2f on %s" %(score,treebank.name))

            print('Finished predicting')
Example #11
0
def run(om, options, i):
    outdir = options.output
    if options.multi_monoling:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir
        modelDir = cur_treebank.modelDir
    else:
        outdir = options.output
        modelDir = om.languages[i].modelDir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.include:
        cur_treebank = om.treebank

    if not options.predictFlag:

        print 'Preparing vocab'
        if options.multiling:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                om.languages, path_is_dir=True)

        else:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                cur_treebank.trainfile)

        with open(os.path.join(outdir, options.params), 'w') as paramsfp:
            pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch),
                        paramsfp)
            print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options)

        for epoch in xrange(options.first_epoch - 1,
                            options.first_epoch - 1 + options.epochs):
            if options.multiling:
                traindata = list(
                    utils.read_conll_dir(om.languages, "train",
                                         options.drop_proj, options.maxCorpus))
                devdata = enumerate(utils.read_conll_dir(om.languages, "dev"))

            else:
                conllFP = open(cur_treebank.trainfile, 'r')
                traindata = list(
                    utils.read_conll(conllFP, options.drop_proj,
                                     cur_treebank.iso_id))
                if os.path.exists(cur_treebank.devfile):
                    conllFP = open(cur_treebank.devfile, 'r')
                    devdata = enumerate(
                        utils.read_conll(conllFP, False, cur_treebank.iso_id))
                else:
                    tot_sen = len(traindata)
                    #take a bit less than 5% of train sentences for dev
                    if tot_sen > 1000:
                        import random
                        random.shuffle(traindata)
                        dev_len = int(0.05 * tot_sen)
                        #gen object * 2
                        devdata, dev_gold = itertools.tee(traindata[:dev_len])
                        devdata = enumerate(devdata)
                        dev_gold_f = os.path.join(outdir,
                                                  'dev_gold' + '.conllu')
                        utils.write_conll(dev_gold_f, dev_gold)
                        cur_treebank.dev_gold = dev_gold_f
                        traindata = traindata[dev_len:]
                    else:
                        devdata = None

            print 'Starting epoch', epoch
            parser.Train(traindata)

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(
                        l.outdir, 'dev_epoch_' + str(epoch + 1) + '.conllu')
                pred = list(parser.Predict(devdata))
                if len(pred) > 0:
                    utils.write_conll_multiling(pred, om.languages)
            else:
                cur_treebank.outfilename = os.path.join(
                    outdir, 'dev_epoch_' + str(epoch + 1) +
                    ('.conll' if not om.conllu else '.conllu'))
                if devdata:
                    pred = list(parser.Predict(devdata))
                    utils.write_conll(cur_treebank.outfilename, pred)

            if options.multiling:
                for l in om.languages:
                    utils.evaluate(l.dev_gold, l.outfilename, om.conllu)
            else:
                utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename,
                               om.conllu)

            print 'Finished predicting dev'
            parser.Save(os.path.join(outdir, options.model + str(epoch + 1)))

    else:  #if predict - so
        params = os.path.join(modelDir, options.params)
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(
                paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch,
                                   stored_opt)
            model = os.path.join(modelDir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = enumerate(utils.read_conll_dir(
                    om.languages, "test"))

            if not options.multiling:
                conllFP = open(cur_treebank.testfile, 'r')
                testdata = enumerate(
                    utils.read_conll(conllFP, False, cur_treebank.iso_id))

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, l.outfilename)
                pred = list(parser.Predict(testdata))
                utils.write_conll_multiling(pred, om.languages)
            else:
                cur_treebank.outfilename = os.path.join(
                    outdir, cur_treebank.outfilename)
                utils.write_conll(cur_treebank.outfilename,
                                  parser.Predict(testdata))

            te = time.time()

            if options.predEval:
                if options.multiling:
                    for l in om.languages:
                        utils.evaluate(l.test_gold, l.outfilename, om.conllu)
                else:
                    utils.evaluate(cur_treebank.test_gold,
                                   cur_treebank.outfilename, om.conllu)

            print 'Finished predicting test', te - ts