Ejemplo n.º 1
0
    def _create_vocab(self):
        assert self.split == 'train', "Vocabulary can ONLY be build from trainset"

        tokenizer = TweetTokenizer(preserve_case=False)

        c = Counter()
        w2i = OrderedDict()
        i2w = OrderedDict()
        tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for i, st in enumerate(tokens):
            i2w[i] = st
            w2i[st] = i

        with open(self.get_path('src-' + self.split + '.txt'), 'r') as file:
            for line in file:
                c.update(tokenizer.tokenize(line))

        if self.mt:
            with open(self.get_path('tgt-' + self.split + '.txt'),
                      'r') as file:
                for line in file:
                    c.update(tokenizer.tokenize(line))

        #collection of the vocabulary and its counts
        vocab_counts = utils.vocab(c)

        for i, (word, counts) in enumerate(vocab_counts):
            if counts > self.min_occ and word not in tokens:
                i2w[len(w2i)] = word
                w2i[word] = len(w2i)

        assert len(w2i) == len(i2w)
        vocab = dict(w2i=w2i, i2w=i2w)

        with io.open(self.get_path('vocab.json'), 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self.load_vocab()
                score = float(poslasCount) * 100 / count
                if score >= highestScore:
                    parser.Save(
                        os.path.join(options.output,
                                     os.path.basename(options.model)))
                    highestScore = score

                print "POS&LAS of the previous saved model: %.2f" % (
                    highestScore)

        else:
            ner_epoch = 1
            dep_epoch = 1
            print 'Extracting vocabulary'
            wordsdep, w2idep, c2idep, posdep, relsdep, capsdep = utils.vocab(
                options.conll_train)
            wordsner, w2iner, c2iner, posner, relsner, capsner = utils.vocab_ner(
                options.conll_trainner)
            words, c2i, pos, rels, caps = merge_counters(
                wordsdep,
                wordsner), merge_c2i_dicts(c2idep,
                                           c2iner), posner, relsdep, capsdep
            w2i = {w: i for i, w in enumerate(words.keys())}
            with open(os.path.join(options.output, options.params),
                      'w') as paramsfp:
                pickle.dump((words, w2i, c2i, pos, rels, options), paramsfp)

            #print 'Initializing joint model'
            parser = learner.jPosDepLearner(words, pos, rels, w2i, c2i, caps,
                                            options)
Ejemplo n.º 3
0
def run(om, options, i):

    if options.multiling:
        outdir = options.outdir
    else:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.predict:  # training

        print 'Preparing vocab'
        if options.multiling:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                om.languages, path_is_dir=True)

        else:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                cur_treebank.trainfile)

        paramsfile = os.path.join(outdir, options.params)
        with open(paramsfile, 'w') as paramsfp:
            print 'Saving params to ' + paramsfile
            pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch),
                        paramsfp)
            print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options)

        durations = []
        for epoch in xrange(options.first_epoch,
                            options.first_epoch + options.epochs):

            print 'Starting epoch ' + str(epoch)
            start_time = time.time()

            if options.multiling:
                traindata = list(
                    utils.read_conll_dir(om.languages, "train",
                                         options.max_sentences))
            else:
                traindata = list(
                    utils.read_conll(cur_treebank.trainfile,
                                     cur_treebank.iso_id,
                                     options.max_sentences))

            parser.Train(traindata)
            print 'Finished epoch ' + str(epoch)

            if not options.overwrite_model:
                model_file = os.path.join(outdir, options.model + str(epoch))
                parser.Save(model_file)

            if options.pred_dev:  # use the model to predict on dev data

                if options.multiling:
                    pred_langs = [
                        lang for lang in om.languages if lang.pred_dev
                    ]  # languages which have dev data on which to predict
                    for lang in pred_langs:
                        lang.outfilename = os.path.join(
                            lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu')
                        print "Predicting on dev data for " + lang.name
                    devdata = utils.read_conll_dir(pred_langs, "dev")
                    pred = list(parser.Predict(devdata))
                    if len(pred) > 0:
                        utils.write_conll_multiling(pred, pred_langs)
                    else:
                        print "Warning: prediction empty"
                    if options.pred_eval:
                        for lang in pred_langs:
                            print "Evaluating dev prediction for " + lang.name
                            utils.evaluate(lang.dev_gold, lang.outfilename,
                                           om.conllu)
                else:  # monolingual case
                    if cur_treebank.pred_dev:
                        print "Predicting on dev data for " + cur_treebank.name
                        devdata = utils.read_conll(cur_treebank.devfile,
                                                   cur_treebank.iso_id)
                        cur_treebank.outfilename = os.path.join(
                            outdir, 'dev_epoch_' + str(epoch) +
                            ('.conll' if not om.conllu else '.conllu'))
                        pred = list(parser.Predict(devdata))
                        utils.write_conll(cur_treebank.outfilename, pred)
                        if options.pred_eval:
                            print "Evaluating dev prediction for " + cur_treebank.name
                            score = utils.evaluate(cur_treebank.dev_gold,
                                                   cur_treebank.outfilename,
                                                   om.conllu)
                            if options.model_selection:
                                if score > cur_treebank.dev_best[1]:
                                    cur_treebank.dev_best = [epoch, score]
                                if options.overwrite_model:
                                    print "Overwriting model due to higher dev score"
                                    model_file = os.path.join(
                                        cur_treebank.outdir, options.model)
                                    parser.Save(model_file)

            if options.deadline:
                # keep track of duration of training+eval
                now = time.time()
                duration = now - start_time
                durations.append(duration)
                # estimate when next epoch will finish
                last_five_durations = durations[-5:]
                eta = time.time() + max(last_five_durations)
                print 'Deadline in %.1f seconds' % (options.deadline - now)
                print 'ETA of next epoch in %.1f seconds' % (eta - now)
                # does it exceed the deadline?
                exceeds_deadline = eta > options.deadline
            else:
                # no deadline
                exceeds_deadline = False

            if exceeds_deadline or epoch == options.epochs:
                # at the last epoch copy the best model to barchybrid.model
                if not options.model_selection:
                    # model selection off completely (for example multilingual case)
                    # --> take the final epoch, i.e. the current epoch
                    best_epoch = epoch
                else:
                    best_epoch = cur_treebank.dev_best[
                        0]  # will be final epoch by default if model selection not on for this treebank
                    if cur_treebank.model_selection:
                        print "Best dev score of " + str(
                            cur_treebank.dev_best[1]
                        ) + " found at epoch " + str(cur_treebank.dev_best[0])

                if not options.overwrite_model:
                    bestmodel_file = os.path.join(
                        outdir, "barchybrid.model" + str(best_epoch))
                    model_file = os.path.join(outdir, "barchybrid.model")
                    print "Copying " + bestmodel_file + " to " + model_file
                    copyfile(bestmodel_file, model_file)

            if exceeds_deadline and epoch < options.epochs:
                print 'Leaving epoch loop early to avoid exceeding deadline'
                break

            if exceeds_deadline and epoch < options.epochs:
                print 'Leaving epoch loop early to avoid exceeding deadline'
                break

    else:  #if predict - so

        if options.multiling:
            modeldir = options.modeldir
        else:
            modeldir = om.languages[i].modeldir

        params = os.path.join(modeldir, options.params)
        print 'Reading params from ' + params
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(
                paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch,
                                   stored_opt)
            model = os.path.join(modeldir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = utils.read_conll_dir(om.languages, "test")
            else:
                testdata = utils.read_conll(cur_treebank.testfile,
                                            cur_treebank.iso_id)

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, l.outfilename)
                pred = list(parser.Predict(testdata))
                utils.write_conll_multiling(pred, om.languages)
            else:
                if cur_treebank.outfilename:
                    cur_treebank.outfilename = os.path.join(
                        outdir, cur_treebank.outfilename)
                else:
                    cur_treebank.outfilename = os.path.join(
                        outdir,
                        'out' + ('.conll' if not om.conllu else '.conllu'))
                utils.write_conll(cur_treebank.outfilename,
                                  parser.Predict(testdata))

            te = time.time()

            if options.pred_eval:
                if options.multiling:
                    for l in om.languages:
                        print "Evaluating on " + l.name
                        score = utils.evaluate(l.test_gold, l.outfilename,
                                               om.conllu)
                        print "Obtained LAS F1 score of %.2f on %s" % (score,
                                                                       l.name)
                else:
                    print "Evaluating on " + cur_treebank.name
                    score = utils.evaluate(cur_treebank.test_gold,
                                           cur_treebank.outfilename, om.conllu)
                    print "Obtained LAS F1 score of %.2f on %s" % (
                        score, cur_treebank.name)

            print 'Finished predicting'
Ejemplo n.º 4
0
import os
import sys
sys.path.insert(0, 'src')
import utils
import jamo as jpack

jamos_train, j2i_train, chars_train, c2i_train, words_train, w2i_train, pos_train, rels_train = utils.vocab(
    sys.argv[1])
jamos_dev, j2i_dev, chars_dev, c2i_dev, words_dev, w2i_dev, pos_dev, rels_dev = utils.vocab(
    sys.argv[2])

oov_word = 0
for word in words_dev:
    if not word in words_train:
        oov_word += 1
print 'OOV word: ', oov_word, ' / ', len(
    words_dev), ' ', float(oov_word) / len(words_dev) * 100

hangul_chars_train = {}
for char in chars_train:
    if len(jpack.decompose(char)) > 1:
        hangul_chars_train[char] = True
hangul_chars_dev = {}
for char in chars_dev:
    if len(jpack.decompose(char)) > 1:
        hangul_chars_dev[char] = True

oov_char = 0
for char in hangul_chars_dev:
    if not char in hangul_chars_train:
        oov_char += 1
Ejemplo n.º 5
0
        test_res = list(parser.Predict(options.conll_test))
        te = time.time()
        print 'Finished predicting test.', te - ts, 'seconds.'
        utils.write_conll(tespath, test_res)

        if not conllu:
            os.system('perl conll/eval.pl -g ' + options.conll_test + ' -s ' +
                      tespath + ' > ' + tespath + '.txt')
        else:
            os.system(
                'python conll/evaluation_script/conll17_ud_eval.py -v -w conll/evaluation_script/weights.clas '
                + options.conll_test + ' ' + tespath + ' > ' + tespath +
                '.txt')
    else:
        print 'Preparing vocab'
        words, w2i, pos, rels = utils.vocab(options.conll_train)

        with open(os.path.join(options.output, options.params),
                  'w') as paramsfp:
            pickle.dump((words, w2i, pos, rels, options), paramsfp)
        print 'Finished collecting vocab'

        print 'Initializing lstm mstparser:'
        parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, options)

        for epoch in xrange(options.epochs):
            print 'Starting epoch', epoch
            parser.Train(options.conll_train)
            conllu = (os.path.splitext(
                options.conll_dev.lower())[1] == '.conllu')
            devpath = os.path.join(
Ejemplo n.º 6
0
        if options.conll_dev != "N/A":
            devPredSents = parser.predict(options.conll_dev)

            count = 0

            for idSent, devSent in enumerate(devPredSents):
                conll_devSent = [entry for entry in devSent if isinstance(entry, utils.ConllEntry)]

                for entry in conll_devSent:
                    if entry.id <= 0:
                        continue
                    count += 1

    else:
        print 'Extracting vocabulary'
        c2i = utils.vocab(options.conll_train)

        with open(os.path.join(options.output, options.params), 'w') as paramsfp:
            pickle.dump((c2i, options), paramsfp)
        parser = learner.Learner(c2i, options)

    highestScore = 0.0
    eId = 0
    for epoch in xrange(options.epochs):
        print '\n-----------------\nStarting epoch', epoch + 1

        if epoch % 10 == 0:
            if epoch == 0:
                parser.trainer.restart(learning_rate=0.001)
            elif epoch == 10:
                parser.trainer.restart(learning_rate=0.0005)
Ejemplo n.º 7
0
                      help='Use predicate boolean flag.')
    parser.add_option("--dynet-gpu",
                      action="store_true",
                      dest="--dynet-gpu",
                      default=False,
                      help='Use GPU instead of cpu.')

    (options, args) = parser.parse_args()
    print 'Using external embedding:', options.external_embedding
    from srl import SRLLSTM

    if options.conll_train:
        print 'Preparing vocab'
        print options
        train_data = list(utils.read_conll(options.conll_train))
        words, lemmas, pos, roles, chars = utils.vocab(train_data)
        with open(os.path.join(options.outdir, options.params),
                  'w') as paramsfp:
            pickle.dump((words, lemmas, pos, roles, chars, options), paramsfp)
        print 'Finished collecting vocab'

        print 'Initializing blstm srl:'
        parser = SRLLSTM(words, lemmas, pos, roles, chars, options)

        max_len = max([len(d) for d in train_data])
        min_len = min([len(d) for d in train_data])
        buckets = [list() for i in range(min_len, max_len)]
        for d in train_data:
            buckets[len(d) - min_len - 1].append(d)
        buckets = [x for x in buckets if x != []]
Ejemplo n.º 8
0



    print 'Using external embedding:', options.external_embedding, "textual file:", options.external_embedding_Textual
   

    if not options.predictFlag:
	# Training
        if not (options.rlFlag or options.rlMostFlag or options.headFlag):
            print 'You must use either --userlmost or --userl or --usehead (you can use multiple)'
            sys.exit()

        print 'Preparing vocab'
	if WITHCPOS:
            words, w2i, pos, cpos, GENDER, NUMBER, PERSON, CASE, rels = utils.vocab(options.conll_train, True)	    
	else:
            words, w2i, pos, rels = utils.vocab(options.conll_train, False)

	#print words
	print pos
	#print cpos

	if WITHCPOS:
            with open(os.path.join(options.output, options.params), 'w') as paramsfp:
                pickle.dump((words, w2i, pos, cpos, GENDER, NUMBER, PERSON, CASE, rels, options), paramsfp)
	else:
            with open(os.path.join(options.output, options.params), 'w') as paramsfp:
                pickle.dump((words, w2i, pos, rels, options), paramsfp)

        print 'Finished collecting vocab'
Ejemplo n.º 9
0
import dynet as dy
import random
from utils import *
import utils
# encoding=utf8
import sys

reload(sys)
sys.setdefaultencoding('utf8')

conll_train = "/Users/huseyinalecakir/NLP_LAB/data/tr_imst-ud-train.conllu"

c2i, w2i, features = utils.vocab(conll_train)

EOS = '<s>'
characters = list("abcdefghijklmnopqrstuvwxyz ")
characters.append(EOS)

int2char = {c2i[i]: i for i in c2i}
char2int = c2i

VOCAB_SIZE = len(c2i)

LSTM_NUM_OF_LAYERS = 2
EMBEDDINGS_SIZE = 128
STATE_SIZE = 256
ATTENTION_SIZE = 64

model = dy.Model()

enc_fwd_lstm = dy.LSTMBuilder(LSTM_NUM_OF_LAYERS, EMBEDDINGS_SIZE, STATE_SIZE,
Ejemplo n.º 10
0
        print 'Initializing lstm mstparser:'
        parser = mstlstm.MSTParserLSTM(pos, rels, w2i, chars, stored_opt)
        parser.Load(options.model)
        ts = time.time()
        print 'loading buckets'
        test_buckets = [list()]
        test_data = list(utils.read_conll(open(options.conll_test, 'r')))
        for d in test_data:
            test_buckets[0].append(d)
        print 'parsing'
        test(parser, test_buckets, options.conll_test, options.conll_output)
        te = time.time()
        print 'Finished predicting test.', te - ts, 'seconds.'
    else:
        print 'Preparing vocab'
        w2i, pos, rels, chars = utils.vocab(options.conll_train)
        if not os.path.isdir(options.output): os.mkdir(options.output)
        with open(os.path.join(options.output, options.params),
                  'w') as paramsfp:
            pickle.dump((w2i, pos, rels, chars, options), paramsfp)
        print 'Finished collecting vocab'

        print 'Initializing lstm mstparser:'
        parser = mstlstm.MSTParserLSTM(pos, rels, w2i, chars, options)
        best_acc = -float('inf')
        t, epoch = 0, 1
        train_data = list(utils.read_conll(open(options.conll_train, 'r')))
        max_len = max([len(d) for d in train_data])
        min_len = min([len(d) for d in train_data])
        buckets = [list() for i in range(min_len, max_len)]
        for d in train_data:
Ejemplo n.º 11
0
    parser.add_option("--usehead", action="store_true", dest="headFlag", default=False)
    parser.add_option("--userlmost", action="store_true", dest="rlFlag", default=False)
    parser.add_option("--userl", action="store_true", dest="rlMostFlag", default=False)
    parser.add_option("--predict", action="store_true", dest="predictFlag", default=False)
    parser.add_option("--dynet-mem", type="int", dest="cnn_mem", default=512)

    (options, args) = parser.parse_args()
    print 'Using external embedding:', options.external_embedding

    if not options.predictFlag:
        if not (options.rlFlag or options.rlMostFlag or options.headFlag):
            print 'You must use either --userlmost or --userl or --usehead (you can use multiple)'
            sys.exit()

        print 'Preparing vocab'
        words, w2i, pos, rels = utils.vocab(options.conll_train)

        with open(os.path.join(options.output, options.params), 'w') as paramsfp:
            pickle.dump((words, w2i, pos, rels, options), paramsfp)
        print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, w2i, options)

        for epoch in xrange(options.epochs):
            print 'Starting epoch', epoch
            parser.Train(options.conll_train)
            conllu = (os.path.splitext(options.conll_dev.lower())[1] == '.conllu')
            devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + ('.conll' if not conllu else '.conllu'))
            utils.write_conll(devpath, parser.Predict(options.conll_dev))
Ejemplo n.º 12
0
def train():
    vocab = utils.vocab()
    textcnn = TextCNN(sequence_length=100, num_classes=19, vocab_size=len(vocab), embedding_size=100,
                      filter_sizes=[3, 5, 7, 9], num_filters=64, pool_k=1, trainable=True)
    textcnn.fit(['yq_train.txt', 'yq_valid.txt', 'yq_test.txt'], 30, dropout=0.5, save_path='./model/yq/lncnn',
                everyEpochNum=500, batch_size=32)
Ejemplo n.º 13
0
                      action="store_true",
                      dest="predictFlag",
                      default=False)
    parser.add_option("--dynet-mem",
                      type="int",
                      dest="dynet_mem",
                      default=1000)  # Doesn't work, must provide command line

    (options, args) = parser.parse_args()

    if not options.predictFlag:  # Training
        if not (options.rlFlag or options.rlMostFlag or options.headFlag):
            print 'You must use either --userlmost or --userl or --usehead (you can use multiple)'
            sys.exit()

        jamos, j2i, chars, c2i, words, w2i, pos, rels = utils.vocab(
            options.conll_train)

        print '----------------------------'
        print len(words), 'wtypes,', len(chars), 'ctypes,', len(
            jamos), 'jtypes'
        print 'Use word?', not options.noword
        print 'Use char?', options.usechar
        print 'Use jamo?', options.usejamo
        print 'word dim:', options.wembedding_dims
        print 'char dim:', options.cembedding_dims
        print 'pos dim:', options.pembedding_dims
        print '----------------------------'

        external_embedding = {}
        if options.external_embedding is not None:
            with open(options.external_embedding,
Ejemplo n.º 14
0
        path_amrs_dev = args.amr_dev+".graphs"
        
        with codecs.open(path_amrs,'rb') as f:
            amr_graphs = pickle.load(f)

        with codecs.open(path_amrs_dev,'rb') as f:
            dev_amr_graphs = pickle.load(f)
            
        with codecs.open(path_amr_templates,'rb') as ft:
            amr_graph_templates = pickle.load(ft)
        
        with codecs.open(path_multiword_templates,'rb') as ft:
            amr_multiword_graph_templates = pickle.load(ft)          

        
        words,lemmas, pos,rels, nodes, entities,deps = utils.vocab(amr_graphs)
    
        _, _, _, dev_rels, dev_nodes,_, _ =  utils.vocab(dev_amr_graphs)
                        
        with open(os.path.join(args.output, args.params), 'wb') as paramsfp:
            pickle.dump((words, lemmas, pos, rels, nodes, entities,deps,args), paramsfp)

        parser = mlp.PerceptronAMR(words,pos,rels,nodes,entities,deps,args.external_embedding, 
                                               None
                                               #args.pos_external_embedding
                                               ,None,None, None, 
                                               amr_graph_templates,
                                               amr_multiword_graph_templates,
                                               None,
                                               args)
    
Ejemplo n.º 15
0
def run(om, options, i):
    outdir = options.output
    if options.multi_monoling:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir
        modelDir = cur_treebank.modelDir
    else:
        outdir = options.output
        modelDir = om.languages[i].modelDir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.include:
        cur_treebank = om.treebank

    if not options.predictFlag:

        print 'Preparing vocab'
        if options.multiling:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                om.languages, path_is_dir=True)

        else:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                cur_treebank.trainfile)

        with open(os.path.join(outdir, options.params), 'w') as paramsfp:
            pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch),
                        paramsfp)
            print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options)

        for epoch in xrange(options.first_epoch - 1,
                            options.first_epoch - 1 + options.epochs):
            if options.multiling:
                traindata = list(
                    utils.read_conll_dir(om.languages, "train",
                                         options.drop_proj, options.maxCorpus))
                devdata = enumerate(utils.read_conll_dir(om.languages, "dev"))

            else:
                conllFP = open(cur_treebank.trainfile, 'r')
                traindata = list(
                    utils.read_conll(conllFP, options.drop_proj,
                                     cur_treebank.iso_id))
                if os.path.exists(cur_treebank.devfile):
                    conllFP = open(cur_treebank.devfile, 'r')
                    devdata = enumerate(
                        utils.read_conll(conllFP, False, cur_treebank.iso_id))
                else:
                    tot_sen = len(traindata)
                    #take a bit less than 5% of train sentences for dev
                    if tot_sen > 1000:
                        import random
                        random.shuffle(traindata)
                        dev_len = int(0.05 * tot_sen)
                        #gen object * 2
                        devdata, dev_gold = itertools.tee(traindata[:dev_len])
                        devdata = enumerate(devdata)
                        dev_gold_f = os.path.join(outdir,
                                                  'dev_gold' + '.conllu')
                        utils.write_conll(dev_gold_f, dev_gold)
                        cur_treebank.dev_gold = dev_gold_f
                        traindata = traindata[dev_len:]
                    else:
                        devdata = None

            print 'Starting epoch', epoch
            parser.Train(traindata)

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(
                        l.outdir, 'dev_epoch_' + str(epoch + 1) + '.conllu')
                pred = list(parser.Predict(devdata))
                if len(pred) > 0:
                    utils.write_conll_multiling(pred, om.languages)
            else:
                cur_treebank.outfilename = os.path.join(
                    outdir, 'dev_epoch_' + str(epoch + 1) +
                    ('.conll' if not om.conllu else '.conllu'))
                if devdata:
                    pred = list(parser.Predict(devdata))
                    utils.write_conll(cur_treebank.outfilename, pred)

            if options.multiling:
                for l in om.languages:
                    utils.evaluate(l.dev_gold, l.outfilename, om.conllu)
            else:
                utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename,
                               om.conllu)

            print 'Finished predicting dev'
            parser.Save(os.path.join(outdir, options.model + str(epoch + 1)))

    else:  #if predict - so
        params = os.path.join(modelDir, options.params)
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(
                paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch,
                                   stored_opt)
            model = os.path.join(modelDir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = enumerate(utils.read_conll_dir(
                    om.languages, "test"))

            if not options.multiling:
                conllFP = open(cur_treebank.testfile, 'r')
                testdata = enumerate(
                    utils.read_conll(conllFP, False, cur_treebank.iso_id))

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, l.outfilename)
                pred = list(parser.Predict(testdata))
                utils.write_conll_multiling(pred, om.languages)
            else:
                cur_treebank.outfilename = os.path.join(
                    outdir, cur_treebank.outfilename)
                utils.write_conll(cur_treebank.outfilename,
                                  parser.Predict(testdata))

            te = time.time()

            if options.predEval:
                if options.multiling:
                    for l in om.languages:
                        utils.evaluate(l.test_gold, l.outfilename, om.conllu)
                else:
                    utils.evaluate(cur_treebank.test_gold,
                                   cur_treebank.outfilename, om.conllu)

            print 'Finished predicting test', te - ts
Ejemplo n.º 16
0
import sys
import utils
#Find % of OOVs on dev or test dataset
#Usage python OOVs.py path_to_train path_to_dev/test
train = sys.argv[1]
dev_test = sys.argv[2]
words, w2i, c2i, pos, rels = utils.vocab(train)
words_dev, w2i_dev, c2i_dev, pos_dev, rels_dev = utils.vocab(dev_test)

OOVs = 0
for k, v in words_dev.items():
    if not (k in words.keys()):
        OOVs += 1

print str(format(float(OOVs) /
                 (len(words_dev)) * 100, '.2f')) + "% OOVs on test/dev dataset"
Ejemplo n.º 17
0
def run(om,options,i):

    if options.multiling:
        outdir = options.outdir
    else:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.predict: # training

        fineTune = False
        start_from = 1
        if options.continueModel is None:
            continueTraining = False
        else:
            continueTraining = True
            trainedModel = options.continueModel
            if options.fineTune:
                fineTune = True
            else:
                start_from = options.first_epoch - 1

        if not continueTraining:
            print 'Preparing vocab'
            if options.multiling:
                path_is_dir=True,
                words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\
                                                                     path_is_dir,
                                                                     options.shareWordLookup,\
                                                                     options.shareCharLookup)

            else:
                words, w2i, pos, cpos, rels, langs, ch = utils.vocab(cur_treebank.trainfile)

            paramsfile = os.path.join(outdir, options.params)
            with open(paramsfile, 'w') as paramsfp:
                print 'Saving params to ' + paramsfile
                pickle.dump((words, w2i, pos, rels, cpos, langs,
                             options, ch), paramsfp)
                print 'Finished collecting vocab'
        else:
            paramsfile = os.path.join(outdir, options.params)
            with open(paramsfile, 'rb') as paramsfp:
                print 'Load params from ' + paramsfile
                words, w2i, pos, rels, cpos, langs, options, ch = pickle.load(paramsfp)
                print 'Finished loading vocab'

        max_epochs = options.first_epoch + options.epochs
        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i,
                               ch, options)

        if continueTraining:
            if not fineTune: 
                # continue training only, not doing fine tuning
                options.first_epoch = start_from + 1
                max_epochs = options.epochs
            else:
                # fine tune model
                options.first_epoch = options.epochs + 1
                max_epochs = options.first_epoch + 15
                print 'Fine tune model for another', max_epochs - options.first_epoch, 'epochs'

            parser.Load(trainedModel)
            

        best_multi_las = -1
        best_multi_epoch = 0
        
        if continueTraining:
            train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'a', encoding='utf-8')
        else:
            train_stats = codecs.open(os.path.join(outdir, 'train.stats'), 'w', encoding='utf-8')
                
        for epoch in xrange(options.first_epoch, max_epochs + 1):

            print 'Starting epoch ' + str(epoch)

            if options.multiling:
                traindata = list(utils.read_conll_dir(om.languages, "train", options.max_sentences))
            else:
                traindata = list(utils.read_conll(cur_treebank.trainfile, cur_treebank.iso_id,options.max_sentences))

            parser.Train(traindata)
            train_stats.write(unicode('Epoch ' + str(epoch) + '\n'))
            print 'Finished epoch ' + str(epoch)

            model_file = os.path.join(outdir, options.model + '.tmp')
            parser.Save(model_file)

            if options.pred_dev: # use the model to predict on dev data
                if options.multiling:
                    pred_langs = [lang for lang in om.languages if lang.pred_dev] # languages which have dev data on which to predict
                    for lang in pred_langs:
                        lang.outfilename = os.path.join(lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu')
                        print "Predicting on dev data for " + lang.name
                    devdata = utils.read_conll_dir(pred_langs,"dev")
                    pred = list(parser.Predict(devdata))

                    if len(pred)>0:
                        utils.write_conll_multiling(pred,pred_langs)
                    else:
                        print "Warning: prediction empty"
                    
                    if options.pred_eval:
                        total_las = 0
                        for lang in pred_langs:
                            print "Evaluating dev prediction for " + lang.name
                            las_score = utils.evaluate(lang.dev_gold, lang.outfilename,om.conllu)
                            total_las += las_score
                            train_stats.write(unicode('Dev LAS ' + lang.name + ': ' + str(las_score) + '\n'))
                        if options.model_selection:
                            if total_las > best_multi_las:
                                best_multi_las = total_las
                                best_multi_epoch = epoch 

                else: # monolingual case
                    if cur_treebank.pred_dev:
                        print "Predicting on dev data for " + cur_treebank.name
                        devdata = utils.read_conll(cur_treebank.devfile, cur_treebank.iso_id)
                        cur_treebank.outfilename = os.path.join(outdir, 'dev_epoch_' + str(epoch) + ('.conll' if not om.conllu else '.conllu'))
                        pred = list(parser.Predict(devdata))
                        utils.write_conll(cur_treebank.outfilename, pred)
                        if options.pred_eval:
                            print "Evaluating dev prediction for " + cur_treebank.name
                            las_score = utils.evaluate(cur_treebank.dev_gold, cur_treebank.outfilename, om.conllu)
                            if options.model_selection:
                                if las_score > cur_treebank.dev_best[1]:
                                    cur_treebank.dev_best = [epoch, las_score]
                                    train_stats.write(unicode('Dev LAS ' + cur_treebank.name + ': ' + str(las_score) + '\n'))
                                    

            if epoch == max_epochs: # at the last epoch choose which model to copy to barchybrid.model
                if not options.model_selection:
                    best_epoch = options.epochs # take the final epoch if model selection off completely (for example multilingual case)
                else:
                    if options.multiling:
                        best_epoch = best_multi_epoch
                    else:
                        best_epoch = cur_treebank.dev_best[0] # will be final epoch by default if model selection not on for this treebank
                        if cur_treebank.model_selection:
                            print "Best dev score of " + str(cur_treebank.dev_best[1]) + " found at epoch " + str(cur_treebank.dev_best[0])

                bestmodel_file = os.path.join(outdir,"barchybrid.model.tmp")
                model_file = os.path.join(outdir,"barchybrid.model")
                if fineTune:
                    model_file = os.path.join(outdir,"barchybrid.tuned.model")
                print "Best epoch: " + str(best_epoch)
                print "Copying " + bestmodel_file + " to " + model_file
                copyfile(bestmodel_file,model_file)

        train_stats.close()

    else: #if predict - so

        # import pdb;pdb.set_trace()
        eval_type = options.evaltype
        print "Eval type: ", eval_type
        if eval_type == "train":
            if options.multiling:
                for l in om.languages:
                    l.test_gold = l.test_gold.replace('test', 'train')
            else:
                cur_treebank.testfile = cur_treebank.trainfile
                cur_treebank.test_gold = cur_treebank.trainfile

        elif eval_type == "dev":
            if options.multiling:
                for l in om.languages:
                    l.test_gold = l.test_gold.replace('test', 'dev')
            else:
                cur_treebank.testfile = cur_treebank.devfile
                cur_treebank.test_gold = cur_treebank.devfile

        if options.multiling:
            modeldir = options.modeldir
            if options.fineTune:
                prefix = [os.path.join(outdir, os.path.basename(l.test_gold) + '-tuned') for l in om.languages] 
            else:
                prefix = [os.path.join(outdir, os.path.basename(l.test_gold)) for l in om.languages] 
        else:
            modeldir = om.languages[i].modeldir
            if options.fineTune:
                prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile)) + '-tuned'
            else:
                prefix = os.path.join(outdir, os.path.basename(cur_treebank.testfile))

        if not options.extract_vectors:
            prefix = None


        params = os.path.join(modeldir, options.params)
        print 'Reading params from ' + params
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i,
                               ch, stored_opt)

            if options.fineTune:
                options.model = options.model.replace('.model', '.tuned.model')
            model = os.path.join(modeldir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = utils.read_conll_dir(om.languages, eval_type)
            else:
                testdata = utils.read_conll(cur_treebank.testfile, cur_treebank.iso_id)

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, eval_type + "-" + l.outfilename)
                pred = list(parser.Predict(testdata, prefix))
                utils.write_conll_multiling(pred,om.languages)
            else:
                if cur_treebank.outfilename:
                    cur_treebank.outfilename = os.path.join(outdir, eval_type + "-" + cur_treebank.outfilename)
                else:
                    cur_treebank.outfilename = os.path.join(outdir, 'out' + ('.conll' if not om.conllu else '.conllu'))
                utils.write_conll(cur_treebank.outfilename, parser.Predict(testdata, prefix))

            te = time.time()

            if options.pred_eval:
                if options.multiling:
                    for l in om.languages:
                        print "Evaluating on " + l.name
                        score = utils.evaluate(l.test_gold, l.outfilename, om.conllu)
                        print "Obtained LAS F1 score of %.2f on %s" %(score, l.name)
                else:
                    print "Evaluating on " + cur_treebank.name
                    score = utils.evaluate(cur_treebank.test_gold, cur_treebank.outfilename, om.conllu)
                    print "Obtained LAS F1 score of %.2f on %s" %(score,cur_treebank.name)

            print 'Finished predicting'
Ejemplo n.º 18
0
    8: '军事',
    9: '民生',
    10: '时尚',
    11: '娱乐',
    12: '游戏',
    13: '时事',
    14: '股票',
    15: '历史',
    16: '美食',
    17: '房产',
    18: '汽车'
}
app = Flask(__name__)
topK = 100
MODEL_NAME = './model/yq/'
vocab = utils.vocab()
textcnn = TextCNN(sequence_length=100,
                  num_classes=len(label_names),
                  vocab_size=len(vocab),
                  embedding_size=100,
                  filter_sizes=[3, 5, 7, 9],
                  num_filters=64,
                  trainable=False,
                  pool_k=1)
textcnn.load_model(MODEL_NAME)


@app.route('/')
def demo():
    return render_template('index.html')
Ejemplo n.º 19
0
    def __init__(self, new_options):

        print '1. Init Parser'
        print '1-1. Preparing vocab'
        if not new_options.predictFlag:
            if new_options.train_multilingual:
                vocab, w2i, pos, xpos, rels = utils.vocab_multilingual(
                    new_options.conll_train)
#                new_options.xpembedding_dims = 0
            else:
                vocab, w2i, pos, xpos, rels = utils.vocab(
                    new_options.conll_train, new_options.conll_train_language)
                new_options.xpembedding_dims = 0 if len(
                    xpos) < 5 else new_options.xpembedding_dims
            options = new_options
        else:
            with open(new_options.params, 'r') as paramsfp:
                vocab, w2i, pos, xpos, rels, ex_trn, exc_trnd, stored_opt = pickle.load(
                    paramsfp)
                self.extrnd = ex_trn
                self.exctrnd = exc_trnd

            stored_opt.conll_test = new_options.conll_test
            stored_opt.conll_test_language = new_options.conll_test_language
            stored_opt.predictFlag = new_options.predictFlag
            stored_opt.lang_vec_file = new_options.lang_vec_file

            options = stored_opt
        print "     ls it using multilingual embedding?:", options.multilingual_emb
        print '1-1. End of preparing vocab'

        self.model = Model()
        random.seed(1)
        self.trainer = AdamTrainer(self.model)

        self.activations = {
            'tanh': tanh,
            'sigmoid': logistic,
            'relu': rectify,
            'tanh3': (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))
        }
        self.activation = self.activations[options.activation]

        self.blstmFlag = options.blstmFlag
        self.labelsFlag = options.labelsFlag
        self.costaugFlag = options.costaugFlag
        self.bibiFlag = options.bibiFlag
        self.extConcateFlag = options.extConcateFlag

        self.ldims = options.lstm_dims
        self.wdims = options.wembedding_dims
        self.pdims = options.pembedding_dims
        self.xpdims = options.xpembedding_dims

        self.rdims = options.rembedding_dims
        self.layers = options.lstm_layers
        self.wordsCount = vocab
        self.vocab = {word: ind + 3 for word, ind in w2i.iteritems()}
        self.pos = {word: ind + 3 for ind, word in enumerate(pos)}
        self.xpos = {word: ind + 3 for ind, word in enumerate(xpos)}
        self.rels = {word: ind for ind, word in enumerate(rels)}
        self.irels = rels

        self.train_multilingual = options.train_multilingual
        self.lang_vec_file = options.lang_vec_file
        self.multilingual_emb = options.multilingual_emb
        self.add_lang_vec = options.add_lang_vec
        self.languageVec_dic = read_languageVec(
            self.lang_vec_file)  ## read language_vec.csv file
        self.landims = len(self.languageVec_dic.values()[0].lang_vec)
        self.conll_test_language = options.conll_test_language
        self.conll_train_language = options.conll_train_language if options.conll_train_language is not None else "Unknown"
        print "     Training language: ", self.conll_train_language
        print "     Load Language vector, Dimensions: ", self.landims

        print "1-2. Load external embedding"
        self.external_embedding, self.edim = None, 0
        if options.predictFlag:
            self.elookup = self.model.add_lookup_parameters(
                (3, 1)
            )  # set temporal variable for model, later it will be resetted by model.load automatically.
            if options.external_embedding is not None:
                self.external_embedding = options.external_embedding
                self.edim = options.edim
        else:
            if options.external_embedding is not None:
                external_embedding_fp = open(options.external_embedding, 'r')
                external_embedding_fp.readline()
                self.external_embedding = {
                    line.split(' ')[0]:
                    [float(f) for f in line.strip().split(' ')[1:]]
                    for line in external_embedding_fp
                }
                external_embedding_fp.close()

                self.edim = len(self.external_embedding.values()[0])
                self.noextrn = [0.0 for _ in xrange(self.edim)]
                self.extrnd = {
                    word: i + 3
                    for i, word in enumerate(self.external_embedding)
                }
                self.elookup = self.model.add_lookup_parameters(
                    (len(self.external_embedding) + 3, self.edim))
                for word, i in self.extrnd.iteritems():
                    self.elookup.init_row(i, self.external_embedding[word])
                self.extrnd['*PAD*'] = 1
                self.extrnd['*INITIAL*'] = 2

                if options.extConcateFlag:
                    print '     Load external embedding. It will be used for an additional embedding', self.edim
                else:
                    self.wdims = self.edim
                    print '     Load external embedding. It will be used for the word dimension vector', self.edim
            else:
                self.elookup = self.model.add_lookup_parameters(
                    (3, 1)
                )  #set temporal variable for model, later it will be resetted by model.load automatically.
                self.extrnd = {}
        print "1-2. End of loading external embedding"

        print "1-3. Load external cluster embedding"
        self.external_cluster_embedding, self.ecdim = None, 0
        if options.predictFlag:
            self.eclookup = self.model.add_lookup_parameters((3, 1))
            if options.external_cluster_embedding is not None:
                self.external_cluster_embedding = options.external_cluster_embedding
                self.ecdim = options.ecdim
        else:
            if options.external_cluster_embedding is not None:
                external_cluster_embedding_fp = open(
                    options.external_cluster_embedding, 'r')
                external_cluster_embedding_fp.readline()
                self.external_cluster_embedding = {
                    line.split(' ')[0]:
                    [float(f) for f in line.strip().split(' ')[1:]]
                    for line in external_cluster_embedding_fp
                }
                external_cluster_embedding_fp.close()

                self.ecdim = len(self.external_cluster_embedding.values()[0])
                self.noexctrn = [0.0 for _ in xrange(self.ecdim)]
                self.exctrnd = {
                    word: i + 3
                    for i, word in enumerate(self.external_cluster_embedding)
                }
                self.eclookup = self.model.add_lookup_parameters(
                    (len(self.external_cluster_embedding) + 3, self.ecdim))
                for word, i in self.exctrnd.iteritems():
                    self.eclookup.init_row(
                        i, self.external_cluster_embedding[word])
                self.exctrnd['*PAD*'] = 1
                self.exctrnd['*INITIAL*'] = 2
                print '     Load external cluster embedding. It will be used for an additional embedding', self.ecdim
            else:
                self.eclookup = self.model.add_lookup_parameters((3, 1))
                self.exctrnd = {}
        print "1-3 End of loading external cluster embedding"

        ### Add language embedding
        if self.add_lang_vec:
            print "Add Language Vector", "language dims: ", self.landims
            self.llookup = self.model.add_lookup_parameters(
                (self.landims + 3, self.landims))
            for key in self.languageVec_dic.keys():
                self.llookup.init_row(
                    self.languageVec_dic.get(key).lang_num,
                    self.languageVec_dic.get(key).lang_vec)
        ## Finish language embedding

        self.dims = self.wdims + self.pdims + self.xpdims + (
            self.landims if self.add_lang_vec else
            0) + (self.edim if options.extConcateFlag else
                  0) + (self.ecdim if self.external_cluster_embedding else 0)
        print "Total dims: ", self.dims, "word dims: ", self.wdims

        if self.bibiFlag:
            self.builders = [
                VanillaLSTMBuilder(1, self.dims, self.ldims, self.model),
                VanillaLSTMBuilder(1, self.dims, self.ldims, self.model)
            ]
            self.bbuilders = [
                VanillaLSTMBuilder(1, self.ldims * 2, self.ldims, self.model),
                VanillaLSTMBuilder(1, self.ldims * 2, self.ldims, self.model)
            ]
        elif self.layers > 0:
            self.builders = [
                VanillaLSTMBuilder(self.layers, self.dims, self.ldims,
                                   self.model),
                VanillaLSTMBuilder(self.layers, self.dims, self.ldims,
                                   self.model)
            ]
        else:
            self.builders = [
                SimpleRNNBuilder(1, self.dims, self.ldims, self.model),
                SimpleRNNBuilder(1, self.dims, self.ldims, self.model)
            ]

        self.hidden_units = options.hidden_units
        self.hidden2_units = options.hidden2_units

        self.vocab['*PAD*'] = 1
        self.pos['*PAD*'] = 1
        self.xpos['*PAD*'] = 1

        self.vocab['*INITIAL*'] = 2
        self.pos['*INITIAL*'] = 2
        self.xpos['*INITIAL*'] = 2

        self.wlookup = self.model.add_lookup_parameters(
            (len(vocab) + 3, self.wdims))
        self.plookup = self.model.add_lookup_parameters(
            (len(pos) + 3, self.pdims))
        self.xplookup = self.model.add_lookup_parameters(
            (len(xpos) + 3, self.xpdims))
        self.rlookup = self.model.add_lookup_parameters(
            (len(rels), self.rdims))

        self.hidLayerFOH = self.model.add_parameters(
            (self.hidden_units, self.ldims * 2))
        self.hidLayerFOM = self.model.add_parameters(
            (self.hidden_units, self.ldims * 2))
        self.hidBias = self.model.add_parameters((self.hidden_units))

        self.hid2Layer = self.model.add_parameters(
            (self.hidden2_units, self.hidden_units))
        self.hid2Bias = self.model.add_parameters((self.hidden2_units))

        self.outLayer = self.model.add_parameters(
            (1, self.hidden2_units
             if self.hidden2_units > 0 else self.hidden_units))

        if self.labelsFlag:
            self.rhidLayerFOH = self.model.add_parameters(
                (self.hidden_units, 2 * self.ldims))
            self.rhidLayerFOM = self.model.add_parameters(
                (self.hidden_units, 2 * self.ldims))
            self.rhidBias = self.model.add_parameters((self.hidden_units))

            self.rhid2Layer = self.model.add_parameters(
                (self.hidden2_units, self.hidden_units))
            self.rhid2Bias = self.model.add_parameters((self.hidden2_units))

            self.routLayer = self.model.add_parameters(
                (len(self.irels), self.hidden2_units
                 if self.hidden2_units > 0 else self.hidden_units))
            self.routBias = self.model.add_parameters((len(self.irels)))

        if not new_options.predictFlag:
            options.edim = self.edim
            options.ecdim = self.ecdim
            with open(os.path.join(new_options.output, new_options.params),
                      'w') as paramsfp:
                pickle.dump((vocab, w2i, pos, xpos, rels, self.extrnd,
                             self.exctrnd, options), paramsfp)
            print 'Finished collecting vocab'
Ejemplo n.º 20
0
        print('Predicting  POS XPOS tags')
        ts = time.time()
        test_res = list(tagger.Predict(conll_sentences, True))
        te = time.time()
        print('Finished in', te - ts, 'seconds.')
        utils.write_conll(tespath, test_res)

    else:

        ext_words_train = utils.ext_vocab(options.conll_train,
                                          options.external_embedding_voc)
        ext_words_dev = utils.ext_vocab(options.conll_dev,
                                        options.external_embedding_voc)

        print('Extracting vocabulary')
        words, w2i, c2i, pos, xpos = utils.vocab(options.conll_train)

        with open(os.path.join(options.output, options.params),
                  'wb') as paramsfp:
            pickle.dump((words, w2i, c2i, pos, xpos, options), paramsfp)

        print('Initializing  model')
        tagger = learner.Affine_tagger(words, pos, xpos, w2i, c2i,
                                       ext_words_train, ext_words_dev, options)

        with open(options.conll_dev, 'r') as conllFP:
            devData = list(utils.read_conll(conllFP, tagger.c2i))

        conll_sentences = []
        for sentence in devData:
            conll_sentence = [
Ejemplo n.º 21
0
def run(om, options, i):

    if options.multiling:
        outdir = options.outdir
    else:
        cur_treebank = om.languages[i]
        outdir = cur_treebank.outdir

    if options.shared_task:
        outdir = options.shared_task_outdir

    if not options.predict:  # training

        print 'Preparing vocab'
        if options.multiling:
            path_is_dir = True,
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(om.languages,\
                                                                 path_is_dir,
                                                                 options.shareWordLookup,\
                                                                 options.shareCharLookup)

        else:
            words, w2i, pos, cpos, rels, langs, ch = utils.vocab(
                cur_treebank.trainfile)

        paramsfile = os.path.join(outdir, options.params)
        with open(paramsfile, 'w') as paramsfp:
            print 'Saving params to ' + paramsfile
            pickle.dump((words, w2i, pos, rels, cpos, langs, options, ch),
                        paramsfp)
            print 'Finished collecting vocab'

        print 'Initializing blstm arc hybrid:'
        parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch, options)
        if options.continueModel is not None:
            parser.Load(options.continueModel)

        for epoch in xrange(options.first_epoch,
                            options.first_epoch + options.epochs):

            print 'Starting epoch ' + str(epoch)

            if options.multiling:
                traindata = list(
                    utils.read_conll_dir(om.languages, "train",
                                         options.max_sentences))
            else:
                traindata = list(
                    utils.read_conll(cur_treebank.trainfile,
                                     cur_treebank.iso_id,
                                     options.max_sentences))

            parser.Train(traindata)
            print 'Finished epoch ' + str(epoch)

            model_file = os.path.join(outdir, options.model + str(epoch))
            parser.Save(model_file)

            if options.pred_dev:  # use the model to predict on dev data

                if options.multiling:
                    pred_langs = [
                        lang for lang in om.languages if lang.pred_dev
                    ]  # languages which have dev data on which to predict
                    for lang in pred_langs:
                        lang.outfilename = os.path.join(
                            lang.outdir, 'dev_epoch_' + str(epoch) + '.conllu')
                        print "Predicting on dev data for " + lang.name
                    devdata = utils.read_conll_dir(pred_langs, "dev")
                    pred = list(parser.Predict(devdata))
                    if len(pred) > 0:
                        utils.write_conll_multiling(pred, pred_langs)
                    else:
                        print "Warning: prediction empty"
                    if options.pred_eval:
                        for lang in pred_langs:
                            print "Evaluating dev prediction for " + lang.name
                            utils.evaluate(lang.dev_gold, lang.outfilename,
                                           om.conllu)
                else:  # monolingual case
                    if cur_treebank.pred_dev:
                        print "Predicting on dev data for " + cur_treebank.name
                        devdata = utils.read_conll(cur_treebank.devfile,
                                                   cur_treebank.iso_id)
                        cur_treebank.outfilename = os.path.join(
                            outdir, 'dev_epoch_' + str(epoch) +
                            ('.conll' if not om.conllu else '.conllu'))
                        pred = list(parser.Predict(devdata))
                        utils.write_conll(cur_treebank.outfilename, pred)
                        if options.pred_eval:
                            print "Evaluating dev prediction for " + cur_treebank.name
                            score = utils.evaluate(cur_treebank.dev_gold,
                                                   cur_treebank.outfilename,
                                                   om.conllu)
                            if options.model_selection:
                                if score > cur_treebank.dev_best[1]:
                                    cur_treebank.dev_best = [epoch, score]

            if epoch == options.epochs:  # at the last epoch choose which model to copy to barchybrid.model
                if not options.model_selection:
                    best_epoch = options.epochs  # take the final epoch if model selection off completely (for example multilingual case)
                else:
                    best_epoch = cur_treebank.dev_best[
                        0]  # will be final epoch by default if model selection not on for this treebank
                    if cur_treebank.model_selection:
                        print "Best dev score of " + str(
                            cur_treebank.dev_best[1]
                        ) + " found at epoch " + str(cur_treebank.dev_best[0])

                bestmodel_file = os.path.join(
                    outdir, "barchybrid.model" + str(best_epoch))
                model_file = os.path.join(outdir, "barchybrid.model")
                print "Copying " + bestmodel_file + " to " + model_file
                copyfile(bestmodel_file, model_file)

    else:  #if predict - so

        if options.multiling:
            modeldir = options.modeldir
        else:
            modeldir = om.languages[i].modeldir

        params = os.path.join(modeldir, options.params)
        print 'Reading params from ' + params
        with open(params, 'r') as paramsfp:
            words, w2i, pos, rels, cpos, langs, stored_opt, ch = pickle.load(
                paramsfp)

            parser = ArcHybridLSTM(words, pos, rels, cpos, langs, w2i, ch,
                                   stored_opt)
            model = os.path.join(modeldir, options.model)
            parser.Load(model)

            if options.multiling:
                testdata = utils.read_conll_dir(om.languages, "test")
            else:
                testdata = utils.read_conll(cur_treebank.testfile,
                                            cur_treebank.iso_id)

            ts = time.time()

            if options.multiling:
                for l in om.languages:
                    l.outfilename = os.path.join(outdir, l.outfilename)
                pred = list(parser.Predict(testdata))
                utils.write_conll_multiling(pred, om.languages)
            else:
                if cur_treebank.outfilename:
                    cur_treebank.outfilename = os.path.join(
                        outdir, cur_treebank.outfilename)
                else:
                    cur_treebank.outfilename = os.path.join(
                        outdir,
                        'out' + ('.conll' if not om.conllu else '.conllu'))
                utils.write_conll(cur_treebank.outfilename,
                                  parser.Predict(testdata))

            te = time.time()

            if options.pred_eval:
                if options.multiling:
                    for l in om.languages:
                        print "Evaluating on " + l.name
                        score = utils.evaluate(l.test_gold, l.outfilename,
                                               om.conllu)
                        print "Obtained LAS F1 score of %.2f on %s" % (score,
                                                                       l.name)
                else:
                    print "Evaluating on " + cur_treebank.name
                    score = utils.evaluate(cur_treebank.test_gold,
                                           cur_treebank.outfilename, om.conllu)
                    print "Obtained LAS F1 score of %.2f on %s" % (
                        score, cur_treebank.name)

            print 'Finished predicting'
Ejemplo n.º 22
0
        print('Predicting POS tags and parsing dependencies')
        with open(testoutpath, 'w') as fh:
            for sentence in parser.Predict(options.conll_test):
                for entry in sentence[1:]:
                    fh.write(str(entry) + '\n')
                fh.write('\n')

    else:
        print("Training file: " + options.conll_train)
        highestScore = 0.0
        eId = 0

        print('Extracting vocabulary')
        morph_dict = utils.get_morph_dict(options.segmentation_path,
                                          options.lowerCase)
        words, w2i, c2i, m2i, t2i, pos, rels = utils.vocab(
            options.conll_train, morph_dict)

        with open(os.path.join(options.output, options.params),
                  'wb') as paramsfp:
            pickle.dump(
                (words, w2i, c2i, m2i, t2i, morph_dict, pos, rels, options),
                paramsfp)

        #print 'Initializing joint model'
        parser = learner.jPosDepLearner(words, pos, rels, w2i, c2i, m2i, t2i,
                                        morph_dict, options)

        if options.pipeline and options.morphFlag and not pretrained_flag:
            for epoch in range(5):
                print('\n-----------------\nStarting Morph2Vec epoch',
                      epoch + 1)
Ejemplo n.º 23
0
                print("POS accuracy:\t%.2f" % (posCount * 100 / count))
                print("POS&LAS:\t%.2f" % (poslasCount * 100 / count))

                score = poslasCount * 100 / count
                if score >= highestScore:
                    parser.Save(
                        os.path.join(args.output,
                                     os.path.basename(args.model)))
                    highestScore = score

                print("POS&LAS of the previous saved model: %.2f" %
                      (highestScore))

        else:
            print('Extracting vocabulary')
            words, w2i, c2i, pos, rels = utils.vocab(args.conll_train)

            with open(os.path.join(args.output, args.params),
                      'wb') as paramsfp:
                pickle.dump((words, w2i, c2i, pos, rels, args),
                            paramsfp,
                            protocol=2)

            #print 'Initializing joint model'
            parser = oldslavdep.OldSlavDep(words, pos, rels, w2i, c2i, args)

        for epoch in range(args.epochs):
            print('\n-----------------\nStarting epoch', epoch + 1)

            if epoch % 10 == 0:
                if epoch == 0:
Ejemplo n.º 24
0
    parser.add_option("--dynet-mem", type="int", dest="mem", default=0)
    parser.add_option(
        "--model-type", type="int", dest="model_type", default=0
    )  # 0 none -1  simple char rnn - 2 simple char bilstm - 3 simple prevec

    (options, args) = parser.parse_args()

    print("Training file: " + options.conll_train)
    if options.conll_dev != "N/A":
        print("Development file: " + options.conll_dev)

    highestScore = 0.0
    eId = 0

    print 'Extracting vocabulary'
    c2i, w2i, features = utils.vocab(options.conll_train)

    parser = learner.Learner(c2i, w2i, features, options)

    highestScore = 0.0
    eId = 0
    for epoch in xrange(options.epochs):
        print '\n-----------------\nStarting epoch', epoch + 1

        if epoch % 10 == 0:
            if epoch == 0:
                parser.trainer.restart(learning_rate=0.001)
            elif epoch == 10:
                parser.trainer.restart(learning_rate=0.0005)
            else:
                parser.trainer.restart(learning_rate=0.00025)
Ejemplo n.º 25
0
                for l in f:
                    if l.startswith('UAS'):
                        print('UAS:%s' % l.strip().split()[-1])
                    elif l.startswith('LAS'):
                        print('LAS:%s' % l.strip().split()[-1])
    else:
        # Training classifier
        print(f'Training with file {options.conll_train}')
        # Added to run from IntelliJ
        train_file = os.getcwd() + options.conll_train
        dev_file = os.getcwd() + options.conll_dev
        # Added to run from IntelliJ

        print('Preparing vocabulary table')

        words, enum_word, pos, rels, onto, cpos = list(utils.vocab(train_file))
        with open(os.path.join(output_file, options.params), 'wb') as paramsfp:
            pickle.dump((words, enum_word, pos, rels, onto, cpos, options),
                        paramsfp)
        print('Finished collecting vocabulary')

        print('Initializing mst-parser:')
        parser = mstlstm.MSTParserLSTM(words, pos, rels, enum_word, options,
                                       onto, cpos)
        for epoch in range(options.epochs):
            print('Starting epoch', epoch)
            parser.train(train_file)
            parser.save(
                os.path.join(output_file,
                             os.path.basename(model_path) + str(epoch + 1)))
            # evaluate_model()
Ejemplo n.º 26
0
                      type="int",
                      dest="dynet-autobatch",
                      default=0)
    parser.add_option("--dynet-l2", type="float", dest="dynet-l2", default=0)
    parser.add_option("--dynet-gpus",
                      action="store_true",
                      dest="dynet-gpus",
                      default=False,
                      help='Use GPU instead of cpu.')

(options, args) = parser.parse_args()
if options.train_file:
    train_data, dev_data = utils.split_data(options.train_file,
                                            options.train_t,
                                            options.dev_percent)
    words, tags, chars = utils.vocab(train_data, options.min_freq)
    max_len = max([len(d[1]) for d in train_data])
    min_len = min([len(d[1]) for d in train_data])
    buckets = [list() for i in range(min_len, max_len)]
    for d in train_data:
        buckets[len(d[1]) - min_len - 1].append(d)
    dev_buckets = [list()]
    for d in dev_data:
        dev_buckets[0].append(d)

    with open(os.path.join(options.outdir, options.params), 'w') as paramsfp:
        pickle.dump((words, tags, chars, options), paramsfp)
    t = MT(options, words, tags, chars)

    dev_batches = utils.get_batches(dev_buckets, t, False)
    best_dev = 0