def main():
    args = parse_args()
    random.seed(args.seed)

    args = vars(args)

    print("[Launching identity lemmatizer...]")

    if args['mode'] == 'train':
        print(
            "[No training is required; will only generate evaluation output...]"
        )

    doc, metasentences = CoNLL.conll2dict(input_file=args['eval_file'])
    document = Document(doc, metasentences=metasentences)
    batch = DataLoader(document,
                       args['batch_size'],
                       args,
                       evaluation=True,
                       conll_only=True)
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # use identity mapping for prediction
    preds = batch.doc.get([TEXT])

    # write to file and score
    batch.doc.set([LEMMA], preds)
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Lemma score:")
        print("{} {:.2f}".format(args['lang'], score * 100))
def evaluate(args):
    # file paths
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_nertagger.pt'.format(args['save_dir'], args['shorthand'])

    # load model
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand', 'mode', 'scheme']:
            loaded_args[k] = args[k]

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    # TODO
    doc = Document(json.load(open(args['eval_file'])))
    batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True)

    print("Start evaluation...")
    preds = []
    for i, b in enumerate(batch):
        preds += trainer.predict(b)

    system_pred_file = args['output_file']
    batch.doc.set(['misc'], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    gold_tags = batch.tags
    _, _, score = scorer.score_by_entity(preds, gold_tags)
    #print([(a,b) for a,b in zip(preds,gold_tags)])
    print("NER tagger score:")
    print("{} {:.2f}".format(args['shorthand'], score*100))
Exemple #3
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrain; note that we allow the pretrain_file to be non-existent
    pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) if args['pretrain_file'] is None \
        else args['pretrain_file']
    pretrain = Pretrain(pretrain_file)

    # load model
    print("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(pretrain=pretrain,
                      model_file=model_file,
                      use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'
                                                              ] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc, metasentences = CoNLL.conll2dict(input_file=args['eval_file'])
    doc = Document(doc, metasentences=metasentences)
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       pretrain,
                       vocab=vocab,
                       evaluation=True,
                       sort_during_eval=True)

    if len(batch) > 0:
        print("Start evaluation...")
        preds = []
        for i, b in enumerate(batch):
            preds += trainer.predict(b)
    else:
        # skip eval if dev data does not exist
        preds = []
    preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Parser score:")
        print("{} {:.2f}".format(args['shorthand'], score * 100))
Exemple #4
0
def test_depparse_with_pretagged_doc():
    nlp = classla.Pipeline(**{'processors': 'depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en',
                                  'depparse_pretagged': True})

    doc, metasentences = CoNLL.conll2dict(input_file=EN_DOC_CONLLU_PRETAGGED)
    doc = classla.Document(doc, metasentences=metasentences)
    processed_doc = nlp(doc)

    assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join(
        [sent.dependencies_string() for sent in processed_doc.sentences])
def test_conllu(processed_doc):
    assert CoNLL.conll_as_string(CoNLL.convert_dict(
        processed_doc.to_dict())) == EN_DOC_CONLLU_GOLD
Exemple #6
0
def train(args):
    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrained vectors
    vec_file = args['wordvec_file']

    pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) if args['pretrain_file'] is None \
        else args['pretrain_file']
    pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc, metasentences = CoNLL.conll2dict(input_file=args['train_file'])
    train_doc = Document(doc, metasentences=metasentences)
    train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False)
    vocab = train_batch.vocab
    doc, metasentences = CoNLL.conll2dict(input_file=args['eval_file'])
    dev_doc = Document(doc, metasentences=metasentences)
    dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        print("Skip training because no data available...")
        sys.exit(0)

    print("Training tagger...")
    trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda'])

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = args['lr']
    global_start_time = time.time()
    format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    if args['adapt_eval_interval']:
        args['eval_interval'] = utils.get_adaptive_eval_interval(dev_batch.num_examples, 2000, args['eval_interval'])
        print("Evaluating the model every {} steps...".format(args['eval_interval']))

    using_amsgrad = False
    last_best_step = 0
    # start training
    train_loss = 0
    while True:
        do_break = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False) # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                        max_steps, loss, duration, current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                print("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)
                dev_batch.doc.set([UPOS, XPOS, FEATS], [y for x in dev_preds for y in x])
                CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
                _, _, dev_score = scorer.score(system_pred_file, gold_file)

                train_loss = train_loss / args['eval_interval'] # avg loss per batch
                print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history) == 0 or dev_score > max(dev_score_history):
                    last_best_step = global_step
                    trainer.save(model_file)
                    print("new best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]
                print("")

            if global_step - last_best_step >= args['max_steps_before_stop']:
                if not using_amsgrad:
                    print("Switching to AMSGrad")
                    last_best_step = global_step
                    using_amsgrad = True
                    trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6)
                else:
                    do_break = True
                    break

            if global_step >= args['max_steps']:
                do_break = True
                break

        if do_break: break

        train_batch.reshuffle()

    print("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1
    print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
Exemple #7
0
def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, max_seqlen=1000, orig_text=None, no_ssplit=False):
    paragraphs = []
    for i, p in enumerate(data_generator.sentences):
        start = 0 if i == 0 else paragraphs[-1][2]
        length = sum([len(x) for x in p])
        paragraphs += [(i, start, start+length, length+1)] # para idx, start idx, end idx, length

    paragraphs = list(sorted(paragraphs, key=lambda x: x[3], reverse=True))

    all_preds = [None] * len(paragraphs)
    all_raw = [None] * len(paragraphs)

    eval_limit = max(3000, max_seqlen)

    batch_size = trainer.args['batch_size']
    batches = int((len(paragraphs) + batch_size - 1) / batch_size)

    t = 0
    for i in range(batches):
        batchparas = paragraphs[i * batch_size : (i + 1) * batch_size]
        offsets = [x[1] for x in batchparas]
        t += sum([x[3] for x in batchparas])

        batch = data_generator.next(eval_offsets=offsets)
        raw = batch[3]

        N = len(batch[3][0])
        if N <= eval_limit:
            pred = np.argmax(trainer.predict(batch), axis=2)
        else:
            idx = [0] * len(batchparas)
            Ns = [p[3] for p in batchparas]
            pred = [[] for _ in batchparas]
            while True:
                ens = [min(N - idx1, eval_limit) for idx1, N in zip(idx, Ns)]
                en = max(ens)
                batch1 = batch[0][:, :en], batch[1][:, :en], batch[2][:, :en], [x[:en] for x in batch[3]]
                pred1 = np.argmax(trainer.predict(batch1), axis=2)

                for j in range(len(batchparas)):
                    sentbreaks = np.where((pred1[j] == 2) + (pred1[j] == 4))[0]
                    if len(sentbreaks) <= 0 or idx[j] >= Ns[j] - eval_limit:
                        advance = ens[j]
                    else:
                        advance = np.max(sentbreaks) + 1

                    pred[j] += [pred1[j, :advance]]
                    idx[j] += advance

                if all([idx1 >= N for idx1, N in zip(idx, Ns)]):
                    break
                batch = data_generator.next(eval_offsets=[x+y for x, y in zip(idx, offsets)])

            pred = [np.concatenate(p, 0) for p in pred]

        for j, p in enumerate(batchparas):
            len1 = len([1 for x in raw[j] if x != '<PAD>'])
            if pred[j][len1-1] < 2:
                pred[j][len1-1] = 2
            elif pred[j][len1-1] > 2:
                pred[j][len1-1] = 4
            all_preds[p[0]] = pred[j][:len1]
            all_raw[p[0]] = raw[j]

    offset = 0
    oov_count = 0
    doc = []

    text = SPACE_RE.sub(' ', orig_text) if orig_text is not None else None
    char_offset = 0
    use_la_ittb_shorthand = trainer.args['shorthand'] == 'la_ittb'

    for j in range(len(paragraphs)):
        raw = all_raw[j]
        pred = all_preds[j]

        current_tok = ''
        current_sent = []

        for t, p in zip(raw, pred):
            if t == '<PAD>':
                break
            # hack la_ittb
            if use_la_ittb_shorthand and t in (":", ";"):
                p = 2
            offset += 1
            if vocab.unit2id(t) == vocab.unit2id('<UNK>'):
                oov_count += 1

            current_tok += t
            if p >= 1:
                tok = vocab.normalize_token(current_tok)
                assert '\t' not in tok, tok
                if len(tok) <= 0:
                    current_tok = ''
                    continue
                if orig_text is not None:
                    st = -1
                    tok_len = 0
                    for part in SPACE_SPLIT_RE.split(current_tok):
                        if len(part) == 0: continue
                        st0 = text.index(part, char_offset) - char_offset
                        lstripped = part.lstrip()
                        if st < 0:
                            st = char_offset + st0 + (len(part) - len(lstripped))
                        char_offset += st0 + len(part)
                    additional_info = {START_CHAR: st, END_CHAR: char_offset}
                else:
                    additional_info = dict()
                current_sent.append((tok, p, additional_info))
                current_tok = ''
                if (p == 2 or p == 4) and not no_ssplit:
                    doc.append(process_sentence(current_sent, mwt_dict))
                    current_sent = []

        assert(len(current_tok) == 0)
        if len(current_sent):
            doc.append(process_sentence(current_sent, mwt_dict))

    if output_file: CoNLL.dict2conll(doc, output_file)
    return oov_count, offset, all_preds, doc
Exemple #8
0
    def process_pre_tokenized_conllu_text(self, input_src):
        """
        Pretokenized text in this case is provided in conllu format.
        """

        return CoNLL.conll2dict(input_str=input_src, generate_raw_text=True)
Exemple #9
0
# achieved by enumerating different options of separators that different treebanks might
# use, and comparing that to treating the XPOS tags as separate categories (using a
# WordVocab).
mapping = defaultdict(list)
for sh, fn in zip(shorthands, fullnames):
    print('Resolving vocab option for {}...'.format(sh))
    if not os.path.exists('data/pos/{}.train.in.conllu'.format(sh)):
        raise UserWarning('Training data for {} not found in the data directory, falling back to using WordVocab. To generate the '
            'XPOS vocabulary for this treebank properly, please run the following command first:\n'
            '\tbash scripts/prep_pos_data.sh {}'.format(fn, fn))
        # without the training file, there's not much we can do
        key = 'WordVocab(data, shorthand, idx=2)'
        mapping[key].append(sh)
        continue

    doc, metasentences = CoNLL.conll2dict(input_file='data/pos/{}.train.in.conllu'.format(sh))
    doc = Document(doc, metasentences=metasentences)
    data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True)
    print(f'Original length = {len(data)}')
    data = filter_data(data, idx=2)
    print(f'Filtered length = {len(data)}')
    vocab = WordVocab(data, sh, idx=2, ignore=["_"])
    key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])'
    best_size = len(vocab) - len(VOCAB_PREFIX)
    if best_size > 20:
        for sep in ['', '-', '+', '|', ',', ':']: # separators
            vocab = XPOSVocab(data, sh, idx=2, sep=sep)
            length = sum(len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values())
            if length < best_size:
                key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep)
                best_size = length
def test_dict_to_conll():
    conll = CoNLL.convert_dict(DICT)
    assert conll == CONLL
def test_conll_to_dict():
    dicts = CoNLL.convert_conll(CONLL)
    assert dicts == DICT
Exemple #12
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = '{}/{}.pt'.format(args['model_dir'], args['model_file'])

    # load model
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']:
            loaded_args[k] = args[k]

    # laod data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc, metasentences = CoNLL.conll2dict(input_file=args['eval_file'])
    doc = Document(doc, metasentences=metasentences)
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       vocab=vocab,
                       evaluation=True)

    # skip eval if dev data does not exist
    if len(batch) == 0:
        print("Skip evaluation because no dev data is available...")
        print("Lemma score:")
        print("{} ".format(args['lang']))
        sys.exit(0)

    dict_preds = trainer.predict_dict([(e[0].lower(), e[1])
                                       for e in batch.doc.get([TEXT, XPOS])])

    if loaded_args.get('dict_only', False):
        preds = dict_preds
    else:
        if loaded_args.get('ensemble_dict', False):
            skip = trainer.skip_seq2seq([(e[0].lower(), e[1])
                                         for e in batch.doc.get([TEXT, XPOS])])
            doc, metasentences = CoNLL.conll2dict(input_file=args['eval_file'])
            dev_doc = Document(doc, metasentences=metasentences)
            seq2seq_batch = DataLoader(dev_doc,
                                       args['batch_size'],
                                       loaded_args,
                                       vocab=vocab,
                                       evaluation=True,
                                       skip=skip)
        else:
            seq2seq_batch = batch
        print("Running the seq2seq model...")
        preds = []
        edits = []
        for i, b in enumerate(seq2seq_batch):
            ps, es = trainer.predict(b, args['beam_size'])
            preds += ps
            if es is not None:
                edits += es
        # preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits)

        if loaded_args.get('ensemble_dict', False):
            preds = trainer.postprocess(
                [x for x, y in zip(batch.doc.get([TEXT]), skip) if not y],
                preds,
                edits=edits)
            print("[Ensembling dict with seq2seq lemmatizer...]")
            i = 0
            preds1 = []
            for s in skip:
                if s:
                    preds1.append('')
                else:
                    preds1.append(preds[i])
                    i += 1
            preds = trainer.ensemble([(e[0].lower(), e[1])
                                      for e in batch.doc.get([TEXT, XPOS])],
                                     preds1)
        else:
            preds = trainer.postprocess(batch.doc.get([TEXT]),
                                        preds,
                                        edits=edits)
            print("[Ensembling dict with seq2seq lemmatizer...]")
            # preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds)

    # write to file and score
    batch.doc.set([LEMMA], preds)
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Lemma score:")
        print("{} {:.2f}".format(args['lang'], score * 100))
Exemple #13
0
def train(args):
    # load data
    print("[Loading data with batch size {}...]".format(args['batch_size']))
    doc, metasentences = CoNLL.conll2dict(input_file=args['train_file'])
    train_doc = Document(doc, metasentences=metasentences)
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             evaluation=False)
    vocab = train_batch.vocab
    args['vocab_size'] = vocab['char'].size
    args['pos_vocab_size'] = vocab['pos'].size
    doc, metasentences = CoNLL.conll2dict(input_file=args['eval_file'])
    dev_doc = Document(doc, metasentences=metasentences)
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           vocab=vocab,
                           evaluation=True)

    utils.ensure_dir(args['model_dir'])
    model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang'])

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    utils.print_config(args)

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        print("[Skip training because no data available...]")
        sys.exit(0)

    # start training
    # train a dictionary-based lemmatizer
    trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda'])
    print("[Training dictionary-based lemmatizer...]")
    dict = train_batch.doc.get([TEXT, XPOS, LEMMA])
    dict = [(e[0].lower(), e[1], e[2]) for e in dict]
    if args.get('external_dict', None) is not None:
        extra_dict = []
        for line in open(args['external_dict']):
            word, lemma, xpos = line.rstrip('\r\n').split('\t')
            extra_dict.append((word.lower(), xpos, lemma))
            dict = extra_dict + dict
    trainer.train_dict(dict)
    print("Evaluating on dev set...")
    dev_preds = trainer.predict_dict([
        (e[0].lower(), e[1]) for e in dev_batch.doc.get([TEXT, XPOS])
    ])
    dev_batch.doc.set([LEMMA], dev_preds)
    CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
    _, _, dev_f = scorer.score(system_pred_file, gold_file)
    print("Dev F1 = {:.2f}".format(dev_f * 100))

    if args.get('dict_only', False):
        # save dictionaries
        trainer.save(model_file)
    else:
        # train a seq2seq model
        print("[Training seq2seq-based lemmatizer...]")
        global_step = 0
        max_steps = len(train_batch) * args['num_epoch']
        dev_score_history = []
        best_dev_preds = []
        current_lr = args['lr']
        global_start_time = time.time()
        format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

        # start training
        for epoch in range(1, args['num_epoch'] + 1):
            train_loss = 0
            for i, batch in enumerate(train_batch):
                start_time = time.time()
                global_step += 1
                loss = trainer.update(batch, eval=False)  # update step
                train_loss += loss
                if global_step % args['log_step'] == 0:
                    duration = time.time() - start_time
                    print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                            max_steps, epoch, args['num_epoch'], loss, duration, current_lr))

            # eval on dev
            print("Evaluating on dev set...")
            dev_preds = []
            dev_edits = []

            dict_preds = trainer.predict_dict([
                (e[0].lower(), e[1]) for e in dev_batch.doc.get([TEXT, XPOS])
            ])
            # for i, batch in enumerate(dev_batch):
            #     preds, edits = trainer.predict(batch, args['beam_size'])
            #     dev_preds += preds
            #     if edits is not None:
            #         dev_edits += edits
            # dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]), dev_preds, edits=dev_edits)

            # try ensembling with dict if necessary
            if args.get('ensemble_dict', False):
                skip = trainer.skip_seq2seq([
                    (e[0].lower(), e[1])
                    for e in dev_batch.doc.get([TEXT, XPOS])
                ])
                doc, metasentences = CoNLL.conll2dict(
                    input_file=args['eval_file'])
                dev_doc = Document(doc, metasentences=metasentences)
                seq2seq_batch = DataLoader(dev_doc,
                                           args['batch_size'],
                                           args,
                                           vocab=vocab,
                                           evaluation=True,
                                           skip=skip)
                # print("[Ensembling dict with seq2seq model...]")
                # dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]), dev_preds)
            else:
                seq2seq_batch = dev_batch

            if args.get('ensemble_dict', False):
                dev_preds = trainer.postprocess([
                    x for x, y in zip(dev_batch.doc.get([TEXT]), skip) if not y
                ],
                                                dev_preds,
                                                edits=dev_edits)
                print("[Ensembling dict with seq2seq lemmatizer...]")
                i = 0
                preds1 = []
                for s in skip:
                    if s:
                        preds1.append('')
                    else:
                        preds1.append(dev_preds[i])
                        i += 1
                dev_preds = trainer.ensemble(
                    [(e[0].lower(), e[1])
                     for e in dev_batch.doc.get([TEXT, XPOS])], preds1)
            else:
                dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]),
                                                dev_preds,
                                                edits=dev_edits)

            dev_batch.doc.set([LEMMA], dev_preds)
            CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)

            train_loss = train_loss / train_batch.num_examples * args[
                'batch_size']  # avg loss per batch
            print("epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                epoch, train_loss, dev_score))

            # save best model
            if epoch == 1 or dev_score > max(dev_score_history):
                trainer.save(model_file)
                print("new best model saved.")
                best_dev_preds = dev_preds

            # lr schedule
            if epoch > args['decay_epoch'] and dev_score <= dev_score_history[-1] and \
                    args['optim'] in ['sgd', 'adagrad']:
                current_lr *= args['lr_decay']
                trainer.update_lr(current_lr)

            dev_score_history += [dev_score]
            print("")

        print("Training ended with {} epochs.".format(epoch))

        best_f, best_epoch = max(dev_score_history) * 100, np.argmax(
            dev_score_history) + 1
        print("Best dev F1 = {:.2f}, at epoch = {}".format(best_f, best_epoch))
Exemple #14
0
 def to_conll(self):
     """ Produces string output of CoNLLu type.
     """
     return CoNLL.conll_as_string(CoNLL.convert_dict(self.to_dict()))