コード例 #1
0
def main():
    args = parse_args()
    random.seed(args.seed)

    args = vars(args)

    print("[Launching identity lemmatizer...]")

    if args['mode'] == 'train':
        print(
            "[No training is required; will only generate evaluation output...]"
        )

    document = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(document,
                       args['batch_size'],
                       args,
                       evaluation=True,
                       conll_only=True)
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # use identity mapping for prediction
    preds = batch.doc.get([TEXT])

    # write to file and score
    batch.doc.set([LEMMA], preds)
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Lemma score:")
        print("{} {:.2f}".format(args['lang'], score * 100))
コード例 #2
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang'])

    # load model
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']:
            loaded_args[k] = args[k]

    # laod data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       vocab=vocab,
                       evaluation=True)

    # skip eval if dev data does not exist
    if len(batch) == 0:
        print("Skip evaluation because no dev data is available...")
        print("Lemma score:")
        print("{} ".format(args['lang']))
        sys.exit(0)

    dict_preds = trainer.predict_dict(batch.doc.get([TEXT, UPOS]))

    if loaded_args.get('dict_only', False):
        preds = dict_preds
    else:
        print("Running the seq2seq model...")
        preds = []
        edits = []
        for i, b in enumerate(batch):
            ps, es = trainer.predict(b, args['beam_size'])
            preds += ps
            if es is not None:
                edits += es
        preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits)

        if loaded_args.get('ensemble_dict', False):
            print("[Ensembling dict with seq2seq lemmatizer...]")
            preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds)

    # write to file and score
    batch.doc.set([LEMMA], preds)
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Lemma score:")
        print("{} {:.2f}".format(args['lang'], score * 100))
コード例 #3
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand'])

    # load model
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']:
            loaded_args[k] = args[k]
    logger.debug('max_dec_len: %d' % loaded_args['max_dec_len'])

    # load data
    logger.debug("Loading data with batch size {}...".format(
        args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       vocab=vocab,
                       evaluation=True)

    if len(batch) > 0:
        dict_preds = trainer.predict_dict(
            batch.doc.get_mwt_expansions(evaluation=True))
        # decide trainer type and run eval
        if loaded_args['dict_only']:
            preds = dict_preds
        else:
            logger.info("Running the seq2seq model...")
            preds = []
            for i, b in enumerate(batch):
                preds += trainer.predict(b)

            if loaded_args.get('ensemble_dict', False):
                preds = trainer.ensemble(
                    batch.doc.get_mwt_expansions(evaluation=True), preds)
    else:
        # skip eval if dev data does not exist
        preds = []

    # write to file and score
    doc = copy.deepcopy(batch.doc)
    doc.set_mwt_expansions(preds)
    CoNLL.dict2conll(doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        logger.info("MWT expansion score: {} {:.2f}".format(
            args['shorthand'], score * 100))
コード例 #4
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrain; note that we allow the pretrain_file to be non-existent
    pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'],
                                               args['shorthand'])
    pretrain = Pretrain(pretrain_file)

    # load model
    print("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(pretrain=pretrain,
                      model_file=model_file,
                      use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'
                                                              ] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       pretrain,
                       vocab=vocab,
                       evaluation=True,
                       sort_during_eval=True)
    if len(batch) > 0:
        print("Start evaluation...")
        preds = []
        for i, b in enumerate(batch):
            preds += trainer.predict(b)
    else:
        # skip eval if dev data does not exist
        preds = []
    preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Tagger score:")
        print("{} {:.2f}".format(args['shorthand'], score * 100))
コード例 #5
0
ファイル: parser.py プロジェクト: Pandinosaurus/stanfordnlp
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    model_file = model_file_name(args)
    # load pretrained vectors if needed
    pretrain = load_pretrain(args)

    # load model
    logger.info("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    logger.info("Loading data with batch size {}...".format(args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)

    if len(batch) > 0:
        logger.info("Start evaluation...")
        preds = []
        for i, b in enumerate(batch):
            preds += trainer.predict(b)
    else:
        # skip eval if dev data does not exist
        preds = []
    preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        logger.info("Parser score:")
        logger.info("{} {:.2f}".format(args['shorthand'], score*100))
コード例 #6
0
def train(args):
    # load data
    print("[Loading data with batch size {}...]".format(args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             evaluation=False)
    vocab = train_batch.vocab
    args['vocab_size'] = vocab['char'].size
    args['pos_vocab_size'] = vocab['pos'].size
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           vocab=vocab,
                           evaluation=True)

    utils.ensure_dir(args['model_dir'])
    model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang'])

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    utils.print_config(args)

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        print("[Skip training because no data available...]")
        sys.exit(0)

    # start training
    # train a dictionary-based lemmatizer
    trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda'])
    print("[Training dictionary-based lemmatizer...]")
    trainer.train_dict(train_batch.doc.get([TEXT, UPOS, LEMMA]))
    print("Evaluating on dev set...")
    dev_preds = trainer.predict_dict(dev_batch.doc.get([TEXT, UPOS]))
    dev_batch.doc.set([LEMMA], dev_preds)
    CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
    _, _, dev_f = scorer.score(system_pred_file, gold_file)
    print("Dev F1 = {:.2f}".format(dev_f * 100))

    if args.get('dict_only', False):
        # save dictionaries
        trainer.save(model_file)
    else:
        # train a seq2seq model
        print("[Training seq2seq-based lemmatizer...]")
        global_step = 0
        max_steps = len(train_batch) * args['num_epoch']
        dev_score_history = []
        best_dev_preds = []
        current_lr = args['lr']
        global_start_time = time.time()
        format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

        # start training
        for epoch in range(1, args['num_epoch'] + 1):
            train_loss = 0
            for i, batch in enumerate(train_batch):
                start_time = time.time()
                global_step += 1
                loss = trainer.update(batch, eval=False)  # update step
                train_loss += loss
                if global_step % args['log_step'] == 0:
                    duration = time.time() - start_time
                    print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                            max_steps, epoch, args['num_epoch'], loss, duration, current_lr))

            # eval on dev
            print("Evaluating on dev set...")
            dev_preds = []
            dev_edits = []
            for i, batch in enumerate(dev_batch):
                preds, edits = trainer.predict(batch, args['beam_size'])
                dev_preds += preds
                if edits is not None:
                    dev_edits += edits
            dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]),
                                            dev_preds,
                                            edits=dev_edits)

            # try ensembling with dict if necessary
            if args.get('ensemble_dict', False):
                print("[Ensembling dict with seq2seq model...]")
                dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]),
                                             dev_preds)
            dev_batch.doc.set([LEMMA], dev_preds)
            CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)

            train_loss = train_loss / train_batch.num_examples * args[
                'batch_size']  # avg loss per batch
            print("epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                epoch, train_loss, dev_score))

            # save best model
            if epoch == 1 or dev_score > max(dev_score_history):
                trainer.save(model_file)
                print("new best model saved.")
                best_dev_preds = dev_preds

            # lr schedule
            if epoch > args['decay_epoch'] and dev_score <= dev_score_history[-1] and \
                    args['optim'] in ['sgd', 'adagrad']:
                current_lr *= args['lr_decay']
                trainer.update_lr(current_lr)

            dev_score_history += [dev_score]
            print("")

        print("Training ended with {} epochs.".format(epoch))

        best_f, best_epoch = max(dev_score_history) * 100, np.argmax(
            dev_score_history) + 1
        print("Best dev F1 = {:.2f}, at epoch = {}".format(best_f, best_epoch))
コード例 #7
0
ファイル: utils.py プロジェクト: wannaphong/stanza
def output_predictions(output_file,
                       trainer,
                       data_generator,
                       vocab,
                       mwt_dict,
                       max_seqlen=1000,
                       orig_text=None,
                       no_ssplit=False,
                       prob=False):
    paragraphs = []
    for i, p in enumerate(data_generator.sentences):
        start = 0 if i == 0 else paragraphs[-1][2]
        length = sum([len(x) for x in p])
        paragraphs += [(i, start, start + length, length + 1)
                       ]  # para idx, start idx, end idx, length

    paragraphs = list(sorted(paragraphs, key=lambda x: x[3], reverse=True))

    all_preds = [None] * len(paragraphs)
    all_raw = [None] * len(paragraphs)

    eval_limit = max(3000, max_seqlen)

    batch_size = trainer.args['batch_size']
    batches = int((len(paragraphs) + batch_size - 1) / batch_size)

    t = 0
    list_prob = []
    for i in range(batches):
        batchparas = paragraphs[i * batch_size:(i + 1) * batch_size]
        offsets = [x[1] for x in batchparas]
        t += sum([x[3] for x in batchparas])

        batch = data_generator.next(eval_offsets=offsets)
        raw = batch[3]

        N = len(batch[3][0])
        if N <= eval_limit:
            a = trainer.predict(batch)
            pred = np.argmax(a, axis=2)
            print("555 " + str(a))
            print("Hi " + str(pred))
            list_prob.append(a)
        else:
            idx = [0] * len(batchparas)
            Ns = [p[3] for p in batchparas]
            pred = [[] for _ in batchparas]
            while True:
                ens = [min(N - idx1, eval_limit) for idx1, N in zip(idx, Ns)]
                en = max(ens)
                batch1 = batch[0][:, :en], batch[1][:, :en], batch[
                    2][:, :en], [x[:en] for x in batch[3]]
                pred1 = np.argmax(trainer.predict(batch1), axis=2)
                print("p1 " + str(pred1))

                for j in range(len(batchparas)):
                    sentbreaks = np.where((pred1[j] == 2) + (pred1[j] == 4))[0]
                    if len(sentbreaks) <= 0 or idx[j] >= Ns[j] - eval_limit:
                        advance = ens[j]
                    else:
                        advance = np.max(sentbreaks) + 1

                    pred[j] += [pred1[j, :advance]]
                    idx[j] += advance

                if all([idx1 >= N for idx1, N in zip(idx, Ns)]):
                    break
                batch = data_generator.next(
                    eval_offsets=[x + y for x, y in zip(idx, offsets)])

            pred = [np.concatenate(p, 0) for p in pred]
            print(pred)

        for j, p in enumerate(batchparas):
            len1 = len([1 for x in raw[j] if x != '<PAD>'])
            if pred[j][len1 - 1] < 2:
                pred[j][len1 - 1] = 2
            elif pred[j][len1 - 1] > 2:
                pred[j][len1 - 1] = 4
            all_preds[p[0]] = pred[j][:len1]
            all_raw[p[0]] = raw[j]

    offset = 0
    oov_count = 0
    doc = []

    text = orig_text
    char_offset = 0

    for j in range(len(paragraphs)):
        raw = all_raw[j]
        pred = all_preds[j]

        current_tok = ''
        current_sent = []

        for t, p in zip(raw, pred):
            if t == '<PAD>':
                break
            # hack la_ittb
            if trainer.args['shorthand'] == 'la_ittb' and t in [":", ";"]:
                p = 2
            offset += 1
            if vocab.unit2id(t) == vocab.unit2id('<UNK>'):
                oov_count += 1

            current_tok += t
            if p >= 1:
                tok = vocab.normalize_token(current_tok)
                assert '\t' not in tok, tok
                if len(tok) <= 0:
                    current_tok = ''
                    continue
                if orig_text is not None:
                    st0, tok0 = find_token(tok, text)
                    st = char_offset + st0
                    text = text[st0 + len(tok0):]
                    char_offset += st0 + len(tok0)
                    additional_info = {
                        START_CHAR: st,
                        END_CHAR: st + len(tok0)
                    }
                else:
                    additional_info = dict()
                current_sent += [(tok, p, additional_info)]
                current_tok = ''
                if (p == 2 or p == 4) and not no_ssplit:
                    doc.append(process_sentence(current_sent, mwt_dict))
                    current_sent = []

        if len(current_tok):
            tok = vocab.normalize_token(current_tok)
            assert '\t' not in tok, tok
            if len(tok) > 0:
                if orig_text is not None:
                    st0, tok0 = find_token(tok, text)
                    st = char_offset + st0
                    text = text[st0 + len(tok0):]
                    char_offset += st0 + len(tok0)
                    additional_info = {END_CHAR: st, END_CHAR: st + len(tok0)}
                else:
                    additional_info = dict()
                current_sent += [(tok, 2, additional_info)]

        if len(current_sent):
            doc.append(process_sentence(current_sent, mwt_dict))
    if output_file: CoNLL.dict2conll(doc, output_file)
    if prob:
        return oov_count, offset, all_preds, doc, list(list_prob)
    return oov_count, offset, all_preds, doc
コード例 #8
0
def train(args):
    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrained vectors if needed
    pretrain = None
    if args['pretrain']:
        vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
        pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand'])
        pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False)
    vocab = train_batch.vocab
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        print("Skip training because no data available...")
        sys.exit(0)

    print("Training parser...")
    trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda'])

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = args['lr']
    global_start_time = time.time()
    format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    using_amsgrad = False
    last_best_step = 0
    # start training
    train_loss = 0
    while True:
        do_break = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False) # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                        max_steps, loss, duration, current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                print("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)

                dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x])
                CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
                _, _, dev_score = scorer.score(system_pred_file, gold_file)

                train_loss = train_loss / args['eval_interval'] # avg loss per batch
                print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history) == 0 or dev_score > max(dev_score_history):
                    last_best_step = global_step
                    trainer.save(model_file)
                    print("new best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]
                print("")

            if global_step - last_best_step >= args['max_steps_before_stop']:
                if not using_amsgrad:
                    print("Switching to AMSGrad")
                    last_best_step = global_step
                    using_amsgrad = True
                    trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6)
                else:
                    do_break = True
                    break

            if global_step >= args['max_steps']:
                do_break = True
                break

        if do_break: break

        train_batch.reshuffle()

    print("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1
    print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
コード例 #9
0
def train(args):
    # load data
    logger.debug('max_dec_len: %d' % args['max_dec_len'])
    logger.debug("Loading data with batch size {}...".format(
        args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             evaluation=False)
    vocab = train_batch.vocab
    args['vocab_size'] = vocab.size
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           vocab=vocab,
                           evaluation=True)

    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand'])

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.warning("Skip training because no data available...")
        return

    # train a dictionary-based MWT expander
    trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda'])
    logger.info("Training dictionary-based MWT expander...")
    trainer.train_dict(train_batch.doc.get_mwt_expansions(evaluation=False))
    logger.info("Evaluating on dev set...")
    dev_preds = trainer.predict_dict(
        dev_batch.doc.get_mwt_expansions(evaluation=True))
    doc = copy.deepcopy(dev_batch.doc)
    doc.set_mwt_expansions(dev_preds)
    CoNLL.dict2conll(doc.to_dict(), system_pred_file)
    _, _, dev_f = scorer.score(system_pred_file, gold_file)
    logger.info("Dev F1 = {:.2f}".format(dev_f * 100))

    if args.get('dict_only', False):
        # save dictionaries
        trainer.save(model_file)
    else:
        # train a seq2seq model
        logger.info("Training seq2seq-based MWT expander...")
        global_step = 0
        max_steps = len(train_batch) * args['num_epoch']
        dev_score_history = []
        best_dev_preds = []
        current_lr = args['lr']
        global_start_time = time.time()
        format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

        # start training
        for epoch in range(1, args['num_epoch'] + 1):
            train_loss = 0
            for i, batch in enumerate(train_batch):
                start_time = time.time()
                global_step += 1
                loss = trainer.update(batch, eval=False)  # update step
                train_loss += loss
                if global_step % args['log_step'] == 0:
                    duration = time.time() - start_time
                    logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                                                  max_steps, epoch, args['num_epoch'], loss, duration, current_lr))

            # eval on dev
            logger.info("Evaluating on dev set...")
            dev_preds = []
            for i, batch in enumerate(dev_batch):
                preds = trainer.predict(batch)
                dev_preds += preds
            if args.get('ensemble_dict', False) and args.get(
                    'ensemble_early_stop', False):
                logger.info("[Ensembling dict with seq2seq model...]")
                dev_preds = trainer.ensemble(
                    dev_batch.doc.get_mwt_expansions(evaluation=True),
                    dev_preds)
            doc = copy.deepcopy(dev_batch.doc)
            doc.set_mwt_expansions(dev_preds)
            CoNLL.dict2conll(doc.to_dict(), system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)

            train_loss = train_loss / train_batch.num_examples * args[
                'batch_size']  # avg loss per batch
            logger.info(
                "epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                    epoch, train_loss, dev_score))

            # save best model
            if epoch == 1 or dev_score > max(dev_score_history):
                trainer.save(model_file)
                logger.info("new best model saved.")
                best_dev_preds = dev_preds

            # lr schedule
            if epoch > args['decay_epoch'] and dev_score <= dev_score_history[
                    -1]:
                current_lr *= args['lr_decay']
                trainer.change_lr(current_lr)

            dev_score_history += [dev_score]

        logger.info("Training ended with {} epochs.".format(epoch))

        best_f, best_epoch = max(dev_score_history) * 100, np.argmax(
            dev_score_history) + 1
        logger.info("Best dev F1 = {:.2f}, at epoch = {}".format(
            best_f, best_epoch))

        # try ensembling with dict if necessary
        if args.get('ensemble_dict', False):
            logger.info("[Ensembling dict with seq2seq model...]")
            dev_preds = trainer.ensemble(
                dev_batch.doc.get_mwt_expansions(evaluation=True),
                best_dev_preds)
            doc = copy.deepcopy(dev_batch.doc)
            doc.set_mwt_expansions(dev_preds)
            CoNLL.dict2conll(doc.to_dict(), system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)
            logger.info("Ensemble dev F1 = {:.2f}".format(dev_score * 100))
            best_f = max(best_f, dev_score)
コード例 #10
0
def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, max_seqlen=1000, orig_text=None, no_ssplit=False, use_regex_tokens=True):
    paragraphs = []
    for i, p in enumerate(data_generator.sentences):
        start = 0 if i == 0 else paragraphs[-1][2]
        length = sum([len(x[0]) for x in p])
        paragraphs += [(i, start, start+length, length+1)] # para idx, start idx, end idx, length

    paragraphs = list(sorted(paragraphs, key=lambda x: x[3], reverse=True))

    all_preds = [None] * len(paragraphs)
    all_raw = [None] * len(paragraphs)

    eval_limit = max(3000, max_seqlen)

    batch_size = trainer.args['batch_size']
    batches = int((len(paragraphs) + batch_size - 1) / batch_size)

    t = 0
    for i in range(batches):
        batchparas = paragraphs[i * batch_size : (i + 1) * batch_size]
        offsets = [x[1] for x in batchparas]
        t += sum([x[3] for x in batchparas])

        batch = data_generator.next(eval_offsets=offsets)
        raw = batch[3]

        N = len(batch[3][0])
        if N <= eval_limit:
            pred = np.argmax(trainer.predict(batch), axis=2)
        else:
            idx = [0] * len(batchparas)
            adv = [0] * len(batchparas)
            Ns = [p[3] for p in batchparas]
            pred = [[] for _ in batchparas]
            while True:
                ens = [min(N - idx1, eval_limit) for idx1, N in zip(idx, Ns)]
                en = max(ens)
                batch1 = batch[0][:, :en], batch[1][:, :en], batch[2][:, :en], [x[:en] for x in batch[3]]
                pred1 = np.argmax(trainer.predict(batch1), axis=2)

                for j in range(len(batchparas)):
                    sentbreaks = np.where((pred1[j] == 2) + (pred1[j] == 4))[0]
                    if len(sentbreaks) <= 0 or idx[j] >= Ns[j] - eval_limit:
                        advance = ens[j]
                    else:
                        advance = np.max(sentbreaks) + 1

                    pred[j] += [pred1[j, :advance]]
                    idx[j] += advance
                    adv[j] = advance

                if all([idx1 >= N for idx1, N in zip(idx, Ns)]):
                    break
                batch = data_generator.next(eval_offsets=adv, old_batch=batch)

            pred = [np.concatenate(p, 0) for p in pred]

        for j, p in enumerate(batchparas):
            len1 = len([1 for x in raw[j] if x != '<PAD>'])
            if pred[j][len1-1] < 2:
                pred[j][len1-1] = 2
            elif pred[j][len1-1] > 2:
                pred[j][len1-1] = 4
            if use_regex_tokens:
                all_preds[p[0]] = update_pred_regex(raw[j], pred[j][:len1])
            else:
                all_preds[p[0]] = pred[j][:len1]
            all_raw[p[0]] = raw[j]

    offset = 0
    oov_count = 0
    doc = []

    text = SPACE_RE.sub(' ', orig_text) if orig_text is not None else None
    char_offset = 0
    use_la_ittb_shorthand = trainer.args['shorthand'] == 'la_ittb'

    for j in range(len(paragraphs)):
        raw = all_raw[j]
        pred = all_preds[j]

        current_tok = ''
        current_sent = []

        for t, p in zip(raw, pred):
            if t == '<PAD>':
                break
            # hack la_ittb
            if use_la_ittb_shorthand and t in (":", ";"):
                p = 2
            offset += 1
            if vocab.unit2id(t) == vocab.unit2id('<UNK>'):
                oov_count += 1

            current_tok += t
            if p >= 1:
                tok = vocab.normalize_token(current_tok)
                assert '\t' not in tok, tok
                if len(tok) <= 0:
                    current_tok = ''
                    continue
                if orig_text is not None:
                    st = -1
                    tok_len = 0
                    for part in SPACE_SPLIT_RE.split(current_tok):
                        if len(part) == 0: continue
                        st0 = text.index(part, char_offset) - char_offset
                        lstripped = part.lstrip()
                        if st < 0:
                            st = char_offset + st0 + (len(part) - len(lstripped))
                        char_offset += st0 + len(part)
                    additional_info = {START_CHAR: st, END_CHAR: char_offset}
                else:
                    additional_info = dict()
                current_sent.append((tok, p, additional_info))
                current_tok = ''
                if (p == 2 or p == 4) and not no_ssplit:
                    doc.append(process_sentence(current_sent, mwt_dict))
                    current_sent = []

        assert(len(current_tok) == 0)
        if len(current_sent):
            doc.append(process_sentence(current_sent, mwt_dict))

    if output_file: CoNLL.dict2conll(doc, output_file)
    return oov_count, offset, all_preds, doc
コード例 #11
0
def output_predictions(output_file,
                       trainer,
                       data_generator,
                       vocab,
                       mwt_dict,
                       max_seqlen=1000,
                       orig_text=None,
                       no_ssplit=False,
                       use_regex_tokens=True):
    paragraphs = []
    for i, p in enumerate(data_generator.sentences):
        start = 0 if i == 0 else paragraphs[-1][2]
        length = sum([len(x[0]) for x in p])
        paragraphs += [(i, start, start + length, length)
                       ]  # para idx, start idx, end idx, length

    paragraphs = list(sorted(paragraphs, key=lambda x: x[3], reverse=True))

    all_preds = [None] * len(paragraphs)
    all_raw = [None] * len(paragraphs)

    eval_limit = max(3000, max_seqlen)

    batch_size = trainer.args['batch_size']
    skip_newline = trainer.args['skip_newline']
    batches = int((len(paragraphs) + batch_size - 1) / batch_size)

    for i in range(batches):
        # At evaluation time, each paragraph is treated as a single "sentence", and a batch of `batch_size` paragraphs
        # are tokenized together. `offsets` here are used by the data generator to identify which paragraphs to use
        # for the next batch of evaluation.
        batchparas = paragraphs[i * batch_size:(i + 1) * batch_size]
        offsets = [x[1] for x in batchparas]

        batch = data_generator.next(eval_offsets=offsets)
        raw = batch[3]

        N = len(batch[3][0])
        if N <= eval_limit:
            pred = np.argmax(trainer.predict(batch), axis=2)
        else:
            idx = [0] * len(batchparas)
            adv = [0] * len(batchparas)
            Ns = [p[3] for p in batchparas]
            pred = [[] for _ in batchparas]
            while True:
                ens = [min(N - idx1, eval_limit) for idx1, N in zip(idx, Ns)]
                en = max(ens)
                batch1 = batch[0][:, :en], batch[1][:, :en], batch[
                    2][:, :en], [x[:en] for x in batch[3]]
                pred1 = np.argmax(trainer.predict(batch1), axis=2)

                for j in range(len(batchparas)):
                    sentbreaks = np.where((pred1[j] == 2) + (pred1[j] == 4))[0]
                    if len(sentbreaks) <= 0 or idx[j] >= Ns[j] - eval_limit:
                        advance = ens[j]
                    else:
                        advance = np.max(sentbreaks) + 1

                    pred[j] += [pred1[j, :advance]]
                    idx[j] += advance
                    adv[j] = advance

                if all([idx1 >= N for idx1, N in zip(idx, Ns)]):
                    break
                # once we've made predictions on a certain number of characters for each paragraph (recorded in `adv`),
                # we skip the first `adv` characters to make the updated batch
                batch = data_generator.next(eval_offsets=adv, old_batch=batch)

            pred = [np.concatenate(p, 0) for p in pred]

        for j, p in enumerate(batchparas):
            len1 = len([1 for x in raw[j] if x != '<PAD>'])
            if pred[j][len1 - 1] < 2:
                pred[j][len1 - 1] = 2
            elif pred[j][len1 - 1] > 2:
                pred[j][len1 - 1] = 4
            if use_regex_tokens:
                all_preds[p[0]] = update_pred_regex(raw[j], pred[j][:len1])
            else:
                all_preds[p[0]] = pred[j][:len1]
            all_raw[p[0]] = raw[j]

    offset = 0
    oov_count = 0
    doc = []

    text = SPACE_RE.sub(' ', orig_text) if orig_text is not None else None
    char_offset = 0
    use_la_ittb_shorthand = trainer.args['shorthand'] == 'la_ittb'

    UNK_ID = vocab.unit2id('<UNK>')

    # Once everything is fed through the tokenizer model, it's time to decode the predictions
    # into actual tokens and sentences that the rest of the pipeline uses
    for j in range(len(paragraphs)):
        raw = all_raw[j]
        pred = all_preds[j]

        current_tok = ''
        current_sent = []

        for t, p in zip(raw, pred):
            if t == '<PAD>':
                break
            # hack la_ittb
            if use_la_ittb_shorthand and t in (":", ";"):
                p = 2
            offset += 1
            if vocab.unit2id(t) == UNK_ID:
                oov_count += 1

            current_tok += t
            if p >= 1:
                tok = vocab.normalize_token(current_tok)
                assert '\t' not in tok, tok
                if len(tok) <= 0:
                    current_tok = ''
                    continue
                if orig_text is not None:
                    st = -1
                    tok_len = 0
                    for part in SPACE_SPLIT_RE.split(current_tok):
                        if len(part) == 0: continue
                        if skip_newline:
                            part_pattern = re.compile(r'\s*'.join(
                                re.escape(c) for c in part))
                            match = part_pattern.search(text, char_offset)
                            st0 = match.start(0) - char_offset
                            partlen = match.end(0) - match.start(0)
                        else:
                            st0 = text.index(part, char_offset) - char_offset
                            partlen = len(part)
                        lstripped = part.lstrip()
                        if st < 0:
                            st = char_offset + st0 + (len(part) -
                                                      len(lstripped))
                        char_offset += st0 + partlen
                    position_info = (st, char_offset)
                else:
                    position_info = None
                current_sent.append((tok, p, position_info))
                current_tok = ''
                if (p == 2 or p == 4) and not no_ssplit:
                    doc.append(process_sentence(current_sent, mwt_dict))
                    current_sent = []

        assert (len(current_tok) == 0)
        if len(current_sent):
            doc.append(process_sentence(current_sent, mwt_dict))

    if output_file: CoNLL.dict2conll(doc, output_file)
    return oov_count, offset, all_preds, doc
コード例 #12
0
ファイル: tagger.py プロジェクト: rasimuvaikas/stanza
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = model_file_name(args)

    pretrain = load_pretrain(args)

    # load model
    train_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    logger.info("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(doc=train_doc,
                      pretrain=pretrain,
                      model_file=model_file,
                      use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'
                                                              ] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    logger.info("Loading data with batch size {}...".format(
        args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       pretrain,
                       vocab=vocab,
                       evaluation=True,
                       sort_during_eval=False)
    if len(batch) > 0:
        logger.info("Start evaluation...")
        preds = []
        if args['morph_dict']:
            print('Collecting morph dictionary...')
            morph_dict = MorphDictionary(args['morph_dict'])
            print('Completed.')
        else:
            morph_dict = None
        start = 0
        end = 0
        for i, b in enumerate(batch):
            end += len(b[8])  # b[8] is orig_idx
            # data_orig_idx=batch.data_orig_idx,
            preds += trainer.predict(b,
                                     morph_dict=morph_dict,
                                     start=start,
                                     end=end)
            start += len(b[8])
    else:
        # skip eval if dev data does not exist
        preds = []
    # sorting is disabled by sort_during_eval=False, no need to unsort
    # preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        logger.info("Tagger score:")
        logger.info("{} {:.2f}".format(args['shorthand'], score * 100))
コード例 #13
0
ファイル: tagger.py プロジェクト: rasimuvaikas/stanza
def train(args):
    model_file = model_file_name(args)
    utils.ensure_dir(os.path.split(model_file)[0])

    # load pretrained vectors if needed
    pretrain = load_pretrain(args)

    # load data
    logger.info("Loading data with batch size {}...".format(
        args['batch_size']))
    # train_data is now a list of sentences, where each sentence is a
    # list of words, in which each word is a dict of conll attributes
    train_data = CoNLL.conll2dict(input_file=args['train_file'])
    # possibly augment the training data with some amount of fake data
    # based on the options chosen
    logger.info("Original data size: {}".format(len(train_data)))
    train_data.extend(
        augment_punct(train_data,
                      args['augment_nopunct'],
                      keep_original_sentences=False))
    logger.info("Augmented data size: {}".format(len(train_data)))
    train_doc = Document(train_data)
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             pretrain,
                             evaluation=False)
    vocab = train_batch.vocab
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           pretrain,
                           vocab=vocab,
                           evaluation=True,
                           sort_during_eval=True)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.info("Skip training because no data available...")
        return

    logger.info("Training tagger...")
    trainer = Trainer(args=args,
                      vocab=vocab,
                      pretrain=pretrain,
                      use_cuda=args['cuda'])

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = args['lr']
    global_start_time = time.time()
    format_str = 'Finished STEP {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    if args['adapt_eval_interval']:
        args['eval_interval'] = utils.get_adaptive_eval_interval(
            dev_batch.num_examples, 2000, args['eval_interval'])
        logger.info("Evaluating the model every {} steps...".format(
            args['eval_interval']))

    using_amsgrad = False
    last_best_step = 0
    # start training
    train_loss = 0
    while True:
        do_break = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False)  # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                logger.info(
                    format_str.format(global_step, max_steps, loss, duration,
                                      current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                logger.info("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)
                dev_batch.doc.set([UPOS, XPOS, FEATS],
                                  [y for x in dev_preds for y in x])
                CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
                _, _, dev_score = scorer.score(system_pred_file, gold_file)

                train_loss = train_loss / args[
                    'eval_interval']  # avg loss per batch
                logger.info(
                    "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                        global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history
                       ) == 0 or dev_score > max(dev_score_history):
                    last_best_step = global_step
                    trainer.save(model_file)
                    logger.info("new best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]

            if global_step - last_best_step >= args['max_steps_before_stop']:
                if not using_amsgrad:
                    logger.info("Switching to AMSGrad")
                    last_best_step = global_step
                    using_amsgrad = True
                    trainer.optimizer = optim.Adam(trainer.model.parameters(),
                                                   amsgrad=True,
                                                   lr=args['lr'],
                                                   betas=(.9, args['beta2']),
                                                   eps=1e-6)
                else:
                    logger.info(
                        "Early termination: have not improved in {} steps".
                        format(args['max_steps_before_stop']))
                    do_break = True
                    break

            if global_step >= args['max_steps']:
                do_break = True
                break

        if do_break: break

        train_batch.reshuffle()

    logger.info("Training ended with {} steps.".format(global_step))

    if len(dev_score_history) > 0:
        best_f, best_eval = max(dev_score_history) * 100, np.argmax(
            dev_score_history) + 1
        logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(
            best_f, best_eval * args['eval_interval']))
    else:
        logger.info("Dev set never evaluated.  Saving final model.")
        trainer.save(model_file)