Beispiel #1
0
def get_factory(sh, fn):
    print('Resolving vocab option for {}...'.format(sh))
    train_file = 'data/pos/{}.train.in.conllu'.format(sh)
    if not os.path.exists(train_file):
        raise UserWarning(
            'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the '
            'XPOS vocabulary for this treebank properly, please run the following command first:\n'
            '\tstanza/utils/datasets/prepare_pos_treebank.py {}'.format(
                fn, fn))
        # without the training file, there's not much we can do
        key = 'WordVocab(data, shorthand, idx=2)'
        return key

    doc = Document(CoNLL.conll2dict(input_file=train_file))
    data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True)
    print(f'Original length = {len(data)}')
    data = filter_data(data, idx=2)
    print(f'Filtered length = {len(data)}')
    vocab = WordVocab(data, sh, idx=2, ignore=["_"])
    key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])'
    best_size = len(vocab) - len(VOCAB_PREFIX)
    if best_size > 20:
        for sep in ['', '-', '+', '|', ',', ':']:  # separators
            vocab = XPOSVocab(data, sh, idx=2, sep=sep)
            length = sum(
                len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values())
            if length < best_size:
                key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep)
                best_size = length
    return key
Beispiel #2
0
def main():
    args = parse_args()
    random.seed(args.seed)

    args = vars(args)

    print("[Launching identity lemmatizer...]")

    if args['mode'] == 'train':
        print(
            "[No training is required; will only generate evaluation output...]"
        )

    document = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(document,
                       args['batch_size'],
                       args,
                       evaluation=True,
                       conll_only=True)
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # use identity mapping for prediction
    preds = batch.doc.get([TEXT])

    # write to file and score
    batch.doc.set([LEMMA], preds)
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Lemma score:")
        print("{} {:.2f}".format(args['lang'], score * 100))
Beispiel #3
0
def check_mwt(filename):
    """
    Checks whether or not there are MWTs in the given conll file
    """
    doc = Document(CoNLL.conll2dict(filename))
    data = doc.get_mwt_expansions(False)
    return len(data) > 0
Beispiel #4
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang'])

    # load model
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']:
            loaded_args[k] = args[k]

    # laod data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       vocab=vocab,
                       evaluation=True)

    # skip eval if dev data does not exist
    if len(batch) == 0:
        print("Skip evaluation because no dev data is available...")
        print("Lemma score:")
        print("{} ".format(args['lang']))
        sys.exit(0)

    dict_preds = trainer.predict_dict(batch.doc.get([TEXT, UPOS]))

    if loaded_args.get('dict_only', False):
        preds = dict_preds
    else:
        print("Running the seq2seq model...")
        preds = []
        edits = []
        for i, b in enumerate(batch):
            ps, es = trainer.predict(b, args['beam_size'])
            preds += ps
            if es is not None:
                edits += es
        preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits)

        if loaded_args.get('ensemble_dict', False):
            print("[Ensembling dict with seq2seq lemmatizer...]")
            preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds)

    # write to file and score
    batch.doc.set([LEMMA], preds)
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Lemma score:")
        print("{} {:.2f}".format(args['lang'], score * 100))
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand'])

    # load model
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']:
            loaded_args[k] = args[k]
    logger.debug('max_dec_len: %d' % loaded_args['max_dec_len'])

    # load data
    logger.debug("Loading data with batch size {}...".format(
        args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       vocab=vocab,
                       evaluation=True)

    if len(batch) > 0:
        dict_preds = trainer.predict_dict(
            batch.doc.get_mwt_expansions(evaluation=True))
        # decide trainer type and run eval
        if loaded_args['dict_only']:
            preds = dict_preds
        else:
            logger.info("Running the seq2seq model...")
            preds = []
            for i, b in enumerate(batch):
                preds += trainer.predict(b)

            if loaded_args.get('ensemble_dict', False):
                preds = trainer.ensemble(
                    batch.doc.get_mwt_expansions(evaluation=True), preds)
    else:
        # skip eval if dev data does not exist
        preds = []

    # write to file and score
    doc = copy.deepcopy(batch.doc)
    doc.set_mwt_expansions(preds)
    CoNLL.dict2conll(doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        logger.info("MWT expansion score: {} {:.2f}".format(
            args['shorthand'], score * 100))
Beispiel #6
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrain; note that we allow the pretrain_file to be non-existent
    pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'],
                                               args['shorthand'])
    pretrain = Pretrain(pretrain_file)

    # load model
    print("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(pretrain=pretrain,
                      model_file=model_file,
                      use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'
                                                              ] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       pretrain,
                       vocab=vocab,
                       evaluation=True,
                       sort_during_eval=True)
    if len(batch) > 0:
        print("Start evaluation...")
        preds = []
        for i, b in enumerate(batch):
            preds += trainer.predict(b)
    else:
        # skip eval if dev data does not exist
        preds = []
    preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Tagger score:")
        print("{} {:.2f}".format(args['shorthand'], score * 100))
def test_depparse_with_pretagged_doc():
    nlp = stanza.Pipeline(
        **{
            'processors': 'depparse',
            'dir': TEST_MODELS_DIR,
            'lang': 'en',
            'depparse_pretagged': True
        })

    doc = stanza.Document(CoNLL.conll2dict(input_str=EN_DOC_CONLLU_PRETAGGED))
    processed_doc = nlp(doc)

    assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join(
        [sent.dependencies_string() for sent in processed_doc.sentences])
Beispiel #8
0
def prep_conllu(tb, file_path, overwrite):
    out_file = out_dir.joinpath(file_path.name)
    if out_file.exists() and not overwrite:
        print(f"{out_file.name} exists; skipping")
        return None
    lang, tb, tb_kwargs = determine_treebank(tb)
    if not lang:
        shutil.copy(file_path, out_file)
        return None
    doc = Document(CoNLL.conll2dict(input_file=file_path))
    nlp = stanza.Pipeline(lang=lang,
                          processors='tokenize,mwt,pos',
                          tokenize_pretokenized=True)
    doc = nlp.processors['pos'].process(doc)
    return doc
Beispiel #9
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    model_file = model_file_name(args)
    # load pretrained vectors if needed
    pretrain = load_pretrain(args)

    # load model
    logger.info("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    logger.info("Loading data with batch size {}...".format(args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)

    if len(batch) > 0:
        logger.info("Start evaluation...")
        preds = []
        for i, b in enumerate(batch):
            preds += trainer.predict(b)
    else:
        # skip eval if dev data does not exist
        preds = []
    preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        logger.info("Parser score:")
        logger.info("{} {:.2f}".format(args['shorthand'], score*100))
Beispiel #10
0
    def predict(self, eval_file_or_string):
        eval_file = _read_conllu_arg(eval_file_or_string,
                                     self.feature_config,
                                     predict=True)
        doc = Document(CoNLL.conll2dict(input_file=eval_file))
        batch = DataLoader(doc,
                           self.batch_size,
                           self.loaded_args,
                           self.pretrain,
                           vocab=self.vocab,
                           evaluation=True,
                           sort_during_eval=True)

        preds = []
        if len(batch) > 0:
            for i, b in enumerate(batch):
                preds += self.trainer.predict(b)
        preds = utils.unsort(preds, batch.data_orig_idx)
        batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])

        doc_conll = CoNLL.convert_dict(batch.doc.to_dict())
        conll_string = CoNLL.conll_as_string(doc_conll)
        return conll_string
Beispiel #11
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = model_file_name(args)

    pretrain = load_pretrain(args)

    # load model
    train_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    logger.info("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(doc=train_doc,
                      pretrain=pretrain,
                      model_file=model_file,
                      use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'
                                                              ] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    logger.info("Loading data with batch size {}...".format(
        args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       pretrain,
                       vocab=vocab,
                       evaluation=True,
                       sort_during_eval=False)
    if len(batch) > 0:
        logger.info("Start evaluation...")
        preds = []
        if args['morph_dict']:
            print('Collecting morph dictionary...')
            morph_dict = MorphDictionary(args['morph_dict'])
            print('Completed.')
        else:
            morph_dict = None
        start = 0
        end = 0
        for i, b in enumerate(batch):
            end += len(b[8])  # b[8] is orig_idx
            # data_orig_idx=batch.data_orig_idx,
            preds += trainer.predict(b,
                                     morph_dict=morph_dict,
                                     start=start,
                                     end=end)
            start += len(b[8])
    else:
        # skip eval if dev data does not exist
        preds = []
    # sorting is disabled by sort_during_eval=False, no need to unsort
    # preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        logger.info("Tagger score:")
        logger.info("{} {:.2f}".format(args['shorthand'], score * 100))
def train(args):
    # load data
    logger.debug('max_dec_len: %d' % args['max_dec_len'])
    logger.debug("Loading data with batch size {}...".format(
        args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             evaluation=False)
    vocab = train_batch.vocab
    args['vocab_size'] = vocab.size
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           vocab=vocab,
                           evaluation=True)

    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand'])

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.warning("Skip training because no data available...")
        return

    # train a dictionary-based MWT expander
    trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda'])
    logger.info("Training dictionary-based MWT expander...")
    trainer.train_dict(train_batch.doc.get_mwt_expansions(evaluation=False))
    logger.info("Evaluating on dev set...")
    dev_preds = trainer.predict_dict(
        dev_batch.doc.get_mwt_expansions(evaluation=True))
    doc = copy.deepcopy(dev_batch.doc)
    doc.set_mwt_expansions(dev_preds)
    CoNLL.dict2conll(doc.to_dict(), system_pred_file)
    _, _, dev_f = scorer.score(system_pred_file, gold_file)
    logger.info("Dev F1 = {:.2f}".format(dev_f * 100))

    if args.get('dict_only', False):
        # save dictionaries
        trainer.save(model_file)
    else:
        # train a seq2seq model
        logger.info("Training seq2seq-based MWT expander...")
        global_step = 0
        max_steps = len(train_batch) * args['num_epoch']
        dev_score_history = []
        best_dev_preds = []
        current_lr = args['lr']
        global_start_time = time.time()
        format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

        # start training
        for epoch in range(1, args['num_epoch'] + 1):
            train_loss = 0
            for i, batch in enumerate(train_batch):
                start_time = time.time()
                global_step += 1
                loss = trainer.update(batch, eval=False)  # update step
                train_loss += loss
                if global_step % args['log_step'] == 0:
                    duration = time.time() - start_time
                    logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                                                  max_steps, epoch, args['num_epoch'], loss, duration, current_lr))

            # eval on dev
            logger.info("Evaluating on dev set...")
            dev_preds = []
            for i, batch in enumerate(dev_batch):
                preds = trainer.predict(batch)
                dev_preds += preds
            if args.get('ensemble_dict', False) and args.get(
                    'ensemble_early_stop', False):
                logger.info("[Ensembling dict with seq2seq model...]")
                dev_preds = trainer.ensemble(
                    dev_batch.doc.get_mwt_expansions(evaluation=True),
                    dev_preds)
            doc = copy.deepcopy(dev_batch.doc)
            doc.set_mwt_expansions(dev_preds)
            CoNLL.dict2conll(doc.to_dict(), system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)

            train_loss = train_loss / train_batch.num_examples * args[
                'batch_size']  # avg loss per batch
            logger.info(
                "epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                    epoch, train_loss, dev_score))

            # save best model
            if epoch == 1 or dev_score > max(dev_score_history):
                trainer.save(model_file)
                logger.info("new best model saved.")
                best_dev_preds = dev_preds

            # lr schedule
            if epoch > args['decay_epoch'] and dev_score <= dev_score_history[
                    -1]:
                current_lr *= args['lr_decay']
                trainer.change_lr(current_lr)

            dev_score_history += [dev_score]

        logger.info("Training ended with {} epochs.".format(epoch))

        best_f, best_epoch = max(dev_score_history) * 100, np.argmax(
            dev_score_history) + 1
        logger.info("Best dev F1 = {:.2f}, at epoch = {}".format(
            best_f, best_epoch))

        # try ensembling with dict if necessary
        if args.get('ensemble_dict', False):
            logger.info("[Ensembling dict with seq2seq model...]")
            dev_preds = trainer.ensemble(
                dev_batch.doc.get_mwt_expansions(evaluation=True),
                best_dev_preds)
            doc = copy.deepcopy(dev_batch.doc)
            doc.set_mwt_expansions(dev_preds)
            CoNLL.dict2conll(doc.to_dict(), system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)
            logger.info("Ensemble dev F1 = {:.2f}".format(dev_score * 100))
            best_f = max(best_f, dev_score)
Beispiel #13
0
def train(args):
    # load data
    print("[Loading data with batch size {}...]".format(args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             evaluation=False)
    vocab = train_batch.vocab
    args['vocab_size'] = vocab['char'].size
    args['pos_vocab_size'] = vocab['pos'].size
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           vocab=vocab,
                           evaluation=True)

    utils.ensure_dir(args['model_dir'])
    model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang'])

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    utils.print_config(args)

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        print("[Skip training because no data available...]")
        sys.exit(0)

    # start training
    # train a dictionary-based lemmatizer
    trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda'])
    print("[Training dictionary-based lemmatizer...]")
    trainer.train_dict(train_batch.doc.get([TEXT, UPOS, LEMMA]))
    print("Evaluating on dev set...")
    dev_preds = trainer.predict_dict(dev_batch.doc.get([TEXT, UPOS]))
    dev_batch.doc.set([LEMMA], dev_preds)
    CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
    _, _, dev_f = scorer.score(system_pred_file, gold_file)
    print("Dev F1 = {:.2f}".format(dev_f * 100))

    if args.get('dict_only', False):
        # save dictionaries
        trainer.save(model_file)
    else:
        # train a seq2seq model
        print("[Training seq2seq-based lemmatizer...]")
        global_step = 0
        max_steps = len(train_batch) * args['num_epoch']
        dev_score_history = []
        best_dev_preds = []
        current_lr = args['lr']
        global_start_time = time.time()
        format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

        # start training
        for epoch in range(1, args['num_epoch'] + 1):
            train_loss = 0
            for i, batch in enumerate(train_batch):
                start_time = time.time()
                global_step += 1
                loss = trainer.update(batch, eval=False)  # update step
                train_loss += loss
                if global_step % args['log_step'] == 0:
                    duration = time.time() - start_time
                    print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                            max_steps, epoch, args['num_epoch'], loss, duration, current_lr))

            # eval on dev
            print("Evaluating on dev set...")
            dev_preds = []
            dev_edits = []
            for i, batch in enumerate(dev_batch):
                preds, edits = trainer.predict(batch, args['beam_size'])
                dev_preds += preds
                if edits is not None:
                    dev_edits += edits
            dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]),
                                            dev_preds,
                                            edits=dev_edits)

            # try ensembling with dict if necessary
            if args.get('ensemble_dict', False):
                print("[Ensembling dict with seq2seq model...]")
                dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]),
                                             dev_preds)
            dev_batch.doc.set([LEMMA], dev_preds)
            CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)

            train_loss = train_loss / train_batch.num_examples * args[
                'batch_size']  # avg loss per batch
            print("epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                epoch, train_loss, dev_score))

            # save best model
            if epoch == 1 or dev_score > max(dev_score_history):
                trainer.save(model_file)
                print("new best model saved.")
                best_dev_preds = dev_preds

            # lr schedule
            if epoch > args['decay_epoch'] and dev_score <= dev_score_history[-1] and \
                    args['optim'] in ['sgd', 'adagrad']:
                current_lr *= args['lr_decay']
                trainer.update_lr(current_lr)

            dev_score_history += [dev_score]
            print("")

        print("Training ended with {} epochs.".format(epoch))

        best_f, best_epoch = max(dev_score_history) * 100, np.argmax(
            dev_score_history) + 1
        print("Best dev F1 = {:.2f}, at epoch = {}".format(best_f, best_epoch))
Beispiel #14
0
def create_lexicon(shorthand=None, train_path=None, external_path=None):
    """
    This function is to create a lexicon to store all the words from the training set and external dictionary.
    This lexicon will be saved with the model and will be used to create dictionary when the model is loaded.
    The idea of separating lexicon and dictionary in two different phases is a good tradeoff between time and space.
    Note that we eliminate all the long words but less frequently appeared in the lexicon by only taking 95-percentile
    list of words.

    :param shorthand - language and dataset, eg: vi_vlsp, zh_gsdsimp
    :param train_path - path to conllu train file
    :param external_path - path to extenral dict, expected to be inside the training dataset dir with format of: SHORTHAND-externaldict.txt
    :return a set lexicon object that contains all distinct words
    """
    lexicon = set()
    length_freq = []
    #this regex is to check if a character is an actual Thai character as seems .isalpha() python method doesn't pick up Thai accent characters..
    pattern_thai = re.compile(r"(?:[^\d\W]+)|\s")

    def check_valid_word(shorthand, word):
        """
        This function is to check if the word are multi-syllable words and not numbers. 
        For vi, whitespaces are syllabe-separator.
        """
        if shorthand.startswith("vi_"):
            return True if len(word.split(" ")) > 1 and any(
                map(str.isalpha,
                    word)) and not any(map(str.isdigit, word)) else False
        elif shorthand.startswith("th_"):
            return True if len(word) > 1 and any(
                map(pattern_thai.match,
                    word)) and not any(map(str.isdigit, word)) else False
        else:
            return True if len(word) > 1 and any(
                map(str.isalpha,
                    word)) and not any(map(str.isdigit, word)) else False

    #checking for words in the training set to add them to lexicon.
    if train_path is not None:
        if not os.path.isfile(train_path):
            raise FileNotFoundError(f"Cannot open train set at {train_path}")

        doc_conll, _ = CoNLL.conll2dict(input_file=train_path)

        for sent_conll in doc_conll:
            for token_conll in sent_conll:
                word = token_conll['text'].lower()
                if check_valid_word(shorthand, word) and word not in lexicon:
                    lexicon.add(word)
                    length_freq.append(len(word))
        count_word = len(lexicon)
        logger.info(
            f"Added {count_word} words from the training data to the lexicon.")

    #checking for external dictionary and add them to lexicon.
    if external_path is not None:
        if not os.path.isfile(external_path):
            raise FileNotFoundError(
                f"Cannot open external dictionary at {external_path}")

        with open(external_path, "r", encoding="utf-8") as external_file:
            lines = external_file.readlines()
        for line in lines:
            word = line.lower()
            word = word.replace("\n", "")
            if check_valid_word(shorthand, word) and word not in lexicon:
                lexicon.add(word)
                length_freq.append(len(word))
        logger.info(
            f"Added another {len(lexicon) - count_word} words from the external dict to dictionary."
        )

    #automatically calculate the number of dictionary features (window size to look for words) based on the frequency of word length
    #take the length at 95-percentile to eliminate all the longest (maybe) compounds words in the lexicon
    num_dict_feat = int(np.percentile(length_freq, 95))
    lexicon = {word for word in lexicon if len(word) <= num_dict_feat}
    logger.info(
        f"Final lexicon consists of {len(lexicon)} words after getting rid of long words."
    )

    return lexicon, num_dict_feat
Beispiel #15
0
def train(args):
    model_file = model_file_name(args)
    utils.ensure_dir(os.path.split(model_file)[0])

    # load pretrained vectors if needed
    pretrain = load_pretrain(args)

    # load data
    logger.info("Loading data with batch size {}...".format(
        args['batch_size']))
    train_data, _ = CoNLL.conll2dict(input_file=args['train_file'])
    # possibly augment the training data with some amount of fake data
    # based on the options chosen
    logger.info("Original data size: {}".format(len(train_data)))
    train_data.extend(
        augment_punct(train_data,
                      args['augment_nopunct'],
                      keep_original_sentences=False))
    logger.info("Augmented data size: {}".format(len(train_data)))
    train_doc = Document(train_data)
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             pretrain,
                             evaluation=False)
    vocab = train_batch.vocab
    dev_doc = CoNLL.conll2doc(input_file=args['eval_file'])
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           pretrain,
                           vocab=vocab,
                           evaluation=True,
                           sort_during_eval=True)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.info("Skip training because no data available...")
        sys.exit(0)

    logger.info("Training parser...")
    trainer = Trainer(args=args,
                      vocab=vocab,
                      pretrain=pretrain,
                      use_cuda=args['cuda'])

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = args['lr']
    global_start_time = time.time()
    format_str = 'Finished STEP {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    using_amsgrad = False
    last_best_step = 0
    # start training
    train_loss = 0
    while True:
        do_break = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False)  # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                logger.info(
                    format_str.format(global_step, max_steps, loss, duration,
                                      current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                logger.info("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)

                dev_batch.doc.set([HEAD, DEPREL],
                                  [y for x in dev_preds for y in x])
                CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
                _, _, dev_score = scorer.score(system_pred_file, gold_file)

                train_loss = train_loss / args[
                    'eval_interval']  # avg loss per batch
                logger.info(
                    "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                        global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history
                       ) == 0 or dev_score > max(dev_score_history):
                    last_best_step = global_step
                    trainer.save(model_file)
                    logger.info("new best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]

            if global_step - last_best_step >= args['max_steps_before_stop']:
                if not using_amsgrad:
                    logger.info("Switching to AMSGrad")
                    last_best_step = global_step
                    using_amsgrad = True
                    trainer.optimizer = optim.Adam(trainer.model.parameters(),
                                                   amsgrad=True,
                                                   lr=args['lr'],
                                                   betas=(.9, args['beta2']),
                                                   eps=1e-6)
                else:
                    do_break = True
                    break

            if global_step >= args['max_steps']:
                do_break = True
                break

        if do_break: break

        train_batch.reshuffle()

    logger.info("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history) * 100, np.argmax(
        dev_score_history) + 1
    logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(
        best_f, best_eval * args['eval_interval']))
# WordVocab).
mapping = defaultdict(list)
for sh, fn in zip(shorthands, fullnames):
    print('Resolving vocab option for {}...'.format(sh))
    if not os.path.exists('data/pos/{}.train.in.conllu'.format(sh)):
        raise UserWarning(
            'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the '
            'XPOS vocabulary for this treebank properly, please run the following command first:\n'
            '\tbash scripts/prep_pos_data.sh {}'.format(fn, fn))
        # without the training file, there's not much we can do
        key = 'WordVocab(data, shorthand, idx=2)'
        mapping[key].append(sh)
        continue

    doc = Document(
        CoNLL.conll2dict(input_file='data/pos/{}.train.in.conllu'.format(sh)))
    data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True)
    print(f'Original length = {len(data)}')
    data = filter_data(data, idx=2)
    print(f'Filtered length = {len(data)}')
    vocab = WordVocab(data, sh, idx=2, ignore=["_"])
    key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])'
    best_size = len(vocab) - len(VOCAB_PREFIX)
    if best_size > 20:
        for sep in ['', '-', '+', '|', ',', ':']:  # separators
            vocab = XPOSVocab(data, sh, idx=2, sep=sep)
            length = sum(
                len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values())
            if length < best_size:
                key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep)
                best_size = length
Beispiel #17
0
def train(args):
    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrained vectors if needed
    pretrain = None
    if args['pretrain']:
        vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
        pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand'])
        pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False)
    vocab = train_batch.vocab
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        print("Skip training because no data available...")
        sys.exit(0)

    print("Training parser...")
    trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda'])

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = args['lr']
    global_start_time = time.time()
    format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    using_amsgrad = False
    last_best_step = 0
    # start training
    train_loss = 0
    while True:
        do_break = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False) # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                        max_steps, loss, duration, current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                print("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)

                dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x])
                CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
                _, _, dev_score = scorer.score(system_pred_file, gold_file)

                train_loss = train_loss / args['eval_interval'] # avg loss per batch
                print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history) == 0 or dev_score > max(dev_score_history):
                    last_best_step = global_step
                    trainer.save(model_file)
                    print("new best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]
                print("")

            if global_step - last_best_step >= args['max_steps_before_stop']:
                if not using_amsgrad:
                    print("Switching to AMSGrad")
                    last_best_step = global_step
                    using_amsgrad = True
                    trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6)
                else:
                    do_break = True
                    break

            if global_step >= args['max_steps']:
                do_break = True
                break

        if do_break: break

        train_batch.reshuffle()

    print("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1
    print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))