def main(): args = parse_args() random.seed(args.seed) args = vars(args) print("[Launching identity lemmatizer...]") if args['mode'] == 'train': print( "[No training is required; will only generate evaluation output...]" ) document = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(document, args['batch_size'], args, evaluation=True, conll_only=True) system_pred_file = args['output_file'] gold_file = args['gold_file'] # use identity mapping for prediction preds = batch.doc.get([TEXT]) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Lemma score:") print("{} {:.2f}".format(args['lang'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang']) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] # laod data print("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) # skip eval if dev data does not exist if len(batch) == 0: print("Skip evaluation because no dev data is available...") print("Lemma score:") print("{} ".format(args['lang'])) sys.exit(0) dict_preds = trainer.predict_dict(batch.doc.get([TEXT, UPOS])) if loaded_args.get('dict_only', False): preds = dict_preds else: print("Running the seq2seq model...") preds = [] edits = [] for i, b in enumerate(batch): ps, es = trainer.predict(b, args['beam_size']) preds += ps if es is not None: edits += es preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits) if loaded_args.get('ensemble_dict', False): print("[Ensembling dict with seq2seq lemmatizer...]") preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Lemma score:") print("{} {:.2f}".format(args['lang'], score * 100))
def train(args): # load data print("[Loading data with batch size {}...]".format(args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False) vocab = train_batch.vocab args['vocab_size'] = vocab['char'].size args['pos_vocab_size'] = vocab['pos'].size dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True) utils.ensure_dir(args['model_dir']) model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang']) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] utils.print_config(args) # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: print("[Skip training because no data available...]") sys.exit(0) # start training # train a dictionary-based lemmatizer trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda']) print("[Training dictionary-based lemmatizer...]") trainer.train_dict(train_batch.doc.get([TEXT, UPOS, LEMMA])) print("Evaluating on dev set...") dev_preds = trainer.predict_dict(dev_batch.doc.get([TEXT, UPOS])) dev_batch.doc.set([LEMMA], dev_preds) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_f = scorer.score(system_pred_file, gold_file) print("Dev F1 = {:.2f}".format(dev_f * 100)) if args.get('dict_only', False): # save dictionaries trainer.save(model_file) else: # train a seq2seq model print("[Training seq2seq-based lemmatizer...]") global_step = 0 max_steps = len(train_batch) * args['num_epoch'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # start training for epoch in range(1, args['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, epoch, args['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") dev_preds = [] dev_edits = [] for i, batch in enumerate(dev_batch): preds, edits = trainer.predict(batch, args['beam_size']) dev_preds += preds if edits is not None: dev_edits += edits dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]), dev_preds, edits=dev_edits) # try ensembling with dict if necessary if args.get('ensemble_dict', False): print("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]), dev_preds) dev_batch.doc.set([LEMMA], dev_preds) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / train_batch.num_examples * args[ 'batch_size'] # avg loss per batch print("epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format( epoch, train_loss, dev_score)) # save best model if epoch == 1 or dev_score > max(dev_score_history): trainer.save(model_file) print("new best model saved.") best_dev_preds = dev_preds # lr schedule if epoch > args['decay_epoch'] and dev_score <= dev_score_history[-1] and \ args['optim'] in ['sgd', 'adagrad']: current_lr *= args['lr_decay'] trainer.update_lr(current_lr) dev_score_history += [dev_score] print("") print("Training ended with {} epochs.".format(epoch)) best_f, best_epoch = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 print("Best dev F1 = {:.2f}, at epoch = {}".format(best_f, best_epoch))