def main(): args = parse_args() random.seed(args.seed) args = vars(args) print("[Launching identity lemmatizer...]") if args['mode'] == 'train': print( "[No training is required; will only generate evaluation output...]" ) document = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(document, args['batch_size'], args, evaluation=True, conll_only=True) system_pred_file = args['output_file'] gold_file = args['gold_file'] # use identity mapping for prediction preds = batch.doc.get([TEXT]) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Lemma score:") print("{} {:.2f}".format(args['lang'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang']) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] # laod data print("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) # skip eval if dev data does not exist if len(batch) == 0: print("Skip evaluation because no dev data is available...") print("Lemma score:") print("{} ".format(args['lang'])) sys.exit(0) dict_preds = trainer.predict_dict(batch.doc.get([TEXT, UPOS])) if loaded_args.get('dict_only', False): preds = dict_preds else: print("Running the seq2seq model...") preds = [] edits = [] for i, b in enumerate(batch): ps, es = trainer.predict(b, args['beam_size']) preds += ps if es is not None: edits += es preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits) if loaded_args.get('ensemble_dict', False): print("[Ensembling dict with seq2seq lemmatizer...]") preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Lemma score:") print("{} {:.2f}".format(args['lang'], score * 100))
def process(self, document): if not self.use_identity: batch = DataLoader(document, self.config['batch_size'], self.config, vocab=self.vocab, evaluation=True) else: batch = DataLoader(document, self.config['batch_size'], self.config, evaluation=True, conll_only=True) if self.use_identity: preds = [word.text for sent in batch.doc.sentences for word in sent.words] elif self.config.get('dict_only', False): preds = self.trainer.predict_dict(batch.doc.get([doc.TEXT, doc.UPOS])) else: if self.config.get('ensemble_dict', False): # skip the seq2seq model when we can skip = self.trainer.skip_seq2seq(batch.doc.get([doc.TEXT, doc.UPOS])) seq2seq_batch = DataLoader(document, self.config['batch_size'], self.config, vocab=self.vocab, evaluation=True, skip=skip) else: seq2seq_batch = batch preds = [] edits = [] for i, b in enumerate(seq2seq_batch): ps, es = self.trainer.predict(b, self.config['beam_size']) preds += ps if es is not None: edits += es if self.config.get('ensemble_dict', False): preds = self.trainer.postprocess([x for x, y in zip(batch.doc.get([doc.TEXT]), skip) if not y], preds, edits=edits) # expand seq2seq predictions to the same size as all words i = 0 preds1 = [] for s in skip: if s: preds1.append('') else: preds1.append(preds[i]) i += 1 preds = self.trainer.ensemble(batch.doc.get([doc.TEXT, doc.UPOS]), preds1) else: preds = self.trainer.postprocess(batch.doc.get([doc.TEXT]), preds, edits=edits) # map empty string lemmas to '_' preds = [max([(len(x), x), (0, '_')])[1] for x in preds] batch.doc.set([doc.LEMMA], preds) return batch.doc
def train(args): # load data print("[Loading data with batch size {}...]".format(args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False) vocab = train_batch.vocab args['vocab_size'] = vocab['char'].size args['pos_vocab_size'] = vocab['pos'].size dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True) utils.ensure_dir(args['model_dir']) model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang']) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] utils.print_config(args) # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: print("[Skip training because no data available...]") sys.exit(0) # start training # train a dictionary-based lemmatizer trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda']) print("[Training dictionary-based lemmatizer...]") trainer.train_dict(train_batch.doc.get([TEXT, UPOS, LEMMA])) print("Evaluating on dev set...") dev_preds = trainer.predict_dict(dev_batch.doc.get([TEXT, UPOS])) dev_batch.doc.set([LEMMA], dev_preds) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_f = scorer.score(system_pred_file, gold_file) print("Dev F1 = {:.2f}".format(dev_f * 100)) if args.get('dict_only', False): # save dictionaries trainer.save(model_file) else: # train a seq2seq model print("[Training seq2seq-based lemmatizer...]") global_step = 0 max_steps = len(train_batch) * args['num_epoch'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # start training for epoch in range(1, args['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, epoch, args['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") dev_preds = [] dev_edits = [] for i, batch in enumerate(dev_batch): preds, edits = trainer.predict(batch, args['beam_size']) dev_preds += preds if edits is not None: dev_edits += edits dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]), dev_preds, edits=dev_edits) # try ensembling with dict if necessary if args.get('ensemble_dict', False): print("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]), dev_preds) dev_batch.doc.set([LEMMA], dev_preds) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / train_batch.num_examples * args[ 'batch_size'] # avg loss per batch print("epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format( epoch, train_loss, dev_score)) # save best model if epoch == 1 or dev_score > max(dev_score_history): trainer.save(model_file) print("new best model saved.") best_dev_preds = dev_preds # lr schedule if epoch > args['decay_epoch'] and dev_score <= dev_score_history[-1] and \ args['optim'] in ['sgd', 'adagrad']: current_lr *= args['lr_decay'] trainer.update_lr(current_lr) dev_score_history += [dev_score] print("") print("Training ended with {} epochs.".format(epoch)) best_f, best_epoch = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 print("Best dev F1 = {:.2f}, at epoch = {}".format(best_f, best_epoch))