def main(): args = parse_args() random.seed(args.seed) args = vars(args) print("[Launching identity lemmatizer...]") if args['mode'] == 'train': print( "[No training is required; will only generate evaluation output...]" ) document = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(document, args['batch_size'], args, evaluation=True, conll_only=True) system_pred_file = args['output_file'] gold_file = args['gold_file'] # use identity mapping for prediction preds = batch.doc.get([TEXT]) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Lemma score:") print("{} {:.2f}".format(args['lang'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang']) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] # laod data print("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) # skip eval if dev data does not exist if len(batch) == 0: print("Skip evaluation because no dev data is available...") print("Lemma score:") print("{} ".format(args['lang'])) sys.exit(0) dict_preds = trainer.predict_dict(batch.doc.get([TEXT, UPOS])) if loaded_args.get('dict_only', False): preds = dict_preds else: print("Running the seq2seq model...") preds = [] edits = [] for i, b in enumerate(batch): ps, es = trainer.predict(b, args['beam_size']) preds += ps if es is not None: edits += es preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits) if loaded_args.get('ensemble_dict', False): print("[Ensembling dict with seq2seq lemmatizer...]") preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Lemma score:") print("{} {:.2f}".format(args['lang'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand']) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] logger.debug('max_dec_len: %d' % loaded_args['max_dec_len']) # load data logger.debug("Loading data with batch size {}...".format( args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) if len(batch) > 0: dict_preds = trainer.predict_dict( batch.doc.get_mwt_expansions(evaluation=True)) # decide trainer type and run eval if loaded_args['dict_only']: preds = dict_preds else: logger.info("Running the seq2seq model...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) if loaded_args.get('ensemble_dict', False): preds = trainer.ensemble( batch.doc.get_mwt_expansions(evaluation=True), preds) else: # skip eval if dev data does not exist preds = [] # write to file and score doc = copy.deepcopy(batch.doc) doc.set_mwt_expansions(preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("MWT expansion score: {} {:.2f}".format( args['shorthand'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand']) # load pretrain; note that we allow the pretrain_file to be non-existent pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file) # load model print("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand' ] or k == 'mode': loaded_args[k] = args[k] # load data print("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) if len(batch) > 0: print("Start evaluation...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) else: # skip eval if dev data does not exist preds = [] preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x]) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Tagger score:") print("{} {:.2f}".format(args['shorthand'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = model_file_name(args) # load pretrained vectors if needed pretrain = load_pretrain(args) # load model logger.info("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'] or k == 'mode': loaded_args[k] = args[k] # load data logger.info("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) if len(batch) > 0: logger.info("Start evaluation...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) else: # skip eval if dev data does not exist preds = [] preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x]) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("Parser score:") logger.info("{} {:.2f}".format(args['shorthand'], score*100))
def train(args): # load data print("[Loading data with batch size {}...]".format(args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False) vocab = train_batch.vocab args['vocab_size'] = vocab['char'].size args['pos_vocab_size'] = vocab['pos'].size dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True) utils.ensure_dir(args['model_dir']) model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang']) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] utils.print_config(args) # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: print("[Skip training because no data available...]") sys.exit(0) # start training # train a dictionary-based lemmatizer trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda']) print("[Training dictionary-based lemmatizer...]") trainer.train_dict(train_batch.doc.get([TEXT, UPOS, LEMMA])) print("Evaluating on dev set...") dev_preds = trainer.predict_dict(dev_batch.doc.get([TEXT, UPOS])) dev_batch.doc.set([LEMMA], dev_preds) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_f = scorer.score(system_pred_file, gold_file) print("Dev F1 = {:.2f}".format(dev_f * 100)) if args.get('dict_only', False): # save dictionaries trainer.save(model_file) else: # train a seq2seq model print("[Training seq2seq-based lemmatizer...]") global_step = 0 max_steps = len(train_batch) * args['num_epoch'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # start training for epoch in range(1, args['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, epoch, args['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") dev_preds = [] dev_edits = [] for i, batch in enumerate(dev_batch): preds, edits = trainer.predict(batch, args['beam_size']) dev_preds += preds if edits is not None: dev_edits += edits dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]), dev_preds, edits=dev_edits) # try ensembling with dict if necessary if args.get('ensemble_dict', False): print("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]), dev_preds) dev_batch.doc.set([LEMMA], dev_preds) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / train_batch.num_examples * args[ 'batch_size'] # avg loss per batch print("epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format( epoch, train_loss, dev_score)) # save best model if epoch == 1 or dev_score > max(dev_score_history): trainer.save(model_file) print("new best model saved.") best_dev_preds = dev_preds # lr schedule if epoch > args['decay_epoch'] and dev_score <= dev_score_history[-1] and \ args['optim'] in ['sgd', 'adagrad']: current_lr *= args['lr_decay'] trainer.update_lr(current_lr) dev_score_history += [dev_score] print("") print("Training ended with {} epochs.".format(epoch)) best_f, best_epoch = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 print("Best dev F1 = {:.2f}, at epoch = {}".format(best_f, best_epoch))
def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, max_seqlen=1000, orig_text=None, no_ssplit=False, prob=False): paragraphs = [] for i, p in enumerate(data_generator.sentences): start = 0 if i == 0 else paragraphs[-1][2] length = sum([len(x) for x in p]) paragraphs += [(i, start, start + length, length + 1) ] # para idx, start idx, end idx, length paragraphs = list(sorted(paragraphs, key=lambda x: x[3], reverse=True)) all_preds = [None] * len(paragraphs) all_raw = [None] * len(paragraphs) eval_limit = max(3000, max_seqlen) batch_size = trainer.args['batch_size'] batches = int((len(paragraphs) + batch_size - 1) / batch_size) t = 0 list_prob = [] for i in range(batches): batchparas = paragraphs[i * batch_size:(i + 1) * batch_size] offsets = [x[1] for x in batchparas] t += sum([x[3] for x in batchparas]) batch = data_generator.next(eval_offsets=offsets) raw = batch[3] N = len(batch[3][0]) if N <= eval_limit: a = trainer.predict(batch) pred = np.argmax(a, axis=2) print("555 " + str(a)) print("Hi " + str(pred)) list_prob.append(a) else: idx = [0] * len(batchparas) Ns = [p[3] for p in batchparas] pred = [[] for _ in batchparas] while True: ens = [min(N - idx1, eval_limit) for idx1, N in zip(idx, Ns)] en = max(ens) batch1 = batch[0][:, :en], batch[1][:, :en], batch[ 2][:, :en], [x[:en] for x in batch[3]] pred1 = np.argmax(trainer.predict(batch1), axis=2) print("p1 " + str(pred1)) for j in range(len(batchparas)): sentbreaks = np.where((pred1[j] == 2) + (pred1[j] == 4))[0] if len(sentbreaks) <= 0 or idx[j] >= Ns[j] - eval_limit: advance = ens[j] else: advance = np.max(sentbreaks) + 1 pred[j] += [pred1[j, :advance]] idx[j] += advance if all([idx1 >= N for idx1, N in zip(idx, Ns)]): break batch = data_generator.next( eval_offsets=[x + y for x, y in zip(idx, offsets)]) pred = [np.concatenate(p, 0) for p in pred] print(pred) for j, p in enumerate(batchparas): len1 = len([1 for x in raw[j] if x != '<PAD>']) if pred[j][len1 - 1] < 2: pred[j][len1 - 1] = 2 elif pred[j][len1 - 1] > 2: pred[j][len1 - 1] = 4 all_preds[p[0]] = pred[j][:len1] all_raw[p[0]] = raw[j] offset = 0 oov_count = 0 doc = [] text = orig_text char_offset = 0 for j in range(len(paragraphs)): raw = all_raw[j] pred = all_preds[j] current_tok = '' current_sent = [] for t, p in zip(raw, pred): if t == '<PAD>': break # hack la_ittb if trainer.args['shorthand'] == 'la_ittb' and t in [":", ";"]: p = 2 offset += 1 if vocab.unit2id(t) == vocab.unit2id('<UNK>'): oov_count += 1 current_tok += t if p >= 1: tok = vocab.normalize_token(current_tok) assert '\t' not in tok, tok if len(tok) <= 0: current_tok = '' continue if orig_text is not None: st0, tok0 = find_token(tok, text) st = char_offset + st0 text = text[st0 + len(tok0):] char_offset += st0 + len(tok0) additional_info = { START_CHAR: st, END_CHAR: st + len(tok0) } else: additional_info = dict() current_sent += [(tok, p, additional_info)] current_tok = '' if (p == 2 or p == 4) and not no_ssplit: doc.append(process_sentence(current_sent, mwt_dict)) current_sent = [] if len(current_tok): tok = vocab.normalize_token(current_tok) assert '\t' not in tok, tok if len(tok) > 0: if orig_text is not None: st0, tok0 = find_token(tok, text) st = char_offset + st0 text = text[st0 + len(tok0):] char_offset += st0 + len(tok0) additional_info = {END_CHAR: st, END_CHAR: st + len(tok0)} else: additional_info = dict() current_sent += [(tok, 2, additional_info)] if len(current_sent): doc.append(process_sentence(current_sent, mwt_dict)) if output_file: CoNLL.dict2conll(doc, output_file) if prob: return oov_count, offset, all_preds, doc, list(list_prob) return oov_count, offset, all_preds, doc
def train(args): utils.ensure_dir(args['save_dir']) model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand']) # load pretrained vectors if needed pretrain = None if args['pretrain']: vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand']) pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab']) # load data print("Loading data with batch size {}...".format(args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: print("Skip training because no data available...") sys.exit(0) print("Training parser...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' using_amsgrad = False last_best_step = 0 # start training train_loss = 0 while True: do_break = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev print("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx) dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x]) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / args['eval_interval'] # avg loss per batch print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history) == 0 or dev_score > max(dev_score_history): last_best_step = global_step trainer.save(model_file) print("new best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] print("") if global_step - last_best_step >= args['max_steps_before_stop']: if not using_amsgrad: print("Switching to AMSGrad") last_best_step = global_step using_amsgrad = True trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6) else: do_break = True break if global_step >= args['max_steps']: do_break = True break if do_break: break train_batch.reshuffle() print("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1 print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
def train(args): # load data logger.debug('max_dec_len: %d' % args['max_dec_len']) logger.debug("Loading data with batch size {}...".format( args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False) vocab = train_batch.vocab args['vocab_size'] = vocab.size dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True) utils.ensure_dir(args['save_dir']) model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand']) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.warning("Skip training because no data available...") return # train a dictionary-based MWT expander trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda']) logger.info("Training dictionary-based MWT expander...") trainer.train_dict(train_batch.doc.get_mwt_expansions(evaluation=False)) logger.info("Evaluating on dev set...") dev_preds = trainer.predict_dict( dev_batch.doc.get_mwt_expansions(evaluation=True)) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) _, _, dev_f = scorer.score(system_pred_file, gold_file) logger.info("Dev F1 = {:.2f}".format(dev_f * 100)) if args.get('dict_only', False): # save dictionaries trainer.save(model_file) else: # train a seq2seq model logger.info("Training seq2seq-based MWT expander...") global_step = 0 max_steps = len(train_batch) * args['num_epoch'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # start training for epoch in range(1, args['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, epoch, args['num_epoch'], loss, duration, current_lr)) # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for i, batch in enumerate(dev_batch): preds = trainer.predict(batch) dev_preds += preds if args.get('ensemble_dict', False) and args.get( 'ensemble_early_stop', False): logger.info("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble( dev_batch.doc.get_mwt_expansions(evaluation=True), dev_preds) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / train_batch.num_examples * args[ 'batch_size'] # avg loss per batch logger.info( "epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format( epoch, train_loss, dev_score)) # save best model if epoch == 1 or dev_score > max(dev_score_history): trainer.save(model_file) logger.info("new best model saved.") best_dev_preds = dev_preds # lr schedule if epoch > args['decay_epoch'] and dev_score <= dev_score_history[ -1]: current_lr *= args['lr_decay'] trainer.change_lr(current_lr) dev_score_history += [dev_score] logger.info("Training ended with {} epochs.".format(epoch)) best_f, best_epoch = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at epoch = {}".format( best_f, best_epoch)) # try ensembling with dict if necessary if args.get('ensemble_dict', False): logger.info("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble( dev_batch.doc.get_mwt_expansions(evaluation=True), best_dev_preds) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) logger.info("Ensemble dev F1 = {:.2f}".format(dev_score * 100)) best_f = max(best_f, dev_score)
def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, max_seqlen=1000, orig_text=None, no_ssplit=False, use_regex_tokens=True): paragraphs = [] for i, p in enumerate(data_generator.sentences): start = 0 if i == 0 else paragraphs[-1][2] length = sum([len(x[0]) for x in p]) paragraphs += [(i, start, start+length, length+1)] # para idx, start idx, end idx, length paragraphs = list(sorted(paragraphs, key=lambda x: x[3], reverse=True)) all_preds = [None] * len(paragraphs) all_raw = [None] * len(paragraphs) eval_limit = max(3000, max_seqlen) batch_size = trainer.args['batch_size'] batches = int((len(paragraphs) + batch_size - 1) / batch_size) t = 0 for i in range(batches): batchparas = paragraphs[i * batch_size : (i + 1) * batch_size] offsets = [x[1] for x in batchparas] t += sum([x[3] for x in batchparas]) batch = data_generator.next(eval_offsets=offsets) raw = batch[3] N = len(batch[3][0]) if N <= eval_limit: pred = np.argmax(trainer.predict(batch), axis=2) else: idx = [0] * len(batchparas) adv = [0] * len(batchparas) Ns = [p[3] for p in batchparas] pred = [[] for _ in batchparas] while True: ens = [min(N - idx1, eval_limit) for idx1, N in zip(idx, Ns)] en = max(ens) batch1 = batch[0][:, :en], batch[1][:, :en], batch[2][:, :en], [x[:en] for x in batch[3]] pred1 = np.argmax(trainer.predict(batch1), axis=2) for j in range(len(batchparas)): sentbreaks = np.where((pred1[j] == 2) + (pred1[j] == 4))[0] if len(sentbreaks) <= 0 or idx[j] >= Ns[j] - eval_limit: advance = ens[j] else: advance = np.max(sentbreaks) + 1 pred[j] += [pred1[j, :advance]] idx[j] += advance adv[j] = advance if all([idx1 >= N for idx1, N in zip(idx, Ns)]): break batch = data_generator.next(eval_offsets=adv, old_batch=batch) pred = [np.concatenate(p, 0) for p in pred] for j, p in enumerate(batchparas): len1 = len([1 for x in raw[j] if x != '<PAD>']) if pred[j][len1-1] < 2: pred[j][len1-1] = 2 elif pred[j][len1-1] > 2: pred[j][len1-1] = 4 if use_regex_tokens: all_preds[p[0]] = update_pred_regex(raw[j], pred[j][:len1]) else: all_preds[p[0]] = pred[j][:len1] all_raw[p[0]] = raw[j] offset = 0 oov_count = 0 doc = [] text = SPACE_RE.sub(' ', orig_text) if orig_text is not None else None char_offset = 0 use_la_ittb_shorthand = trainer.args['shorthand'] == 'la_ittb' for j in range(len(paragraphs)): raw = all_raw[j] pred = all_preds[j] current_tok = '' current_sent = [] for t, p in zip(raw, pred): if t == '<PAD>': break # hack la_ittb if use_la_ittb_shorthand and t in (":", ";"): p = 2 offset += 1 if vocab.unit2id(t) == vocab.unit2id('<UNK>'): oov_count += 1 current_tok += t if p >= 1: tok = vocab.normalize_token(current_tok) assert '\t' not in tok, tok if len(tok) <= 0: current_tok = '' continue if orig_text is not None: st = -1 tok_len = 0 for part in SPACE_SPLIT_RE.split(current_tok): if len(part) == 0: continue st0 = text.index(part, char_offset) - char_offset lstripped = part.lstrip() if st < 0: st = char_offset + st0 + (len(part) - len(lstripped)) char_offset += st0 + len(part) additional_info = {START_CHAR: st, END_CHAR: char_offset} else: additional_info = dict() current_sent.append((tok, p, additional_info)) current_tok = '' if (p == 2 or p == 4) and not no_ssplit: doc.append(process_sentence(current_sent, mwt_dict)) current_sent = [] assert(len(current_tok) == 0) if len(current_sent): doc.append(process_sentence(current_sent, mwt_dict)) if output_file: CoNLL.dict2conll(doc, output_file) return oov_count, offset, all_preds, doc
def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, max_seqlen=1000, orig_text=None, no_ssplit=False, use_regex_tokens=True): paragraphs = [] for i, p in enumerate(data_generator.sentences): start = 0 if i == 0 else paragraphs[-1][2] length = sum([len(x[0]) for x in p]) paragraphs += [(i, start, start + length, length) ] # para idx, start idx, end idx, length paragraphs = list(sorted(paragraphs, key=lambda x: x[3], reverse=True)) all_preds = [None] * len(paragraphs) all_raw = [None] * len(paragraphs) eval_limit = max(3000, max_seqlen) batch_size = trainer.args['batch_size'] skip_newline = trainer.args['skip_newline'] batches = int((len(paragraphs) + batch_size - 1) / batch_size) for i in range(batches): # At evaluation time, each paragraph is treated as a single "sentence", and a batch of `batch_size` paragraphs # are tokenized together. `offsets` here are used by the data generator to identify which paragraphs to use # for the next batch of evaluation. batchparas = paragraphs[i * batch_size:(i + 1) * batch_size] offsets = [x[1] for x in batchparas] batch = data_generator.next(eval_offsets=offsets) raw = batch[3] N = len(batch[3][0]) if N <= eval_limit: pred = np.argmax(trainer.predict(batch), axis=2) else: idx = [0] * len(batchparas) adv = [0] * len(batchparas) Ns = [p[3] for p in batchparas] pred = [[] for _ in batchparas] while True: ens = [min(N - idx1, eval_limit) for idx1, N in zip(idx, Ns)] en = max(ens) batch1 = batch[0][:, :en], batch[1][:, :en], batch[ 2][:, :en], [x[:en] for x in batch[3]] pred1 = np.argmax(trainer.predict(batch1), axis=2) for j in range(len(batchparas)): sentbreaks = np.where((pred1[j] == 2) + (pred1[j] == 4))[0] if len(sentbreaks) <= 0 or idx[j] >= Ns[j] - eval_limit: advance = ens[j] else: advance = np.max(sentbreaks) + 1 pred[j] += [pred1[j, :advance]] idx[j] += advance adv[j] = advance if all([idx1 >= N for idx1, N in zip(idx, Ns)]): break # once we've made predictions on a certain number of characters for each paragraph (recorded in `adv`), # we skip the first `adv` characters to make the updated batch batch = data_generator.next(eval_offsets=adv, old_batch=batch) pred = [np.concatenate(p, 0) for p in pred] for j, p in enumerate(batchparas): len1 = len([1 for x in raw[j] if x != '<PAD>']) if pred[j][len1 - 1] < 2: pred[j][len1 - 1] = 2 elif pred[j][len1 - 1] > 2: pred[j][len1 - 1] = 4 if use_regex_tokens: all_preds[p[0]] = update_pred_regex(raw[j], pred[j][:len1]) else: all_preds[p[0]] = pred[j][:len1] all_raw[p[0]] = raw[j] offset = 0 oov_count = 0 doc = [] text = SPACE_RE.sub(' ', orig_text) if orig_text is not None else None char_offset = 0 use_la_ittb_shorthand = trainer.args['shorthand'] == 'la_ittb' UNK_ID = vocab.unit2id('<UNK>') # Once everything is fed through the tokenizer model, it's time to decode the predictions # into actual tokens and sentences that the rest of the pipeline uses for j in range(len(paragraphs)): raw = all_raw[j] pred = all_preds[j] current_tok = '' current_sent = [] for t, p in zip(raw, pred): if t == '<PAD>': break # hack la_ittb if use_la_ittb_shorthand and t in (":", ";"): p = 2 offset += 1 if vocab.unit2id(t) == UNK_ID: oov_count += 1 current_tok += t if p >= 1: tok = vocab.normalize_token(current_tok) assert '\t' not in tok, tok if len(tok) <= 0: current_tok = '' continue if orig_text is not None: st = -1 tok_len = 0 for part in SPACE_SPLIT_RE.split(current_tok): if len(part) == 0: continue if skip_newline: part_pattern = re.compile(r'\s*'.join( re.escape(c) for c in part)) match = part_pattern.search(text, char_offset) st0 = match.start(0) - char_offset partlen = match.end(0) - match.start(0) else: st0 = text.index(part, char_offset) - char_offset partlen = len(part) lstripped = part.lstrip() if st < 0: st = char_offset + st0 + (len(part) - len(lstripped)) char_offset += st0 + partlen position_info = (st, char_offset) else: position_info = None current_sent.append((tok, p, position_info)) current_tok = '' if (p == 2 or p == 4) and not no_ssplit: doc.append(process_sentence(current_sent, mwt_dict)) current_sent = [] assert (len(current_tok) == 0) if len(current_sent): doc.append(process_sentence(current_sent, mwt_dict)) if output_file: CoNLL.dict2conll(doc, output_file) return oov_count, offset, all_preds, doc
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = model_file_name(args) pretrain = load_pretrain(args) # load model train_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) logger.info("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(doc=train_doc, pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand' ] or k == 'mode': loaded_args[k] = args[k] # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=False) if len(batch) > 0: logger.info("Start evaluation...") preds = [] if args['morph_dict']: print('Collecting morph dictionary...') morph_dict = MorphDictionary(args['morph_dict']) print('Completed.') else: morph_dict = None start = 0 end = 0 for i, b in enumerate(batch): end += len(b[8]) # b[8] is orig_idx # data_orig_idx=batch.data_orig_idx, preds += trainer.predict(b, morph_dict=morph_dict, start=start, end=end) start += len(b[8]) else: # skip eval if dev data does not exist preds = [] # sorting is disabled by sort_during_eval=False, no need to unsort # preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x]) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("Tagger score:") logger.info("{} {:.2f}".format(args['shorthand'], score * 100))
def train(args): model_file = model_file_name(args) utils.ensure_dir(os.path.split(model_file)[0]) # load pretrained vectors if needed pretrain = load_pretrain(args) # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) # train_data is now a list of sentences, where each sentence is a # list of words, in which each word is a dict of conll attributes train_data = CoNLL.conll2dict(input_file=args['train_file']) # possibly augment the training data with some amount of fake data # based on the options chosen logger.info("Original data size: {}".format(len(train_data))) train_data.extend( augment_punct(train_data, args['augment_nopunct'], keep_original_sentences=False)) logger.info("Augmented data size: {}".format(len(train_data))) train_doc = Document(train_data) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.info("Skip training because no data available...") return logger.info("Training tagger...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = 'Finished STEP {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' if args['adapt_eval_interval']: args['eval_interval'] = utils.get_adaptive_eval_interval( dev_batch.num_examples, 2000, args['eval_interval']) logger.info("Evaluating the model every {} steps...".format( args['eval_interval'])) using_amsgrad = False last_best_step = 0 # start training train_loss = 0 while True: do_break = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info( format_str.format(global_step, max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx) dev_batch.doc.set([UPOS, XPOS, FEATS], [y for x in dev_preds for y in x]) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / args[ 'eval_interval'] # avg loss per batch logger.info( "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format( global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history ) == 0 or dev_score > max(dev_score_history): last_best_step = global_step trainer.save(model_file) logger.info("new best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] if global_step - last_best_step >= args['max_steps_before_stop']: if not using_amsgrad: logger.info("Switching to AMSGrad") last_best_step = global_step using_amsgrad = True trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6) else: logger.info( "Early termination: have not improved in {} steps". format(args['max_steps_before_stop'])) do_break = True break if global_step >= args['max_steps']: do_break = True break if do_break: break train_batch.reshuffle() logger.info("Training ended with {} steps.".format(global_step)) if len(dev_score_history) > 0: best_f, best_eval = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at iteration = {}".format( best_f, best_eval * args['eval_interval'])) else: logger.info("Dev set never evaluated. Saving final model.") trainer.save(model_file)