def get_factory(sh, fn): print('Resolving vocab option for {}...'.format(sh)) train_file = 'data/pos/{}.train.in.conllu'.format(sh) if not os.path.exists(train_file): raise UserWarning( 'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the ' 'XPOS vocabulary for this treebank properly, please run the following command first:\n' '\tstanza/utils/datasets/prepare_pos_treebank.py {}'.format( fn, fn)) # without the training file, there's not much we can do key = 'WordVocab(data, shorthand, idx=2)' return key doc = Document(CoNLL.conll2dict(input_file=train_file)) data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True) print(f'Original length = {len(data)}') data = filter_data(data, idx=2) print(f'Filtered length = {len(data)}') vocab = WordVocab(data, sh, idx=2, ignore=["_"]) key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])' best_size = len(vocab) - len(VOCAB_PREFIX) if best_size > 20: for sep in ['', '-', '+', '|', ',', ':']: # separators vocab = XPOSVocab(data, sh, idx=2, sep=sep) length = sum( len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values()) if length < best_size: key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep) best_size = length return key
def main(): args = parse_args() random.seed(args.seed) args = vars(args) print("[Launching identity lemmatizer...]") if args['mode'] == 'train': print( "[No training is required; will only generate evaluation output...]" ) document = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(document, args['batch_size'], args, evaluation=True, conll_only=True) system_pred_file = args['output_file'] gold_file = args['gold_file'] # use identity mapping for prediction preds = batch.doc.get([TEXT]) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Lemma score:") print("{} {:.2f}".format(args['lang'], score * 100))
def check_mwt(filename): """ Checks whether or not there are MWTs in the given conll file """ doc = Document(CoNLL.conll2dict(filename)) data = doc.get_mwt_expansions(False) return len(data) > 0
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang']) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] # laod data print("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) # skip eval if dev data does not exist if len(batch) == 0: print("Skip evaluation because no dev data is available...") print("Lemma score:") print("{} ".format(args['lang'])) sys.exit(0) dict_preds = trainer.predict_dict(batch.doc.get([TEXT, UPOS])) if loaded_args.get('dict_only', False): preds = dict_preds else: print("Running the seq2seq model...") preds = [] edits = [] for i, b in enumerate(batch): ps, es = trainer.predict(b, args['beam_size']) preds += ps if es is not None: edits += es preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits) if loaded_args.get('ensemble_dict', False): print("[Ensembling dict with seq2seq lemmatizer...]") preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Lemma score:") print("{} {:.2f}".format(args['lang'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand']) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] logger.debug('max_dec_len: %d' % loaded_args['max_dec_len']) # load data logger.debug("Loading data with batch size {}...".format( args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) if len(batch) > 0: dict_preds = trainer.predict_dict( batch.doc.get_mwt_expansions(evaluation=True)) # decide trainer type and run eval if loaded_args['dict_only']: preds = dict_preds else: logger.info("Running the seq2seq model...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) if loaded_args.get('ensemble_dict', False): preds = trainer.ensemble( batch.doc.get_mwt_expansions(evaluation=True), preds) else: # skip eval if dev data does not exist preds = [] # write to file and score doc = copy.deepcopy(batch.doc) doc.set_mwt_expansions(preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("MWT expansion score: {} {:.2f}".format( args['shorthand'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand']) # load pretrain; note that we allow the pretrain_file to be non-existent pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file) # load model print("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand' ] or k == 'mode': loaded_args[k] = args[k] # load data print("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) if len(batch) > 0: print("Start evaluation...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) else: # skip eval if dev data does not exist preds = [] preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x]) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Tagger score:") print("{} {:.2f}".format(args['shorthand'], score * 100))
def test_depparse_with_pretagged_doc(): nlp = stanza.Pipeline( **{ 'processors': 'depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'depparse_pretagged': True }) doc = stanza.Document(CoNLL.conll2dict(input_str=EN_DOC_CONLLU_PRETAGGED)) processed_doc = nlp(doc) assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join( [sent.dependencies_string() for sent in processed_doc.sentences])
def prep_conllu(tb, file_path, overwrite): out_file = out_dir.joinpath(file_path.name) if out_file.exists() and not overwrite: print(f"{out_file.name} exists; skipping") return None lang, tb, tb_kwargs = determine_treebank(tb) if not lang: shutil.copy(file_path, out_file) return None doc = Document(CoNLL.conll2dict(input_file=file_path)) nlp = stanza.Pipeline(lang=lang, processors='tokenize,mwt,pos', tokenize_pretokenized=True) doc = nlp.processors['pos'].process(doc) return doc
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = model_file_name(args) # load pretrained vectors if needed pretrain = load_pretrain(args) # load model logger.info("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'] or k == 'mode': loaded_args[k] = args[k] # load data logger.info("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) if len(batch) > 0: logger.info("Start evaluation...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) else: # skip eval if dev data does not exist preds = [] preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x]) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("Parser score:") logger.info("{} {:.2f}".format(args['shorthand'], score*100))
def predict(self, eval_file_or_string): eval_file = _read_conllu_arg(eval_file_or_string, self.feature_config, predict=True) doc = Document(CoNLL.conll2dict(input_file=eval_file)) batch = DataLoader(doc, self.batch_size, self.loaded_args, self.pretrain, vocab=self.vocab, evaluation=True, sort_during_eval=True) preds = [] if len(batch) > 0: for i, b in enumerate(batch): preds += self.trainer.predict(b) preds = utils.unsort(preds, batch.data_orig_idx) batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x]) doc_conll = CoNLL.convert_dict(batch.doc.to_dict()) conll_string = CoNLL.conll_as_string(doc_conll) return conll_string
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = model_file_name(args) pretrain = load_pretrain(args) # load model train_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) logger.info("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(doc=train_doc, pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand' ] or k == 'mode': loaded_args[k] = args[k] # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=False) if len(batch) > 0: logger.info("Start evaluation...") preds = [] if args['morph_dict']: print('Collecting morph dictionary...') morph_dict = MorphDictionary(args['morph_dict']) print('Completed.') else: morph_dict = None start = 0 end = 0 for i, b in enumerate(batch): end += len(b[8]) # b[8] is orig_idx # data_orig_idx=batch.data_orig_idx, preds += trainer.predict(b, morph_dict=morph_dict, start=start, end=end) start += len(b[8]) else: # skip eval if dev data does not exist preds = [] # sorting is disabled by sort_during_eval=False, no need to unsort # preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x]) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("Tagger score:") logger.info("{} {:.2f}".format(args['shorthand'], score * 100))
def train(args): # load data logger.debug('max_dec_len: %d' % args['max_dec_len']) logger.debug("Loading data with batch size {}...".format( args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False) vocab = train_batch.vocab args['vocab_size'] = vocab.size dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True) utils.ensure_dir(args['save_dir']) model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand']) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.warning("Skip training because no data available...") return # train a dictionary-based MWT expander trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda']) logger.info("Training dictionary-based MWT expander...") trainer.train_dict(train_batch.doc.get_mwt_expansions(evaluation=False)) logger.info("Evaluating on dev set...") dev_preds = trainer.predict_dict( dev_batch.doc.get_mwt_expansions(evaluation=True)) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) _, _, dev_f = scorer.score(system_pred_file, gold_file) logger.info("Dev F1 = {:.2f}".format(dev_f * 100)) if args.get('dict_only', False): # save dictionaries trainer.save(model_file) else: # train a seq2seq model logger.info("Training seq2seq-based MWT expander...") global_step = 0 max_steps = len(train_batch) * args['num_epoch'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # start training for epoch in range(1, args['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, epoch, args['num_epoch'], loss, duration, current_lr)) # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for i, batch in enumerate(dev_batch): preds = trainer.predict(batch) dev_preds += preds if args.get('ensemble_dict', False) and args.get( 'ensemble_early_stop', False): logger.info("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble( dev_batch.doc.get_mwt_expansions(evaluation=True), dev_preds) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / train_batch.num_examples * args[ 'batch_size'] # avg loss per batch logger.info( "epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format( epoch, train_loss, dev_score)) # save best model if epoch == 1 or dev_score > max(dev_score_history): trainer.save(model_file) logger.info("new best model saved.") best_dev_preds = dev_preds # lr schedule if epoch > args['decay_epoch'] and dev_score <= dev_score_history[ -1]: current_lr *= args['lr_decay'] trainer.change_lr(current_lr) dev_score_history += [dev_score] logger.info("Training ended with {} epochs.".format(epoch)) best_f, best_epoch = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at epoch = {}".format( best_f, best_epoch)) # try ensembling with dict if necessary if args.get('ensemble_dict', False): logger.info("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble( dev_batch.doc.get_mwt_expansions(evaluation=True), best_dev_preds) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) logger.info("Ensemble dev F1 = {:.2f}".format(dev_score * 100)) best_f = max(best_f, dev_score)
def train(args): # load data print("[Loading data with batch size {}...]".format(args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False) vocab = train_batch.vocab args['vocab_size'] = vocab['char'].size args['pos_vocab_size'] = vocab['pos'].size dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True) utils.ensure_dir(args['model_dir']) model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang']) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] utils.print_config(args) # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: print("[Skip training because no data available...]") sys.exit(0) # start training # train a dictionary-based lemmatizer trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda']) print("[Training dictionary-based lemmatizer...]") trainer.train_dict(train_batch.doc.get([TEXT, UPOS, LEMMA])) print("Evaluating on dev set...") dev_preds = trainer.predict_dict(dev_batch.doc.get([TEXT, UPOS])) dev_batch.doc.set([LEMMA], dev_preds) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_f = scorer.score(system_pred_file, gold_file) print("Dev F1 = {:.2f}".format(dev_f * 100)) if args.get('dict_only', False): # save dictionaries trainer.save(model_file) else: # train a seq2seq model print("[Training seq2seq-based lemmatizer...]") global_step = 0 max_steps = len(train_batch) * args['num_epoch'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # start training for epoch in range(1, args['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, epoch, args['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") dev_preds = [] dev_edits = [] for i, batch in enumerate(dev_batch): preds, edits = trainer.predict(batch, args['beam_size']) dev_preds += preds if edits is not None: dev_edits += edits dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]), dev_preds, edits=dev_edits) # try ensembling with dict if necessary if args.get('ensemble_dict', False): print("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]), dev_preds) dev_batch.doc.set([LEMMA], dev_preds) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / train_batch.num_examples * args[ 'batch_size'] # avg loss per batch print("epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format( epoch, train_loss, dev_score)) # save best model if epoch == 1 or dev_score > max(dev_score_history): trainer.save(model_file) print("new best model saved.") best_dev_preds = dev_preds # lr schedule if epoch > args['decay_epoch'] and dev_score <= dev_score_history[-1] and \ args['optim'] in ['sgd', 'adagrad']: current_lr *= args['lr_decay'] trainer.update_lr(current_lr) dev_score_history += [dev_score] print("") print("Training ended with {} epochs.".format(epoch)) best_f, best_epoch = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 print("Best dev F1 = {:.2f}, at epoch = {}".format(best_f, best_epoch))
def create_lexicon(shorthand=None, train_path=None, external_path=None): """ This function is to create a lexicon to store all the words from the training set and external dictionary. This lexicon will be saved with the model and will be used to create dictionary when the model is loaded. The idea of separating lexicon and dictionary in two different phases is a good tradeoff between time and space. Note that we eliminate all the long words but less frequently appeared in the lexicon by only taking 95-percentile list of words. :param shorthand - language and dataset, eg: vi_vlsp, zh_gsdsimp :param train_path - path to conllu train file :param external_path - path to extenral dict, expected to be inside the training dataset dir with format of: SHORTHAND-externaldict.txt :return a set lexicon object that contains all distinct words """ lexicon = set() length_freq = [] #this regex is to check if a character is an actual Thai character as seems .isalpha() python method doesn't pick up Thai accent characters.. pattern_thai = re.compile(r"(?:[^\d\W]+)|\s") def check_valid_word(shorthand, word): """ This function is to check if the word are multi-syllable words and not numbers. For vi, whitespaces are syllabe-separator. """ if shorthand.startswith("vi_"): return True if len(word.split(" ")) > 1 and any( map(str.isalpha, word)) and not any(map(str.isdigit, word)) else False elif shorthand.startswith("th_"): return True if len(word) > 1 and any( map(pattern_thai.match, word)) and not any(map(str.isdigit, word)) else False else: return True if len(word) > 1 and any( map(str.isalpha, word)) and not any(map(str.isdigit, word)) else False #checking for words in the training set to add them to lexicon. if train_path is not None: if not os.path.isfile(train_path): raise FileNotFoundError(f"Cannot open train set at {train_path}") doc_conll, _ = CoNLL.conll2dict(input_file=train_path) for sent_conll in doc_conll: for token_conll in sent_conll: word = token_conll['text'].lower() if check_valid_word(shorthand, word) and word not in lexicon: lexicon.add(word) length_freq.append(len(word)) count_word = len(lexicon) logger.info( f"Added {count_word} words from the training data to the lexicon.") #checking for external dictionary and add them to lexicon. if external_path is not None: if not os.path.isfile(external_path): raise FileNotFoundError( f"Cannot open external dictionary at {external_path}") with open(external_path, "r", encoding="utf-8") as external_file: lines = external_file.readlines() for line in lines: word = line.lower() word = word.replace("\n", "") if check_valid_word(shorthand, word) and word not in lexicon: lexicon.add(word) length_freq.append(len(word)) logger.info( f"Added another {len(lexicon) - count_word} words from the external dict to dictionary." ) #automatically calculate the number of dictionary features (window size to look for words) based on the frequency of word length #take the length at 95-percentile to eliminate all the longest (maybe) compounds words in the lexicon num_dict_feat = int(np.percentile(length_freq, 95)) lexicon = {word for word in lexicon if len(word) <= num_dict_feat} logger.info( f"Final lexicon consists of {len(lexicon)} words after getting rid of long words." ) return lexicon, num_dict_feat
def train(args): model_file = model_file_name(args) utils.ensure_dir(os.path.split(model_file)[0]) # load pretrained vectors if needed pretrain = load_pretrain(args) # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) train_data, _ = CoNLL.conll2dict(input_file=args['train_file']) # possibly augment the training data with some amount of fake data # based on the options chosen logger.info("Original data size: {}".format(len(train_data))) train_data.extend( augment_punct(train_data, args['augment_nopunct'], keep_original_sentences=False)) logger.info("Augmented data size: {}".format(len(train_data))) train_doc = Document(train_data) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = CoNLL.conll2doc(input_file=args['eval_file']) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.info("Skip training because no data available...") sys.exit(0) logger.info("Training parser...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = 'Finished STEP {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' using_amsgrad = False last_best_step = 0 # start training train_loss = 0 while True: do_break = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info( format_str.format(global_step, max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx) dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x]) CoNLL.write_doc2conll(dev_batch.doc, system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / args[ 'eval_interval'] # avg loss per batch logger.info( "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format( global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history ) == 0 or dev_score > max(dev_score_history): last_best_step = global_step trainer.save(model_file) logger.info("new best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] if global_step - last_best_step >= args['max_steps_before_stop']: if not using_amsgrad: logger.info("Switching to AMSGrad") last_best_step = global_step using_amsgrad = True trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6) else: do_break = True break if global_step >= args['max_steps']: do_break = True break if do_break: break train_batch.reshuffle() logger.info("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at iteration = {}".format( best_f, best_eval * args['eval_interval']))
# WordVocab). mapping = defaultdict(list) for sh, fn in zip(shorthands, fullnames): print('Resolving vocab option for {}...'.format(sh)) if not os.path.exists('data/pos/{}.train.in.conllu'.format(sh)): raise UserWarning( 'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the ' 'XPOS vocabulary for this treebank properly, please run the following command first:\n' '\tbash scripts/prep_pos_data.sh {}'.format(fn, fn)) # without the training file, there's not much we can do key = 'WordVocab(data, shorthand, idx=2)' mapping[key].append(sh) continue doc = Document( CoNLL.conll2dict(input_file='data/pos/{}.train.in.conllu'.format(sh))) data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True) print(f'Original length = {len(data)}') data = filter_data(data, idx=2) print(f'Filtered length = {len(data)}') vocab = WordVocab(data, sh, idx=2, ignore=["_"]) key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])' best_size = len(vocab) - len(VOCAB_PREFIX) if best_size > 20: for sep in ['', '-', '+', '|', ',', ':']: # separators vocab = XPOSVocab(data, sh, idx=2, sep=sep) length = sum( len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values()) if length < best_size: key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep) best_size = length
def train(args): utils.ensure_dir(args['save_dir']) model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand']) # load pretrained vectors if needed pretrain = None if args['pretrain']: vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand']) pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab']) # load data print("Loading data with batch size {}...".format(args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: print("Skip training because no data available...") sys.exit(0) print("Training parser...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' using_amsgrad = False last_best_step = 0 # start training train_loss = 0 while True: do_break = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev print("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx) dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x]) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / args['eval_interval'] # avg loss per batch print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history) == 0 or dev_score > max(dev_score_history): last_best_step = global_step trainer.save(model_file) print("new best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] print("") if global_step - last_best_step >= args['max_steps_before_stop']: if not using_amsgrad: print("Switching to AMSGrad") last_best_step = global_step using_amsgrad = True trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6) else: do_break = True break if global_step >= args['max_steps']: do_break = True break if do_break: break train_batch.reshuffle() print("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1 print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))