def training_setup(self, train_data_fname, dev_data_fname): assert train_data_fname is not None assert dev_data_fname is not None train_raw = read_conll_data_file(train_data_fname) dev_raw = read_conll_data_file(dev_data_fname) # extract from TRAIN lemmas, forms, feats if in training mode train_lemmas_l, train_forms_l, train_feat_d = BaseMorphData.extract_lemmas_forms_feature_dicts( train_raw) dev_lemmas_l, dev_forms_l, dev_feat_d = BaseMorphData.extract_lemmas_forms_feature_dicts( dev_raw) self.set_char_freqlist(train_lemmas_l) self.max_src_len = self.config['max_src_len'] self.max_tgt_len = self.config['max_tgt_len'] logger.info('Max lemma/form lenghts: %d/%d', self.max_src_len, self.max_tgt_len) train_lemmas_l, train_forms_l, train_feat_d = self.filter_data_outliers( train_lemmas_l, train_forms_l, train_feat_d) dev_lemmas_l, dev_forms_l, dev_feat_d = self.filter_data_outliers( dev_lemmas_l, dev_forms_l, dev_feat_d) # align forms and lemmas all_aligned_pairs = mcmc_align(wordpairs=list( zip(train_lemmas_l + dev_lemmas_l, train_forms_l + dev_forms_l)), align_symbol=ALIGN_SYMBOL) # splitting aligned pairs num_train_pairs = len(train_lemmas_l) train_aligned_pairs = all_aligned_pairs[:num_train_pairs] dev_aligned_pairs = all_aligned_pairs[num_train_pairs:] # vectorize the data self.vocab.setup(vocab_path=self.vocab_fn, data=(train_lemmas_l, train_forms_l, train_feat_d), lower=self.lower, source='lemma_form_feat') self.train = self.train_data_to_ids(data=list( zip(train_lemmas_l, train_feat_d, train_aligned_pairs, train_forms_l))) self.dev = self.train_data_to_ids(data=list( zip(dev_lemmas_l, dev_feat_d, dev_aligned_pairs, dev_forms_l)))
def get_all_stats(train_fn, dev_fn): """ # stats to retrieve self.lem2form_ratio = [] self.lengths = None self.branching_factor = None self.unique_feats = None self.oov_lemmas = None self.oov_forms = None self.oov_chars = None self.proper_nouns_count = 0 self.noise = None self.junk = None :param dev_fn: :return: """ train_raw = read_conll_data_file(train_fn) train_tokens = raw_data_to_tokens(train_raw) train_lemmas = [t.LEMMA.lower() for i in train_tokens for t in i] train_forms = [t.GOLD_FORM.lower() for i in train_tokens for t in i] # lem2form_ratio # title = 'Lemma-to-form ratio' plot_lem2form_ratio(train_tokens, fname='lem2form_ratio.pdf')
def training_setup(self, train_data_fname, dev_data_fname): assert train_data_fname is not None assert dev_data_fname is not None train_raw = read_conll_data_file(train_data_fname) dev_raw = read_conll_data_file(dev_data_fname) # extract from TRAIN lemmas, forms, feats if in training mode train_lemmas_l, train_forms_l, train_feat_d = BaseMorphData.extract_lemmas_forms_feature_dicts(train_raw) dev_lemmas_l, dev_forms_l, dev_feat_d = BaseMorphData.extract_lemmas_forms_feature_dicts(dev_raw) self.set_char_freqlist(train_lemmas_l) self.max_src_len = self.config['max_src_len'] self.max_tgt_len = self.config['max_tgt_len'] logger.info('Max lemma/form lenghts: %d/%d', self.max_src_len, self.max_tgt_len) train_lemmas_l, train_forms_l, train_feat_d = self.filter_data_outliers(train_lemmas_l, train_forms_l, train_feat_d) dev_lemmas_l, dev_forms_l, dev_feat_d = self.filter_data_outliers(dev_lemmas_l, dev_forms_l, dev_feat_d) # vectorize the data self.vocab.setup(vocab_path=self.vocab_fn, data=(train_lemmas_l, train_forms_l, train_feat_d), lower=self.lower, source='lemma_form_feat') self.train = self.train_data_to_ids(data=list(zip( train_lemmas_l, train_feat_d, train_forms_l))) self.dev = self.train_data_to_ids(data=list(zip( dev_lemmas_l, dev_feat_d, dev_forms_l)))
def predict_from_file(self, model, data_fname, vocab): """ Evaluating model on files :param model: :param dev_data: :return: """ raw_data = read_conll_data_file(data_fname) predictions = self.predict_from_raw_data(model, raw_data, vocab) return predictions
def training_setup(self, train_data_fname, dev_data_fname): assert train_data_fname is not None assert dev_data_fname is not None train_raw = read_conll_data_file(train_data_fname) dev_raw = read_conll_data_file(dev_data_fname) train_graphs = [dg_from_tokens(toks) for toks in raw_data_to_tokens(train_raw)] dev_graphs = [dg_from_tokens(toks) for toks in raw_data_to_tokens(dev_raw)] self.vocab.setup(vocab_path=self.vocab_fn, data=train_graphs, lower=self.lower, source='depgraphs') # Vectorize data self.train = self.vectorize_graphs(train_graphs) self.dev = self.vectorize_graphs(dev_graphs) # Set the number of extracted features (need for the NN) random_instance_x = self.dev[0, :-1] self.num_features = len(random_instance_x)
def training_start(self, model, data, evaluator, nlgen): logger.debug("Preparing training data") dev_data_fname = data.fnames.dev_fn assert os.path.exists(dev_data_fname), logger.error( 'File %s does not exist', dev_data_fname) # dev data for evaluation dev_data_ref_fname = data.fnames.dev_ref_fn dev_data_raw = read_conll_data_file(data.fnames.dev_fn) logger.info('Saving Syn reference --> %s', data.fnames.dev_ref_fn) save_txt(itemlist=conll2snt(dev_data_raw), fname=dev_data_ref_fname) train_batches = data.batchify_vectorized_data( data.train, self.batch_size) # [(np_x, np_y_1hot), ...] dev_batches = data.batchify_vectorized_data(data.dev, self.batch_size) # need to move the model before setting the optimizer # see: http://pytorch.org/docs/stable/optim.html if self.use_cuda: model.cuda() self.set_optimizer(model, self.config['optimizer']) self.set_train_criterion(len(data.vocab.id2tok), PAD_ID) training_start_time = time.time() logger.info("Start training") best_score = 0 best_model_fn = None best_weights = None for epoch_idx in range(1, self.n_epochs + 1): epoch_start = time.time() logger.info('Epoch %d/%d', epoch_idx, self.n_epochs) # compute loss on train and dev data train_loss = self.train_epoch(epoch_idx, model, train_batches) dev_loss = self.compute_val_loss(model, dev_batches) evaluator.record_loss(train_loss, dev_loss) # run on dev data in prediction mode (no oracle decoding) predictions_fname = self.get_predictions_fname(epoch_idx) depgraphs = nlgen.predict_from_raw_data(model, dev_data_raw, data.vocab) nlgen.save_predictions(depgraphs, predictions_fname) # evaluate using metrics scores = evaluator.external_metric_eval(ref_fn=dev_data_ref_fname, pred_fn=predictions_fname) avg_score = (scores.bleu + scores.edist) / 2 model_fn = os.path.join( self.model_dir, 'weights.epoch%d_%0.3f_%0.3f' % (epoch_idx, scores.bleu, scores.edist)) if avg_score > best_score: best_score = avg_score best_model_fn = model_fn best_weights = model.state_dict() logger.debug('Time = %s', asMinutes(time.time() - epoch_start)) logger.info('Total training time=%s' % (asMinutes(time.time() - training_start_time))) self.best_model_fn = best_model_fn logger.debug('Saving model to --> %s', best_model_fn) torch.save(best_weights, best_model_fn) score_fname = os.path.join(self.model_dir, 'scores.csv') scores = evaluator.get_scores_to_save() evaluator.save_scores(scores, self.score_file_header, score_fname) evaluator.plot_lcurve(fname=os.path.join(self.model_dir, "lcurve.pdf"), title=self.model_type)