def training_setup(self, train_data_fname, dev_data_fname):

        assert train_data_fname is not None
        assert dev_data_fname is not None

        train_raw = read_conll_data_file(train_data_fname)
        dev_raw = read_conll_data_file(dev_data_fname)

        # extract from TRAIN lemmas, forms, feats if in training mode
        train_lemmas_l, train_forms_l, train_feat_d = BaseMorphData.extract_lemmas_forms_feature_dicts(
            train_raw)
        dev_lemmas_l, dev_forms_l, dev_feat_d = BaseMorphData.extract_lemmas_forms_feature_dicts(
            dev_raw)
        self.set_char_freqlist(train_lemmas_l)

        self.max_src_len = self.config['max_src_len']
        self.max_tgt_len = self.config['max_tgt_len']

        logger.info('Max lemma/form lenghts: %d/%d', self.max_src_len,
                    self.max_tgt_len)

        train_lemmas_l, train_forms_l, train_feat_d = self.filter_data_outliers(
            train_lemmas_l, train_forms_l, train_feat_d)

        dev_lemmas_l, dev_forms_l, dev_feat_d = self.filter_data_outliers(
            dev_lemmas_l, dev_forms_l, dev_feat_d)

        # align forms and lemmas
        all_aligned_pairs = mcmc_align(wordpairs=list(
            zip(train_lemmas_l + dev_lemmas_l, train_forms_l + dev_forms_l)),
                                       align_symbol=ALIGN_SYMBOL)

        # splitting aligned pairs
        num_train_pairs = len(train_lemmas_l)
        train_aligned_pairs = all_aligned_pairs[:num_train_pairs]
        dev_aligned_pairs = all_aligned_pairs[num_train_pairs:]

        # vectorize the data
        self.vocab.setup(vocab_path=self.vocab_fn,
                         data=(train_lemmas_l, train_forms_l, train_feat_d),
                         lower=self.lower,
                         source='lemma_form_feat')

        self.train = self.train_data_to_ids(data=list(
            zip(train_lemmas_l, train_feat_d, train_aligned_pairs,
                train_forms_l)))

        self.dev = self.train_data_to_ids(data=list(
            zip(dev_lemmas_l, dev_feat_d, dev_aligned_pairs, dev_forms_l)))
Ejemplo n.º 2
0
def get_all_stats(train_fn, dev_fn):
    """
    # stats to retrieve
    self.lem2form_ratio = []
    self.lengths = None
    self.branching_factor = None
    self.unique_feats = None

    self.oov_lemmas = None
    self.oov_forms = None
    self.oov_chars = None

    self.proper_nouns_count = 0
    self.noise = None
    self.junk = None

    :param dev_fn:
    :return:
    """
    train_raw = read_conll_data_file(train_fn)
    train_tokens = raw_data_to_tokens(train_raw)

    train_lemmas = [t.LEMMA.lower() for i in train_tokens for t in i]
    train_forms = [t.GOLD_FORM.lower() for i in train_tokens for t in i]

    # lem2form_ratio
    # title = 'Lemma-to-form ratio'
    plot_lem2form_ratio(train_tokens, fname='lem2form_ratio.pdf')
    def training_setup(self, train_data_fname, dev_data_fname):

        assert train_data_fname is not None
        assert dev_data_fname is not None

        train_raw = read_conll_data_file(train_data_fname)
        dev_raw = read_conll_data_file(dev_data_fname)

        # extract from TRAIN lemmas, forms, feats if in training mode
        train_lemmas_l, train_forms_l, train_feat_d = BaseMorphData.extract_lemmas_forms_feature_dicts(train_raw)
        dev_lemmas_l, dev_forms_l, dev_feat_d = BaseMorphData.extract_lemmas_forms_feature_dicts(dev_raw)
        self.set_char_freqlist(train_lemmas_l)

        self.max_src_len = self.config['max_src_len']
        self.max_tgt_len = self.config['max_tgt_len']

        logger.info('Max lemma/form lenghts: %d/%d', self.max_src_len, self.max_tgt_len)

        train_lemmas_l, train_forms_l, train_feat_d = self.filter_data_outliers(train_lemmas_l,
                                                                                train_forms_l,
                                                                                train_feat_d)

        dev_lemmas_l, dev_forms_l, dev_feat_d = self.filter_data_outliers(dev_lemmas_l,
                                                                          dev_forms_l,
                                                                          dev_feat_d)

        # vectorize the data
        self.vocab.setup(vocab_path=self.vocab_fn,
                         data=(train_lemmas_l, train_forms_l, train_feat_d),
                         lower=self.lower,
                         source='lemma_form_feat')

        self.train = self.train_data_to_ids(data=list(zip(
            train_lemmas_l,
            train_feat_d,
            train_forms_l)))

        self.dev = self.train_data_to_ids(data=list(zip(
            dev_lemmas_l,
            dev_feat_d,
            dev_forms_l)))
    def predict_from_file(self, model, data_fname, vocab):
        """
        Evaluating model on files
        :param model:
        :param dev_data:
        :return:
        """

        raw_data = read_conll_data_file(data_fname)
        predictions = self.predict_from_raw_data(model, raw_data, vocab)

        return predictions
Ejemplo n.º 5
0
    def training_setup(self, train_data_fname, dev_data_fname):

        assert train_data_fname is not None
        assert dev_data_fname is not None

        train_raw = read_conll_data_file(train_data_fname)
        dev_raw = read_conll_data_file(dev_data_fname)

        train_graphs = [dg_from_tokens(toks) for toks in raw_data_to_tokens(train_raw)]
        dev_graphs = [dg_from_tokens(toks) for toks in raw_data_to_tokens(dev_raw)]

        self.vocab.setup(vocab_path=self.vocab_fn,
                         data=train_graphs,
                         lower=self.lower,
                         source='depgraphs')

        # Vectorize data
        self.train = self.vectorize_graphs(train_graphs)
        self.dev = self.vectorize_graphs(dev_graphs)

        # Set the number of extracted features (need for the NN)
        random_instance_x = self.dev[0, :-1]
        self.num_features = len(random_instance_x)
    def training_start(self, model, data, evaluator, nlgen):

        logger.debug("Preparing training data")

        dev_data_fname = data.fnames.dev_fn
        assert os.path.exists(dev_data_fname), logger.error(
            'File %s does not exist', dev_data_fname)

        # dev data for evaluation
        dev_data_ref_fname = data.fnames.dev_ref_fn
        dev_data_raw = read_conll_data_file(data.fnames.dev_fn)
        logger.info('Saving Syn reference --> %s', data.fnames.dev_ref_fn)
        save_txt(itemlist=conll2snt(dev_data_raw), fname=dev_data_ref_fname)

        train_batches = data.batchify_vectorized_data(
            data.train, self.batch_size)  # [(np_x, np_y_1hot), ...]
        dev_batches = data.batchify_vectorized_data(data.dev, self.batch_size)

        # need to move the model before setting the optimizer
        # see: http://pytorch.org/docs/stable/optim.html
        if self.use_cuda:
            model.cuda()

        self.set_optimizer(model, self.config['optimizer'])
        self.set_train_criterion(len(data.vocab.id2tok), PAD_ID)

        training_start_time = time.time()
        logger.info("Start training")

        best_score = 0
        best_model_fn = None
        best_weights = None

        for epoch_idx in range(1, self.n_epochs + 1):
            epoch_start = time.time()
            logger.info('Epoch %d/%d', epoch_idx, self.n_epochs)

            # compute loss on train and dev data
            train_loss = self.train_epoch(epoch_idx, model, train_batches)
            dev_loss = self.compute_val_loss(model, dev_batches)
            evaluator.record_loss(train_loss, dev_loss)

            # run on dev data in prediction mode (no oracle decoding)
            predictions_fname = self.get_predictions_fname(epoch_idx)
            depgraphs = nlgen.predict_from_raw_data(model, dev_data_raw,
                                                    data.vocab)
            nlgen.save_predictions(depgraphs, predictions_fname)

            # evaluate using metrics
            scores = evaluator.external_metric_eval(ref_fn=dev_data_ref_fname,
                                                    pred_fn=predictions_fname)
            avg_score = (scores.bleu + scores.edist) / 2
            model_fn = os.path.join(
                self.model_dir, 'weights.epoch%d_%0.3f_%0.3f' %
                (epoch_idx, scores.bleu, scores.edist))

            if avg_score > best_score:
                best_score = avg_score
                best_model_fn = model_fn
                best_weights = model.state_dict()

            logger.debug('Time = %s', asMinutes(time.time() - epoch_start))

        logger.info('Total training time=%s' %
                    (asMinutes(time.time() - training_start_time)))

        self.best_model_fn = best_model_fn
        logger.debug('Saving model to --> %s', best_model_fn)
        torch.save(best_weights, best_model_fn)

        score_fname = os.path.join(self.model_dir, 'scores.csv')
        scores = evaluator.get_scores_to_save()
        evaluator.save_scores(scores, self.score_file_header, score_fname)

        evaluator.plot_lcurve(fname=os.path.join(self.model_dir, "lcurve.pdf"),
                              title=self.model_type)