def batchify_training_data(self, xy_ids_and_targets, batch_size, is_dev_data):

        logger.debug('Batchifying data')

        # sort according to the src lengths -> xy pairs
        sorted_data = sorted(zip(*xy_ids_and_targets), key=lambda p: len(p[0]), reverse=True)
        data_size = len(sorted_data)
        num_batches = data_size // batch_size
        data_indices = self.index_data(data_size, mode='no_shuffling')

        batch_pairs = []
        batch_forms_all = []
        for bi in range(num_batches+1): # including the last (smaller) batch
            batch_data_tuple, batch_forms = self.make_one_batch(sorted_data, data_indices, bi, batch_size)
            batch_pairs.append(batch_data_tuple)
            batch_forms_all.extend(batch_forms)

        if is_dev_data:
            self.dev_references = batch_forms_all
            logger.info('Saving dev (training) references to --> %s', self.fnames.dev_ref_fn)
            save_txt(self.dev_references, self.fnames.dev_ref_fn)

        assert data_size == len(batch_forms_all)

        return batch_pairs
Beispiel #2
0
    def save_predictions(self, predictions, fname):

        assert type(predictions[0]) == str, \
            'Predictions are not strings -- consider re-implementing the method!'

        logger.debug('Saving Morph predictions to --> %s', fname)
        save_txt(predictions, fname)
    def batchify_training_data(self, xy_ids, batch_size, is_dev_data):

        logger.debug('Batchifying data')

        x_data_ids, y_data_ids, forms = xy_ids
        data_size = len(x_data_ids)
        num_batches = data_size // batch_size
        data_indices = self.index_data(data_size, mode='no_shuffling')

        batch_pairs = []

        for bi in range(num_batches + 1):
            batch_x = []
            batch_y = []

            curr_batch_indices = data_indices[bi * batch_size:(bi + 1) *
                                              batch_size]

            for idx in curr_batch_indices:
                x_ids = x_data_ids[idx]
                y_ids = y_data_ids[idx]

                x_enc_ids_copy = copy.deepcopy(x_ids)
                batch_x.append(x_enc_ids_copy)

                y_ids_copy = copy.deepcopy(y_ids)
                batch_y.append(y_ids_copy)

            batch_enc_x_var = cuda_if_gpu(Variable(torch.LongTensor(batch_x)))
            batch_dec_y_var = cuda_if_gpu(Variable(torch.LongTensor(batch_y)))

            batch_pairs.append((batch_enc_x_var, batch_dec_y_var))

        if is_dev_data:
            self.dev_references = forms
            logger.info('Saving dev (training) references to --> %s',
                        self.fnames.dev_ref_fn)
            save_txt(self.dev_references, self.fnames.dev_ref_fn)

        assert data_size == len(forms)

        return batch_pairs
    def save_predictions(self, depgraphs, fname):

        predicted_snts = []
        random_dg = depgraphs[0]
        if 'PRED_FORM' not in random_dg.node['1']:
            logger.debug('*** USING ORACLE FORMS *** ')
            for dg in depgraphs:
                snt_tokens = [
                    SynAlgo.get_node_gold_form(dg, node_id)
                    for node_id in dg.graph['node_order']
                ]
                predicted_snts.append(' '.join(snt_tokens))

        else:
            logger.debug('*** USING PREDICTED FORMS *** ')
            for dg in depgraphs:
                snt_tokens = [
                    SynAlgo.get_node_pred_form(dg, node_id)
                    for node_id in dg.graph['node_order']
                ]
                predicted_snts.append(' '.join(snt_tokens))

        logger.debug('Saving Syn predictions to --> %s', fname)
        save_txt(predicted_snts, fname)
    def training_start(self, model, data, evaluator, nlgen):

        logger.debug("Preparing training data")

        dev_data_fname = data.fnames.dev_fn
        assert os.path.exists(dev_data_fname), logger.error(
            'File %s does not exist', dev_data_fname)

        # dev data for evaluation
        dev_data_ref_fname = data.fnames.dev_ref_fn
        dev_data_raw = read_conll_data_file(data.fnames.dev_fn)
        logger.info('Saving Syn reference --> %s', data.fnames.dev_ref_fn)
        save_txt(itemlist=conll2snt(dev_data_raw), fname=dev_data_ref_fname)

        train_batches = data.batchify_vectorized_data(
            data.train, self.batch_size)  # [(np_x, np_y_1hot), ...]
        dev_batches = data.batchify_vectorized_data(data.dev, self.batch_size)

        # need to move the model before setting the optimizer
        # see: http://pytorch.org/docs/stable/optim.html
        if self.use_cuda:
            model.cuda()

        self.set_optimizer(model, self.config['optimizer'])
        self.set_train_criterion(len(data.vocab.id2tok), PAD_ID)

        training_start_time = time.time()
        logger.info("Start training")

        best_score = 0
        best_model_fn = None
        best_weights = None

        for epoch_idx in range(1, self.n_epochs + 1):
            epoch_start = time.time()
            logger.info('Epoch %d/%d', epoch_idx, self.n_epochs)

            # compute loss on train and dev data
            train_loss = self.train_epoch(epoch_idx, model, train_batches)
            dev_loss = self.compute_val_loss(model, dev_batches)
            evaluator.record_loss(train_loss, dev_loss)

            # run on dev data in prediction mode (no oracle decoding)
            predictions_fname = self.get_predictions_fname(epoch_idx)
            depgraphs = nlgen.predict_from_raw_data(model, dev_data_raw,
                                                    data.vocab)
            nlgen.save_predictions(depgraphs, predictions_fname)

            # evaluate using metrics
            scores = evaluator.external_metric_eval(ref_fn=dev_data_ref_fname,
                                                    pred_fn=predictions_fname)
            avg_score = (scores.bleu + scores.edist) / 2
            model_fn = os.path.join(
                self.model_dir, 'weights.epoch%d_%0.3f_%0.3f' %
                (epoch_idx, scores.bleu, scores.edist))

            if avg_score > best_score:
                best_score = avg_score
                best_model_fn = model_fn
                best_weights = model.state_dict()

            logger.debug('Time = %s', asMinutes(time.time() - epoch_start))

        logger.info('Total training time=%s' %
                    (asMinutes(time.time() - training_start_time)))

        self.best_model_fn = best_model_fn
        logger.debug('Saving model to --> %s', best_model_fn)
        torch.save(best_weights, best_model_fn)

        score_fname = os.path.join(self.model_dir, 'scores.csv')
        scores = evaluator.get_scores_to_save()
        evaluator.save_scores(scores, self.score_file_header, score_fname)

        evaluator.plot_lcurve(fname=os.path.join(self.model_dir, "lcurve.pdf"),
                              title=self.model_type)
Beispiel #6
0
 def save_dev_references(self, fname):
     logger.debug('Saving Morph references to --> %s', fname)
     save_txt(self.targets, fname)