예제 #1
0
    def testset_evaluation(self,
                           D_test,
                           D_C_test,
                           prod_to_sent,
                           flic,
                           num_samples=20,
                           random_sampling=False):

        probs_pos = 0
        probs_neg = 0
        probs_pos_count = 0
        probs_neg_count = 0
        tot_cost = 0
        sampled_sentences = []
        costs = []
        num_errors = 0.0

        for context, x, y in training_sample(D_test, D_C_test):

            mean_prob, mean_cost = self.validate_t(flic[context], flic[x], y)
            costs.append(mean_cost)

            tot_cost += mean_cost
            if y == 1:
                probs_pos += mean_prob
                probs_pos_count += 1
            else:
                probs_neg += mean_prob
                probs_neg_count += 1

            if abs(mean_prob - y) > 0.5:
                num_errors += 1

        return sampled_sentences, 1 - num_errors / (probs_pos_count + probs_neg_count),  \
               (tot_cost / (probs_pos_count + probs_neg_count), probs_pos / probs_pos_count, probs_neg / probs_neg_count)
예제 #2
0
    def validation_cost(self,
                        D_valid,
                        D_C_valid,
                        prod_to_sent,
                        flic,
                        num_samples=10):
        probs_pos = 0
        probs_neg = 0
        probs_pos_count = 0
        probs_neg_count = 0
        tot_cost = 0
        num_errors = 0.0

        for context, x, y in training_sample(D_valid, D_C_valid):

            mean_prob, mean_cost = self.validate_t(flic[context], flic[x], y)

            if abs(mean_prob - y) > 0.5:
                num_errors += 1

            tot_cost += mean_cost
            if y == 1:
                probs_pos += mean_prob
                probs_pos_count += 1
            else:
                probs_neg += mean_prob
                probs_neg_count += 1

        count = probs_pos_count + probs_neg_count

        if probs_pos_count > 0 and probs_neg_count > 0:
            return 1 - num_errors / count, tot_cost / count, probs_pos / probs_pos_count, probs_neg / probs_neg_count
        else:
            return 0, 0, 0, 0
    def testset_evaluation(self, D_test, D_C_test, prod_to_sent, flic, num_samples = 20, random_sampling=False):

        self.dropout_encoder.set_value(0.0)
        self.dropout_generator.set_value(0.0)

        probs_pos = 0
        probs_neg = 0
        probs_pos_count = 0
        probs_neg_count = 0
        tot_cost = 0
        sampled_sentences = []
        costs = []
        num_errors = 0.0

        for context, x, y in training_sample(D_test, D_C_test):

            sent_embs = prod_to_sent[x][0]

            samples, max_sents_count = self.generator.sample(sent_embs.shape[0], num_samples)
            mean_prob, mean_cost = self.validate_t(sent_embs, samples, max_sents_count, flic[context], y)

            sampled_sentences.append((context, x, get_strings_from_samples(prod_to_sent[x][1], samples)))
            costs.append(mean_cost)

            tot_cost +=  mean_cost
            if y == 1:
                probs_pos += mean_prob
                probs_pos_count += 1
            else:
                probs_neg += mean_prob
                probs_neg_count += 1

            if abs(mean_prob - y) > 0.5:
                num_errors += 1


        self.dropout_generator.set_value(self.args.dropout_generator)
        self.dropout_encoder.set_value(self.args.dropout_encoder)

        return sampled_sentences, 1 - num_errors / (probs_pos_count + probs_neg_count),  \
               (tot_cost / (probs_pos_count + probs_neg_count), probs_pos / probs_pos_count, probs_neg / probs_neg_count)
    def validation_cost(self, D_valid, D_C_valid, prod_to_sent, flic, num_samples = 10):
        probs_pos = 0
        probs_neg = 0
        probs_pos_count = 0
        probs_neg_count = 0
        tot_cost = 0
        num_errors = 0.0

        self.dropout_encoder.set_value(0.0)
        self.dropout_generator.set_value(0.0)

        for context, x, y in training_sample(D_valid, D_C_valid):

            sent_embs = prod_to_sent[x][0]

            samples, max_sents_count = self.generator.sample(sent_embs.shape[0], num_samples)
            mean_prob, mean_cost = self.validate_t(sent_embs, samples, max_sents_count, flic[context], y)

            if abs(mean_prob - y) > 0.5:
                num_errors += 1

            tot_cost +=  mean_cost
            if y == 1:
                probs_pos += mean_prob
                probs_pos_count += 1
            else:
                probs_neg += mean_prob
                probs_neg_count += 1

        self.dropout_generator.set_value(self.args.dropout_generator)
        self.dropout_encoder.set_value(self.args.dropout_encoder)

        count = probs_pos_count + probs_neg_count

        if probs_pos_count > 0 and probs_neg_count > 0:
            return 1 - num_errors / count, tot_cost / count, probs_pos / probs_pos_count, probs_neg / probs_neg_count
        else:
            return 0, 0, 0, 0
예제 #5
0
    def train(self, D, D_C, D_valid, D_C_valid, flic, prod_to_sent):

        #---------------------------------------------------------------------------------------------------------------
        # Looping through the data (i.e. the 'Training-For-Loop').
        #---------------------------------------------------------------------------------------------------------------

        # For time measurements
        t_training = 0.0
        t_0 = 0.0

        print("Start training.")
        print("Num links in training set = {}".format(len(D) + len(D_C)))
        for epoch in range(self.args.max_epochs):
            epoch += self.epochs_done
            iteration = 0
            cost_epoch = 0
            cost_tmp = 0
            probs_pos = 0
            probs_neg = 0
            probs_pos_count = 0
            probs_neg_count = 0
            probs_pos_tmp = 0
            probs_neg_tmp = 0
            probs_pos_count_tmp = 0
            probs_neg_count_tmp = 0

            for context, x, y in training_sample(D, D_C):

                #-------------------------------------------------------------------------------------------------------
                # Train model
                #-------------------------------------------------------------------------------------------------------

                t_0 = timer()
                mean_prob, mean_cost = self.train_model_t(
                    flic[context], flic[x], y)
                t_training += (timer() - t_0)

                #-------------------------------------------------------------------------------------------------------
                # Output of the training and validation scores (only sometimes)
                #-------------------------------------------------------------------------------------------------------

                cost_epoch += mean_cost
                cost_tmp += mean_cost
                if y == 1:
                    probs_pos += mean_prob
                    probs_pos_tmp += mean_prob
                    probs_pos_count += 1
                    probs_pos_count_tmp += 1
                else:
                    probs_neg += mean_prob
                    probs_neg_tmp += mean_prob
                    probs_neg_count += 1
                    probs_neg_count_tmp += 1

                iteration += 1

                # All 1000 datapoints
                if iteration % 1000 == 0:
                    print("Cost on the training set after {} iterations: {}".
                          format(
                              iteration, cost_tmp /
                              (probs_pos_count_tmp + probs_neg_count_tmp)))

                if iteration % 10000 == 0 and self.args.save_model:
                    self.store(self.args.save_model, epoch, iteration)
                    print("Model saved. Epoch = {}, Iteration = {}".format(
                        epoch, iteration))

                if self.sig_handler.exit == True:
                    if self.best_val_error == sys.float_info.max and self.args.save_model:
                        self.best_val_error = -1
                        self.store(
                            self.args.save_model + '/best_valerr_' +
                            str("{0:3f}.pkl".format(self.best_val_error)))
                    return (self.best_val_error, "SIGNAL")

            # ----------------------------------------------------------------------------------------------------------
            # Output per epoch
            # ----------------------------------------------------------------------------------------------------------

            if self.args.save_model:
                self.store(self.args.save_model, epoch, 0)
                print("\nEpoch {} done. Model saved! ".format(epoch))
            else:
                print("\nEpoch {} done.".format(epoch))

            # Timing output
            if self.args.measure_timing:
                print("Time Training = {}".format(t_training / iteration))

            print("Calculating the average cost in this epoch.")
            if D_valid is not None and D_C_valid is not None:
                accuracy, valid_cost, valid_pos_score, valid_neg_score = self.validation_cost(
                    D_valid, D_C_valid, prod_to_sent, flic)
                self._print_scores(
                    (valid_cost, valid_pos_score, valid_neg_score),
                    (cost_epoch / (probs_pos_count + probs_neg_count),
                     probs_pos / probs_pos_count, probs_neg / probs_neg_count))
                if valid_cost < self.best_val_error and self.args.save_model:
                    self.best_val_error = min(self.best_val_error, valid_cost)
                    self.store(self.args.save_model + '/best_valerr_' +
                               str("{0:3f}.pkl".format(valid_cost)))

                # Decreasing the learning rate and reloading the best model if the validation cost
                # is not decreasing anymore.
                if self.args.adaptive_lrs:
                    lrs_adapted = self.adaptive_learning_rate.adapt_learning_rate(
                        valid_cost, self.lr)
                if self.args.save_model and self.args.adaptive_lrs and lrs_adapted:
                    lr_value = self.lr.get_value()
                    self.load(self.args.save_model + '/best_valerr_' +
                              str("{0:3f}.pkl".format(self.best_val_error)))
                    self.lr.set_value(lr_value)
                    print("Best model loaded.")

                # If the learning rates are very small, we stop our training.
                if self.lr.get_value() < 1e-5:
                    return (self.best_val_error, "END")
            else:
                self._print_scores(
                    None,
                    (cost_epoch / (probs_pos_count + probs_neg_count),
                     probs_pos / probs_pos_count, probs_neg / probs_neg_count))
            print("")

            # Resets all temporary variables.
            cost_epoch = 0
            if epoch + 1 >= self.args.max_epochs:
                return (self.best_val_error, "END")
            if (epoch + 1) % 10 == 0:
                self.epochs_done += 10
                self.store(self.args.save_model + '/after_10_epochs_' +
                           str("{0:3f}.pkl".format(valid_cost)))
                return (valid_cost, "10_epochs")
            probs_neg = 0
            probs_pos = 0
            probs_pos_count = 0
            probs_neg_count = 0
        return (self.best_val_error, "END")
예제 #6
0
    def testset_evaluation(self,
                           D_test,
                           D_C_test,
                           prod_to_sent,
                           flic,
                           num_samples=20,
                           return_all_probs=False):

        self.dropout_encoder.set_value(0.0)
        self.dropout_generator.set_value(0.0)

        probs_pos = 0
        probs_neg = 0
        probs_pos_count = 0
        probs_neg_count = 0
        tot_cost = 0
        sampled_sentences = []
        costs = []
        num_errors = 0.0

        for context, x, y in training_sample(D_test, D_C_test):

            sent_embs = prod_to_sent[x][0]

            if not self.args.sample_all_sentences:
                # Sampling
                L = self.get_L_t(sent_embs, flic[context])
                samples, max_sents_count = self.generator.sample(
                    L, num_samples)

                mean_prob, mean_cost = self.validate_t(sent_embs, samples,
                                                       max_sents_count,
                                                       flic[context], y)

                # return_all_probs is only needed for the script which prints out sampled sentences
                if return_all_probs:
                    sampled_sentences.append(
                        (context, x, y, mean_prob,
                         get_strings_from_samples(prod_to_sent[x][1],
                                                  samples)))
                else:
                    sampled_sentences.append(
                        (context, x,
                         get_strings_from_samples(prod_to_sent[x][1],
                                                  samples)))
                costs.append(mean_cost)
            else:
                mean_prob, mean_cost = self.validate_t(sent_embs,
                                                       flic[context], y)

            tot_cost += mean_cost
            if y == 1:
                probs_pos += mean_prob
                probs_pos_count += 1
            else:
                probs_neg += mean_prob
                probs_neg_count += 1

            if abs(mean_prob - y) > 0.5:
                num_errors += 1

        self.dropout_generator.set_value(self.args.dropout_generator)
        self.dropout_encoder.set_value(self.args.dropout_encoder)

        return sampled_sentences, 1 - num_errors / (probs_pos_count + probs_neg_count),  \
               (tot_cost / (probs_pos_count + probs_neg_count), probs_pos / probs_pos_count, probs_neg / probs_neg_count)
예제 #7
0
    def validation_cost(self,
                        D_valid,
                        D_C_valid,
                        prod_to_sent,
                        flic,
                        num_samples=10):
        """ Evaluates the model on the validation set.

        :param D_valid: Validation set (dataset D)
        :param D_C_valid: Validation set (dataset D_C)
        :type flic: FLIC
        :param flic: The FLIC model trained on the product data set
        :type prod_to_sent: dict: string -> Set of sentences
        :param prod_to_sent: A dict from product ID's to a set of (sentence, sent_embedding) pair.
        :type num_samples: int
        :param num_samples: The number of reinforcement sampled sets.

        :return value: (accuracy, mean_cost, mean_pos_probs, mean_neg_probs)

        """

        probs_pos = 0
        probs_neg = 0
        probs_pos_count = 0
        probs_neg_count = 0
        tot_cost = 0
        num_errors = 0.0

        self.dropout_encoder.set_value(0.0)
        self.dropout_generator.set_value(0.0)

        for context, x, y in training_sample(D_valid, D_C_valid):

            sent_embs = prod_to_sent[x][0]

            if not self.args.sample_all_sentences:
                L = self.get_L_t(sent_embs, flic[context])

                samples, max_sents_count = self.generator.sample(
                    L, num_samples)
                mean_prob, mean_cost = self.validate_t(sent_embs, samples,
                                                       max_sents_count,
                                                       flic[context], y)
            else:
                mean_prob, mean_cost = self.validate_t(sent_embs,
                                                       flic[context], y)

            if abs(mean_prob - y) > 0.5:
                num_errors += 1

            tot_cost += mean_cost
            if y == 1:
                probs_pos += mean_prob
                probs_pos_count += 1
            else:
                probs_neg += mean_prob
                probs_neg_count += 1

        self.dropout_generator.set_value(self.args.dropout_generator)
        self.dropout_encoder.set_value(self.args.dropout_encoder)

        count = probs_pos_count + probs_neg_count

        if probs_pos_count > 0 and probs_neg_count > 0:
            return 1 - num_errors / count, tot_cost / count, probs_pos / probs_pos_count, probs_neg / probs_neg_count
        else:
            return 0, 0, 0, 0