def getFinalResultsForTask(self, setting_dict):
		if self.users_as_tasks:
			task_df = self.data_df[self.data_df['user_id'] == setting_dict['task_name']]
			target_label = [self.target_label]
		else:
			task_df = self.data_df
			target_label = [helper.getOfficialLabelName(setting_dict['task_name'])]
		self.net = tfnet.TensorFlowNetwork(task_df, copy.deepcopy(self.wanted_feats),target_label, verbose=False, val_type=self.val_type)
		self.net.setParams(l2_beta=setting_dict['l2_beta'], initial_learning_rate=setting_dict['learning_rate'], decay=setting_dict['decay'], 
							decay_steps=setting_dict['decay_steps'], decay_rate=setting_dict['decay_rate'], batch_size=setting_dict['batch_size'],
							optimizer=setting_dict['optimizer'], dropout=setting_dict['dropout'])
		self.constructNetwork(setting_dict['hidden_layers'])

		self.net.setUpGraph()
		preds = self.net.runGraph(self.test_steps, print_test=True, return_test_preds=True)

		preds_df = self.net.get_preds_for_df()
		label_name = setting_dict['task_name']
		preds_df.to_csv(self.results_path + "Preds-" + self.save_prefix + label_name + '.csv')
		print "Preds df saved to", self.results_path + "Preds-" + self.save_prefix + label_name + '.csv'

		return self.net.final_test_results['acc'], self.net.final_test_results['auc'], preds
    def getFinalResultsAndSave(self, results_dict):
        print "\nRetraining on full training data with the best settings..."
        self.drop20 = False
        self.initializeAndTrainMTMKL(self.train_tasks,
                                     results_dict['C'],
                                     results_dict['beta'],
                                     results_dict['kernel'],
                                     results_dict['v'],
                                     results_dict['regularizer'],
                                     verbose=True)

        print "\nEvaluating results on held-out test set!! ..."
        all_preds = []
        all_true_y = []
        per_task_accs = [np.nan] * self.n_tasks
        per_task_aucs = [np.nan] * self.n_tasks
        per_task_f1 = [np.nan] * self.n_tasks
        per_task_precision = [np.nan] * self.n_tasks
        per_task_recall = [np.nan] * self.n_tasks
        for t in range(self.n_tasks):
            preds = self.classifier.predictOneTask(self.test_tasks, t)
            true_y = list(self.test_tasks[t]['Y'].flatten())

            if len(preds) == 0 or len(true_y) == 0:
                print "no y for task", t, "... skipping"
                continue

            all_preds.extend(preds)
            all_true_y.extend(true_y)

            # save the per-task results
            t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(
                preds, true_y)
            per_task_accs[t] = t_acc
            per_task_aucs[t] = t_auc
            per_task_f1[t] = t_f1
            per_task_precision[t] = t_precision
            per_task_recall[t] = t_recall

        print "\nPlotting cool stuff about the final model..."
        self.saveImagePlot(self.classifier.eta, 'Etas')
        pd.DataFrame(
            self.classifier.eta).to_csv(self.etas_path + self.save_prefix +
                                        "-etas.csv")

        print "\tHELD OUT TEST METRICS COMPUTED BY APPENDING ALL PREDS"
        acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(
            all_preds, all_true_y)
        print '\t\tAcc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall

        print "\n\tHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS"
        avg_acc = np.nanmean(per_task_accs)
        avg_auc = np.nanmean(per_task_aucs)
        avg_f1 = np.nanmean(per_task_f1)
        avg_precision = np.nanmean(per_task_precision)
        avg_recall = np.nanmean(per_task_recall)
        print '\t\tAcc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall

        print "\n\tHELD OUT TEST METRICS COMPUTED FOR EACH TASK"
        if not self.users_as_tasks:
            for t in range(self.n_tasks):
                task_name = self.test_tasks[t]['Name']
                task_name = helper.getFriendlyLabelName(task_name)
                print "\t\t", task_name, "- Acc:", per_task_accs[
                    t], "AUC:", per_task_aucs[t], 'F1:', per_task_f1[
                        t], 'Precision:', per_task_precision[
                            t], 'Recall:', per_task_recall[t]

        if self.test_csv_filename is not None:
            print "\tSAVING HELD OUT PREDICITONS"
            if 'Big5GenderKMeansCluster' in self.file_prefix:
                task_column = 'Big5GenderKMeansCluster'
                tasks_are_ints = True
                label_name = helper.getFriendlyLabelName(self.file_prefix)
                wanted_label = helper.getOfficialLabelName(label_name)
                predictions_df = helper.get_test_predictions_for_df_with_task_column(
                    self.classifier.predict_01,
                    self.test_csv_filename,
                    task_column,
                    self.test_tasks,
                    wanted_label=wanted_label,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1],
                    label_name=label_name,
                    tasks_are_ints=tasks_are_ints)
            elif not self.users_as_tasks:
                predictions_df = helper.get_test_predictions_for_df_with_no_task_column(
                    self.classifier.predict_01,
                    self.test_csv_filename,
                    self.test_tasks,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1])
            else:
                print "Error! Cannot determine what type of model you are training and therefore cannot save predictions."
                return
            predictions_df.to_csv(self.results_path + "Preds-" +
                                  self.save_prefix + '.csv')
        else:
            print "Uh oh, the test csv filename was not set, can't save test preds"
Beispiel #3
0
    def retrainAndPlot(self, setting_dict):
        print "\nRETRAINING WITH THE BEST SETTINGS:"

        self.net.verbose = True
        self.net.setParams(l2_beta=setting_dict['l2_beta'],
                           initial_learning_rate=setting_dict['learning_rate'],
                           decay=setting_dict['decay'],
                           decay_steps=setting_dict['decay_steps'],
                           decay_rate=setting_dict['decay_rate'],
                           batch_size=setting_dict['batch_size'],
                           optimizer=setting_dict['optimizer'],
                           dropout=setting_dict['dropout'])
        self.constructNetwork(setting_dict['hidden_layers'])

        self.net.setUpGraph()
        self.net.runGraph(self.test_steps, print_test=True)

        if self.multilabel:
            for label in self.optimize_labels:
                friendly_label = helper.getFriendlyLabelName(label)
                self.net.plotValResults(save_path=self.figures_path +
                                        self.val_output_prefix + '-' +
                                        friendly_label + '.eps',
                                        label=label)
                self.net.plotValResults(save_path=self.figures_path +
                                        self.val_output_prefix + '-' +
                                        friendly_label + '.png',
                                        label=label)
                print "Final validation results for", friendly_label,"... Acc:", \
                  self.net.training_val_results_per_task['acc'][label][-1], "Auc:", self.net.training_val_results_per_task['auc'][label][-1]
        elif self.print_per_task:
            for label in self.wanted_labels:
                friendly_label = helper.getFriendlyLabelName(label)
                self.net.plotValResults(save_path=self.figures_path +
                                        self.val_output_prefix + '-' +
                                        friendly_label + '.eps',
                                        label=label)
                self.net.plotValResults(save_path=self.figures_path +
                                        self.val_output_prefix + '-' +
                                        friendly_label + '.png',
                                        label=label)
                print "Final validation results for", friendly_label,"... Acc:", \
                 self.net.training_val_results_per_task['acc'][label][-1], "Auc:", self.net.training_val_results_per_task['auc'][label][-1]
        else:
            self.net.plotValResults(save_path=self.figures_path +
                                    self.val_output_prefix + '.eps')
            self.net.plotValResults(save_path=self.figures_path +
                                    self.val_output_prefix + '.png')
            print "Final AUC:", self.net.training_val_results['auc'][-1]

        if self.test_csv_filename is not None:
            if self.multitask:
                task_column = None
                if 'Cluster' in self.dataset_name:
                    print "Guessing the task column is Big5GenderKMeansCluster - if this is incorrect expect errors"
                    task_column = 'Big5GenderKMeansCluster'
                    tasks_are_ints = True

                if 'User' in self.dataset_name:
                    print "Guessing the task column is user_id - if this is incorrect expect errors"
                    task_column = 'user_id'
                    tasks_are_ints = False

                if task_column is not None:
                    label_name = helper.getFriendlyLabelName(self.dataset_name)
                    wanted_label = helper.getOfficialLabelName(label_name)
                    test_preds_df = helper.get_test_predictions_for_df_with_task_column(
                        self.net.predict,
                        self.test_csv_filename,
                        task_column,
                        self.net.test_tasks,
                        wanted_label=wanted_label,
                        num_feats_expected=np.shape(
                            self.net.test_tasks[0]['X'])[1],
                        label_name=label_name,
                        tasks_are_ints=tasks_are_ints)
                else:
                    test_preds_df = helper.get_test_predictions_for_df_with_no_task_column(
                        self.net.predict,
                        self.test_csv_filename,
                        self.net.test_tasks,
                        num_feats_expected=np.shape(
                            self.net.test_tasks[0]['X'])[1])
            else:
                test_preds_df = self.net.get_preds_for_df()
            print "Got a test preds df! Saving it to:", self.results_path + "Preds-" + self.val_output_prefix + '.csv'
            test_preds_df.to_csv(self.results_path + 'Preds-' +
                                 self.val_output_prefix + '.csv')
        else:
            print "Uh oh, the test csv filename was not set, can't save test preds"

        print "Saving a copy of the final model!"
        self.net.save_model(self.val_output_prefix, self.results_path)
    def getFinalResultsAndSave(self, setting_dict):
        if self.val_type == 'cross':
            print "\nPlotting cross-validation results for best settings..."
            self.getCrossValidationResults(dict(),
                                           setting_dict['tau10'],
                                           setting_dict['tau20'],
                                           setting_dict['sigma_multiplier'],
                                           setting_dict['mu_multiplier'],
                                           save_plots=True)

        print "\nRetraining on training data with the best settings..."
        self.initializeHBLRModel(self.train_tasks)
        self.classifier.verbose = True
        self.setClassifierToSetting(setting_dict['tau10'],
                                    setting_dict['tau20'],
                                    setting_dict['sigma_multiplier'],
                                    setting_dict['mu_multiplier'])
        self.classifier.trainUntilConverged()

        print "\nPlotting and saving cool stuff about the final model..."
        self.saveImagePlot(self.classifier.phi, 'Phi')
        pd.DataFrame(self.classifier.phi).to_csv(self.results_path +
                                                 self.save_prefix + "-phi.csv")
        self.saveConvergencePlots()

        print "\nEvaluating results on held-out test set!! ..."
        all_preds = []
        all_true_y = []
        all_X_data = []
        per_task_accs = [np.nan] * self.n_tasks
        per_task_aucs = [np.nan] * self.n_tasks
        per_task_f1 = [np.nan] * self.n_tasks
        per_task_precision = [np.nan] * self.n_tasks
        per_task_recall = [np.nan] * self.n_tasks
        for t in range(self.n_tasks):
            preds = self.classifier.predictBinary(self.test_tasks[t]['X'], t)
            true_y = list(self.test_tasks[t]['Y'].flatten())

            if len(preds) == 0 or len(true_y) == 0:
                continue

            all_preds.extend(preds)
            all_true_y.extend(true_y)
            all_X_data.extend(self.test_tasks[t]['X'])

            # save the per-task results
            t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(
                preds, true_y)
            per_task_accs[t] = t_acc
            per_task_aucs[t] = t_auc
            per_task_f1[t] = t_f1
            per_task_precision[t] = t_precision
            per_task_recall[t] = t_recall

        print "\tHELD OUT TEST METRICS COMPUTED BY APPENDING ALL PREDS"
        acc, auc, f1, precision, recall = helper.computeAllMetricsForPreds(
            all_preds, all_true_y)
        print '\t\tAcc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall

        print "\n\tHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS"
        avg_acc = np.nanmean(per_task_accs)
        avg_auc = np.nanmean(per_task_aucs)
        avg_f1 = np.nanmean(per_task_f1)
        avg_precision = np.nanmean(per_task_precision)
        avg_recall = np.nanmean(per_task_recall)
        print '\t\tAcc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall

        print "\n\tHELD OUT TEST METRICS COMPUTED FOR EACH TASK"
        if not self.users_as_tasks:
            for t in range(self.n_tasks):
                task_name = self.test_tasks[t]['Name']
                if not self.users_as_tasks:
                    task_name = helper.getFriendlyLabelName(task_name)
                print "\t\t", task_name, "- Acc:", per_task_accs[
                    t], "AUC:", per_task_aucs[t], 'F1:', per_task_f1[
                        t], 'Precision:', per_task_precision[
                            t], 'Recall:', per_task_recall[t]

        if self.test_csv_filename is not None:
            print "\tSAVING HELD OUT PREDICITONS"
            if self.users_as_tasks:
                task_column = 'user_id'
                label_name = helper.getFriendlyLabelName(self.file_prefix)
                wanted_label = helper.getOfficialLabelName(label_name)
                predictions_df = helper.get_test_predictions_for_df_with_task_column(
                    self.classifier.predictBinary,
                    self.test_csv_filename,
                    task_column,
                    self.test_tasks,
                    wanted_label=wanted_label,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1],
                    label_name=label_name,
                    tasks_are_ints=False)
            else:
                predictions_df = helper.get_test_predictions_for_df_with_no_task_column(
                    self.classifier.predictBinary,
                    self.test_csv_filename,
                    self.test_tasks,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1])
            predictions_df.to_csv(self.results_path + "Preds-" +
                                  self.save_prefix + '.csv')
        else:
            print "Uh oh, the test csv filename was not set, can't save test preds"

        print "\t SAVING CLASSIFIER"
        with open(
                self.results_path + "PickledModel-" + self.save_prefix + '.p',
                "w") as f:
            pickle.dump(self.classifier, f)
    def get_final_results(self, optimize_for='val_acc'):
        if self.users_as_tasks and not self.check_test:
            print "check_test is set to false, Will not evaluate performance on held-out test set."
            return
        print "\nAbout to evaluate results on held-out test set!!"
        print "Will use the settings that produced the best", optimize_for

        all_preds = []
        all_true_y = []
        per_task_accs = []
        per_task_aucs = []
        per_task_f1 = []
        per_task_precision = []
        per_task_recall = []

        for t in range(self.n_tasks):
            task_settings = self.find_best_setting_for_task(
                t, optimize_for=optimize_for)
            assert (task_settings['task_num'] == t)
            if not self.users_as_tasks:
                print "\nBEST SETTING FOR TASK", t, "-", task_settings[
                    'task_name']
                print "The highest", optimize_for, "of", task_settings[
                    optimize_for], "was found with the following settings:"
                print task_settings

            task_settings = self.convert_param_dict_for_use(task_settings)
            preds, true_y = self.get_preds_true_for_task(
                self.train_tasks, self.test_tasks, task_settings)
            if preds is None or true_y is None:
                continue

            all_preds.extend(preds)
            all_true_y.extend(true_y)

            # save the per-task results
            t_acc, t_auc, t_f1, t_precision, t_recall = helper.computeAllMetricsForPreds(
                preds, true_y)
            per_task_accs.append(t_acc)
            per_task_aucs.append(t_auc)
            per_task_f1.append(t_f1)
            per_task_precision.append(t_precision)
            per_task_recall.append(t_recall)

            if not self.users_as_tasks:
                print "\nFINAL TEST RESULTS FOR", helper.getFriendlyLabelName(
                    self.train_tasks[t]['Name'])
                print 'Acc:', t_acc, 'AUC:', t_auc, 'F1:', t_f1, 'Precision:', t_precision, 'Recall:', t_recall

        print "\nHELD OUT TEST METRICS COMPUTED BY AVERAGING OVER TASKS"
        avg_acc = np.nanmean(per_task_accs)
        avg_auc = np.nanmean(per_task_aucs)
        avg_f1 = np.nanmean(per_task_f1)
        avg_precision = np.nanmean(per_task_precision)
        avg_recall = np.nanmean(per_task_recall)
        print 'Acc:', avg_acc, 'AUC:', avg_auc, 'F1:', avg_f1, 'Precision:', avg_precision, 'Recall:', avg_recall

        if self.test_csv_filename is not None:
            print "\tSAVING HELD OUT PREDICITONS"
            if self.users_as_tasks:
                task_column = 'user_id'
                label_name = helper.getFriendlyLabelName(self.file_prefix)
                wanted_label = helper.getOfficialLabelName(label_name)
                predictions_df = helper.get_test_predictions_for_df_with_task_column(
                    self.predict_task,
                    self.test_csv_filename,
                    task_column,
                    self.test_tasks,
                    wanted_label=wanted_label,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1],
                    label_name=label_name,
                    tasks_are_ints=False)
            else:
                predictions_df = helper.get_test_predictions_for_df_with_no_task_column(
                    self.predict_task,
                    self.test_csv_filename,
                    self.test_tasks,
                    num_feats_expected=np.shape(self.test_tasks[0]['X'])[1])
            predictions_df.to_csv(self.results_path + "Preds-" +
                                  self.save_prefix + '.csv')
        else:
            print "Uh oh, the test csv filename was not set, can't save test preds"