def train(x_train: np.array, y_train: np.array, x_val: np.array, y_val: np.array, save_path=None): train_data = lgb.Dataset(x_train, label=y_train) val_data = lgb.Dataset(x_val, label=y_val) gbm = lgb.train(PARAM, train_data, NUM_ROUND, valid_sets=[train_data, val_data], verbose_eval=True) pred = gbm.predict(x_val) accuracy(y_val, pred) return pred
def plot(self): if self.values is None or (self.cols is None and self.rows is None): self.draw_empty() else: value_type = self.frame.value_type.GetItemLabel( self.frame.value_type.GetSelection()) if value_type == 'metric': agg = stats.aggregate(self.df, subplots=self.subplots, rows=self.rows, cols=self.cols, yerr=self.yerr, values=self.values) elif value_type == 'accuracy': correct = list( self.frame.panel_corr.check_correct.GetCheckedStrings()) incorrect = list( self.frame.panel_corr.check_incorrect.GetCheckedStrings()) agg = stats.accuracy(self.df, subplots=self.subplots, rows=self.rows, cols=self.cols, yerr=self.yerr, values=self.values, correct=correct, incorrect=incorrect) #import pdb; pdb.set_trace() self.redraw(agg) self.frame.list_agg.DeleteAllItems() for i in range(self.frame.list_agg.GetColumnCount()): self.frame.list_agg.DeleteColumn(0) aggr = self.frame.list_agg.stack(agg) self.frame.list_agg.set_data(aggr) self.frame.aggr = aggr
def plot_accuracy_t_sweep(M, T, T_hat, output_dir): print('Plotting t sweep over accuracy...') T_hat_norm = normalize_predictions(T_hat) t_values = np.linspace(0, 1, 11) test_set_accs, zero_set_accs = [], [] for t in t_values: test_set_acc, zero_set_acc = accuracy(M, T, T_hat_norm, t) test_set_accs.append(test_set_acc) zero_set_accs.append(zero_set_acc) plt.plot(t_values, test_set_accs, label='Test Set Accuracy') plt.plot(t_values, zero_set_accs, label='Zero Set Accuracy') plt.yticks([0, 0.2, 0.4, 0.6, 0.8, 1]) plt.title('t Sweep of Model Accuracy') plt.xlabel('Value of t') plt.ylabel('Accuracy') plt.legend() plt.savefig(output_dir + '/accuracy.png')
def plot(self): if self.values is None or (self.cols is None and self.rows is None): self.draw_empty() else: value_type = self.frame.value_type.GetItemLabel(self.frame.value_type.GetSelection()) if value_type == 'metric': agg = stats.aggregate(self.df, subplots=self.subplots, rows=self.rows, cols=self.cols, yerr=self.yerr, values=self.values) elif value_type == 'accuracy': correct = list(self.frame.panel_corr.check_correct.GetCheckedStrings()) incorrect = list(self.frame.panel_corr.check_incorrect.GetCheckedStrings()) agg = stats.accuracy(self.df, subplots=self.subplots, rows=self.rows, cols=self.cols, yerr=self.yerr, values=self.values, correct=correct, incorrect=incorrect) #import pdb; pdb.set_trace() self.redraw(agg) self.frame.list_agg.DeleteAllItems() for i in range(self.frame.list_agg.GetColumnCount()): self.frame.list_agg.DeleteColumn(0) aggr = self.frame.list_agg.stack(agg) self.frame.list_agg.set_data(aggr) self.frame.aggr = aggr
output_fname=False) x, y = zip(*val_iterator) x = np.concatenate(x) y = np.concatenate(y).flatten() model_class = MODEL2CLASS[model_name] K.clear_session() model = model_class(weights=weights_path) y_pred = model.predict(x).flatten() DATA_DF.loc[test_df.index, 'pred'] = y_pred * 10 np.testing.assert_array_almost_equal( DATA_DF.loc[test_df.index, 'rank'], y * 10) accuracy(y * 10, y_pred * 10, f'\nFold {test_fold} accuracy:') df_preds = DATA_DF.dropna() df_preds.to_csv(PREDS_DF, index=False) accuracy(df_preds['rank'], df_preds['pred'], f'\nTotal preds {len(df_preds)}:') spearman = [] for g in df_preds.groupby(['datasetId', 'baseSf']): spearman.append(accuracy(g[1]['rank'], g[1]['pred'], verbose=False)) print(f'\nMean spearman {np.array([s[0] for s in spearman]).mean()}') plt.figure() plt.hist(np.array([s[0] for s in spearman])) plt.show()
val_data = lgb.Dataset(x_val, label=y_val) gbm = lgb.train(PARAM, train_data, NUM_ROUND, valid_sets=[train_data, val_data], verbose_eval=True) pred = gbm.predict(x_val) accuracy(y_val, pred) return pred if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-features', default=None, required=False, help='path to unsupervised features') args = parser.parse_args() MODEL_TYPE = 'gbt' MODEL_DIR = Path(CURRENT_DIR / 'models/unsupervised_model') FEATURE_DICT_PATH = Path(args.features) if args.features else MODEL_DIR / 'features.ncomp20.naugs40.pkl' PREDS_DF_PATH = PREDS_DIR / 'preds_{}.csv'.format(MODEL_TYPE) PREDS_DF = DATA_DF.copy() for fold in FOLDS: print(f'Fold {fold}/{len(FOLDS)}') (train_features, train_y), (test_features, test_y), df_index = load_data(FEATURE_DICT_PATH, DATA_DF, fold, N_FOLDS) assert len(train_features) + len(test_features) == len(DATA_DF) pred = train(train_features, train_y, test_features, test_y) PREDS_DF.loc[df_index, 'pred'] = pred * 10 PREDS_DF.to_csv(PREDS_DF_PATH, index=False) accuracy(DATA_DF['rank'], PREDS_DF['pred'])