def isotonicPredict(trainData, evalData): """ Based on http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html Fits linear regression (PMD ~ HT) between iniDate and endDate and then predicts values for fcDate. Requires training and evaluation datasets to be input Train-test split with a test size of 25 % :trainData: pandas dataframe :evalData: pandas dataframe """ algo = 'isotonic' #try: X_train, X_test, y_train, y_test = train_test_split( trainData['HT'].values.reshape(-1, 1), trainData['600'].values.reshape(-1, 1), test_size=.25, random_state=42) # regression iso = IsotonicRegression(out_of_bounds='clip') iso.fit(X_train.flatten(), y_train.flatten()) testPred = iso.predict(X_test.flatten()) r2_test = r2_score(y_test, testPred) isoPred = iso.predict(evalData['HT'].values.reshape(-1, 1).flatten()) # results dicResults = {'r2': r2_test, 'pred': isoPred.flatten(), 'name': algo} return dicResults
def test_fast_predict(): # test that the faster prediction change doesn't # affect out-of-sample predictions: # https://github.com/scikit-learn/scikit-learn/pull/6206 rng = np.random.RandomState(123) n_samples = 10**3 # X values over the -10,10 range X_train = 20.0 * rng.rand(n_samples) - 10 y_train = np.less(rng.rand(n_samples), expit(X_train)).astype('int64').astype('float64') weights = rng.rand(n_samples) # we also want to test that everything still works when some weights are 0 weights[rng.rand(n_samples) < 0.1] = 0 slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") # Build interpolation function with ALL input data, not just the # non-redundant subset. The following 2 lines are taken from the # .fit() method, without removing unnecessary points X_train_fit, y_train_fit = slow_model._build_y(X_train, y_train, sample_weight=weights, trim_duplicates=False) slow_model._build_f(X_train_fit, y_train_fit) # fit with just the necessary data fast_model.fit(X_train, y_train, sample_weight=weights) X_test = 20.0 * rng.rand(n_samples) - 10 y_pred_slow = slow_model.predict(X_test) y_pred_fast = fast_model.predict(X_test) assert_array_equal(y_pred_slow, y_pred_fast)
def test_fast_predict(): # test that the faster prediction change doesn't # affect out-of-sample predictions: # https://github.com/scikit-learn/scikit-learn/pull/6206 rng = np.random.RandomState(123) n_samples = 10 ** 3 # X values over the -10,10 range X_train = 20.0 * rng.rand(n_samples) - 10 y_train = np.less(rng.rand(n_samples), expit(X_train)).astype('int64').astype('float64') weights = rng.rand(n_samples) # we also want to test that everything still works when some weights are 0 weights[rng.rand(n_samples) < 0.1] = 0 slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") # Build interpolation function with ALL input data, not just the # non-redundant subset. The following 2 lines are taken from the # .fit() method, without removing unnecessary points X_train_fit, y_train_fit = slow_model._build_y(X_train, y_train, sample_weight=weights, trim_duplicates=False) slow_model._build_f(X_train_fit, y_train_fit) # fit with just the necessary data fast_model.fit(X_train, y_train, sample_weight=weights) X_test = 20.0 * rng.rand(n_samples) - 10 y_pred_slow = slow_model.predict(X_test) y_pred_fast = fast_model.predict(X_test) assert_array_equal(y_pred_slow, y_pred_fast)
def do_preds(train, test, files): train = train.sort_values(by="bin", ascending=False) test = test.sort_values(by="bin", ascending=False) train["bin"] = train["bin"] * -33.219281 test["bin"] = test["bin"] * -33.219281 print(train.head()) print(train.tail()) print("------- do preds --------") ensemble_col = [f[1] for f in files] train_x = train[ensemble_col].values.reshape(-1) reg = IsotonicRegression() reg.fit(train_x, train["target"]) y_pred = reg.predict(train_x) score = evaluator.rmse(train["target"], y_pred) print(score) test_x = test[ensemble_col].values.reshape(-1) y_pred = reg.predict(test_x) sub = pd.DataFrame() sub["card_id"] = test["card_id"] sub["target"] = y_pred print(train["target"].describe()) # print(train["big"].describe()) print(sub["target"].describe()) sub.to_csv(path_const.OUTPUT_ENS, index=False)
def calibrate_row(row): calibrator = IsotonicRegression(y_min=0, y_max=1) x = lab[~np.isnan(lab[row])][row].values y = lab[~np.isnan(lab[row])]['labels'].values calibrator.fit(x, y) lab[row] = calibrator.predict(lab[row].values) amb[row] = calibrator.predict(amb[row].values) unl[row] = calibrator.predict(unl[row].values) scr[row] = calibrator.predict(scr[row].values)
def do_cv_pred(train, test, files): print("------- do preds --------") ensemble_col = [f[1] for f in files] train_x = train[ensemble_col] test_x = test[ensemble_col].values.reshape(-1) train_y = train["target"] submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 outliers = (train["target"] < -30).astype(int).values split_num = 5 skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=4590) train_preds = [] for idx, (train_index, test_index) in enumerate(skf.split(train, outliers)): X_train, X_test = train_x.iloc[train_index], train_x.iloc[ test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[ test_index] reg = IsotonicRegression() X_train = X_train.values.reshape(-1) X_test = X_test.values.reshape(-1) reg.fit(X_train, y_train) valid_set_pred = reg.predict(X_test) print(y_test.describe()) temp = pd.DataFrame(valid_set_pred) print(temp.describe()) score = evaluator.rmse(y_test, valid_set_pred) print(score) y_pred = reg.predict(test_x) submission["target"] = submission["target"] + y_pred train_id = train.iloc[test_index] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = valid_set_pred train_preds.append(train_cv_prediction) train_output = pd.concat(train_preds, axis=0) submission["target"] = submission["target"] / split_num submission.to_csv(path_const.OUTPUT_SUB, index=False) train_output["cv_pred"] = np.clip(train_output["cv_pred"], -33.219281, 18.0) train_output.to_csv(path_const.OUTPUT_OOF, index=False) df_pred = pd.merge(train[["card_id", "target"]], train_output, on="card_id") rmse_score = evaluator.rmse(df_pred["target"], df_pred["cv_pred"]) print(rmse_score)
def test_isotonic_regression_oob_raise(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="raise") ir.fit(x, y) # Check that an exception is thrown msg = 'A value in x_new is below the interpolation range' with pytest.raises(ValueError, match=msg): ir.predict([min(x) - 10, max(x) + 10])
def test_isotonic_regression_oob_clip(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="clip") ir.fit(x, y) # Predict from training and test x and check that min/max match. y1 = ir.predict([min(x) - 10, max(x) + 10]) y2 = ir.predict(x) assert_equal(max(y1), max(y2)) assert_equal(min(y1), min(y2))
def test_isotonic_regression_oob_clip(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="clip") ir.fit(x, y) # Predict from training and test x and check that min/max match. y1 = ir.predict([min(x) - 10, max(x) + 10]) y2 = ir.predict(x) assert max(y1) == max(y2) assert min(y1) == min(y2)
class Recalibrator: def __init__(self, model, data, args): self.args = args self.model = model inputs, labels = data[0].to(args.device), data[1].to(args.device) with torch.no_grad(): outputs = model(inputs) labels = torch.sort(labels.flatten())[0].cpu().numpy() outputs = torch.sort(outputs.flatten())[0].cpu().numpy() # plt.scatter(outputs, labels) # plt.show() # plt.hist(outputs, bins=30, alpha=0.5, color='r') # plt.hist(labels, bins=30, alpha=0.5, color='g') # plt.show() # print(labels.shape, outputs.shape) self.iso = IsotonicRegression(out_of_bounds='clip', increasing=True) self.iso = self.iso.fit(outputs, labels) def adjust(self, original_y): original_shape = original_y.shape return torch.from_numpy(self.iso.predict( original_y.cpu().flatten())).view(original_shape).to( self.args.device)
def cali(fname, predict_name, out_name, mode='ctr'): if mode == 'ctr': true_col = 'actual_click' prob_col = 'ctr' if mode == 'cvr': true_col = 'actual_purchase' prob_col = 'cvr' pred_df = pd.read_csv(predict_name, names=columns) nn = pred_df.shape[0] df = pd.read_csv(fname, names=columns) n = df.shape[0] y_true = df[true_col].values y_prob = df[prob_col].values #fraction_of_positives, mean_predicted_value = cali.calibration_curve(y_true, y_prob, normalize=False, n_bins=10) #plt.figure() #plt.plot(mean_predicted_value,fraction_of_positives) #plt.show() #plt.close() ir = IsotonicRegression() y = ir.fit_transform(y_prob, y_true) y_pred = ir.predict(pred_df[prob_col].values) nn = y_pred.shape[0] h = open(out_name, 'w') for i in range(nn): if i < nn - 1: h.write(str(y_pred[i]) + '\n') else: h.write(str(y_pred[i])) h.close()
def regression_model(freqs, deg=2, same=False, method='isotonic'): ''' - qual: all measured quality scores when a transition was observed - proportion of transitions for a given quality ''' observed_transitions = (~np.isnan(freqs)) & (freqs>0) x = np.arange(1, 41)[observed_transitions] y = -np.log10(freqs[observed_transitions]) if len(x) == 0: return np.zeros(40) if method == 'polynomial': z = np.polyfit(x, y, 3) polynom = np.poly1d(z) y_interp = 10**-polynom(np.arange(1, 41)) elif method == 'isotonic': ir = IsotonicRegression(y_min=0, out_of_bounds='clip', increasing=not same) ir.fit(x, y) y_interp = 10**-ir.predict(np.arange(1, 41)) else: print('Unknown method: {}. Aborting.'.format(method)) exit(1) return y_interp
def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) y = np.array([10, 0, 2]) y_ = np.array([4, 4, 4]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm]) # check we don't crash when all x are equal: ir = IsotonicRegression() assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
def test_isotonic_2darray_more_than_1_feature(): # Ensure IsotonicRegression raises error if input has more than 1 feature X = np.arange(10) X_2d = np.c_[X, X] y = np.arange(10) msg = "should be a 1d array or 2d array with 1 feature" with pytest.raises(ValueError, match=msg): IsotonicRegression().fit(X_2d, y) iso_reg = IsotonicRegression().fit(X, y) with pytest.raises(ValueError, match=msg): iso_reg.predict(X_2d) with pytest.raises(ValueError, match=msg): iso_reg.transform(X_2d)
def isotonic_regression(scores_dev: torch.Tensor, scores_test: torch.Tensor, labels_dev: torch.Tensor, labels_test: torch.Tensor): """Calibrates confidence scores using scikit-learn implementation of isotonic regression. Isotonic regression does not require bins for recalibration. Args: scores_dev: [n, t] Tensor of confidence scores (e.g. softmaxed logits) for dev set. scores_test: [n, t] Tensor of confidence scores (e.g. softmaxed logits) for test set. labels_dev: [n, t] One-hot tensor of labels for dev set. labels_test: [n, t] One-hot tensor of labels for test set. """ logger.info("Starting isotonic regression...") # Scores need to be moved to CPU for sklearn model flattened_scores_dev = scores_dev.reshape(-1).cpu() flattened_labels_dev = labels_dev.reshape(-1).cpu() flattened_scores_test = (scores_test.reshape(-1)).cpu() model = IsotonicRegression(y_min=0, y_max=1) model.fit(X=flattened_scores_dev, y=flattened_labels_dev) predictions = torch.Tensor(model.predict(flattened_scores_test)).cuda() calibrated_scores_test = predictions return calibrated_scores_test
def eval(scores_dir): y_preds = [] y_trues = [] for fn in os.listdir(scores_dir): print(fn) y_pred, y_true = np.loadtxt(os.path.join(scores_dir, fn), delimiter=',', usecols=(3, 4), unpack=True) ##, max_rows=10 y_preds.extend(y_pred) y_trues.extend(y_true) y_preds = np.array(y_preds) y_trues = np.array(y_trues) iso_reg = IsotonicRegression().fit(y_preds, y_trues) y_probs = iso_reg.predict(y_preds) auc_roc = roc_auc_score(y_trues, y_probs) auc_pr = average_precision_score(y_trues, y_probs) y_predicts = np.where(y_probs > 0.5, 1.0, 0.0) accuracy = accuracy_score(y_trues, y_predicts) f1 = f1_score(y_trues, y_predicts) print( f'total length is {len(y_trues)} \n auc score for roc is {auc_roc} and the auc for pr is {auc_pr}, f1 score is {f1}, accuracy is {accuracy}' ) res = np.vstack((y_preds, y_trues, y_probs)).T np.savetxt('/home/yh1844/inference-2019/eval/eval.txt', res)
class IsotonicRegressionCalibrator: def __init__(self): self.Calibrator = IsotonicRegression(out_of_bounds='clip') def train(self, h0, h1): #fits the IR model, self.Calibrator, to scores h0 and h1. Assumes labels of h0 is 0 (normal) and labels of h1 is 1 (anamolous) #expects single dimensional arrays of anyform (list, numpy, mx1, 1xm, etc.) n0 = np.size(h0) n1 = np.size(h1) H = np.append(np.reshape(h0, n0), np.reshape(h1, n1)) y = np.append(np.zeros(n0), np.ones(n1)) #labels self.Calibrator.fit(H, y) def test(self, H): #Reutrns the predicted posterioir probabilities of the list of scores H using the fitted IR model in self.Calibrator. #expects single dimensional array of anyform (list, numpy, mx1, 1xm, etc.) # Keyword arguments: # H - classifier scores return self.Calibrator.predict(H) def toString(self): return "Isotonic"
def test_calibration_ensemble_false(data, method): # Test that `ensemble=False` is the same as using predictions from # `cross_val_predict` to train calibrator. X, y = data clf = LinearSVC(random_state=7) cal_clf = CalibratedClassifierCV(clf, method=method, cv=3, ensemble=False) cal_clf.fit(X, y) cal_probas = cal_clf.predict_proba(X) # Get probas manually unbiased_preds = cross_val_predict(clf, X, y, cv=3, method="decision_function") if method == "isotonic": calibrator = IsotonicRegression(out_of_bounds="clip") else: calibrator = _SigmoidCalibration() calibrator.fit(unbiased_preds, y) # Use `clf` fit on all data clf.fit(X, y) clf_df = clf.decision_function(X) manual_probas = calibrator.predict(clf_df) assert_allclose(cal_probas[:, 1], manual_probas)
def fnIsotonicRegression(self, year, avgTemp, predictYear): feature_train, feature_test, target_train, target_test = train_test_split( year, avgTemp, test_size=0.1, random_state=42) isoReg = IsotonicRegression() isoReg.fit(feature_train, target_train) return (isoReg.score(feature_test, target_test), isoReg.predict(predictYear))
def _gspv_interpolate_cloud(powers, velocities): from sklearn.isotonic import IsotonicRegression from scipy.interpolate import InterpolatedUnivariateSpline regressor = IsotonicRegression() regressor.fit(powers, velocities) x = np.linspace(min(powers), max(powers)) y = regressor.predict(x) return InterpolatedUnivariateSpline(x, y, k=1, ext=3)
def test_isotonic_mismatched_dtype(y_dtype): # regression test for #15004 # check that data are converted when X and y dtype differ reg = IsotonicRegression() y = np.array([2, 1, 4, 3, 5], dtype=y_dtype) X = np.arange(len(y), dtype=np.float32) reg.fit(X, y) assert reg.predict(X).dtype == X.dtype
def test_isotonic_non_regression_inf_slope(): # Non-regression test to ensure that inf values are not returned # see: https://github.com/scikit-learn/scikit-learn/issues/10903 X = np.array([0., 4.1e-320, 4.4e-314, 1.]) y = np.array([0.42, 0.42, 0.44, 0.44]) ireg = IsotonicRegression().fit(X, y) y_pred = ireg.predict(np.array([0, 2.1e-319, 5.4e-316, 1e-10])) assert np.all(np.isfinite(y_pred))
def test_isotonic_duplicate_min_entry(): x = [0, 0, 1] y = [0, 0, 1] ir = IsotonicRegression(increasing=True, out_of_bounds="clip") ir.fit(x, y) all_predictions_finite = np.all(np.isfinite(ir.predict(x))) assert_true(all_predictions_finite)
def test_isotonic_duplicate_min_entry(): x = [0, 0, 1] y = [0, 0, 1] ir = IsotonicRegression(increasing=True, out_of_bounds="clip") ir.fit(x, y) all_predictions_finite = np.all(np.isfinite(ir.predict(x))) assert all_predictions_finite
def _configure_classifier(self, X1, test_size=0.35): """ The neural network below captures the residual non-linearity after the transformations above. :param X1: array_like, shape (n_samples, n_features) List of n_features-dimensional data points to be modelled. Each row corresponds to a single data point. :param float test_size: fraction of X1 used for probability calibration. Default is 0.4. """ # fitting uniform data sample vs observed data # set random state np.random.seed(self.random_state) # make training sample and labels # use test sample below for probability calibration. X1_train, X1_test, y1_train, y1_test = train_test_split( X1, np.ones(X1.shape[0]), test_size=test_size, random_state=self.random_state) X0_train = np.random.uniform(size=X1_train.shape) X_train = np.concatenate([X0_train, X1_train], axis=0) y_train = np.concatenate([np.zeros(X1_train.shape[0]), y1_train], axis=None) self.clf = self.clf.fit(X_train, y_train) # self.train_data = (X_train, y_train) # self.test_data = (X1_test, y1_test) # Calibrate probabilities manually. (Used for weights calculation.) X0_test = np.random.uniform(size=(1000000, X1.shape[1])) p0 = self.clf.predict_proba(X0_test)[:, 1] p1 = self.clf.predict_proba(X1_test)[:, 1] hist_p0, bin_edges = np.histogram(p0, bins=100, range=(0, 1)) hist_p1, bin_edges = np.histogram(p1, bins=100, range=(0, 1)) bin_centers = bin_edges[:-1] + 0.005 hnorm_p0 = hist_p0 / sum(hist_p0) hnorm_p1 = hist_p1 / sum(hist_p1) hnorm_sum = hnorm_p0 + hnorm_p1 p1cb = np.divide(hnorm_p1, hnorm_sum, out=np.zeros_like(hnorm_p1), where=hnorm_sum != 0) # self.p1cb = p1cb, bin_centers # use isotonic regression to smooth out potential fluctuations in the p1 values # isotonic regression assumes that p1 can only be a rising function. # I’m assuming that if a classifier predicts a higher probability, the calibrated probability # will also be higher. This may not always be right, but I think generally it is a safe one. iso_reg = IsotonicRegression().fit(bin_centers, p1cb) p1pred = iso_reg.predict(bin_centers) self.p1f_ = interpolate.interp1d(bin_edges[:-1], p1pred, kind='previous', bounds_error=False, fill_value="extrapolate")
def mir_calibrate(logit,label,logit_eval): p = np.exp(logit)/np.sum(np.exp(logit),1)[:,None] p_eval = np.exp(logit_eval)/np.sum(np.exp(logit_eval),1)[:,None] ir = IsotonicRegression(out_of_bounds='clip') y_ = ir.fit_transform(p.flatten(), (label.flatten())) yt_ = ir.predict(p_eval.flatten()) p = yt_.reshape(logit_eval.shape)+1e-9*p_eval return p
def isotonic(self): clf = IsotonicRegression() train_x = self.train_x.to_list() train_y = self.train_y.to_list() test_x = self.test_x.to_list() clf.fit(train_x, train_y) test_y_pred = clf.predict(test_x) return test_y_pred
def linear_regression(self, exp1, exp2, min_samples=5): X = [] Y = [] Xi = [] for i in sorted(exp1): if i in exp2: Xi.append(i) X.append(exp2[i]) Y.append(exp1[i]) X = np.r_[X] Y = np.r_[Y] Xi = np.r_[Xi] if X.size < min_samples: rscore = 0 slope = 0 warning = False else: # clean the inputs by isotonic regression warning, increasing_bool = check_increasing(Xi, Y) IR = IsotonicRegression(increasing=increasing_bool) IR.fit(Xi, Y) Y1 = IR.predict(Xi) vi = np.where(np.diff(Y1) < 0)[0] pieces = np.split(vi, np.where(np.diff(vi) != 1)[0] + 1) si = 0 for i in range(len(pieces) - 1): p1 = pieces[i] p2 = pieces[i + 1] if p1.size / (p2[0] - p1[0]) > 0.5: si = p1[0] break if si / X.size > 0.3: # if more than 1/4 data discarded si = vi[0] if si / X.size > 0.3: si = 0 X = X[si:] Y = Y[si:] X = X[:, np.newaxis] huber = HuberRegressor().fit(X, Y) inlier_mask = np.logical_not(huber.outliers_) if inlier_mask.sum() < min_samples: rscore = 0 slope = 0 else: sX = X[inlier_mask] sY = Y[inlier_mask] rscore = huber.score(sX, sY) slope = huber.coef_[0] return rscore, slope, warning
def calibrate_col(col): # isotonic not the best here, and faces numerical issues calibrator = IsotonicRegression(y_min=0, y_max=1) x = lab[~np.isnan(lab[col])][col].values y = lab[~np.isnan(lab[col])]['labels'].values # This worked with old sklearn try: # Old sklearn calibrator.fit(x.reshape(-1, 1), y) lab[col] = calibrator.predict(lab[col].values.reshape(-1, 1)) amb[col] = calibrator.predict(amb[col].values.reshape(-1, 1)) unl[col] = calibrator.predict(unl[col].values.reshape(-1, 1)) scr[col] = calibrator.predict(scr[col].values.reshape(-1, 1)) except ValueError: # Newer sklearn calibrator.fit(x.ravel(), y) lab[col] = calibrator.predict(lab[col].values.ravel()) amb[col] = calibrator.predict(amb[col].values.ravel()) unl[col] = calibrator.predict(unl[col].values.ravel()) scr[col] = calibrator.predict(scr[col].values.ravel())
def test_isotonic_make_unique_tolerance(): # Check that averaging of targets for duplicate X is done correctly, # taking into account tolerance X = np.array([0, 1, 1 + 1e-16, 2], dtype=np.float64) y = np.array([0, 1, 2, 3], dtype=np.float64) ireg = IsotonicRegression().fit(X, y) y_pred = ireg.predict([0, 0.5, 1, 1.5, 2]) assert_array_equal(y_pred, np.array([0, 0.75, 1.5, 2.25, 3])) assert_array_equal(ireg.X_thresholds_, np.array([0., 1., 2.])) assert_array_equal(ireg.y_thresholds_, np.array([0., 1.5, 3.]))
def irova_calibrate(logit,label,logit_eval): p = np.exp(logit)/np.sum(np.exp(logit),1)[:,None] p_eval = np.exp(logit_eval)/np.sum(np.exp(logit_eval),1)[:,None] for ii in range(p_eval.shape[1]): ir = IsotonicRegression(out_of_bounds='clip') y_ = ir.fit_transform(p[:,ii], label[:,ii]) p_eval[:,ii] = ir.predict(p_eval[:,ii])+1e-9*p_eval[:,ii] return p_eval return p_eval
def test_input_shape_validation(): # Test from #15012 # Check that IsotonicRegression can handle 2darray with only 1 feature X = np.arange(10) X_2d = X.reshape(-1, 1) y = np.arange(10) iso_reg = IsotonicRegression().fit(X, y) iso_reg_2d = IsotonicRegression().fit(X_2d, y) assert iso_reg.X_max_ == iso_reg_2d.X_max_ assert iso_reg.X_min_ == iso_reg_2d.X_min_ assert iso_reg.y_max == iso_reg_2d.y_max assert iso_reg.y_min == iso_reg_2d.y_min assert_array_equal(iso_reg.X_thresholds_, iso_reg_2d.X_thresholds_) assert_array_equal(iso_reg.y_thresholds_, iso_reg_2d.y_thresholds_) y_pred1 = iso_reg.predict(X) y_pred2 = iso_reg_2d.predict(X_2d) assert_allclose(y_pred1, y_pred2)
def test_isotonic_regression_pickle(): y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="clip") ir.fit(x, y) ir_ser = pickle.dumps(ir, pickle.HIGHEST_PROTOCOL) ir2 = pickle.loads(ir_ser) np.testing.assert_array_equal(ir.predict(x), ir2.predict(x))
def isotonicFit(thr, prec, maxThr=999): thr = np.array(thr) prec = np.array(prec) prec = prec[thr <= maxThr] thr = thr[thr <= maxThr] objFun = lambda thr, alpha, beta: alpha * thr**beta isoReg = IsotonicRegression(y_min=0, y_max=1) isoReg.fit(thr, prec) # joblib.dump(isoReg, "/home/rsanchez/Tesis/rriPredMethod/pyCode/webApp/rriPredWeb/media/scoreToPrecModel/mixed.isotonic.joblib") return lambda x: isoReg.predict(x), "isotonic"
def test_isotonic_regression_oob_nan(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="nan") ir.fit(x, y) # Predict from training and test x and check that we have two NaNs. y1 = ir.predict([min(x) - 10, max(x) + 10]) assert sum(np.isnan(y1)) == 2
def test_isotonic_regression_oob_nan(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing='auto', out_of_bounds="nan") ir.fit(x, y) # Predict from training and test x and check that we have two NaNs. y1 = ir.predict([min(x) - 10, max(x) + 10]) assert_equal(sum(np.isnan(y1)), 2)
def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0.0, y_max=1.0) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0.0, y_max=1.0) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
def test_isotonic_dtype(): y = [2, 1, 4, 3, 5] weights = np.array([.9, .9, .9, .9, .9], dtype=np.float64) reg = IsotonicRegression() for dtype in (np.int32, np.int64, np.float32, np.float64): for sample_weight in (None, weights.astype(np.float32), weights): y_np = np.array(y, dtype=dtype) expected_dtype = \ check_array(y_np, dtype=[np.float64, np.float32], ensure_2d=False).dtype res = isotonic_regression(y_np, sample_weight=sample_weight) assert_equal(res.dtype, expected_dtype) X = np.arange(len(y)).astype(dtype) reg.fit(X, y_np, sample_weight=sample_weight) res = reg.predict(X) assert_equal(res.dtype, expected_dtype)
def main_old(): model = create_model(nb_classes, optimizer, loss) if keras_plot_available: plot(model, to_file='model.png') (X_train, Y_train), (X_val, Y_val) = get_mnist_data( train_size, binarize, add_noise, noise_proportion, test=False) print('Showing data samples') imshow_samples(X_train, Y_train, X_val, Y_val, 5) diary.save_figure(plt, filename='samples', extension='svg') print('Creating error and accuracy vectors') error_train = np.zeros(num_epochs+1) error_val = np.zeros(num_epochs+1) accuracy_train = np.zeros(num_epochs+1) accuracy_val = np.zeros(num_epochs+1) print('Model predict training scores') score_train = model.predict(X_train).flatten() if output_activation == 'isotonic_regression': # 4. Calibrate the network with isotonic regression in the full training ir = IsotonicRegression(increasing=True, out_of_bounds='clip', y_min=_EPSILON, y_max=(1-_EPSILON)) # b. Calibrate the scores print('Learning Isotonic Regression from TRAINING set') ir.fit(score_train, Y_train) # 5. Evaluate the performance with probabilities # b. Evaluation on validation set print('Model predict validation scores') score_val = model.predict(X_val).flatten() if output_activation == 'isotonic_regression': prob_train = ir.predict(score_train) print('IR predict validation probabilities') prob_val = ir.predict(score_val) else: prob_train = score_train prob_val = score_val error_train[0] = compute_loss(prob_train, Y_train, loss) accuracy_train[0] = compute_accuracy(prob_train, Y_train) error_val[0] = compute_loss(prob_val, Y_val, loss) accuracy_val[0] = compute_accuracy(prob_val, Y_val) # SHOW INITIAL PERFORMANCE print(("train: error = {}, acc = {}\n" "valid: error = {}, acc = {}").format( error_train[0], accuracy_train[0], error_val[0], accuracy_val[0])) diary.add_entry('training', [error_train[0], accuracy_train[0]]) diary.add_entry('validation', [error_val[0], accuracy_val[0]]) num_minibatches = np.ceil(np.true_divide(train_size,batch_size)).astype('int') for epoch in range(1,num_epochs+1): for iteration in range(num_minibatches): # Given that the probabilities are calibrated # 1. Choose the next minibatch print('EPOCH {}'.format(epoch)) minibatch_id = get_minibatch_id(train_size, batch_size, method=minibatch_method, iteration=iteration) X_train_mb = X_train[minibatch_id] Y_train_mb = Y_train[minibatch_id] if output_activation == 'isotonic_regression': # 2. Compute the new values for the labels on this minibatch # a. Predict the scores using the network print('\tMODEL PREDICTING TRAINING SCORES') score_train_mb = model.predict(X_train_mb).flatten() # b. Predict the probabilities using IR print('\tIR PREDICTING TRAINING PROBABILITIES') prob_train_mb = ir.predict(score_train_mb.flatten()) # c. Compute the gradients of IR g_prob_train_mb = isotonic_gradients(ir, prob_train_mb) # c. Compute new values for the labels #Y_train_mb_new = prob_train_mb + Y_train_mb Y_train_mb_new = prob_train_mb + \ np.divide(np.multiply(prob_train_mb - Y_train_mb, g_prob_train_mb), np.multiply(prob_train_mb, 1 - prob_train_mb)) else: Y_train_mb_new = Y_train_mb # 3. Train the network on this minibatch # Be advised that the errors shown by Keras on the training # set are really for this minibatch. print('\tTRAINING MODEL') model.fit(X_train_mb, Y_train_mb_new, nb_epoch=1, batch_size=inner_batch_size, show_accuracy=True, verbose=1, validation_data=(X_val,Y_val)) if output_activation == 'isotonic_regression': # 4. Calibrate the network with isotonic regression in the full training # a. Get the new scores from the model print('\tModel predict training scores') score_train = model.predict(X_train).flatten() # b. Calibrate the scores print('\tLearning Isotonic Regression from TRAINING set') ir.fit(score_train, Y_train) # Evaluate epoch on the full training and validation set # 5. Evaluate the performance with the calibrated probabilities print('\tModel predict training scores') score_train = model.predict(X_train).flatten() print('\tModel predict validation scores') score_val = model.predict(X_val).flatten() if output_activation == 'isotonic_regression': # a. Evaluation on TRAINING set print('\tIR predict training probabilities') prob_train = ir.predict(score_train.flatten()) # b. Evaluation on VALIDATION set print('\tIR predict validation probabilities') prob_val = ir.predict(score_val.flatten()) else: prob_train = score_train prob_val = score_val error_train[epoch] = compute_loss(prob_train, Y_train, loss) accuracy_train[epoch] = compute_accuracy(prob_train, Y_train) error_val[epoch] = compute_loss(prob_val, Y_val, loss) accuracy_val[epoch] = compute_accuracy(prob_val, Y_val) # SHOW PERFORMANCE ON MINIBATCH print(("\ttrain: error = {}, acc = {}\n" "\tvalid: error = {}, acc = {}").format( error_train[epoch], accuracy_train[epoch], error_val[epoch], accuracy_val[epoch])) # SAVE PERFORMANCE ON epoch diary.add_entry('training', [error_train[epoch], accuracy_train[epoch]]) diary.add_entry('validation', [error_val[epoch], accuracy_val[epoch]]) # PLOTS print('\tUpdating all plots') if output_activation == 'isotonic_regression': prob_lin = ir.predict(score_lin) plot_reliability_diagram(prob_train, Y_train, prob_val, Y_val, epoch, score_lin=score_lin, prob_lin=prob_lin) else: plot_reliability_diagram(prob_train, Y_train, prob_val, Y_val, epoch) diary.save_figure(plt, filename='reliability_diagram', extension='svg') plot_histogram_scores(prob_train, epoch) diary.save_figure(plt, filename='histogram_scores', extension='svg') plot_accuracy(accuracy_train, accuracy_val, epoch) diary.save_figure(plt, filename='accuracy', extension='svg') plot_error(error_train, error_val, epoch, loss) diary.save_figure(plt, filename='error', extension='svg') plt.pause(0.0001)
def main(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20, n_hidden=[500, 500]): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ if add_noise==True: datasets = load_data(dataset, nb_classes=nb_classes, binarize=binarize, noise_prop=noise_proportion) else: datasets = load_data(dataset, nb_classes=nb_classes, binarize=binarize) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print('Showing data samples') if nb_classes == 2: labels = ['odd', 'even'] else: labels = [0,1,2,3,4,5,6,7,8,9] imshow_samples(train_set_x.get_value(), train_set_y, valid_set_x.get_value(), valid_set_y, num_samples=4, labels=labels) plt.pause(0.0001) diary.save_figure(plt, filename='samples', extension='svg') # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = numpy.random.RandomState(1234) # construct the MLP class classifier = MLP( rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=nb_classes ) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = ( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr ) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] } ) training_error_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] } ) # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_loss_model = theano.function( inputs=[index], outputs=classifier.loss(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] } ) validation_loss_model = theano.function( inputs=[index], outputs=classifier.loss(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] } ) training_loss_model = theano.function( inputs=[index], outputs=classifier.loss(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] } ) # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_accuracy_model = theano.function( inputs=[index], outputs=classifier.accuracy(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] } ) validation_accuracy_model = theano.function( inputs=[index], outputs=classifier.accuracy(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] } ) training_accuracy_model = theano.function( inputs=[index], outputs=classifier.accuracy(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] } ) # compiling a Theano function that computes the predictions on the # training data training_predictions_model = theano.function( inputs=[index], outputs=classifier.predictions(), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], } ) validation_predictions_model = theano.function( inputs=[index], outputs=classifier.predictions(), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], } ) # compiling a Theano function that computes the predictions on the # training data training_scores_model = theano.function( inputs=[index], outputs=classifier.scores(), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], } ) validation_scores_model = theano.function( inputs=[index], outputs=classifier.scores(), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], } ) # start-snippet-5 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two lists of the same length, A = [a1, a2, a3, a4] and # B = [b1, b2, b3, b4], zip generates a list C of same size, where each # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams) ] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-5 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() error_tra = np.zeros(n_epochs+1) error_val = np.zeros(n_epochs+1) accuracy_tra = np.zeros(n_epochs+1) accuracy_val = np.zeros(n_epochs+1) epoch = 0 # Error in accuracy Y_train = train_set_y.eval() Y_valid = valid_set_y.eval() print('Model predict training scores') score_train = np.asarray([training_scores_model(i) for i in range(n_train_batches)]).reshape(-1,nb_classes)[:,1] if output_activation == 'isotonic_regression': # 4. Calibrate the network with isotonic regression in the full training ir = IsotonicRegression(increasing=True, out_of_bounds='clip', y_min=_EPSILON, y_max=(1-_EPSILON)) # b. Calibrate the scores print('Learning Isotonic Regression from TRAINING set') ir.fit(score_train, Y_train) # 5. Evaluate the performance with probabilities # b. Evaluation on validation set print('Model predict validation scores') score_val = np.asarray([validation_scores_model(i) for i in range(n_valid_batches)]).reshape(-1,nb_classes)[:,1] if output_activation == 'isotonic_regression': prob_train = ir.predict(score_train) print('IR predict validation probabilities') prob_val = ir.predict(score_val) else: prob_train = score_train prob_val = score_val error_tra[epoch] = compute_loss(prob_train, Y_train, loss) error_val[epoch] = compute_loss(prob_val, Y_valid, loss) accuracy_tra[epoch] = compute_accuracy(prob_train, Y_train) accuracy_val[epoch] = compute_accuracy(prob_val, Y_valid) diary.add_entry('training', [error_tra[epoch], accuracy_tra[epoch]]) diary.add_entry('validation', [error_val[epoch], accuracy_val[epoch]]) done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break # Error in accuracy #training_loss = [training_loss_model(i) for i # in range(n_train_batches)] #validation_loss = [validation_loss_model(i) for i # in range(n_valid_batches)] #error_tra[epoch] = numpy.mean(training_loss) #error_val[epoch] = numpy.mean(validation_loss) #training_acc = [training_accuracy_model(i) for i # in range(n_train_batches)] #validation_acc = [validation_accuracy_model(i) for i # in range(n_valid_batches)] #accuracy_tra[epoch] = numpy.mean(training_acc) #accuracy_val[epoch] = numpy.mean(validation_acc) print('Model predict training scores') score_train = np.asarray([training_scores_model(i) for i in range(n_train_batches)]).reshape(-1,nb_classes)[:,1] if output_activation == 'isotonic_regression': # 4. Calibrate the network with isotonic regression in the full training # b. Calibrate the scores print('Learning Isotonic Regression from TRAINING set') ir.fit(score_train, Y_train) # 5. Evaluate the performance with probabilities # b. Evaluation on validation set print('Model predict validation scores') score_val = np.asarray([validation_scores_model(i) for i in range(n_valid_batches)]).reshape(-1,nb_classes)[:,1] if output_activation == 'isotonic_regression': prob_train = ir.predict(score_train) print('IR predict validation probabilities') prob_val = ir.predict(score_val) else: prob_train = score_train prob_val = score_val error_tra[epoch] = compute_loss(prob_train, Y_train, loss) error_val[epoch] = compute_loss(prob_val, Y_valid, loss) accuracy_tra[epoch] = compute_accuracy(prob_train, Y_train) accuracy_val[epoch] = compute_accuracy(prob_val, Y_valid) diary.add_entry('training', [error_tra[epoch], accuracy_tra[epoch]]) diary.add_entry('validation', [error_val[epoch], accuracy_val[epoch]]) plot_error(error_tra, error_val, epoch, 'loss') diary.save_figure(plt, filename='error', extension='svg') plot_accuracy(accuracy_tra, accuracy_val, epoch) diary.save_figure(plt, filename='accuracy', extension='svg') if nb_classes == 2: #prob_train = np.asarray([training_scores_model(i) for i # in range(n_train_batches)]).reshape(-1,nb_classes) #prob_val = np.asarray([validation_scores_model(i) for i # in range(n_valid_batches)]).reshape(-1,nb_classes) if output_activation == 'isotonic_regression': prob_lin = ir.predict(score_lin) plot_reliability_diagram(prob_train, Y_train, prob_val, Y_valid, epoch, prob_lin, score_lin) else: plot_reliability_diagram(prob_train, Y_train, prob_val, Y_valid, epoch) diary.save_figure(plt, filename='reliability_diagram', extension='svg') plot_histogram_scores(prob_train, prob_val, epoch=epoch) diary.save_figure(plt, filename='histogram_scores', extension='svg') #from IPython import embed #embed() plt.pause(0.0001) end_time = timeit.default_timer() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.))
def sgd_optimization_gauss(learning_rate=0.13, n_epochs=1000, batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer """ #datasets = load_data(dataset) diary = Diary(name='experiment', path='results') diary.add_notebook('training') diary.add_notebook('validation') diary.add_notebook('data') samples=[4000,10000] diary.add_entry('data', ['samples', samples]) diary.add_entry('data', ['num_classes', len(samples)]) diary.add_entry('data', ['batch_size', batch_size]) #means=[[0,0],[5,5]] #cov=[[[1,0],[0,1]],[[3,0],[0,3]]] #diary.add_entry('data', ['means', means]) #diary.add_entry('data', ['covariance', cov]) #datasets = generate_gaussian_data(means, cov, samples) datasets = generate_opposite_cs_data(samples) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] diary.add_entry('data', ['train_size', len(train_set_y.eval())]) diary.add_entry('data', ['valid_size', len(valid_set_y.eval())]) diary.add_entry('data', ['test_size', len(test_set_y.eval())]) pt = PresentationTier() pt.plot_samples(train_set_x.eval(), train_set_y.eval()) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size delta = 20 x_min = numpy.min(train_set_x.eval(),axis=0) x_max = numpy.max(train_set_x.eval(),axis=0) x1_lin = numpy.linspace(x_min[0], x_max[0], delta) x2_lin = numpy.linspace(x_min[1], x_max[1], delta) MX1, MX2 = numpy.meshgrid(x1_lin, x2_lin) x_grid = numpy.asarray([MX1.flatten(),MX2.flatten()]).T grid_set_x = theano.shared(numpy.asarray(x_grid, dtype=theano.config.floatX), borrow=True) n_grid_batches = grid_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 n_in = train_set_x.eval().shape[-1] n_out = max(train_set_y.eval()) + 1 classifier = LogisticRegression(input=x, n_in=n_in, n_out=n_out) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) # Scores grid_scores_model = theano.function(inputs=[], outputs=classifier.scores(), givens={ x: grid_set_x}) training_scores_model = theano.function( inputs=[index], outputs=classifier.scores(), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], } ) validation_scores_model = theano.function( inputs=[index], outputs=classifier.scores(), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], } ) # compute the gradient of cost with respect to theta = (W,b) g_w = T.grad(cost=cost, wrt=classifier.w) g_b = T.grad(cost=cost, wrt=classifier.b) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.w, classifier.w - learning_rate * g_w), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}) # Accuracy validation_accuracy_model = theano.function( inputs=[index], outputs=classifier.accuracy(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] } ) training_accuracy_model = theano.function( inputs=[index], outputs=classifier.accuracy(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] } ) # Loss training_error_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}) validation_error_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch print('Creating error and accuracy vectors') error_train = numpy.zeros(n_epochs+1) error_val = numpy.zeros(n_epochs+1) accuracy_train = numpy.zeros(n_epochs+1) accuracy_val = numpy.zeros(n_epochs+1) # Results for Isotonic Regression error_train_ir = numpy.zeros(n_epochs+1) error_val_ir = numpy.zeros(n_epochs+1) accuracy_train_ir = numpy.zeros(n_epochs+1) accuracy_val_ir = numpy.zeros(n_epochs+1) best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() ir = IsotonicRegression(increasing=True, out_of_bounds='clip', y_min=0, y_max=1) done_looping = False epoch = 0 CS = None while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of best' ' model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break scores_grid = grid_scores_model() fig = pt.update_contourline(grid_set_x.eval(), scores_grid, delta) diary.save_figure(fig, filename='contour_lines', extension='svg') scores_train = numpy.asarray([training_scores_model(i) for i in range(n_train_batches)]).flatten() scores_val = numpy.asarray([validation_scores_model(i) for i in range(n_valid_batches)]).flatten() print('Learning Isotonic Regression from TRAINING set') ir.fit(scores_train, train_set_y.eval()) scores_train_ir = ir.predict(scores_train) print('IR predict validation probabilities') scores_val_ir = ir.predict(scores_val) scores_set = (scores_train, scores_val, scores_train_ir, scores_val_ir) labels_set = (train_set_y.eval(), valid_set_y.eval(), train_set_y.eval(), valid_set_y.eval()) legend = ['train', 'valid', 'iso. train', 'iso. valid'] fig = pt.plot_reliability_diagram(scores_set, labels_set, legend) diary.save_figure(fig, filename='reliability_diagram', extension='svg') # TODO add reliability map scores_set = (scores_train) prob_set = (train_set_y.eval()) fig = pt.plot_reliability_map(scores_set, labels_set, legend) diary.save_figure(fig, filename='reliability_map', extension='svg') fig = pt.plot_histogram_scores(scores_set) diary.save_figure(fig, filename='histogram_scores', extension='svg') # Performance accuracy_train[epoch] = numpy.asarray([training_accuracy_model(i) for i in range(n_train_batches)]).flatten().mean() accuracy_val[epoch] = numpy.asarray([validation_accuracy_model(i) for i in range(n_valid_batches)]).flatten().mean() error_train[epoch] = numpy.asarray([training_error_model(i) for i in range(n_train_batches)]).flatten().mean() error_val[epoch] = numpy.asarray([validation_error_model(i) for i in range(n_valid_batches)]).flatten().mean() accuracy_train_ir[epoch] = compute_accuracy(scores_train_ir, train_set_y.eval()) accuracy_val_ir[epoch] = compute_accuracy(scores_val_ir, valid_set_y.eval()) error_train_ir[epoch] = compute_cross_entropy(scores_train_ir, train_set_y.eval()) error_val_ir[epoch] = compute_cross_entropy(scores_val_ir, valid_set_y.eval()) diary.add_entry('training', [error_train[epoch], accuracy_train[epoch]]) diary.add_entry('validation', [error_val[epoch], accuracy_val[epoch]]) accuracy_set = (accuracy_train[1:epoch], accuracy_val[1:epoch], accuracy_train_ir[1:epoch], accuracy_val_ir[1:epoch]) fig = pt.plot_accuracy(accuracy_set, legend) diary.save_figure(fig, filename='accuracy', extension='svg') error_set = (error_train[1:epoch], error_val[1:epoch], error_train_ir[1:epoch], error_val_ir[1:epoch]) fig = pt.plot_error(error_set, legend, 'cross-entropy') diary.save_figure(fig, filename='error', extension='svg') pt.update_contourline(grid_set_x.eval(), scores_grid, delta, clabel=True) end_time = time.clock() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time))
sum_x = 0.0 sum_y = 0.0 for j in range(len(learner.predictions[i])): sum_x += learner.predictions[i][j][0] sum_y += learner.predictions[i][j][1] if sum_x > 0: regression_x.append(sum_x/max(1, len(learner.predictions[i]))) regression_y.append(sum_y/max(1, len(learner.predictions[i]))) regression_x.append(1.0) regression_y.append(1.0) ir = IsotonicRegression(increasing=True) fit = ir.fit(regression_x, regression_y) y_ = ir.predict(regression_x) plt.plot(regression_x, regression_y, 'g.', markersize=12) plt.plot(regression_x, y_, 'r-', markersize=5) plt.show() #learner.validation_set=zip(learner.validation_set) predictions_calibrated = ir.predict(learner.validation_set[0]).tolist() predictions_combined = learner.validation_set[0] for i in range(len(predictions_combined)): if 0.2 < predictions_combined[i] < 0.8: predictions_combined[i] = predictions_calibrated[i] loss_combined = logloss_arr(predictions_combined,learner.validation_set[1])/len(learner.validation_set[1]) loss_calibrated = logloss_arr(predictions_calibrated,learner.validation_set[1])/len(learner.validation_set[1])
, 'Class_5' : y_5 , 'Class_6' : y_6 , 'Class_7' : y_7 , 'Class_8' : y_8 , 'Class_9' : y_9 }) cols = cv10fold.calibrated.columns.tolist() cols = cols[-1:] + cols[:-1] cv10fold.calibrated = cv10fold.calibrated[cols] #for validation purposes cv10fold.calibrated.to_csv('csvs\\cv10fold.calibrated.csv', index=False) yt_1 = ir1.predict(test.ix[:,0]) yt_2 = ir2.predict(test.ix[:,1]) yt_3 = ir3.predict(test.ix[:,2]) yt_4 = ir4.predict(test.ix[:,3]) yt_5 = ir5.predict(test.ix[:,4]) yt_6 = ir6.predict(test.ix[:,5]) yt_7 = ir7.predict(test.ix[:,6]) yt_8 = ir8.predict(test.ix[:,7]) yt_9 = ir9.predict(test.ix[:,8]) test.calibrated = pd.DataFrame({'id' : testId , 'Class_1' : yt_1 , 'Class_2' : yt_2 , 'Class_3' : yt_3 , 'Class_4' : yt_4 , 'Class_5' : yt_5
1,0,0,0,0,0,1,1,1,1, 1,0,0,0,0,0,1,1,1,1, 1,0,0,0,0,0,1,1,1,1, 1,0,0,0,0,0,1,1,1,1, 1,0,0,0,0,0,1,1,1,1]) print('Learning Isotonic Regression') ir = IsotonicRegression(increasing=True, out_of_bounds='clip', y_min=_EPSILON, y_max=(1-_EPSILON)) ir.fit(S, Y) print('Learning Logistic Regression') lr = LogisticRegression(C=1., solver='lbfgs') lr.fit(S.reshape(-1,1), Y) scores_set = [S, ir.predict(S), lr.predict_proba(S.reshape(-1,1))[:,1]] labels_set = [Y, Y, Y] legend = ['Y', 'IR', 'LR'] pt = PresentationTier() fig = pt.plot_reliability_diagram(scores_set, labels_set, legend, original_first=True, alpha=alpha) scores_lin = np.linspace(0,1,100) scores_set = [S, scores_lin, scores_lin] prob_set = [Y, ir.predict(scores_lin), lr.predict_proba(scores_lin.reshape(-1,1))[:,1]] fig = pt.plot_reliability_map(scores_set, prob_set, legend, original_first=True, alpha=alpha)
def IsotonicRegression_pred(y_train, predictions_train, test_preds, bin_step, y_test): # Y Training Target sort the y_test # X Training Data use the indexes of sorted(y_test) # y_train_len=len(y_train) # if bin_step<1: # step_count = 1/bin_step # else: # step_count = int(math.floor(y_train_len/bin_step)) # step_element_count = int(math.floor(y_train_len/step_count)) # bin_start_indexes=np.array(range(0,step_count))*step_element_count predictions_np = np.array(predictions_train, float) predictions_sorted = np.sort(predictions_np) predictions_sorted_indexes = predictions_np.argsort() y_train_arranged = np.array(y_train, float)[predictions_sorted_indexes].ravel() # not_binned_y_train_arranged = y_train_arranged[:] # for index in range(len(bin_start_indexes)-1): # pin = bin_start_indexes[index] # pend = bin_start_indexes[index+1] # y_train_arranged[pin:pend] = np.average(y_train_arranged[pin:pend]) # if bin_start_indexes[-1]<y_train_len: # pin = bin_start_indexes[-1] # pend = y_train_len # y_train_arranged[pin:pend] = np.average(y_train_arranged[pin:pend]) ir = IsotonicRegression() y_ir = ir.fit_transform(predictions_sorted, y_train_arranged) y_ir_pred = ir.predict(predictions_sorted) # print "min(y_train_arranged) :", min(y_train_arranged) # print "max(y_train_arranged) :", max(y_train_arranged) # print "min(predictions_sorted) :", min(predictions_sorted) # print "max(predictions_sorted) :", max(predictions_sorted) # print "min(test_preds) :", min(test_preds) # print "max(test_preds) :", max(test_preds) # if max(test_preds)>=max(y_train_arranged): # np.arrya(test_preds>max(y_train_arranged))==True max_indexes = np.array((np.where(test_preds > max(y_train_arranged))), int).ravel() if len(max_indexes) != 0: for m_i in max_indexes: test_preds[m_i] = max(y_train_arranged) test_preds_sorted = np.sort(np.array(test_preds)) predictions_ir = ir.predict(test_preds) ind = np.where(np.isnan(predictions_ir))[0] preds_test_min = np.nanmin(predictions_ir) if len(ind) != 0: for i in ind: predictions_ir[i] = preds_test_min # ==============WRITING TO CSV================ # d_train={'y_train' :np.array(y_train,float)[predictions_sorted_indexes].ravel(), # 'y_train_bin' :np.array(y_train_arranged).ravel(), # 'train_preds' :np.array(predictions_sorted).ravel(), # 'train_preds_ir' :y_ir} # df_train=pd.DataFrame(d_train) # df_train.to_csv("train_IR.csv") # d_test={'y_test' :np.array(y_test).ravel(), # 'test_preds' :np.array(test_preds).ravel(), # 'test_preds_ir' :predictions_ir} # df_test=pd.DataFrame(d_test) # df_test.to_csv("test_IR.csv") # score_test_ir=ir.score(test_preds,y_test) score_test_ir = 0 return predictions_ir, y_ir_pred, ir.get_params(deep=True), score_test_ir