def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) y = np.array([10, 0, 2]) y_ = np.array([4, 4, 4]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm]) # check we don't crash when all x are equal: ir = IsotonicRegression() assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
def train_classifier_with_calib(classifier, data, use_all_data=False, normalize=False): X_train = data.X_train y_train = data.y_train X_cv = data.X_cv y_cv = data.y_cv if normalize: X_train, X_cv = normalize_data(X_train, X_cv) if not use_all_data: ir = IR() score, S = train(classifier, X_train, y_train, X_cv, y_cv, data.y_classes) predictions_proba = classifier.predict_proba(X_cv) proba = predictions_proba[:,1]; ir.fit_transform(proba,y_cv) print proba print ir return { 'classifier': classifier, 'score': score, 'S_auc': S, 'IR':ir, 'prange':[np.amin(proba),np.amax(proba)] } else: train_all_data(classifier, X_train, y_train, X_cv, y_cv) return { 'classifier': classifier }
def test_isotonic_regression_ties_max(): # Setup examples with ties on maximum x = [1, 2, 3, 4, 5, 5] y = [1, 2, 3, 4, 5, 6] y_true = [1, 2, 3, 4, 5.5, 5.5] # Check that we get identical results for fit/transform and fit_transform ir = IsotonicRegression() ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(y_true, ir.fit_transform(x, y))
def test_isotonic_sample_weight_parameter_default_value(): # check if default value of sample_weight parameter is one ir = IsotonicRegression() # random test data rng = np.random.RandomState(42) n = 100 x = np.arange(n) y = rng.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n)) # check if value is correctly used weights = np.ones(n) y_set_value = ir.fit_transform(x, y, sample_weight=weights) y_default_value = ir.fit_transform(x, y) assert_array_equal(y_set_value, y_default_value)
def test_isotonic_sample_weight_parameter_default_value(): # check if default value of sample_weight parameter is one ir = IsotonicRegression() # random test data rng = np.random.RandomState(42) n = 100 x = np.arange(n) y = rng.randint(-50, 50, size=(n, )) + 50. * np.log(1 + np.arange(n)) # check if value is correctly used weights = np.ones(n) y_set_value = ir.fit_transform(x, y, sample_weight=weights) y_default_value = ir.fit_transform(x, y) assert_array_equal(y_set_value, y_default_value)
def fit_iso_transform(self, bx, by): with torch.no_grad(): cdf = self.eval_all(bx, by)[0].cpu().numpy()[:, 0].astype(np.float) cdf = np.sort(cdf) lin = np.linspace(0, 1, int(cdf.shape[0])) # Insert an extra 0 and 1 to ensure the range is always [0, 1], and trim CDF for numerical stability cdf = np.clip(cdf, a_max=1.0 - 1e-6, a_min=1e-6) cdf = np.insert(np.insert(cdf, -1, 1), 0, 0) lin = np.insert(np.insert(lin, -1, 1), 0, 0) iso_transform = IsotonicRegression() iso_transform.fit_transform(cdf, lin) return iso_transform
def plot(): results = [] for f in glob('lengths*npz'): d = np.load(f) l = d['lengths'] l = l[l > 0.] print d['mu'], l.shape results.append([d['mu'], l.mean()]) results = sorted(results) results = np.array(results).T muvals, mean_length = results f = plt.figure() f.clf() ax = f.gca() iso = IsotonicRegression(increasing=False) mean_length_iso = iso.fit_transform(np.arange(mean_length.shape[0]), mean_length) ax.plot(muvals, mean_length, 'k', linewidth=2, label='UMAU') ax.plot([muvals.min(), muvals.max()], [2 * ndist.ppf(0.975)] * 2, c='red', label='Sample splitting', linewidth=2) ax.plot([muvals.min(), muvals.max()], [np.sqrt(2) * ndist.ppf(0.975)] * 2, 'k--') ax.set_xlabel(r'$\mu$', fontsize=20) ax.set_ylabel(r'E(|CI($\mu$)|)', fontsize=20) ax.legend(loc='lower right') ax.set_ylim([0, 4]) ax.set_xlim([-2, 9]) f.savefig('figure_b.pdf') output = np.array(zip(muvals, mean_length)) np.savetxt('equal_tailed_lengths.csv', output, delimiter=',')
def apply_isotonic(arr): n_krn = len(gus_krn) arr_smt = np.convolve(np.hstack([np.repeat(arr[0], n_krn * 2), arr]), gus_krn, mode='same')[n_krn * 2:] # plt.plot(range(len(arr_smt)), arr_smt, alpha=0.5, label='Smooth') ir = IsotonicRegression(y_min=0, increasing=False) # plt.plot(range(len(arr_smt)), bin_prd, alpha=0.5, label='Pred') return ir.fit_transform(range(len(arr_smt)), arr_smt)
def _fit_isotonic(model,train_loader): t_start = perf_counter() means,stds,ys = model.mc_prediction_loader(train_loader) N = means.shape[0] dist = Normal(means,stds) cdf = dist.cdf(ys) sorted_cdf,ind = cdf.sort() #[N] y = torch.arange(1.0,N+1)/N #[N] ir = IsotonicRegression(out_of_bounds='clip') x = sorted_cdf.cpu().numpy() #[N] y = y.numpy() #[N] x_app = np.insert(x,0,0.0) y_app = np.insert(y,0,0.0) y_ = ir.fit_transform(x_app, y_app)#[N] delta = _delta(means,stds,ys) #for synchronizing cuda calls torch.cuda.synchronize() #stop and measure the time taken for postprocessing method t_stop = perf_counter() iso_time = torch.tensor(t_stop - t_start) return ir,delta,sorted_cdf,iso_time
def main(self): x_field = self.fields_by_key('x')[0] y_field = self.fields_by_key('y')[0] x = np.array(self.slice_data(x_field,int)) y = np.array(self.slice_data(y_field,int)) n = len(x) render = StringIO.StringIO() ############################################################################### # Fit IsotonicRegression and LinearRegression models ir = IsotonicRegression() y_ = ir.fit_transform(x, y) lr = LinearRegression() lr.fit(x[:, np.newaxis], y) # x needs to be 2d for LinearRegression ############################################################################### # plot result segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)] lc = LineCollection(segments, zorder=0) lc.set_array(np.ones(len(y))) lc.set_linewidths(0.5 * np.ones(n)) fig = plt.figure() plt.plot(x, y, 'r.', markersize=12) plt.plot(x, y_, 'g.-', markersize=12) plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-') plt.gca().add_collection(lc) plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right') plt.title('Isotonic regression') plt.savefig(render,format='png') return render
def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0.0, y_max=1.0) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0.0, y_max=1.0) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
def pavTermFrequency(ranking_fn, cluster_names_fn, fn, plot): ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: frq.append(readFreq(name)) pav_classes = [] for f in range(len(frq)): print(names[f]) x = np.asarray(frq[f]) y = ranking[f] ir = IsotonicRegression() y_ = ir.fit_transform(x, y) pav_classes.append(y_) if plot: plot(x, y, y_) print(f) dt.write2dArray( pav_classes, "../data/movies/finetune/" + file_name + "PavTermFrequency.txt") return pav_classes
def cali(fname, predict_name, out_name, mode='ctr'): if mode == 'ctr': true_col = 'actual_click' prob_col = 'ctr' if mode == 'cvr': true_col = 'actual_purchase' prob_col = 'cvr' pred_df = pd.read_csv(predict_name, names=columns) nn = pred_df.shape[0] df = pd.read_csv(fname, names=columns) n = df.shape[0] y_true = df[true_col].values y_prob = df[prob_col].values #fraction_of_positives, mean_predicted_value = cali.calibration_curve(y_true, y_prob, normalize=False, n_bins=10) #plt.figure() #plt.plot(mean_predicted_value,fraction_of_positives) #plt.show() #plt.close() ir = IsotonicRegression() y = ir.fit_transform(y_prob, y_true) y_pred = ir.predict(pred_df[prob_col].values) nn = y_pred.shape[0] h = open(out_name, 'w') for i in range(nn): if i < nn - 1: h.write(str(y_pred[i]) + '\n') else: h.write(str(y_pred[i])) h.close()
def test_isotonic_regression_ties_secondary_(): """ Test isotonic regression fit, transform and fit_transform against the "secondary" ties method and "pituitary" data from R "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair, Isotone Optimization in R: Pool-Adjacent-Violators Algorithm (PAVA) and Active Set Methods Set values based on pituitary example and the following R command detailed in the paper above: > library("isotone") > data("pituitary") > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary") > res1$x `isotone` version: 1.0-2, 2014-09-07 R version: R version 3.1.1 (2014-07-10) """ x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14] y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25] y_true = [22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 24.25, 24.25] # Check fit, transform and fit_transform ir = IsotonicRegression() ir.fit(x, y) assert_array_almost_equal(ir.transform(x), y_true, 4) assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
def test_isotonic_regression_ties_secondary_(): """ Test isotonic regression fit, transform and fit_transform against the "secondary" ties method and "pituitary" data from R "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair, Isotone Optimization in R: Pool-Adjacent-Violators Algorithm (PAVA) and Active Set Methods Set values based on pituitary example and the following R command detailed in the paper above: > library("isotone") > data("pituitary") > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary") > res1$x `isotone` version: 1.0-2, 2014-09-07 R version: R version 3.1.1 (2014-07-10) """ x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14] y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25] y_true = [ 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 24.25, 24.25 ] # Check fit, transform and fit_transform ir = IsotonicRegression() ir.fit(x, y) assert_array_almost_equal(ir.transform(x), y_true, 4) assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
def isoreg(filename): ds = pd.read_csv(filename, names=["1", "2"], skiprows=1) ds["1"] = pd.to_datetime(ds["1"], format="%Y-%m") ds["d"] = (ds["1"] - ds["1"].min()) / np.timedelta64(1,'D') X = ds["d"] # put your dates in here y = ds["2"] # put your kwh in here model = IsotonicRegression() model.fit_transform(X, y) X_predict = ds["d"] # put the dates of which you want to predict kwh here y_predict = model.predict(X_predict) fig = plt.figure(figsize=(12, 6)) plt.plot(y) plt.plot(y_predict) fig.savefig("files/" + str(os.path.splitext(os.path.basename(filename))[0]) + "_isoreg.png")
def compare_PAVA_implementations(): trials = 10 rs = check_random_state(0) times = [] dimensions = [int(1e1), int(1e2), int(1e3), int(1e4), int(1e5), int(1e6)] #dimensions = [int(1e6)] for n in dimensions: print 'dimensionality', n x = np.arange(n) for trial in range(trials): y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n)) # scikit-learn PAVA if n <= int(1e5): #if n <= int(1e6): ir = IsotonicRegression() y_copy = np.copy(y) start_time = time.time() ir.fit_transform(x, y_copy) time1 = time.time() - start_time else: time1 = -1. # in-place PAVA y_copy = np.copy(y) start_time = time.time() isotonic_regression_c_2(y_copy, 0, n) time2 = time.time() - start_time # in-place PAVA++ y_copy = np.copy(y) start_time = time.time() isotonic_regression_c(y_copy, 0, n) time3 = time.time() - start_time times.append([time1, time2, time3]) index = [] for n in ['1e1','1e2','1e3','1e4','1e5','1e6']: index += [n]*trials #for n in ['1e6']: index += [n]*trials tuples = zip() df = pd.DataFrame(times, index=index, columns=['sklearn', 'PAVA+', 'PAVA++']) print df df.save('results/PAVA_comparison_5.pkl')
def mir_calibrate(logit,label,logit_eval): p = np.exp(logit)/np.sum(np.exp(logit),1)[:,None] p_eval = np.exp(logit_eval)/np.sum(np.exp(logit_eval),1)[:,None] ir = IsotonicRegression(out_of_bounds='clip') y_ = ir.fit_transform(p.flatten(), (label.flatten())) yt_ = ir.predict(p_eval.flatten()) p = yt_.reshape(logit_eval.shape)+1e-9*p_eval return p
def sklearn_isotonic_regression_multi(self, y, blocks): ir = IsotonicRegression() n = len(y) x = np.arange(n) z = np.zeros(n) z[:blocks[0]] = y[:blocks[0]] for start, end in zip(blocks, np.append(blocks[1:], [n])): z[start:end] = ir.fit_transform(x[start:end], y[start:end]) return z
def generate_calibration_model(df, pred_col, actual_col): ir = IsotonicRegression() y_ = ir.fit_transform(df[pred_col], df[actual_col]) calib = {} calib['method'] = 'ir' calib['mod'] = ir calib['max_obs'] = max(df[pred_col]) calib['max_cal'] = max(y_) return calib
def test_proj_PAV(self): n = 10 x = np.arange(n) rs = check_random_state(0) for i in range(10): y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n)) ir = IsotonicRegression() truth = ir.fit_transform(x, y) self.assertTrue(np.linalg.norm(proj_PAV(y) - truth) < 1e-8)
def _apply_isotonic_regression(df, mag, magErr): df.sort_values(by=[mag], inplace=True) df = df.reset_index(drop=True) x = df[mag] y = df[magErr] ir = IsotonicRegression() y_expected = ir.fit_transform(x, y) return ir, x, y, y_expected
def test_isotonic_min_max_boundaries(): # check if min value is used correctly ir = IsotonicRegression(y_min=2, y_max=4) n = 6 x = np.arange(n) y = np.arange(n) y_test = [2, 2, 2, 3, 4, 4] y_result = np.round(ir.fit_transform(x, y)) assert_array_equal(y_result, y_test)
def test_isotonic_sample_weight(): ir = IsotonicRegression() x = [1, 2, 3, 4, 5, 6, 7] y = [1, 41, 51, 1, 2, 5, 24] sample_weight = [1, 2, 3, 4, 5, 6, 7] expected_y = [1, 13.95, 13.95, 13.95, 13.95, 13.95, 24] received_y = ir.fit_transform(x, y, sample_weight=sample_weight) assert_array_equal(expected_y, received_y)
def compute_correlations(vectors, dissimilarities, distance_function): """ Computes the correlation between vector distances and actual dissimilarities, using the given distance function between the vectors. Returns a dictionary from correlation metric to its corresponding value. For convenience, this dictionary also contains both the vector of target dissimilarities and the vector of predicted similarities. """ import numpy as np from sklearn.isotonic import IsotonicRegression from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score from scipy.stats import pearsonr, spearmanr, kendalltau # initialize dissimilarities with ones (arbitrary, will be overwritten anyways) dissimilarity_scores = np.ones(dissimilarities.shape) for i in range(len(vectors)): for j in range(len(vectors)): vec_i = vectors[i] vec_j = vectors[j] score = distance_function(vec_i, vec_j)[0][0] dissimilarity_scores[i][j] = score # transform dissimilarity matrices into vectors for correlation computation target_vector = np.reshape(dissimilarities, (-1, 1)) sim_vector = np.reshape(dissimilarity_scores, (-1, 1)) # compute correlations pearson, _ = pearsonr(sim_vector, target_vector) spearman, _ = spearmanr(sim_vector, target_vector) kendall, _ = kendalltau(sim_vector, target_vector) # compute least squares regression for R² metric linear_regression = LinearRegression() linear_regression.fit(sim_vector, target_vector) predictions = linear_regression.predict(sim_vector) r2_linear = r2_score(target_vector, predictions) # compute isotonic regression for R² metric x = np.reshape(dissimilarity_scores, (-1)) y = np.reshape(dissimilarities, (-1)) isotonic_regression = IsotonicRegression() predictions = isotonic_regression.fit_transform(x, y) r2_isotonic = r2_score(y, predictions) return { 'pearson': pearson[0], 'spearman': spearman, 'kendall': kendall, 'r2_linear': r2_linear, 'r2_isotonic': r2_isotonic, 'targets': target_vector, 'predictions': sim_vector }
def _minCllr(self, targetScoreValues, nonTargetScoreValues, ): """ Computes the 'minimum cost of log likelihood ratio' measure as given in IDIAP's bob calibration.py We don't however use pavx here, as used in many other implementations, but sklearn's isotonic regression, which is equivalent and frees us from linking to c++ code. """ # First, sort both scores. neg = sorted(nonTargetScoreValues) pos = sorted(targetScoreValues) N = len(neg) P = len(pos) I = N + P # Now, iterate through both score sets and add a 0 for negative and 1 for positive scores. n, p = 0, 0 idealSequence = np.zeros(I) neg_indices = [0] * N pos_indices = [0] * P for i in range(I): if n == N or neg[n] > pos[p]: pos_indices[p] = i p += 1 idealSequence[i] = 1 else: neg_indices[n] = i n += 1 # Run the pool adjacent violaters method on the ideal LLR scores. # pavx implements isotonic regression. Python's sklearn contains code to do just that. ir = IsotonicRegression() # Calculate the isotonic regression. popt = ir.fit_transform(np.arange(len(idealSequence)), idealSequence) # disable runtime warnings for a short time since log(0) will raise a warning. old_warn_setup = np.seterr(divide='ignore') # ... compute logs. # Lets assume the prior odds on a target score is the ratio #target scores / #non target scores. log_prior_odds = math.log(float(P) / float(N)) posterior_log_odds = np.log(popt) - np.log(1.0 - popt) # ... activate old warnings. np.seterr(**old_warn_setup) llrs = posterior_log_odds - log_prior_odds # Unmix positive and negative scores. new_neg = np.zeros(N) for n in range(N): new_neg[n] = llrs[neg_indices[n]] new_pos = np.zeros(P) for p in range(P): new_pos[p] = llrs[pos_indices[p]] # Compute cllr of these new 'optimal' LLR scores. minCllr = self._cllr(new_pos, new_neg) return minCllr
def test_proj_PAV(self): n = 10 x = np.arange(n) rs = check_random_state(0) for i in range(10): y = rs.randint(-50, 50, size=(n, )) + 50. * np.log(1 + np.arange(n)) ir = IsotonicRegression() truth = ir.fit_transform(x, y) self.assertTrue(np.linalg.norm(proj_PAV(y) - truth) < 1e-8)
def regression_monotone_initial(data_training,data_target,data): regression_data = [] #Realisation de la regression monotone qui nous donnes un bruit de fond théorique #On fit et transform les données pour chacun des replicats indépendamment ir = IsotonicRegression() for i in range(len(data_target)-1): regression = ir.fit_transform(data_training[0:len(data_training),1],data_target[i+1,0:len(data)]) regression_data.append(regression) regression_data = np.asarray(regression_data) return regression_data
def irova_calibrate(logit,label,logit_eval): p = np.exp(logit)/np.sum(np.exp(logit),1)[:,None] p_eval = np.exp(logit_eval)/np.sum(np.exp(logit_eval),1)[:,None] for ii in range(p_eval.shape[1]): ir = IsotonicRegression(out_of_bounds='clip') y_ = ir.fit_transform(p[:,ii], label[:,ii]) p_eval[:,ii] = ir.predict(p_eval[:,ii])+1e-9*p_eval[:,ii] return p_eval return p_eval
def pavPPMI(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000): pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", pavPPMI.__name__) return else: print("Running task", pavPPMI.__name__) print("certainly still running that old pavPPMI task, yes sir") if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: name = name.split()[0] if ":" in name: name = name[:-1] frq.append( readPPMI(name, data_type, lowest_amt, highest_amt, classification)) pav_classes = [] for f in range(len(frq)): try: print(names[f]) x = np.asarray(frq[f]) y = ranking[f] ir = IsotonicRegression() y_ = ir.fit_transform(x, y) pav_classes.append(y_) if do_p: plot(x, y, y_) except ValueError: print(names[f], "len ppmi", len(frq[f], "len ranking", len(ranking[f]))) exit() print(f) dt.write2dArray(pav_classes, pavPPMI_fn) return pav_classes
def cal(refn, out_fn, base_folder='data/round2models', example_folder_name='example_data'): """ :param refn: :param out_fn: :param base_folder: :return: """ from sklearn.isotonic import IsotonicRegression from sklearn.metrics import log_loss, roc_auc_score import os calpath = 'calibration/data/' + out_fn + '_caldata.p' if os.path.exists(calpath): try: with open(calpath, 'rb')as f: ldirs, pcal, mags = pickle.load(f) return ldirs, pcal, mags except: with open(calpath, 'rb')as f: ldirs, pcal = pickle.load(f) return ldirs, pcal mags = [] y = [] dirs = os.listdir(path=base_folder) for dir in dirs: adv_path = os.path.join(base_folder, dir, example_folder_name, refn) if os.path.exists(adv_path): mag = get_blur_mag(adv_path, sigma=2.0) truth_fn = os.path.join(base_folder, dir, 'config.json') cls = utils.get_class(truth_fn, classtype='binary', file=True) mags.append(mag) y.append(cls) ir_model = IsotonicRegression(out_of_bounds='clip') pcal = ir_model.fit_transform(mags, y) kld = log_loss(y, pcal) # print(kld) roc1 = roc_auc_score(y, np.array(pcal)) print(out_fn, 'AUC:', roc1, 'KLD:', kld) # dump(ir_model, 'data/classifiers/blur' + '_ir.joblib') dump(ir_model, 'calibration/fitted/' + out_fn) pcal = pcal[np.argsort(dirs)] dirs.sort() with open(calpath,'wb') as f: pickle.dump([dirs, pcal, mags], f) return dirs, pcal, mags
def test_permutation_invariance(): # check that fit is permuation invariant. # regression test of missing sorting of sample-weights ir = IsotonicRegression() x = [1, 2, 3, 4, 5, 6, 7] y = [1, 41, 51, 1, 2, 5, 24] sample_weight = [1, 2, 3, 4, 5, 6, 7] x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0) y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight) y_transformed_s = ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x) assert_array_equal(y_transformed, y_transformed_s)
def test_permutation_invariance(): # check that fit is permutation invariant. # regression test of missing sorting of sample-weights ir = IsotonicRegression() x = [1, 2, 3, 4, 5, 6, 7] y = [1, 41, 51, 1, 2, 5, 24] sample_weight = [1, 2, 3, 4, 5, 6, 7] x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0) y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight) y_transformed_s = ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x) assert_array_equal(y_transformed, y_transformed_s)
def test_isotonic_regression(self): self.setUp() times = [] rs = check_random_state(0) for n in [int(1e1), int(1e2), int(1e3), int(1e4)]: x = np.arange(n) y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n)) ir = IsotonicRegression() start_time = time.time() y1 = ir.fit_transform(x, y) times.append(time.time() - start_time) print 'test isotonic_regression' print times
def regression_monotone(data_target,data): regression_data = [] #Realisation de la regression monotone qui nous donnes un bruit de fond théorique #On fit et transform les données pour chacun des replicats indépendamment ir = IsotonicRegression() #data_target_transpose = np.transpose(data_target) for replicat in data_target: regression = ir.fit_transform(np.arange(0,len(replicat),1),replicat) regression_data.append(regression) regression_data = np.asarray(regression_data) #print(len(regression_data)) return regression_data
def test_isotonic_regression(self): self.setUp() times = [] rs = check_random_state(0) for n in [int(1e1), int(1e2), int(1e3), int(1e4)]: x = np.arange(n) y = rs.randint(-50, 50, size=(n, )) + 50. * np.log(1 + np.arange(n)) ir = IsotonicRegression() start_time = time.time() y1 = ir.fit_transform(x, y) times.append(time.time() - start_time) print 'test isotonic_regression' print times
def fit(self, counts_matrix, lengths=None): ''' NMDS fit Function, scale low dimension matrix to high dimension Parameters ---------- counts_matrix: ndarray Returns ---------- fit_matrix: ndarray ''' if not sparse.isspmatrix_coo(counts_matrix): counts_matrix = sparse.coo_matrix(counts_matrix) for i in range(self.max_iter_outer): if i == 0: fit_matrix = Multi_Dimensional_Scaling_Base.estimate_model( counts_matrix, alpha=self.alpha, beta=self.beta, ini=self.init, verbose=self.verbose, precompute_distances=self.precompute_distances, use_zero_entries=False, random_state=self.random_state, bias=self.bias, factr=self.factr, maxiter=self.max_iter) else: ir = IsotonicRegression() distances = np.sqrt( ((fit_matrix[counts_matrix.row] - fit_matrix[counts_matrix.col])**2).sum(axis=1)) wish_distances = ir.fit_transform(1. / counts_matrix.data, distances) fit_matrix = Multi_Dimensional_Scaling_Base.estimate_model( sparse.coo_matrix( (wish_distances, (counts_matrix.row, counts_matrix.col))), alpha=self.alpha, beta=self.beta, ini=fit_matrix, verbose=self.verbose, use_zero_entries=False, precompute_distances='precomputed', random_state=self.random_state, factr=self.factr, maxiter=self.max_iter, ) return fit_matrix
def isotonic_regression(ax, x, y, w=[]): """ INPUT: ax: an Axes object x: (N, ) np.array y: (N, ) np.array w: None or a list of length N. OUTPUT: ax: an Axes object """ if len(w) == 0: w = [1.0 for _ in y] n = len(y) # Fit IsotonicRegression and LinearRegression models ir = IsotonicRegression() y_ = ir.fit_transform(x, y, sample_weight=w) lr = LinearRegression() lr.fit(x[:, np.newaxis], y, sample_weight=w) # x needs to be 2d for LinearRegression # Plot result segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)] ax.plot(x, y, 'r.', markersize=12, alpha=0.2) ax.plot(x, y_, 'g^', markersize=12, alpha=0.2) ax.plot(x, lr.predict(x[:, np.newaxis]), 'b-') ax.set_xlim(-0.1, 1.1) ax.set_ylim(-0.1, 1.1) # compute ece and acc after calibration ece = EceEval(np.array([1 - y_, y_]).T, y, num_bins=20) y_predict = y_ > 0.5 acc = (y_predict == y).mean() ax.text(0.05, 0.8, 'ECE=%.4f\nACC=%.4f' % (ece, acc), size=14, ha='left', va='center', bbox={ 'facecolor': 'green', 'alpha': 0.5, 'pad': 4 }) return ax
def sklearn_pav(y_true, y_score): """ Binary PAV algorithm, algorithm to solve Isotonic regression NOTE: sklearn isotonic regression is used y_true: 1D array y_score: 1D array """ id_permute = np.argsort(y_score) y_sort = y_true[id_permute] p_sort = np.sort(y_score) ir = IsotonicRegression() p_calibrated = ir.fit_transform(p_sort, y_sort) return y_sort, p_calibrated
def ensure_monotone_increasing(arr_, fromright=True, fromleft=True, newmode=True): r""" Args: arr_ (ndarray): Returns: ndarray: arr CommandLine: python -m vtool.math --test-ensure_monotone_increasing --show Example: >>> # DISABLE_DOCTEST >>> from vtool.math import * # NOQA >>> rng = np.random.RandomState(0) >>> size_ = 100 >>> domain = np.arange(size_) >>> offset = ut.get_argval('--offset', type_=float, default=2.3) >>> arr_ = np.sin(np.pi * (domain / 100) - offset) + (rng.rand(len(domain)) - .5) * .1 >>> arr = ensure_monotone_increasing(arr_, fromleft=False, fromright=True) >>> result = str(arr) >>> print(result) >>> ut.quit_if_noshow() >>> import plottool as pt >>> pt.plot2(domain, arr_, 'r-', fnum=1, pnum=(2, 1, 1), title='before', equal_aspect=False) >>> pt.plot2(domain, arr, 'r-', fnum=1, pnum=(2, 1, 2), title='after monotonization (increasing)', equal_aspect=False) >>> ut.show_if_requested() """ if newmode: from sklearn.isotonic import IsotonicRegression ir = IsotonicRegression() arr = ir.fit_transform(np.arange(len(arr_)), arr_) else: arr = arr_.copy() size = len(arr) # Ensure increasing from right if fromright: for lx in range(1, size): rx = (size - lx - 1) if arr[rx] > arr[rx + 1]: arr[rx] = arr[rx + 1] if fromleft: # ensure increasing from left for lx in range(0, size - 1): if arr[lx] > arr[lx + 1]: arr[lx + 1] = arr[lx] return arr
def test_isotonic_regression_auto_increasing(): # Set y and x for decreasing y = np.array([5, 6.1, 6, 7, 10, 9, 10]) x = np.arange(len(y)) # Create model and fit_transform ir = IsotonicRegression(increasing='auto') with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") y_ = ir.fit_transform(x, y) # work-around for pearson divide warnings in scipy <= 0.17.0 assert_true(all(["invalid value encountered in " in str(warn.message) for warn in w])) # Check that relationship increases is_increasing = y_[0] < y_[-1] assert_true(is_increasing)
def plot(): results = [] for f in glob('umau_lengths*npz'): d = np.load(f) l = d['lengths'] l = l[~np.isnan(l)] l = l[np.isfinite(l)] l = l[l>0] results.append([d['mu'], l.mean()]) for f in glob('miller/lengths*npz'): d = np.load(f) if d['mu'] not in [r[0] for r in results]: l = d['lengths'] l = l[np.isfinite(l)] l = l[~np.isnan(l)] l = l[l>0] results.append([d['mu'], l.mean()]) else: idx = [r[0] for r in results].index(d['mu']) l = d['lengths'] l = l[np.isfinite(l)] l = l[~np.isnan(l)] l = l[l>0] results[idx][1] = 0.5 * (results[idx][1] + l.mean()) results = sorted(results) results = np.array(results).T muvals, mean_length = results f = plt.figure() f.clf() ax = f.gca() iso = IsotonicRegression(increasing=False) mean_length_iso = iso.fit_transform(np.arange(mean_length.shape[0]), mean_length) ax.plot(muvals, mean_length, 'k', linewidth=2, label='UMAU') ax.plot([muvals.min(), muvals.max()], [2*ndist.ppf(0.975)]*2, c='red', label='Sample splitting', linewidth=2) ax.plot([muvals.min(), muvals.max()], [np.sqrt(2)*ndist.ppf(0.975)]*2, 'k--') ax.set_xlabel(r'$\mu$', fontsize=20) ax.set_ylabel(r'E(|CI($\mu$)|)', fontsize=20) ax.legend(loc='lower right') ax.set_ylim([0,4]) ax.set_xlim([-2,9]) f.savefig('figure_b_umau.pdf')
def fit(self, counts, lengths=None): """ """ if not sparse.isspmatrix_coo(counts): counts = sparse.coo_matrix(counts) for i in range(self.max_iter_outer): if i == 0: X = estimate_X( counts, alpha=self.alpha, beta=self.beta, ini=self.init, verbose=self.verbose, use_zero_entries=False, random_state=self.random_state, bias=self.bias, factr=self.factr, maxiter=self.max_iter, ) else: ir = IsotonicRegression() dis = np.sqrt(((X[counts.row] - X[counts.col]) ** 2).sum(axis=1)) wish_distances = ir.fit_transform(1.0 / counts.data, dis) X = estimate_X( sparse.coo_matrix((wish_distances, (counts.row, counts.col))), alpha=self.alpha, beta=self.beta, ini=X, verbose=self.verbose, use_zero_entries=False, precompute_distances="precomputed", random_state=self.random_state, bias=self.bias, factr=self.factr, maxiter=self.max_iter, ) print "writing wish distances" return X
def isotonic_deg_sequence(deg_seq, eps): # index-sort deg_seq idx = [i[0] for i in sorted(enumerate(deg_seq), key=lambda x:x[1])] sorted_deg_seq = [deg_seq[idx[i]] for i in range(len(deg_seq))] alpha = math.exp(-eps/4) # global seensitivity = 4 s1 = [0.0 for _ in range(len(deg_seq))] # s1 (sorted) for i in range(len(deg_seq)): s1[i] = sorted_deg_seq[i] + geometric_mechanism(alpha, 1) # Geometric noise # s ir = IsotonicRegression() s = ir.fit_transform(range(len(deg_seq)), s1) # return s, s1, sorted_deg_seq
def test_isotonic_regression_with_ties_in_differently_sized_groups(): """ Non-regression test to handle issue 9432: https://github.com/scikit-learn/scikit-learn/issues/9432 Compare against output in R: > library("isotone") > x <- c(0, 1, 1, 2, 3, 4) > y <- c(0, 0, 1, 0, 0, 1) > res1 <- gpava(x, y, ties="secondary") > res1$x `isotone` version: 1.1-0, 2015-07-24 R version: R version 3.3.2 (2016-10-31) """ x = np.array([0, 1, 1, 2, 3, 4]) y = np.array([0, 0, 1, 0, 0, 1]) y_true = np.array([0., 0.25, 0.25, 0.25, 0.25, 1.]) ir = IsotonicRegression() ir.fit(x, y) assert_array_almost_equal(ir.transform(x), y_true) assert_array_almost_equal(ir.fit_transform(x, y), y_true)
def _smacof_single_p(similarities, n_uq, metric=True, n_components=2, init=None, max_iter=300, verbose=0, eps=1e-3, random_state=None): """ Computes multidimensional scaling using SMACOF algorithm Parameters ---------- n_uq similarities: symmetric ndarray, shape [n * n] similarities between the points metric: boolean, optional, default: True compute metric or nonmetric SMACOF algorithm n_components: int, optional, default: 2 number of dimension in which to immerse the similarities overwritten if initial array is provided. init: {None or ndarray}, optional if None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array max_iter: int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run verbose: int, optional, default: 0 level of verbosity eps: float, optional, default: 1e-6 relative tolerance w.r.t stress to declare converge random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- X: ndarray (n_samples, n_components), float coordinates of the n_samples points in a n_components-space stress_: float The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points) n_iter : int Number of iterations run. """ similarities = check_symmetric(similarities, raise_exception=True) n_samples = similarities.shape[0] random_state = check_random_state(random_state) W = np.ones((n_samples, n_samples)) W[:n_uq, :n_uq] = 0.0 W[n_uq:, n_uq:] = 0.0 # W[np.arange(len(W)), np.arange(len(W))] = 0.0 V = -W V[np.arange(len(V)), np.arange(len(V))] = W.sum(axis=1) e = np.ones((n_samples, 1)) Vp = np.linalg.inv(V + np.dot(e, e.T)/n_samples) - np.dot(e, e.T)/n_samples # Vp = np.linalg.pinv(V) # sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel() sim_flat = similarities.ravel() sim_flat_w = sim_flat[sim_flat != 0] if init is None: # Randomly choose initial configuration X = random_state.rand(n_samples * n_components) X = X.reshape((n_samples, n_components)) else: # overrides the parameter p n_components = init.shape[1] if n_samples != init.shape[0]: raise ValueError("init matrix should be of shape (%d, %d)" % (n_samples, n_components)) X = init old_stress = None ir = IsotonicRegression() for it in range(max_iter): # Compute distance and monotonic regression dis = euclidean_distances(X) if metric: disparities = similarities else: # dis_flat = dis.ravel() # # similarities with 0 are considered as missing values # dis_flat_w = dis_flat[sim_flat != 0] # # Compute the disparities using a monotonic regression # disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) # disparities = dis_flat.copy() # disparities[sim_flat != 0] = disparities_flat # disparities = disparities.reshape((n_samples, n_samples)) # disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / # (disparities ** 2).sum()) dis_flat = dis.ravel() # similarities with 0 are considered as missing values dis_flat_w = dis_flat[sim_flat != 0] # Compute the disparities using a monotonic regression disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) disparities = dis_flat.copy() disparities[sim_flat != 0] = disparities_flat disparities = disparities.reshape((n_samples, n_samples)) disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / (disparities ** 2).sum()) disparities[similarities==0] = 0 # Compute stress # stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2 _stress = (W.ravel()*((dis.ravel() - disparities.ravel()) ** 2)).sum() / 2 # Update X using the Guttman transform # dis[dis == 0] = 1e-5 # ratio = disparities / dis # B = - ratio # B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1) # X = 1. / n_samples * np.dot(B, X) # print (1. / n_samples * np.dot(B, X))[:5].T dis[dis == 0] = 1e-5 ratio = disparities / dis _B = - W*ratio _B[np.arange(len(_B)), np.arange(len(_B))] += (W*ratio).sum(axis=1) X = np.dot(Vp, np.dot(_B, X)) # print X[:5].T dis = np.sqrt((X ** 2).sum(axis=1)).sum() if verbose >= 2: print('it: %d, stress %s' % (it, _stress)) if old_stress is not None: if(old_stress - _stress / dis) < eps: if verbose: print('breaking at iteration %d with stress %s' % (it, _stress)) break old_stress = _stress / dis return X, _stress, it + 1
def run_nmds(directory): print directory if os.path.exists(os.path.join(directory, "config.ini")): config_file = os.path.join(directory, "config.ini") else: config_file = None options = parse(config_file) run_mds(directory) for i in range(0, max_iter): if i == 0: try: X = np.loadtxt( os.path.join(directory, "MDS." + options["output_name"] + ".txt")) except IOError: return else: X = np.loadtxt( os.path.join(directory, '%d.NMDS.' % (i) + options["output_name"] + ".txt")) X = X.reshape((len(X) / 3, 3)) dis = euclidean_distances(X) * 1000 counts = np.load( os.path.join(directory, options["counts"])) counts[np.isnan(counts)] = 0 wish_distances = np.zeros(counts.shape) print "Fitting isotonic regression..." ir = IsotonicRegression() wish_distances[counts != 0] = ir.fit_transform( 1. / counts[counts != 0], dis[counts != 0]) print "writing wish distances" lengths = np.loadtxt( os.path.join(directory, options["organism_structure"])) try: len(lengths) except TypeError: lengths = np.array([lengths]) write(wish_distances, os.path.join(directory, '%d.NMDS.wish_distances.txt' % i), lengths=lengths, resolution=options["resolution"]) if i == 0: shutil.copy( os.path.join(directory, "MDS." + options["output_name"] + ".txt"), os.path.join(directory, '%d.NMDS.' % (i + 1) + options["output_name"] + ".temp.txt")) else: shutil.copy( os.path.join(directory, '%d.NMDS.' % i + options["output_name"] + ".txt"), os.path.join(directory, '%d.NMDS.' % (i + 1) + options["output_name"] + ".temp.txt")) locus_coord = options["output_name"].replace(".pdb",".bed") cmd = CMD_MDS % (options["binary_mds"], os.path.join(directory, "%d.NMDS." % (i + 1) + options["output_name"]), options["resolution"], os.path.join(directory, options["organism_structure"]), os.path.join(directory, "%d.NMDS.wish_distances.txt" % (i)), os.path.join(directory, locus_coord), options["adjacent_beads"], options["chromosomes"], os.path.join(directory, str(i + 1) + '.NMDS.log')) filename = os.path.join(directory, str(i + 1) + '.NMDS.sh') fileptr = open(filename, 'wb') fileptr.write(cmd) fileptr.close() st = os.stat(filename) os.chmod(filename, st.st_mode | stat.S_IXUSR) p =subprocess.Popen(filename.split(), shell='True') p.wait()
def _smacof_with_anchors_single(config, similarities, metric=True, n_components=2, init=None, max_iter=300, verbose=0, eps=1e-3, random_state=None, estimated_dist_weights=None): """ Computes multidimensional scaling using SMACOF algorithm Parameters ---------- config : Config object configuration object for anchor-tag deployment parameters similarities: symmetric ndarray, shape [n * n] similarities between the points metric: boolean, optional, default: True compute metric or nonmetric SMACOF algorithm n_components: int, optional, default: 2 number of dimension in which to immerse the similarities overwritten if initial array is provided. init: {None or ndarray}, optional if None, randomly chooses the initial configuration if ndarray, initialize the SMACOF algorithm with this array max_iter: int, optional, default: 300 Maximum number of iterations of the SMACOF algorithm for a single run verbose: int, optional, default: 0 level of verbosity eps: float, optional, default: 1e-6 relative tolerance w.r.t stress to declare converge random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- X: ndarray (n_samples, n_components), float coordinates of the n_samples points in a n_components-space stress_: float The final value of the stress (sum of squared distance of the disparities and the distances for all constrained points) n_iter : int Number of iterations run last_positions: ndarray [X1,...,Xn] An array of computed Xs. """ NO_OF_TAGS, NO_OF_ANCHORS = config.no_of_tags, config.no_of_anchors similarities = check_symmetric(similarities, raise_exception=True) n_samples = similarities.shape[0] random_state = check_random_state(random_state) sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel() sim_flat_w = sim_flat[sim_flat != 0] if init is None: # Randomly choose initial configuration X = random_state.rand(n_samples * n_components) X = X.reshape((n_samples, n_components)) # uncomment the following if weight matrix W is not hollow #X[:-2] = Xa else: # overrides the parameter p n_components = init.shape[1] if n_samples != init.shape[0]: raise ValueError("init matrix should be of shape (%d, %d)" % (n_samples, n_components)) X = init old_stress = None ir = IsotonicRegression() # setup weight matrix if getattr(config, 'weights', None) is not None: weights = config.weights else: weights = np.ones((n_samples, n_samples)) if getattr(config, 'missingdata', None): weights[-NO_OF_TAGS:, -NO_OF_TAGS:] = 0 if estimated_dist_weights is not None: weights[-NO_OF_TAGS:, -NO_OF_TAGS:] = estimated_dist_weights diag = np.arange(n_samples) weights[diag, diag] = 0 last_n_configs = [] Xa = config.anchors for it in range(max_iter): # Compute distance and monotonic regression dis = euclidean_distances(X) if metric: disparities = similarities else: dis_flat = dis.ravel() # similarities with 0 are considered as missing values dis_flat_w = dis_flat[sim_flat != 0] # Compute the disparities using a monotonic regression disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) disparities = dis_flat.copy() disparities[sim_flat != 0] = disparities_flat disparities = disparities.reshape((n_samples, n_samples)) disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / (disparities ** 2).sum()) # Compute stress stress = (weights.ravel()*(dis.ravel() - disparities.ravel()) ** 2).sum() / 2 #stress = ((dis[:-NO_OF_TAGS, -NO_OF_TAGS:].ravel() - disparities[:-NO_OF_TAGS, -NO_OF_TAGS:].ravel()) ** 2).sum() # Update X using the Guttman transform dis[dis == 0] = 1e5 ratio = weights*disparities / dis B = - ratio B[diag, diag] = 0 B[diag, diag] = -B.sum(axis=1) # Apply update to only tag configuration since anchor config is already known V = - weights V[diag, diag] += weights.sum(axis=1) # V_inv = np.linalg.pinv(V) V12 = V[-NO_OF_TAGS:, :-NO_OF_TAGS] B11 = B[-NO_OF_TAGS:, -NO_OF_TAGS:] Zu = X[-NO_OF_TAGS:] B12 = B[-NO_OF_TAGS:, :-NO_OF_TAGS] V11_inv = np.linalg.inv(V[-NO_OF_TAGS:, -NO_OF_TAGS:]) Xu = V11_inv.dot(B11.dot(Zu) + (B12 - V12).dot(Xa)) # merge known anchors config with new tags config X = np.concatenate((Xa, Xu)) last_n_configs.append(X) #X = (1/n_samples)*B.dot(X) #dis = np.sqrt((X ** 2).sum(axis=1)).sum() dis = (weights*dis**2).sum() / 2 if verbose >= 2: print('it: %d, stress %s' % (it, stress)) if old_stress is not None: if(old_stress - stress / dis) < eps: if verbose: print('breaking at iteration %d with stress %s' % (it, stress)) break old_stress = stress / dis return X, stress, it + 1, np.array(last_n_configs)
from sklearn.linear_model import LinearRegression from sklearn.isotonic import IsotonicRegression from sklearn.utils import check_random_state n = 100 x = np.arange(n) rs = check_random_state(0) y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n)) ############################################################################### # Fit IsotonicRegression and LinearRegression models ir = IsotonicRegression() y_ = ir.fit_transform(x, y) lr = LinearRegression() lr.fit(x[:, np.newaxis], y) # x needs to be 2d for LinearRegression ############################################################################### # plot result segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)] lc = LineCollection(segments, zorder=0) lc.set_array(np.ones(len(y))) lc.set_linewidths(0.5 * np.ones(n)) fig = plt.figure() plt.plot(x, y, 'r.', markersize=12) plt.plot(x, y_, 'g.-', markersize=12)
test = pd.read_csv(r'csvs\\submit_xB.csv') testId = test.id.values test = test.drop('id', axis=1) ##one vs all ir1 = IsotonicRegression() ir2 = IsotonicRegression() ir3 = IsotonicRegression() ir4 = IsotonicRegression() ir5 = IsotonicRegression() ir6 = IsotonicRegression() ir7 = IsotonicRegression() ir8 = IsotonicRegression() ir9 = IsotonicRegression() y_1 = ir1.fit_transform(cv10fold.ix[:,0], y_train.ix[:,0]) y_2 = ir2.fit_transform(cv10fold.ix[:,1], y_train.ix[:,1]) y_3 = ir3.fit_transform(cv10fold.ix[:,2], y_train.ix[:,2]) y_4 = ir4.fit_transform(cv10fold.ix[:,3], y_train.ix[:,3]) y_5 = ir5.fit_transform(cv10fold.ix[:,4], y_train.ix[:,4]) y_6 = ir6.fit_transform(cv10fold.ix[:,5], y_train.ix[:,5]) y_7 = ir7.fit_transform(cv10fold.ix[:,6], y_train.ix[:,6]) y_8 = ir8.fit_transform(cv10fold.ix[:,7], y_train.ix[:,7]) y_9 = ir9.fit_transform(cv10fold.ix[:,8], y_train.ix[:,8]) #container cv10fold.calibrated = pd.DataFrame({'id' : id , 'Class_1' : y_1 , 'Class_2' : y_2 , 'Class_3' : y_3 , 'Class_4' : y_4
def IsotonicRegression_pred(y_train, predictions_train, test_preds, bin_step, y_test): # Y Training Target sort the y_test # X Training Data use the indexes of sorted(y_test) # y_train_len=len(y_train) # if bin_step<1: # step_count = 1/bin_step # else: # step_count = int(math.floor(y_train_len/bin_step)) # step_element_count = int(math.floor(y_train_len/step_count)) # bin_start_indexes=np.array(range(0,step_count))*step_element_count predictions_np = np.array(predictions_train, float) predictions_sorted = np.sort(predictions_np) predictions_sorted_indexes = predictions_np.argsort() y_train_arranged = np.array(y_train, float)[predictions_sorted_indexes].ravel() # not_binned_y_train_arranged = y_train_arranged[:] # for index in range(len(bin_start_indexes)-1): # pin = bin_start_indexes[index] # pend = bin_start_indexes[index+1] # y_train_arranged[pin:pend] = np.average(y_train_arranged[pin:pend]) # if bin_start_indexes[-1]<y_train_len: # pin = bin_start_indexes[-1] # pend = y_train_len # y_train_arranged[pin:pend] = np.average(y_train_arranged[pin:pend]) ir = IsotonicRegression() y_ir = ir.fit_transform(predictions_sorted, y_train_arranged) y_ir_pred = ir.predict(predictions_sorted) # print "min(y_train_arranged) :", min(y_train_arranged) # print "max(y_train_arranged) :", max(y_train_arranged) # print "min(predictions_sorted) :", min(predictions_sorted) # print "max(predictions_sorted) :", max(predictions_sorted) # print "min(test_preds) :", min(test_preds) # print "max(test_preds) :", max(test_preds) # if max(test_preds)>=max(y_train_arranged): # np.arrya(test_preds>max(y_train_arranged))==True max_indexes = np.array((np.where(test_preds > max(y_train_arranged))), int).ravel() if len(max_indexes) != 0: for m_i in max_indexes: test_preds[m_i] = max(y_train_arranged) test_preds_sorted = np.sort(np.array(test_preds)) predictions_ir = ir.predict(test_preds) ind = np.where(np.isnan(predictions_ir))[0] preds_test_min = np.nanmin(predictions_ir) if len(ind) != 0: for i in ind: predictions_ir[i] = preds_test_min # ==============WRITING TO CSV================ # d_train={'y_train' :np.array(y_train,float)[predictions_sorted_indexes].ravel(), # 'y_train_bin' :np.array(y_train_arranged).ravel(), # 'train_preds' :np.array(predictions_sorted).ravel(), # 'train_preds_ir' :y_ir} # df_train=pd.DataFrame(d_train) # df_train.to_csv("train_IR.csv") # d_test={'y_test' :np.array(y_test).ravel(), # 'test_preds' :np.array(test_preds).ravel(), # 'test_preds_ir' :predictions_ir} # df_test=pd.DataFrame(d_test) # df_test.to_csv("test_IR.csv") # score_test_ir=ir.score(test_preds,y_test) score_test_ir = 0 return predictions_ir, y_ir_pred, ir.get_params(deep=True), score_test_ir
def interpolation_estimate(Z, Z_constraint, lower=0.5, upper=4, npts=30, ndraw=5000, burnin=1000, estimator='truncated'): """ Estimate the parameter $\sigma$ in $Z \sim N(0, \sigma^2 I) | Z \in C$ where $C$ is the convex set encoded by `Z_constraint` .. math:: C = \left\{z: Az+b \geq 0 \right\} with $(A,b)$ being `(Z_constraints.inequality, Z_constraints.inequality_offset)`. The algorithm proceeds by estimating $\|Z\|^2_2$ by Monte Carlo for a range of `npts` values starting from `lower*np.linalg.norm(Z)/np.sqrt(n)` to `upper*np.linalg.norm(Z)/np.sqrt(n)` with `n=Z.shape[0]`. These values are then used to compute the GCM (Greated Convex Minorant) which is interpolated and solved for an arguments such that the expected value matches the observed value `(Z**2).sum()`. Parameters ---------- Z : `np.float` Observed data to be used to estimate $\sigma$. Should be in the cone specified by `Z_constraints`. Z_constraint : `constraints` Constraints under which we observe $Z$. lower : float Multiple of naive estimate to use as lower endpoint. upper : float Multiple of naive estimate to use as upper endpoint. npts : int Number of points in interpolation grid. ndraw : int Number of Gibbs steps to use for estimating each expectation. burnin : int How many Gibbs steps to use for burning in. Returns ------- sigma_hat : float The root of the interpolant derived from GCM values. interpolant : `interp1d` The interpolant, to be used for plotting or other diagnostics. WARNING ------- * It is assumed that `Z_constraints.equality` is `None`. * Uses `rpy2` and `fdrtool` library to compute the GCM. """ initial = np.linalg.norm(Z) / np.sqrt(Z.shape[0]) Svalues = np.linspace(lower*initial,upper*initial, npts) Evalues = [] n = Z.shape[0] L, V, U, S = quadratic_bounds(Z, np.identity(n), Z_constraint) if estimator == 'truncated': def _estimator(S, Z, Z_constraint): L, V, U, _ = quadratic_bounds(Z, np.identity(n), Z_constraint) num = mpquad(lambda x: mpexp(-x**2/(2*S**2) -L*x / S**2 + (n-1) * mplog((x+L)/S) + 2 * mplog(x+L)), [0, U-L]) den = mpquad(lambda x: mpexp(-x**2/(2*S**2) -L*x / S**2 + (n-1) * mplog((x+L)/S)), [0, U-L]) print num / den, V**2, S, (L, U) return num / den elif estimator == 'simulate': state = Z.copy() rpy.r.assign('state', state) def _estimator(S, state, Z_constraint): Z_constraint.covariance = S**2 * np.identity(Z.shape[0]) e, v, _state = expected_norm_squared(state, Z_constraint, ndraw=ndraw, burnin=burnin) state[:] = _state return e state = Z.copy() for S in Svalues: Evalues.append(_estimator(S, state, Z_constraint)) ir = IsotonicRegression() if DEBUG: print Svalues, Evalues Eiso = ir.fit_transform(Svalues, Evalues) Sinterp, Einterp = Svalues, Eiso # rpy.r.assign('S', Svalues) # rpy.r.assign('E', np.array(Evalues)) # rpy.r(''' # library(fdrtool); # G = gcmlcm(S, E, 'gcm'); # Sgcm = G$x.knots; # Egcm = G$y.knots; # ''') # Sgcm = np.asarray(rpy.r('Sgcm')) # Egcm = np.asarray(rpy.r('Egcm')) # interpolant = interp1d(Sgcm, Egcm - (Z**2).sum()) interpolant = interp1d(Sinterp, Einterp - (Z**2).sum()) try: sigma_hat = bisect(interpolant, Sinterp.min(), Sinterp.max()) except: raise ValueError('''Bisection failed -- check (lower, upper). Observed = %0.1e, Range = (%0.1e,%0.1e)''' % ((Z**2).sum(), Einterp.min(), Einterp.max())) return sigma_hat, interpolant
def truncated_estimate(Z, Z_constraint, lower=0.5, upper=2, npts=15): """ Estimate the parameter $\sigma$ in $Z \sim N(0, \sigma^2 I) | Z \in C$ where $C$ is the convex set encoded by `Z_constraints` .. math:: C = \left\{z: Az+b \geq 0 \right\} with $(A,b)$ being `(Z_constraints.inequality, Z_constraints.inequality_offset)`. The algorithm proceeds by estimating $\|Z\|^2_2$ by Monte Carlo for a range of `npts` values starting from `lower*np.linalg.norm(Z)/np.sqrt(n)` to `upper*np.linalg.norm(Z)/np.sqrt(n)` with `n=Z.shape[0]`. These values are then used to compute the GCM (Greated Convex Minorant) which is interpolated and solved for an arguments such that the expected value matches the observed value `(Z**2).sum()`. Parameters ---------- Z : `np.float` Observed data to be used to estimate $\sigma$. Should be in the cone specified by `Z_constraints`. Z_constraint : `constraints` Constraints under which we observe $Z$. lower : float Multiple of naive estimate to use as lower endpoint. upper : float Multiple of naive estimate to use as upper endpoint. npts : int Number of points in interpolation grid. Returns ------- sigma_hat : float The root of the interpolant derived from GCM values. interpolant : `interp1d` The interpolant, to be used for plotting or other diagnostics. WARNING ------- * It is assumed that `Z_constraints.equality` is `None`. * Uses `rpy2` and `fdrtool` library to compute the GCM. """ initial = np.linalg.norm(Z) / np.sqrt(Z.shape[0]) Svalues = np.linspace(lower*initial,upper*initial, npts) Evalues = [] # use truncated chi to estimate integral # with scipy.integrate.quad n = Z.shape[0] operator = np.identity(n) L, V, U, S = quadratic_bounds(Z, operator, Z_constraint) for S in Svalues: num = quad(lambda x: np.exp(-x**2/(2*S**2) + (n+1) * np.log(x)), L, U) den = quad(lambda x: np.exp(-x**2/(2*S**2) + (n-1) * np.log(x)), L, U) Evalues.append(num[0] / den[0]) print num, den ir = IsotonicRegression() if DEBUG: print Svalues, Evalues Eiso = ir.fit_transform(Svalues, Evalues) Sinterp, Einterp = Svalues, Eiso interpolant = interp1d(Sinterp, Einterp - (Z**2).sum()) try: sigma_hat = bisect(interpolant, Sinterp.min(), Sinterp.max()) except: raise ValueError('''Bisection failed -- check (lower, upper). Observed = %0.1e, Range = (%0.1e,%0.1e)''' % ((Z**2).sum(), Einterp.min(), Einterp.max())) return sigma_hat, interpolant print L, V, U, S