def test_cross_val_generator_with_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) # explicitly passing indices value is deprecated loo = assert_warns(DeprecationWarning, cval.LeaveOneOut, 4, indices=True) lpo = assert_warns(DeprecationWarning, cval.LeavePOut, 4, 2, indices=True) kf = assert_warns(DeprecationWarning, cval.KFold, 4, 2, indices=True) skf = assert_warns(DeprecationWarning, cval.StratifiedKFold, y, 2, indices=True) lolo = assert_warns(DeprecationWarning, cval.LeaveOneLabelOut, labels, indices=True) lopo = assert_warns(DeprecationWarning, cval.LeavePLabelOut, labels, 2, indices=True) b = cval.Bootstrap(2) # only in index mode ss = assert_warns(DeprecationWarning, cval.ShuffleSplit, 2, indices=True) for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test]
def bootstrap(X_train, y_train, d, nIter=100, random_state=0): """ svm bootstrap 0 and 632. X_train -> nxD y_train -> n d -> selected feature size output -> err_bs0, err_bs632 """ bs = cross_validation.Bootstrap(len(y_train), n_iter=nIter, random_state=random_state) errs0 = [] errs632 = [] for train_index, test_index in bs: Xbs_train = X_train[train_index] Xbs_test = X_train[test_index] ybs_train = y_train[train_index] ybs_test = y_train[test_index] FeaInd_bs = FeaSelTtest(Xbs_train, ybs_train, d) Xdbs_train = Xbs_train[:, FeaInd_bs] Xdbs_test = Xbs_test[:, FeaInd_bs] clf = svm.SVC(kernel='linear') clf.fit(Xdbs_train, ybs_train) err0 = 1 - clf.score(Xdbs_test, ybs_test) err_bs_resub = 1 - clf.score(Xdbs_train, ybs_train) err632 = (1 - 0.632) * err_bs_resub + 0.632 * err0 errs0.append(err0) errs632.append(err632) err_bs0 = np.array(errs0).mean() err_bs632 = np.array(errs632).mean() return (err_bs0, err_bs632)
def cv_select(y, random_state, n_cv, cv, test_size=0.1): if isinstance(cv, basestring): if cv == 'shuffle': return cross_validation.StratifiedShuffleSplit( y, n_cv, test_size=test_size, random_state=random_state) elif cv == 'loo': return cross_validation.LeaveOneOut(n_cv) elif cv == 'kfold': return cross_validation.StratifiedKFold(y, n_folds=n_cv) elif cv == 'boot': return cross_validation.Bootstrap(len(y), n_iter=n_cv, train_size=(1 - test_size), random_state=random_state) elif cv == 'boot632': return bootstrap_632(len(y), n_iter=n_cv, random_state=random_state) # for regression elif cv == '_shuffle': return cross_validation.ShuffleSplit(len(y), n_iter=n_cv, test_size=test_size, random_state=random_state) elif cv == '_kfold': return cross_validation.KFold(len(y), n_folds=n_cv) else: raise ValueError("bad cv:%s" % cv) else: return cv
def _bootstrapped_fit_transform(self, data, n_iter=100, thresh=0.6, min_samples=10): """Resample each splicing event n_iter times to robustly estimate modalities. """ bs = cross_validation.Bootstrap(data.shape[0], n_iter=n_iter) assignments = pd.DataFrame(columns=data.columns, index=range(n_iter)) for i, (train_index, test_index) in enumerate(bs): index = train_index + test_index psi = data.ix[data.index[index], :] psi = psi.dropna(axis=1, thresh=min_samples) assignments.ix[i] = self._single_fit_transform(psi, do_not_memoize=True) counts = assignments.apply( lambda x: pd.Series(collections.Counter(x.dropna()))) fractions = counts / counts.sum().astype(float) thresh_assignments = fractions[fractions >= thresh].apply( self._max_assignment, axis=0) thresh_assignments = thresh_assignments.fillna('unassigned') return thresh_assignments
def sample_random_n(table, n, stratified=False, replace=False, random_state=None): assert n > 0 n = int(n) if replace: ind = cross_validation.Bootstrap(len(table), train_size=n, random_state=random_state) elif stratified and is_discrete(table.domain.class_var): train_size = max(len(table.domain.class_var.values), n) test_size = max(len(table) - train_size, 0) ind = cross_validation.StratifiedShuffleSplit( table.Y.ravel(), n_iter=1, test_size=test_size, train_size=train_size, random_state=random_state) else: train_size = max(len(table.domain.class_var.values), n) test_size = max(len(table) - train_size, 0) ind = cross_validation.ShuffleSplit(len(table), n_iter=1, test_size=test_size, train_size=train_size, random_state=random_state) return next(iter(ind))
def __call__(self, data, fitters): indices = cross_validation.Bootstrap(len(data), n_iter=self.n_resamples, train_size=self.p, random_state=self.random_state) results = Results(data, len(fitters), store_data=self.store_data) results.folds = [] if self.store_models: results.models = [] row_indices = [] actual = [] predicted = [[] for _ in fitters] probabilities = [[] for _ in fitters] fold_start = 0 class_var = data.domain.class_var for train, test in indices: train_data, test_data = data[train], data[test] results.folds.append(slice(fold_start, fold_start + len(test))) row_indices.append(test) actual.append(test_data.Y.flatten()) if self.store_models: fold_models = [] results.models.append(fold_models) for i, fitter in enumerate(fitters): model = fitter(train_data) if self.store_models: fold_models.append(model) if is_discrete(class_var): values, probs = model(test_data, model.ValueProbs) predicted[i].append(values) probabilities[i].append(probs) elif is_continuous(class_var): values = model(test_data, model.Value) predicted[i].append(values) fold_start += len(test) row_indices = np.hstack(row_indices) actual = np.hstack(actual) predicted = np.array([np.hstack(pred) for pred in predicted]) if is_discrete(class_var): probabilities = np.array( [np.vstack(prob) for prob in probabilities]) nrows = len(actual) nmodels = len(predicted) results.nrows = len(actual) results.row_indices = row_indices results.actual = actual results.predicted = predicted.reshape(nmodels, nrows) if is_discrete(class_var): results.probabilities = probabilities return results
def fit(self, X, y): total_rows, total_features = X.shape ## randomly select features bt = cross_validation.Bootstrap(total_features, n_iter = self.n_estimators, train_size = self.n_features) self.feature_sets = [fset for (fset, _) in bt] """ self.ensemble = Parallel(n_jobs = -1)(delayed(fit_model)(self.ensemble[i], X, y, self.feature_sets[i]) for i in xrange(self.n_estimators)) """ self.ensemble = [fit_model(self.ensemble[i], X, y, self.feature_sets[i]) for i in xrange(self.n_estimators)] return self
def cross_phenotype_jsd(data, groupby, bins, n_iter=100): """Jensen-Shannon divergence of features across phenotypes Parameters ---------- data : pandas.DataFrame A (n_samples, n_features) Dataframe groupby : mappable A samples to phenotypes mapping n_iter : int Number of bootstrap resampling iterations to perform for the within-group comparisons n_bins : int Number of bins to binify the singles data on Returns ------- jsd_df : pandas.DataFrame A (n_features, n_phenotypes^2) dataframe of the JSD between each feature between and within phenotypes """ grouped = data.groupby(groupby) jsds = [] seen = set([]) for phenotype1, df1 in grouped: for phenotype2, df2 in grouped: pair = tuple(sorted([phenotype1, phenotype2])) if pair in seen: continue seen.add(pair) if phenotype1 == phenotype2: seriess = [] bs = cross_validation.Bootstrap(df1.shape[0], n_iter=n_iter, train_size=0.5) for i, (ind1, ind2) in enumerate(bs): df1_subset = df1.iloc[ind1, :] df2_subset = df2.iloc[ind2, :] seriess.append( binify_and_jsd(df1_subset, df2_subset, None, bins)) series = pd.concat(seriess, axis=1, names=None).mean(axis=1) series.name = pair jsds.append(series) else: series = binify_and_jsd(df1, df2, pair, bins) jsds.append(series) return pd.concat(jsds, axis=1)
def test_cross_val_generator_with_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4, indices=True) lpo = cval.LeavePOut(4, 2, indices=True) kf = cval.KFold(4, 2, indices=True) skf = cval.StratifiedKFold(y, 2, indices=True) lolo = cval.LeaveOneLabelOut(labels, indices=True) lopo = cval.LeavePLabelOut(labels, 2, indices=True) b = cval.Bootstrap(2) # only in index mode ss = cval.ShuffleSplit(2, indices=True) for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]: for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test]
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) b = cval.Bootstrap(2) # only in index mode ss = cval.ShuffleSplit(2) for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test]
train = data[train_index] test = data[test_index] train_target_vals = target_vals[train_index] test_traget_vals = target_vals[test_index] models = all_model(alpha) for model, model_type in models: model.fit(train, train_target_vals) predict = model.predict(test) accuracy = metrics.accuracy_score(test_traget_vals, predict) list_accuracy.append(accuracy) # print model_type, " = ", accuracy return max(list_accuracy) if __name__ == "__main__": data, targets = parse() data = np.array(data) data = data.astype(np.float) data = normalize(data, axis=1) # print (data[1][0]) targets = np.array(targets) # print (targets[0]) alpha_vals = np.linspace(750, 1000, 20) bs = cv.Bootstrap(targets.size, n_iter=100) max_vals = [] for alpha in alpha_vals: print '\n', alpha max_vals.append(trails(data, targets, bs, alpha)) print max(max_vals)
#svm, X1, Y1, cv=bs)#, score_func=metrics.f1_score) #print 'score: %f +- %f' % (scores.mean(), scores.std()) #pred_Y = svm.predict(test_X) #print metrics.precision_score(test_Y, pred_Y) #print metrics.recall_score(test_Y, pred_Y) #print metrics.f1_score(test_Y, pred_Y) #pred_Y = svm.predict(X1) #print metrics.precision_score(Y1, pred_Y) #print metrics.recall_score(Y1, pred_Y) #print metrics.f1_score(Y1, pred_Y) alpha_arr = [] for label in np.unique(labels): n = np.sum(labels == label) alpha_arr.append(n / float(labels.shape[0])) alpha_arr = np.array(alpha_arr) alpha = np.max(alpha_arr) print alpha bs = cross_validation.Bootstrap(data.shape[0], 3) for train_indices, test_indices in bs: svm.fit(data[train_indices], labels[train_indices]) score = svm.score(data[test_indices], labels[test_indices]) print score, (score - alpha) / (1 - alpha) #pred = svm.predict(data[test_indices])
def main(): #random.seed(5) #random.random() startCol = 0 endCol = 1775 # max = 1775 trainBase = csv_io.read_data("../Data/train.csv") result = 100 avg = 0 bootstraps = 9 # should be odd for median rnd_start = 456 predicted_list = [] spanDistance = 12 bootstrapLists = [] if ( True): predicted_list = [] bs = cross_validation.Bootstrap(len(trainBase) - 1, n_bootstraps=bootstraps, train_size=0.7, random_state=0) for train_index, test_index in bs: trainBaseTemp = [trainBase[i+1] for i in train_index] #trainBaseTemp = trainBase target = [x[0] for x in trainBaseTemp]#[1001:3700] train = [x[1:] for x in trainBaseTemp]#[1001:3700] testBaseTemp = [trainBase[i+1] for i in test_index] #testBaseTemp = trainBase targetTest = [x[0] for x in testBaseTemp]#[1:1000] trainTest = [x[1:] for x in testBaseTemp]#[1:1000] test = csv_io.read_data("../Data/test.csv") test = [x[0:] for x in test] fo = open("rf_stats.txt", "a+") rf = ExtraTreesClassifier(n_estimators=200, criterion='gini', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=False, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) rf.fit(train, target) prob = rf.predict_proba(trainTest) # was test probSum = 0 totalOffByHalf = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if ( probX > 0.999999999999): probX = 0.999999999999; if ( probX < 0.000000000001): probX = 0.000000000001; #print i, probSum, probX, target[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX) if ( math.fabs(probX - targetTest[i]) > 0.5 ): totalOffByHalf = totalOffByHalf + 1 print "Total Off By > 0.5 ", totalOffByHalf print -probSum/len(prob) #fo.write(str(C) + "," + str(g) + "," + str(-probSum/len(prob))); avg += (-probSum/len(prob))/bootstraps predicted_probs = rf.predict_proba(test) # was test predicted_list.append([x[1] for x in predicted_probs]) fo.close() avg_list = [] med_list = [] for p in range(0, len(test)): temp_list =[] for q in range(0, len(predicted_list)): temp_list.append( predicted_list[q][p]) avg_list.append( mean(temp_list) ) med_list.append( getMedian(temp_list) ) print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) if ( len(bootstrapLists) > 1 ): finalList = [] for p in range(0, len(test)): temp_list =[] for q in range(0, len(bootstrapLists)): temp_list.append( bootstrapLists[q][p]) finalList.append( meanSpan(temp_list, spanDistance) ) print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file("../Submissions/et_stack_avg_benchmark.csv", avg_values) print "Average: ", avg var = raw_input("Enter to terminate.")
def main(): #random.seed(5) #random.random() # this method does not seem to benefit from using less than all columns of data. startCol = 0 endCol = 1775 # max = 1775 trainBase = csv_io.read_data("../Data/train.csv") result = 100 avg = 0 avg_sum = 0 avg_counter = 0 bootstraps = 9 predicted_list = [] bs = cross_validation.Bootstrap(len(trainBase) - 1, n_bootstraps=bootstraps, train_size=0.7, random_state=0) for train_index, test_index in bs: trainBaseTemp = [trainBase[i + 1] for i in train_index] trainBaseTemp = trainBase target = [x[0] for x in trainBaseTemp][1001:3700] train = [x[startCol + 1:endCol + 1] for x in trainBaseTemp][1001:3700] testBaseTemp = [trainBase[i + 1] for i in test_index] testBaseTemp = trainBase targetTest = [x[0] for x in testBaseTemp][1:1000] trainTest = [x[startCol + 1:endCol + 1] for x in testBaseTemp][1:1000] test = csv_io.read_data("../Data/test.csv") test = [x[startCol:endCol] for x in test] fo = open("svm_stats.txt", "a+") # good for rbf method CC = [0.0] gg = [-5.5] # sigmoid use C=-8 #C=-8 #poly use #CC = [12, 10, 8, 6,4 ,2] #gg = [-19, -17, -15, -13, -11, -9] for g in gg: for C in CC: #for g in range(-19, -10, 2): #for C in range(14, -10, -2): #if (True): #for y in range(0, 6): svc = svm.SVC(probability=True, C=10**-8, cache_size=800, coef0=0.0, degree=3, kernel='rbf', shrinking=True, tol=0.001) #svc = svm.SVC(probability=True, C=10**C[y], gamma=2**g[y],cache_size=800, coef0=0.0, degree=3, kernel='poly', shrinking=True, tol=0.01) svc.fit(train, target) prob = svc.predict_proba(trainTest) # was test probSum = 0 totalOffByHalf = 0 for i in range(0, len(prob)): #print i, probSum, prob[i][1], target[i] #print target[i]*log(prob[i][1]), (1-target[i])*log(1-prob[i][1]) probSum += targetTest[i] * log( prob[i][1]) + (1 - targetTest[i]) * log(1 - prob[i][1]) if (math.fabs(prob[i][1] - targetTest[i]) > 0.5): totalOffByHalf = totalOffByHalf + 1 #print probSum #print len(prob) print "Total Off By > 0.5 ", totalOffByHalf print "C: ", 10**C, " gamma: ", 2**g #print "C: ", 10**C[y], " gamma: " ,2**g[y] print -probSum / len(prob) #fo.write(str(C[y]) + "," + str(g[y]) + "," + str(-probSum/len(prob))); fo.write( str(C) + "," + str(g) + "," + str(-probSum / len(prob))) avg_sum += (-probSum / len(prob)) avg_counter = avg_counter + 1 #if ( -probSum/len(prob) < result ): # result = -probSum/len(prob) # predicted_probs = svc.predict_proba(test) # was test # predicted_probs = ["%f" % x[1] for x in predicted_probs] # csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs) # print "Generated Data!!" predicted_probs = svc.predict_proba(test) # was test predicted_list.append([x[1] for x in predicted_probs]) fo.close() avg_list = [] med_list = [] for p in range(0, len(test)): temp_list = [] for q in range(0, len(predicted_list)): temp_list.append(predicted_list[q][p]) avg_list.append(mean(temp_list)) med_list.append(getMedian(temp_list)) print p, q, temp_list, mean(temp_list), getMedian(temp_list) med_values = ["%f" % x for x in med_list] csv_io.write_delimited_file("../Submissions/svm_med_benchmark.csv", med_values) avg_values = ["%f" % x for x in avg_list] csv_io.write_delimited_file("../Submissions/svm_avg_benchmark.csv", avg_values) print "Average: ", (avg_sum / avg_counter) var = raw_input("Enter to terminate.")
def __init__(self, data, learners, n_resamples=10, p=0.75, random_state=0, store_data=False, store_models=False): super().__init__(data, len(learners), store_data=store_data, store_models=store_models) self.store_models = store_models self.n_resamples = n_resamples self.p = p self.random_state = random_state indices = skl_cross_validation.Bootstrap( len(data), n_iter=self.n_resamples, train_size=self.p, random_state=self.random_state ) self.folds = [] if self.store_models: self.models = [] row_indices = [] actual = [] predicted = [[] for _ in learners] probabilities = [[] for _ in learners] fold_start = 0 class_var = data.domain.class_var for train, test in indices: train_data, test_data = data[train], data[test] self.folds.append(slice(fold_start, fold_start + len(test))) row_indices.append(test) actual.append(test_data.Y.flatten()) if self.store_models: fold_models = [] self.models.append(fold_models) for i, learner in enumerate(learners): model = learner(train_data) if self.store_models: fold_models.append(model) if is_discrete(class_var): values, probs = model(test_data, model.ValueProbs) predicted[i].append(values) probabilities[i].append(probs) elif is_continuous(class_var): values = model(test_data, model.Value) predicted[i].append(values) fold_start += len(test) row_indices = np.hstack(row_indices) actual = np.hstack(actual) predicted = np.array([np.hstack(pred) for pred in predicted]) if is_discrete(class_var): probabilities = np.array([np.vstack(prob) for prob in probabilities]) nrows = len(actual) nmodels = len(predicted) self.nrows = len(actual) self.row_indices = row_indices self.actual = actual self.predicted = predicted.reshape(nmodels, nrows) if is_discrete(class_var): self.probabilities = probabilities
def main(kernel, CC, gg, bootstraps, spanDistance): #meanSpan([0.197, 0.384,0.382,0.268,0.248,0.280,0.248,0.417], 4) #exit() #random.seed(5) #random.random() # this method does not seem to benefit from using less than all columns of data. startCol = 0 endCol = 1775 # max = 1775 trainBase = csv_io.read_data("../Data/train.csv") result = 100 avg = 0 avg_sum = 0 avg_counter = 0 #bootstraps = 5 predicted_list = [] #spanDistance = 15 #poly use #Cc = [12, 10, 8, 6,4 ,2] #gg = [-19, -17, -15, -13, -11, -9] #CC = [10, 9.5, 9, 8.5, 8, 7.5, 7, 6.5, 6] #gg = [-17, -16.5, -16, -15.5, -15, -14.5, -14, -13.5, 13.0] # poly optimized set. #CC = [10, 8.5, 8.5, 8, 8, 7, 6.5, 6] #gg = [-17, -16.5, -16, -15.5, -15, -14.5, -14, -13.5] #CC = [10] #gg = [-17] bootstrapLists = [] #for g in gg: #for C in CC: if (True): for a in range(0, len(CC)): C = CC[a] g = gg[a] predicted_list = [] bs = cross_validation.Bootstrap(len(trainBase) - 1, n_bootstraps=bootstraps, train_size=0.7, random_state=0) for train_index, test_index in bs: trainBaseTemp = [trainBase[i + 1] for i in train_index] trainBaseTemp = trainBase target = [x[0] for x in trainBaseTemp][1001:3700] train = [x[startCol + 1:endCol + 1] for x in trainBaseTemp][1001:3700] testBaseTemp = [trainBase[i + 1] for i in test_index] testBaseTemp = trainBase targetTest = [x[0] for x in testBaseTemp][1:1000] trainTest = [x[startCol + 1:endCol + 1] for x in testBaseTemp][1:1000] test = csv_io.read_data("../Data/test.csv") test = [x[startCol:endCol] for x in test] fo = open("svm_stats.txt", "a+") svc = None if (kernel == 'poly'): svc = svm.SVC(probability=True, C=10**C, gamma=2**g, cache_size=800, coef0=0.0, degree=3, kernel='poly', shrinking=True, tol=0.01) if (kernel == 'sigmoid'): svc = svm.SVC(probability=True, C=1 * C, gamma=2**g, cache_size=800, coef0=0.0, degree=3, kernel='sigmoid', shrinking=True, tol=0.01) if (kernel == 'rbf'): svc = svm.SVC(probability=True, C=10**C, gamma=2**g, cache_size=800, coef0=0.0, degree=3, kernel='rbf', shrinking=True, tol=0.01) if (kernel == 'linear'): svc = svm.SVC(probability=True, C=10**C, gamma=2**g, cache_size=800, coef0=0.0, degree=3, kernel='linear', shrinking=True, tol=0.01) svc.fit(train, target) prob = svc.predict_proba(trainTest) # was test probSum = 0 totalOffByHalf = 0 for i in range(0, len(prob)): #print i, probSum, prob[i][1], target[i] #print target[i]*log(prob[i][1]), (1-target[i])*log(1-prob[i][1]) probSum += targetTest[i] * log( prob[i][1]) + (1 - targetTest[i]) * log(1 - prob[i][1]) if (math.fabs(prob[i][1] - targetTest[i]) > 0.5): totalOffByHalf = totalOffByHalf + 1 #print probSum #print len(prob) print "Total Off By > 0.5 ", totalOffByHalf print "C: ", 10**C, " gamma: ", 2**g #print "C: ", 10**C[y], " gamma: " ,2**g[y] print -probSum / len(prob) #fo.write(str(C[y]) + "," + str(g[y]) + "," + str(-probSum/len(prob))); fo.write( str(C) + "," + str(g) + "," + str(-probSum / len(prob))) avg_sum += (-probSum / len(prob)) avg_counter = avg_counter + 1 #if ( -probSum/len(prob) < result ): # result = -probSum/len(prob) # predicted_probs = svc.predict_proba(test) # was test # predicted_probs = ["%f" % x[1] for x in predicted_probs] # csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs) # print "Generated Data!!" predicted_probs = svc.predict_proba(test) # was test predicted_list.append([x[1] for x in predicted_probs]) fo.close() avg_list = [] med_list = [] for p in range(0, len(test)): temp_list = [] for q in range(0, len(predicted_list)): temp_list.append(predicted_list[q][p]) avg_list.append(mean(temp_list)) med_list.append(getMedian(temp_list)) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(med_list) if (len(bootstrapLists) > 1): finalList = [] for p in range(0, len(test)): temp_list = [] for q in range(0, len(bootstrapLists)): temp_list.append(bootstrapLists[q][p]) finalList.append(meanSpan(temp_list, spanDistance)) print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] final_values = ["%f" % x for x in finalList] csv_io.write_delimited_file( "../Submissions/svm-" + kernel + "-bootstrap-stack_meanSpan_benchmark.csv", final_values) print "Average: ", (avg_sum / avg_counter) var = raw_input("Enter to terminate.")
def test_bootstrap_test_sizes(): assert_equal(cval.Bootstrap(10, test_size=0.2).test_size, 2) assert_equal(cval.Bootstrap(10, test_size=2).test_size, 2) assert_equal(cval.Bootstrap(10, test_size=None).test_size, 5)
def main(): #random.seed(5) #random.random() startCol = 0 endCol = 1775 # max = 1775 trainBase = csv_io.read_data("../Data/train.csv") result = 100 avg = 0 bootstraps = 1 # should be ood for median rnd_start = 456 predicted_list = [] for n_est in [160, 320, 640, 1280, 4000, 8000, 16000]: for learn_r in [0.5, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001]: bs = cross_validation.Bootstrap(len(trainBase) - 1, n_bootstraps=bootstraps, train_size=0.6, random_state=0) for train_index, test_index in bs: print n_est, learn_r #trainBaseTemp = [trainBase[i+1] for i in train_index] trainBaseTemp = trainBase target = [x[0] for x in trainBaseTemp][1001:3700] train = [x[1:] for x in trainBaseTemp][1001:3700] #testBaseTemp = [trainBase[i+1] for i in test_index] testBaseTemp = trainBase targetTest = [x[0] for x in testBaseTemp][1:1000] trainTest = [x[1:] for x in testBaseTemp][1:1000] test = csv_io.read_data("../Data/test.csv") test = [x[0:] for x in test] fo = open("gb_stats.txt", "a+") #learn_rate=0.1, n_estimators=200 rf = GradientBoostingClassifier( loss='deviance', learn_rate=learn_r, n_estimators=n_est, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=3, init=None, random_state=rnd_start) # , max_features=None rf.fit(train, target) prob = rf.predict_proba(trainTest) # was test probSum = 0 totalOffByHalf = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if (probX > 0.999999999999): probX = 0.999999999999 if (probX < 0.000000000001): probX = 0.000000000001 #print i, probSum, probX, target[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += targetTest[i] * log(probX) + ( 1 - targetTest[i]) * log(1 - probX) if (math.fabs(probX - targetTest[i]) > 0.5): totalOffByHalf = totalOffByHalf + 1 print "Total Off By > 0.5 ", totalOffByHalf print -probSum / len(prob) #fo.write(str(C) + "," + str(g) + "," + str(-probSum/len(prob))); avg += (-probSum / len(prob)) / bootstraps #if ( -probSum/len(prob) < result ): # result = -probSum/len(prob) # predicted_probs = rf.predict_proba(test) # was test # predicted_probs = ["%f" % x[1] for x in predicted_probs] # csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs) # print "Generated Data!!" predicted_probs = rf.predict_proba(test) # was test predicted_list.append([x[1] for x in predicted_probs]) fo.close() avg_list = [] med_list = [] for p in range(0, len(predicted_list[0])): temp_list = [] for q in range(0, len(predicted_list)): temp_list.append(predicted_list[q][p]) avg_list.append(mean(temp_list)) med_list.append(getMedian(temp_list)) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) med_values = ["%f" % x for x in med_list] csv_io.write_delimited_file("../Submissions/gb_med_benchmark.csv", med_values) avg_values = ["%f" % x for x in avg_list] csv_io.write_delimited_file("../Submissions/gb_avg_benchmark.csv", avg_values) print "Average: ", avg var = raw_input("Enter to terminate.")