def get_sub_set_with_size(self, data, set_size, random_state=1): X, y = data ss = ShuffleSplit(X.shape[0], n_iter=1, train_size=set_size, test_size=0, random_state=random_state) sub_index, other_index = ss.__iter__().next() X_sub, y_sub = X[sub_index], y[sub_index] return X_sub, y_sub
def grid_search(self, n_iter=5): rs = ShuffleSplit(self.data.shape[0], n_iter=1, test_size=.1, random_state=0) train, test = rs.__iter__().next() x_train, y_train, w_train = self.data[train], self.labels[train], self.weights[train] x_test, y_test = self.data[test], self.labels[test] scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) clf = GridSearchCV(self.classifier, self.param_grid, cv=2, scoring=score, verbose=3, n_jobs=2) clf.fit(x_train, y_train) print("Best parameters set found on development set:") print(clf.best_estimator_) print("Grid scores on development set:") for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) print("Detailed classification report:") print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") y_true, y_pred = y_test, clf.predict(x_test) print(classification_report(y_true, y_pred)) test_ams, test_threshold = self.calculateAMS(test, clf.best_estimator_) train_ams, train_threshold = self.calculateAMS(train, clf.best_estimator_) print(('Test AMS %f, Train AMS %f') % (test_ams, train_ams))
def get_sub_set_with_size(self, data, set_size): """ @param train_data is [X, y] """ X, y = data ss = ShuffleSplit(X.shape[0], n_iter=1, train_size=set_size, test_size=0, random_state=1) sub_index, other_index = ss.__iter__().next() X_sub = X[sub_index] y_sub = y[sub_index] return X_sub, y_sub
def toNumpyDominant(): X, y = load_svmlight_file(default_train_file) ss = ShuffleSplit(X.shape[0], n_iter=1, test_size=5640, random_state=1) train_index, test_index = ss.__iter__().next() X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] print 'loaded snippet dataset' print 'entire training set size', y_train.size print 'test set size', y_test.size return X_train, y_train, X_test, y_test
def toNumpy(): X, y = load_data() ss = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.5, random_state=0) train_index, test_index = ss.__iter__().next() X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] sel = SelectKBest(chi2, 200) X_train = sel.fit_transform(X_train, y_train) X_test = sel.transform(X_test) X_train = X_train.toarray() X_test = X_test.toarray() return X_train, y_train, X_test, y_test
def toNumpy(): print "News 20 dataset is being loaded" X, y = load_libsvm(default_train_file) ss = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.5, random_state=0) train_index, test_index = ss.__iter__().next() X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] # Let's perform feature selection to decrease the memory requirements f = SelectKBest(chi2, k=2000) X_train = f.fit_transform(X_train, y_train) X_test = f.transform(X_test) return X_train, y_train, X_test, y_test
def get_sub_set_with_size(self, data, set_size): ''' @param train_data is [X, y] ''' X, y = data # random_state is fixed for reproducibility ss = ShuffleSplit(X.shape[0], n_iter=1, train_size=set_size, test_size=0, random_state=1) sub_index, other_index = ss.__iter__().next() X_sub = X[sub_index] y_sub = y[sub_index] return X_sub, y_sub
def get_sub_set_with_size(self, data, set_size): ''' @param train_data is [X, y] ''' X, y = data # TODO: you might want to change random_state ss = ShuffleSplit(X.shape[0], n_iter=1, train_size=set_size, test_size=0, random_state=1) sub_index, other_index = ss.__iter__().next() X_sub = X[sub_index] y_sub = y[sub_index] return X_sub, y_sub
def toNumpy(err = 0.3, n_sample = 10000, n_class = 2, train_err = 0): if n_class == 2: X, y = generate_binary_class(n_sample, err) elif n_class == 4: X, y = generate_four_class(n_sample, err) else: raise ValueError ss = ShuffleSplit(y.size, n_iter=1, test_size=0.8, random_state=0) train_index, test_index = ss.__iter__().next() X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] y_train = inject_noise(y_train, train_err) return X_train, y_train, X_test, y_test
def grow_single(args): rand_int, labeled_data, parent = args X, y = labeled_data ss = ShuffleSplit(y.size, n_iter=1, test_size=parent.split_r, random_state = rand_int) train_index, calib_index = ss.__iter__().next() train_set = X_train, y_train = X[train_index], y[train_index] calib_set = X_calib, y_calib = X[calib_index], y[calib_index] if len(set(y_train)) == 1 or len(set(y_calib)) == 1: # extreme case. return None clf = parent.base_clf_class() clf.fit(X_train, y_train) y_preds_prob = clf.predict_proba(X_calib)[:,1] y_trues = y_calib return clf, y_trues, y_preds_prob
def toNumpy(): print "Covtype dataset is being loaded" X, y = load_libsvm(default_train_file) ss = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.5, random_state=0) train_index, test_index = ss.__iter__().next() train_data = X[train_index], y[train_index] test_data = X[test_index], y[test_index] X_train, y_train = get_sub_set_with_size(train_data, 10000) X_test, y_test = get_sub_set_with_size(test_data, 10000) y_train[y_train == 1] = 0 y_train[y_train == 2] = 1 y_test[y_test == 1] = 0 y_test[y_test == 2] = 1 return X_train, y_train, X_test, y_test
def get_sub_set_with_size(self, data, set_size): ''' @param train_data is [X, y] ''' X, y = data ss = ShuffleSplit(X.shape[0], n_iter=1, train_size=set_size, test_size=0, random_state=1) sub_index, other_index = ss.__iter__().next() X_sub = X[sub_index] y_sub = y[sub_index] from collections import Counter print "Class counts" print Counter(y_sub) return X_sub, y_sub
def randomSplit(df, y_var_name, x_var_names, testSize=0.35, seedIn=None): """ Scale, impute missing, then split dataset *df* randomly into tuple (test_x, test_y, train_x, train_y) :param df: the pandas data frame to split :param y_var_name: the name of the variable (column of df) we try to predict. :type y_var_name: str :param x_var_names: list of the predictor variables. :type x_var_names: list(str) :param testSize: the fraction (0.0 to 1.0) of the dataset to put in TEST partition. :param seedIn: seed to the random number generator for reproducible results (what ARE those even). :returns: dict -- {'train_x','train_y','test_x','test_y'} """ scaler = preprocessing.StandardScaler() imputer = preprocessing.Imputer(missing_values="NaN", strategy="mean", axis=0) # remove rows where y_variable is missing. good_inds = (np.isnan(df[y_var_name])==False).nonzero() d = df.iloc[good_inds] if seedIn!=None: ss = ShuffleSplit(d.shape[0], n_iter=1, test_size=testSize, random_state=seedIn) else: ss = ShuffleSplit(d.shape[0], n_iter=1, test_size=testSize) training_inds, test_inds = ss.__iter__().next() training_rows = d.iloc[training_inds] test_rows = d.iloc[test_inds] data_tr = training_rows[x_var_names] imputer.fit(data_tr) scaler.fit(imputer.transform(data_tr)) data_tr_scaled = scaler.transform(imputer.transform(data_tr)) data_test = test_rows[x_var_names] data_test_scaled = scaler.transform(imputer.transform(data_test)) return {'test_x': data_test_scaled, 'test_y': test_rows[y_var_name], 'train_x': data_tr_scaled, 'train_y': training_rows[y_var_name], }
def VisualizeModelLearning(X, y): # Calculate performance of several models with varying training data sizes # then plot the learning and testing scores for each model # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(X.shape[0], n_iter = 10, test_size = .2, random_state = 0) print("ShuffleSplit sets: {}".format(cv)) # Generate the training sets of increasing sizes train_sizes = np.rint(np.linspace(1, X.shape[0] * .8 - 1, 9)).astype(int) print("Visualize training set sizes: {}".format(train_sizes)) # Create the figure window fig = pl.figure(figsize = (10, 8)) # Create three different models based on max_depth for k, depth in enumerate([1, 3, 4, 5, 6, 10]): # Create a decision tree regressor with a max_depth of depth regressor = DecisionTreeRegressor(max_depth = depth) # Calculate training and testing scores print("Evaluating depth {}".format(depth)) sizes, train_scores, test_scores = curves.learning_curve(regressor, X, y, \ cv = cv, train_sizes = train_sizes, scoring = 'r2') # Determine the mean and standard deviation for use in smoothing train_std = np.std(train_scores, axis = 1) train_mean = np.mean(train_scores, axis = 1) test_std = np.std(test_scores, axis = 1) test_mean = np.mean(test_scores, axis = 1) # Subplot the learning curve ax = fig.add_subplot(3, 2, k + 1) ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score') ax.plot(sizes, test_mean, 'o-', color = 'g', label = 'Testing Score') ax.fill_between(sizes, train_mean - train_std, train_mean + train_std, alpha = .15, color = 'r') ax.fill_between(sizes, test_mean - test_std, test_mean + test_std, alpha = .15, color = 'g') print("Results for depth {}: {}".format(depth, test_mean)) # Labels ax.set_title('max_depth = %s'%(depth)) ax.set_xlabel('Number of Training Points') ax.set_ylabel('Score') ax.set_xlim([0, X.shape[0] * 0.8]) ax.set_ylim([-.05, 1.05]) # Aesthetics ax.legend(loc = 'best') fig.suptitle('Decision Tree Regressor Learning Performances', fontsize = 16, y = 1.03) fig.tight_layout() fig.show()
def ModelLearning(X, y): """ Calculates the performance of several models with varying sizes of training data. The learning and testing scores for each model are then plotted. """ # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=0) # Generate the training set sizes increasing by 50 train_sizes = np.rint(np.linspace(1, X.shape[0] * 0.8 - 1, 9)).astype(int) # Create the figure window fig = pl.figure(figsize=(10, 7)) # Create three different models based on max_depth for k, depth in enumerate([1, 3, 6, 10]): # Create a Decision tree regressor at max_depth = depth regressor = DecisionTreeRegressor(max_depth=depth) # Calculate the training and testing scores sizes, train_scores, test_scores = curves.learning_curve(regressor, X, y, \ cv = cv, train_sizes = train_sizes, scoring = 'r2') # Find the mean and standard deviation for smoothing train_std = np.std(train_scores, axis=1) train_mean = np.mean(train_scores, axis=1) test_std = np.std(test_scores, axis=1) test_mean = np.mean(test_scores, axis=1) # Subplot the learning curve ax = fig.add_subplot(2, 2, k + 1) ax.plot(sizes, train_mean, 'o-', color='r', label='Training Score') ax.plot(sizes, test_mean, 'o-', color='g', label='Testing Score') ax.fill_between(sizes, train_mean - train_std, \ train_mean + train_std, alpha = 0.15, color = 'r') ax.fill_between(sizes, test_mean - test_std, \ test_mean + test_std, alpha = 0.15, color = 'g') # Labels ax.set_title('max_depth = %s' % (depth)) ax.set_xlabel('Number of Training Points') ax.set_ylabel('Score') ax.set_xlim([0, X.shape[0] * 0.8]) ax.set_ylim([-0.05, 1.05]) # Visual aesthetics ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad=0.) fig.suptitle( 'Decision Tree Regressor Learning Performances', fontsize=16, y=1.03) fig.tight_layout() fig.show()
def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors)
def toNumpyBalanced(): X, y = load_svmlight_file(default_train_file) ss = ShuffleSplit(X.shape[0], n_iter=1, test_size=5640, random_state=1) train_index, test_index = ss.__iter__().next() X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] # subsample the training set to make the number of instances for each class # equal. from collections import Counter sample_num = min(Counter(y_train).values()) X0 = X_train[np.where(y_train==0)[0],:] X0 = X0[:sample_num,:] X1 = X_train[np.where(y_train==1)[0],:] X1 = X1[:sample_num,:] X2 = X_train[np.where(y_train==2)[0],:] X2 = X2[:sample_num,:] X3 = X_train[np.where(y_train==3)[0],:] X3 = X3[:sample_num,:] XX = sparse.vstack([X0, X1, X2, X3]) assert(XX.shape[0] == sample_num * 4) X_train = XX y_train = np.hstack([ np.zeros(sample_num), np.ones(sample_num), np.ones(sample_num) * 2, np.ones(sample_num) * 3]) print 'loaded snippet dataset' print 'entire training set size', y_train.size print 'test set size', y_test.size return X_train, y_train, X_test, y_test
def prepare_evaluation_splits(tasks_dir, to_dir, folds = 3, test_part = 0.3): all_task_fnames = numpy.array([fname for fname in os.listdir(tasks_dir) if os.path.isfile(os.path.join(tasks_dir, fname))]) for fold_i, (train_idx, test_idx) in enumerate(ShuffleSplit(len(all_task_fnames), n_iter = folds, test_size = test_part)): train_dir = os.path.join(to_dir, str(fold_i), TRAIN_DIR) ensure_dir_exists(train_dir) copy_files(tasks_dir, all_task_fnames[train_idx], train_dir) test_dir = os.path.join(to_dir, str(fold_i), TEST_DIR) ensure_dir_exists(test_dir) copy_files(tasks_dir, all_task_fnames[test_idx], test_dir)
def univariate_test(x, y, model, names, score_type): scores = [] X = np.matrix(x) for i in range(X.shape[1]): score = cross_val_score(model, X[:, i:i + 1], y, scoring=score_type, cv=ShuffleSplit(len(X), 3, .3)) scores.append(round(np.mean(score), 3)) maxval = max(scores) minval = min(scores) dist = maxval - minval return list(zip((np.array(scores) - minval) / dist, names))
def fit(self, X, y): train = np.array(X) assert len(train.shape) == 2 assert len(y.shape) == 1 ss = ShuffleSplit(n=X.shape[0], n_iter=self.n_iter, random_state=self.random_state, test_size=flexible_int(X.shape[0], self.sample_size)) self.clfs_ = [] for _, indices in ss: tmp_clf = deepcopy(self.clf) tmp_clf.fit(train[indices], y[indices]) self.clfs_.append(tmp_clf) return self
def binary_cbf(oversampling=(0, 0)): """ :param oversampling: Tuple(Int), double review samples with star classes in range :return: None """ t = time() with sqlite3.connect(DB_PATH) as conn: y = FeatureReformer(conn, 'r_samples', ['rstar']).transform('y2').transpose()[0] X = FeatureReformer(conn, 'r_samples', [ 'brcnt', 'bstar', 'checkins', 'compliments', 'fans', 'rdate', 'urcnt', 'ustar', 'uvotes', 'ysince', ]).transform() # oversampling ovsp = over_sampling(y, oversampling) y = y[ovsp] X = X[ovsp] n_samples, n_features = X.shape print(X.shape) print('Done with collecting & reforming data from database, using ', time() - t, 's') t = time() rec_scorer = RecScorer(n_class=2) div = ShuffleSplit(n_samples, n_iter=5, test_size=0.2, random_state=0) model = ExtraTreesClassifier(n_estimators=5) for train, test in div: X_train = X[np.array(train)] X_test = X[np.array(test)] y_train = y[np.array(train)] y_test = y[np.array(test)] model.fit(X_train, y_train) y_pred = model.predict(X_test) # Metrics below rec_scorer.record(y_true=y_test, y_pred=y_pred) # print(confusion_matrix(y_true=y_test, y_pred=y_pred), '\n', time()-t, 's used >>\n') print(time() - t, 's used >>\n') print('Done with 5-fold training & cross validating, using ', time() - t, 's') rec_scorer.finalScores()
def get_acc_auc_randomisedCV(clfname,X,Y,iterNo=5,test_percent=0.2): acc=[] auc_=[] precision=[] recall=[] f1score=[] rs = ShuffleSplit(len(Y), iterNo,test_percent) for train_index, test_index in rs: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] Y_pred = 0 if clfname=='Logistic Regression': Y_pred=logistic_regression_pred(X_train,Y_train,X_test) if clfname=='SVM': Y_pred=svm_pred(X_train,Y_train,X_test) if clfname=='Decision Tree': Y_pred=decisionTree_pred(X_train,Y_train,X_test) if clfname=='SGDClassifier': Y_pred=SGDClassifier_pred(X_train, Y_train, X_test) if clfname=='adaboost': Y_pred=adaboost(X_train, Y_train, X_test) if clfname=='LogisticRegressionCV': Y_pred=LRCV(X_train, Y_train, X_test) if clfname=='SVC': Y_pred=dosvc(X_train, Y_train, X_test) if clfname=='RFC': Y_pred=RFC(X_train, Y_train, X_test) if clfname=='GBC': Y_pred=GBC(X_train, Y_train, X_test) accvalue, auc_value, precisionvalue, recallvalue, f1scorevalue = classification_metrics(Y_pred,Y_test) acc.append(accvalue) auc_.append(auc_value) precision.append(precisionvalue) recall.append(recallvalue) f1score.append(f1scorevalue) acc_mean=mean(acc) auc_mean=mean(auc_) precision_mean = mean(precision) recall_mean = mean(recall) f1score_mean = mean(f1score) return acc_mean,auc_mean,precision_mean,recall_mean,f1score_mean
def svc(self, knl='linear'): model = Pipeline([('vect', CountVectorizer(tokenizer=self.tokenize, stop_words=self.stop_words)), ('clf', SVC(kernel=knl))]) cv = ShuffleSplit(self.len_row, random_state=self.random_state) recall_rate = cross_val_score( model, self.X, self.y, scoring='recall', cv=cv, fit_params={'clf__sample_weight': self.weight}) return recall_rate
def train_classifierGS(clf, X_train, y_train, params=None): cv_iters = 2 cv_sets = ShuffleSplit(X_train.shape[0], n_iter=cv_iters, test_size=0.20, random_state=0) scoring_fnc = make_scorer(performance_metric) # Create the grid search object grid = GridSearchCV(clf, params, scoring=scoring_fnc, cv=cv_sets) grid.fit(X_train, y_train) print "best_params_ for the optimal model are: {}.".format( str(grid.best_params_)) return grid.best_params_, grid.best_estimator_
def test2(): from sklearn.cross_validation import cross_val_score, ShuffleSplit X, Y, names = iris.data, iris.target, iris['feature_names'] rf = RandomForestRegressor() scores = [] for i in range(X.shape[1]): score = cross_val_score(rf, X[:, i:i + 1], Y, scoring='r2', cv=ShuffleSplit(len(X), 3, .3)) scores.append((round(np.mean(score), 3), names[i])) print(sorted(scores, reverse=True))
def evaluate(self,K,Y): n = len(K) f_score = __score_definition__() cv = self.cv if cv == None: cv = ShuffleSplit(n, n_iter=1, test_size=.25) score = [] for train,test in cv: clf = self.estimator.fit(K[train][:,train],Y[train]) y_pred = clf.predict(K[test][:,train]) score.append(f_score(Y[test],y_pred)) return np.mean(score)
def score_models(column): """Generates all models and scores the data without storing all the possible models, no big tinydb's are used""" Ys = get_Ys() all_full_input = get_all_lin_model_inp() model_obj = LinearRegression(fit_intercept=False) Y = Ys[column].dropna().values sn_Y = Ys[column].dropna().index my_cv = ShuffleSplit(len(Y), n_iter=3, test_size=0.333, random_state=0) equip, d_type = column.split(' ') top_db = access_db('Top_score_results_' + equip + '_' + d_type, False) for i in range(28): number_of_terms = i + 1 done = top_db.contains(Q.n_terms == number_of_terms) if done: continue f_name = 'All_Poss_Mod_{}_Terms'.format(number_of_terms) f_obj = access_file(f_name, write=False) mcodes = cPickle.load(f_obj) f_obj.close() top_score = -10000.0 for i in mcodes: # Generate X for certain model and Y X = gen_X(sn_Y, all_full_input, i) scores = cross_val_score(model_obj, X, Y, cv=my_cv) score = mean(scores) top_score = max(score, top_score) if top_score == score: top_mcode = list(i) entry = { 'equipment_name': equip, 'data_type': d_type, 'n_terms': number_of_terms, 'top_score': top_score, 'top_mcode': top_mcode } top_db.insert(entry)
def _do(matrix, test_ratio=0.0): if labels: # Learning mode # Split train & test folds shuffle = ShuffleSplit(len(matrix), test_size=test_ratio) trainlist, testlist = [(a, b) for (a, b) in shuffle][-1] X_train = [x for x in map(lambda i: matrix[i], trainlist)] Y_train = [y for y in map(lambda i: labels[i], trainlist)] X_valid = [x for x in map(lambda i: matrix[i], testlist)] Y_valid = [y for y in map(lambda i: labels[i], testlist)] # Display what the underlying classifier is print(colored(clf[-1], 'yellow')) # Display the dimension of the training elements print(colored('Trainset:', 'cyan')) print(colored('X: {0}'.format(np.shape(X_train)), 'yellow')) print(colored('y: {0}'.format(np.shape(Y_train)), 'yellow')) # Process trainset for opr in clf[:-1]: print(colored(opr, 'yellow')) X_train = opr.fit_transform(X_train, Y_train) # NOTE: The last operation of the CLF is always a clustering algo clf[-1].fit(X_train, Y_train) # Display the dimension of the training elements print(colored('Validation set:', 'cyan')) print(colored('X: {0}'.format(np.shape(X_valid)), 'yellow')) print(colored('y: {0}'.format(np.shape(Y_valid)), 'yellow')) # Process validation set for opr in clf[:-1]: print(colored(opr, 'yellow')) X_valid = opr.transform(X_valid) # Return tuple of [actual], [prediction] # on the validation set return (Y_valid, clf[-1].predict(X_valid)) else: # Classification mode X = matrix # Feature transformations for opr in clf[:-1]: X = opr.transform(X) # NOTE: Predict the clusters with the last operation y = clf[-1].predict(X) return iter(y)
def __init__(self, dataset, n_iter=10, test_size=0.1, train_size=None, random_state=None, **kwargs): n = dataset.X.shape[0] cv = ShuffleSplit(n, n_iter=n_iter, test_size=test_size, train_size=train_size, random_state=random_state) super(DatasetShuffleSplit, self).__init__(dataset, cv, **kwargs)
def fit_model(X, y): cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=0) estimator = DecisionTreeClassifier() param_grid = { 'min_samples_split': list(np.linspace(30, 150, 12).astype(int)) } grid = GridSearchCV(estimator, param_grid, scoring='accuracy', cv=cv) grid = grid.fit(X, y) return grid.best_estimator_
def model_complexity(X, y): """ Calculates the performance of the model as model complexity increases. The learning and testing errors rates are then plotted. """ # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=0) # Vary the max_depth parameter from 1 to 10 max_depth = np.arange(1, 11) # Calculate the training and testing scores train_scores, test_scores = curves.validation_curve( DecisionTreeRegressor(), X, y, param_name="max_depth", param_range=max_depth, cv=cv, scoring='r2') # Find the mean and standard deviation for smoothing train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Plot the validation curve pl.figure(figsize=(7, 5)) pl.title('Decision Tree Regressor Complexity Performance') pl.plot(max_depth, train_mean, 'o-', color='r', label='Training Score') pl.plot(max_depth, test_mean, 'o-', color='g', label='Validation Score') pl.fill_between(max_depth, train_mean - train_std, train_mean + train_std, alpha=0.15, color='r') pl.fill_between(max_depth, test_mean - test_std, test_mean + test_std, alpha=0.15, color='g') # Visual aesthetics pl.legend(loc='lower right') pl.xlabel('Maximum Depth') pl.ylabel('Score') pl.ylim([-0.05, 1.05]) pl.show()
def logistic(self, ngram=(1, 1)): model = Pipeline([('vect', CountVectorizer(tokenizer=self.tokenize, stop_words=self.stop_words, ngram_range=ngram)), ('clf', LogisticRegression())]) cv = ShuffleSplit(self.len_row, random_state=self.random_state) recall_rate = cross_val_score( model, self.X, self.y, scoring='recall', cv=cv, fit_params={'clf__sample_weight': self.weight}) return recall_rate
def fit_model(X, y): # Create cross-validation sets from the training data cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0) # Create a decision tree regressor object regressor = DecisionTreeRegressor() # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 params = {'max_depth':list(range(1,11))} #Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric) #Create the grid search object grid = GridSearchCV(regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets) # Fit the grid search object to the data to compute the optimal model grid = grid.fit(X, y) # Return the optimal model after fitting the data return grid.best_estimator_
def get_acc_auc_randomisedCV(X, Y, iterNo=5, test_percent=0.2): #TODO: First get the train indices and test indices for each iteration #Then train the classifier accordingly #Report the mean accuracy and mean auc of all the iterations kf = ShuffleSplit(len(Y), n_iter=iterNo, test_size=test_percent, random_state=545510477) acc = [] auc = [] for train, test in kf: Y_pred = models.logistic_regression_pred(X[train], Y[train], X[test]) acc.append(accuracy_score(Y[test], Y_pred)) auc.append(roc_auc_score(Y[test], Y_pred)) return mean(acc), mean(auc)
def logistic_regression_classifier(train_x, train_y): from sklearn.linear_model import LogisticRegression cv = ShuffleSplit(int(len(train_x)), n_iter=10, random_state=0, test_size=0.2) param_grid = { 'intercept_scaling': list([1, 2, 3]), 'C': list(range(1, 20)) } model = GridSearchCV(estimator=LogisticRegression(), cv=cv, param_grid=param_grid) model.fit(train_x, train_y) return model
def splitFunc(target, optNum): #do k-fold cross val if optNum > 1: return KFold(len(target), int(optNum), indices=False, shuffle=True) #do percent based train/test split elif optNum < 1: return ShuffleSplit(n=len(target), n_iter=1, test_size=optNum, indices=False) else: print 'Error, do not set opt num to 1!' return 0
def fit_model2(X, y): cv_sets = ShuffleSplit(X.shape[0], n_iter=10, test_size=.20, random_state=0) regressor = DecisionTreeRegressor() count = range(1, 11) params = dict(max_depth=count) scoring_func = make_scorer(performance_metric) grid = RandomizedSearchCV(regressor, params, cv=cv_sets, scoring=scoring_func) grid = grid.fit(X, y) return grid.best_estimator_
def RFcross_hq(X, y): ### RF cross validation ===================== from sklearn.cross_validation import cross_val_score, ShuffleSplit from sklearn.ensemble import RandomForestRegressor from math import log n_estimators = max(int(log(X.shape[0]))+1, 100) max_depth = max(int(log(X.shape[1]))+1, 5) rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth) scores = [] for i in range(X.shape[1]): score = cross_val_score(rf, X[:, i:i+1], y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3)) #scores.append((round(np.mean(score), 3), names[i])) scores.append(round(np.mean(score), 3)) return scores
def grid_search_model(clf_factory, X, Y, stop_words): cv = ShuffleSplit(n=len(X), test_size=0.3, random_state=0) param_grid = dict( vect__min_df = [1, 2], vect__smooth_idf = [False, True], vect__use_idf = [False, True], vect__sublinear_tf = [False, True], vect__binary = [False, True], clf__alpha = [0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(stop_words), param_grid = param_grid, cv = cv, scoring = make_scorer(f1_score), verbose=1) grid_search.fit(X, Y) return grid_search.best_estimator_
def grid_search_model(clf_factory, X, y): cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__smooth_idf=[False, True], vect__stop_words=[None, "english"], vect__use_idf=[True, False], vect__sublinear_tf=[True, False], vect__binary=[True, False], clf__alpha=[0, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, y) return grid_search.best_estimator_, grid_search.best_score_, grid_search.best_params_
def grow_single(args): rand_int, labeled_data, parent = args X, y = labeled_data # In this portion of the script, we can the set used to train a classifier # 'train set', and the set used to estimate the classification probability # 'calib set'. ss = ShuffleSplit(y.size, n_iter=1, test_size=parent.split_r, random_state = rand_int) train_index, calib_index = ss.__iter__().next() train_set = X_train, y_train = X[train_index], y[train_index] calib_set = X_calib, y_calib = X[calib_index], y[calib_index] if len(set(y_train)) == 1 or len(set(y_calib)) == 1: # extreme case. return None prev_estimator = CC2(parent.base_clf_class) prev_estimator.fit(train_set) ac_estimator = ac_factory(prev_estimator, calib_set) if ac_estimator is None: return None prev_estimator = ac_estimator return prev_estimator
def exp23(): x_enrollment_train, x_normal_enrollment_train, x_enrollment_test, x_normal_enrollment_test, y_train, enrollment_id_df, sample_weight_df = load_data( ) skf = ShuffleSplit(y_train.shape[0], 1, 0.4) for train_index, test_index in skf: reject_features = [] # reject_features = list(range(170, 209)) + list(range(287, 303)) # reject_features = list(range(170, 287)) + list(range(287, 303)) + list(range(303, 367)) reject_features = list(range(170, 248)) selected_features = list(set(range(367)) - set(reject_features)) print('nb_feature:', len(selected_features)) # import ipdb; ipdb.set_trace() y_train2 = np.vstack([1 - y_train[train_index], y_train[train_index]]).T model = build_model2() model.fit(x_normal_enrollment_train[:, selected_features][train_index], y_train2, nb_epoch=7) # model.fit( # x_normal_enrollment_train[:, selected_features][train_index], # y_train[train_index] # ) if hasattr(model, 'predict_proba'): predicts_cv = model.predict_proba( x_normal_enrollment_train[:, selected_features][test_index]) else: predicts_cv = model.decision_function( x_normal_enrollment_train[:, selected_features][test_index]) if len(predicts_cv.shape) == 2: if predicts_cv.shape[1] == 2: roc = roc_auc_score(y_train[test_index], predicts_cv[:, 1]) print('roc_auc_score of cv on test %f' % roc) else: roc = roc_auc_score(y_train[test_index], predicts_cv[:, 0]) print('roc_auc_score of cv on test %f' % roc) else: roc = roc_auc_score(y_train[test_index], predicts_cv) print('roc_auc_score of cv on test %f' % roc)
def train_model(clf_factory, X, Y, name): labels = np.unique(Y) cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) pr_scores_list = np.array([]) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve(y_label_test, proba_label) auc_result = auc(recall, precision) pr_scores[label].append(auc(recall, precision)) pr_scores_list = np.append(pr_scores_list, auc_result) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores_list), np.std(pr_scores_list)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def fit_model(X, y): # Gera conjuntos de validação-cruzada para o treinamento de dados cv_sets = ShuffleSplit(X.shape[0] # qt total elementos , n_iter = 10 # qt vezes embaralhar e dividir , test_size = 0.2 , random_state = 123) grid = GridSearchCV(DecisionTreeRegressor() , dict(max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) , make_scorer(performance_metric) , cv = cv_sets) # Encontrando os melhores parâmetros do estimador grid = grid.fit(X, y) return grid.best_estimator_
def cross_validation(self): self.remove_columns(['institute_latitude', 'institute_longitude']) gbr = GradientBoostingRegressor() cv = ShuffleSplit(self.X.shape[0], n_iter=3, test_size=0.3, random_state=0) self.test_scores = cross_val_score(gbr, self.X, self.y, cv=cv, scoring=self.rmse_scorer(), n_jobs=1) # poor machine
def split_arrays(self, n, test_fraction = 0.1 ): shfSplt = ShuffleSplit( n=n, n_iterations=1, test_size = test_fraction) train_ix, test_ix = shfSplt.__iter__().next() return train_ix, test_ix
from pybrain.structure.modules import SoftmaxLayer, TanhLayer from pybrain.supervised.trainers import BackpropTrainer from pybrain.datasets import ClassificationDataSet from pybrain.utilities import percentError from lib import dao, viz #ds = dao.load_ads() ds = dao.load_credit() #ds.sanitize(strategy='impute_mean', scale=True) ds.onehot() ds.scale_zmuv() X, y = ds.sample(class_balance=None) #X, y = ds.select_features(technique='extra_trees') ssp = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.2, random_state=5557) train_idxs, test_idxs = ssp._iter_indices().next() train_idxs = train_idxs[:-1] if len(train_idxs) % 2 != 0 else train_idxs test_idxs = test_idxs[:-1] if len(test_idxs) % 2 != 0 else test_idxs train = ClassificationDataSet(X.shape[1], 1, nb_classes=2, class_labels=ds._class_names) for i in train_idxs: train.addSample(X[i], [y[i]]) test = ClassificationDataSet(X.shape[1], 1, nb_classes=2, class_labels=ds._class_names) for i in test_idxs: test.addSample(X[i], [y[i]]) train._convertToOneOfMany() test._convertToOneOfMany()