df_equal = pd.concat([df_equal, df_subset], axis=0) species_key_df = df_all[['Species', 'Species_code']].drop_duplicates() # create arrays of required data X_columns = ['leaf length', 'leaf width', 'widest point', 'total veins'] y_columns = ['Species'] X = df_equal[X_columns].values y = df_equal[y_columns].values # parameters of the model n_neighbors = 10 weights = ['uniform', 'distance'] weight = weights[0] ss = ShuffleSplit(n_splits=10, test_size=0.1) for train_index, test_index in ss.split(X): # generate data from indices X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index] # fit the training data clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weight) clf.fit(X_train, y_train.ravel()) # predict the test data output = clf.predict(X_test) # report results score = clf.score(X_test, y_test) print("Score: {:.2%}".format(score))
def FitModel(cnnc, A, Y, T, FN): print('Fitting model...') ss = ShuffleSplit(n_splits = 1) trn, tst = next(ss.split(A)) #Fit the network cnnc.fit(A[trn], Y[trn]) #The predictions as sequences of character indices YH = [] for i in np.array_split(np.arange(A.shape[0]), 32): YH.append(cnnc.predict(A[i])) YH = np.vstack(YH) #Convert from sequence of char indices to strings PS = np.array([''.join(YHi) for YHi in YH]) #Compute the accuracy S1 = SAcc(PS[trn], T[trn]) S2 = SAcc(PS[tst], T[tst]) print('Train: ' + str(S1)) print('Test: ' + str(S2)) for PSi, Ti, FNi in zip(PS, T, FN): if np.random.rand() > 0.99: #Randomly select rows to print print(FNi + ': ' + Ti + ' -> ' + PSi) print('Fitting with CV data...') #Fit remainder cnnc.SetMaxIter(4) cnnc.fit(A, Y) return cnnc
def main(): from io import open as uopen import argparse parser = argparse.ArgumentParser() parser.add_argument('fname') parser.add_argument('idx', default=2, type=int) parser.add_argument('--key', default=u'V;1;SG;IND;PST;PFV') parser.add_argument('--shuffle', action='store_true') parser.add_argument('--folds', default=10, type=int) parser.add_argument('--lang', default='sp') parser.add_argument('--key-idx', default=3, type=int) args = parser.parse_args() fh = uopen(args.fname, encoding='utf-8') lines = [x.strip().split(u'\t') for x in fh] to_extract = [(x[0], x[args.idx]) for x in lines if x[args.key_idx] == args.key] if args.shuffle: from random import shuffle shuffle(to_extract) from distutils.dir_util import mkpath from sklearn.model_selection import ShuffleSplit rs = ShuffleSplit(n_splits=args.folds, test_size=0.2, random_state=42) for i, (train_indices, test_indices) in enumerate(rs.split(to_extract)): mkpath('res/ryan_splits/{}-10fold/{}'.format(args.lang, i)) train_fh, dev_fh, test_fh = (uopen('res/ryan_splits/{}-10fold/{}/train.uniq'.format(args.lang, i), mode='w', encoding='utf-8'), uopen('res/ryan_splits/{}-10fold/{}/dev.uniq'.format(args.lang, i), mode='w', encoding='utf-8'), uopen('res/ryan_splits/{}-10fold/{}/test.uniq'.format(args.lang, i), mode='w', encoding='utf-8'), ) for idx in train_indices: train_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1])) for j, idx in enumerate(test_indices): if j % 2 == 0: dev_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1])) else: test_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting # cv = ShuffleSplit( # n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) cv = ShuffleSplit( n_splits=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv.split(X): X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors)
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Initiate model model = init_model(X_train.shape[1]) vanilla_weights = model.get_weights() # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate( cross_validation_iterator.split(X_train), start=1 ): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) optimal_weights_path = "/tmp/Optimal_Weights_{}.h5".format(cross_validation_index) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue if not os.path.isfile(optimal_weights_path): # Load the vanilla weights model.set_weights(vanilla_weights) # Perform the training procedure earlystopping_callback = EarlyStopping(monitor="val_actual_mae", patience=EARLYSTOPPING_PATIENCE) modelcheckpoint_callback = ModelCheckpoint(optimal_weights_path, monitor="val_loss", save_best_only=True) model.fit( X_train[train_index], Y_train[train_index], batch_size=TRAIN_BATCH_SIZE, nb_epoch=MAXIMUM_EPOCH_NUM, validation_data=(X_train[valid_index], Y_train[valid_index]), callbacks=[earlystopping_callback, modelcheckpoint_callback], verbose=2, ) # Load the optimal weights model.load_weights(optimal_weights_path) # Perform the testing procedure Y_test = model.predict(X_test, batch_size=TEST_BATCH_SIZE, verbose=2) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
def train_model(clf_factory, X, Y, name="NB ngram", plot=False): # cv = ShuffleSplit( # n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html # old:http://scikit-learn.org/0.15/modules/generated/sklearn # .cross_validation.ShuffleSplit.html#sklearn.cross_validation.ShuffleSplit cv = ShuffleSplit( n_splits=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv.split(X): X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) # print('proba:', proba) # fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) scores_to_sort = pr_scores # print('np.argsort(scores_to_sort):', np.argsort(scores_to_sort),len(scores_to_sort) / 2) median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)] if plot: plot_pr(pr_scores[median], name, "01", precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors)
def fit_models(imps, X, Y, all_props, props=None, labels=None, n_splits=5, clf_args={'n_estimators':25, 'max_features':'auto', 'random_state':0}): if props is None: props = all_props n_obs = X['missing'].shape[0] # Number of observations. n_features = X['missing'].shape[1] # Number of observations. n_props = len(props) # Number of properties to predict. test_size = 0.2 if labels is None: shuffle_split = ShuffleSplit(n_iter=n_splits, test_size=test_size,random_state=0) else: shuffle_split = GroupShuffleSplit(n_iter=n_splits, test_size=test_size,random_state=0) n_test_samples = np.max([len(list(shuffle_split.split(range(n_obs),groups=labels))[i][1]) \ for i in range(n_splits)]) rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps} ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps} ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps} feature_importances = {imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps} for n_prop,prop in enumerate(props): j = all_props.index(prop) print("Fitting model for %s..." % prop) for imp in imps: for k,(train,test) in enumerate(shuffle_split.split(range(n_obs), groups=labels)): X_train,X_test = X[imp][train],X[imp][test] Y_train,Y_test = Y[imp][train,j],Y['missing'][test,j] clf_args_ = {key:(value if type(value) is not dict \ else value[prop])\ for key,value in clf_args.items()} if clf_args_['max_features'] not in [None, 'auto']: clf_args_['max_features'] = min(X_train.shape[1], clf_args_['max_features']) rfc = RandomForestClassifier(**clf_args_) #if Y_train.shape[1] == 1: # Y_train = Y_train.ravel() rfc.fit(X_train,Y_train) Y_predict = rfc.predict(X_test)#.reshape(-1,n_props) probs = rfc.predict_proba(X_test) if probs.shape[1]<2 and probs.mean()==1.0: n_test_samples = len(probs) ps[imp][n_prop,k,:n_test_samples] = 0.0 else: n_test_samples = len(probs[:,1]) ps[imp][n_prop,k,:n_test_samples] = probs[:,1] ys[imp][n_prop,k,:n_test_samples] = Y_test rs[imp][n_prop,k] = np.ma.corrcoef(Y_predict,Y_test)[0,1] feature_importances[imp][n_prop,:,k] = rfc.feature_importances_ return rs,feature_importances,ys,ps
def fit_models_mc(imps, X, Y, all_props, props=None, labels=None, n_splits=5, clf_args={'n_estimators':25, 'max_features':'auto', 'random_state':0}): if props is None: props = all_props n_obs = X['missing'].shape[0] # Number of observations. n_features = X['missing'].shape[1] # Number of observations. n_props = len(props) # Number of properties to predict. test_size = 0.2 if labels is None: shuffle_split = ShuffleSplit(n_iter=n_splits, test_size=test_size,random_state=0) else: shuffle_split = LabelShuffleSplit(n_iter=n_splits, test_size=test_size,random_state=0) n_test_samples = np.max([len(list(shuffle_split)[i][1]) \ for i in range(n_splits)]) rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps} ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps} ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps} feature_importances = None#{imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps} cols = np.array([i for i in range(len(all_props)) if all_props[i] in props]) for imp in imps: for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),groups=labels)): #X_train,X_test = X[imp][train][:,cols],X[imp][test][:,cols] #Y_train,Y_test = Y[imp][train][:,cols],Y['missing'][test][:,cols] X_train,X_test = X[imp][train,:],X[imp][test,:] Y_train,Y_test = Y[imp][train,:],Y['missing'][test,:] clf_args_ = {key:(value if type(value) is not dict \ else value[prop])\ for key,value in clf_args.items()} if clf_args_['max_features'] not in [None, 'auto']: clf_args_['max_features'] = min(X_train.shape[1], clf_args_['max_features']) rfc = RandomForestClassifier(**clf_args_) onevsrest = OneVsRestClassifier(rfc) onevsrest.fit(X_train,Y_train) Y_predict = onevsrest.predict(X_test)#.reshape(-1,n_props) probs = onevsrest.predict_proba(X_test) if probs.shape[1]<2 and probs.mean()==1.0: n_test_samples = len(probs) ps[imp][:,k,:n_test_samples] = 0.0 else: n_test_samples = len(probs[:,1]) ps[imp][:,k,:n_test_samples] = probs.T ys[imp][:,k,:n_test_samples] = Y_test.T for i in range(n_props): rs[imp][i,k] = np.ma.corrcoef(Y_predict[:,i],Y_test[:,i])[0,1] #feature_importances[imp][n_prop,:,k] = onevsrest.feature_importances_ return rs,feature_importances,ys,ps
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = XGBRegressor( learning_rate=0.01, max_depth=12, n_estimators=N_ESTIMATORS, silent=False, objective="reg:linear", gamma=1, min_child_weight=1, subsample=0.8, colsample_bytree=0.5, reg_alpha=1, seed=cross_validation_index, nthread=-1) model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])], eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))), early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
def TestPerformance(self, df = None): #If no dataframe is provided, use the currently learned one if(df is None): D = self.D else: D = self.S.transform(df.copy()) #Get features from the data frame A = self._ExtractFeat(D) #Get the target values and their corresponding column names y, _ = self._ExtractTarg(D) #Begin cross validation ss = ShuffleSplit(n_splits = 1) for trn, tst in ss.split(A): s1 = self.R.score(A, y) s2 = self.R.score(A[tst], y[tst]) s3 = self.R.score(A[trn], y[trn]) print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))
def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") X, y = iris.data, iris.target K = np.dot(X, X.T) cv = ShuffleSplit(test_size=0.25, random_state=0) tr, te = list(cv.split(X))[0] X_tr, y_tr = _safe_split(clf, X, y, tr) K_tr, y_tr2 = _safe_split(clfp, K, y, tr) assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T)) X_te, y_te = _safe_split(clf, X, y, te, tr) K_te, y_te2 = _safe_split(clfp, K, y, te, tr) assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = GBMRegressor( learning_rate=0.01, num_iterations=NUM_ITERATIONS, num_leaves=200, min_data_in_leaf=10, feature_fraction=0.3, feature_fraction_seed=cross_validation_index, bagging_fraction=0.8, bagging_freq=10, bagging_seed=cross_validation_index, metric="l1", metric_freq=10, early_stopping_round=EARLY_STOPPING_ROUND, num_threads=-1) model.fit(X_train[train_index], Y_train[train_index], test_data=[(X_train[valid_index], Y_train[valid_index])]) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
def plot_shuffle_split(): from sklearn.model_selection import ShuffleSplit plt.figure(figsize=(10, 2)) plt.title("ShuffleSplit with 10 points" ", train_size=5, test_size=2, n_splits=4") axes = plt.gca() axes.set_frame_on(False) n_folds = 10 n_samples = 10 n_iter = 4 n_samples_per_fold = 1 ss = ShuffleSplit(n_splits=4, train_size=5, test_size=2, random_state=43) mask = np.zeros((n_iter, n_samples)) for i, (train, test) in enumerate(ss.split(range(10))): mask[i, train] = 1 mask[i, test] = 2 for i in range(n_folds): # test is grey colors = ["grey" if x == 2 else "white" for x in mask[:, i]] # not selected has no hatch boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//", edgecolor='k', align='edge') for j in np.where(mask[:, i] == 0)[0]: boxes[j].set_hatch("") axes.invert_yaxis() axes.set_xlim(0, n_samples + 1) axes.set_ylabel("CV iterations") axes.set_xlabel("Data points") axes.set_xticks(np.arange(n_samples) + .5) axes.set_xticklabels(np.arange(1, n_samples + 1)) axes.set_yticks(np.arange(n_iter) + .3) axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)]) # legend hacked for this random state plt.legend([boxes[1], boxes[0], boxes[2]], [ "Training set", "Test set", "Not selected"], loc=(1, .3)) plt.tight_layout()
def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") iris = datasets.load_iris() X, y = iris.data, iris.target K = np.dot(X, X.T) cv = ShuffleSplit(test_size=0.25, random_state=0) train, test = list(cv.split(X))[0] X_train, y_train = _safe_split(clf, X, y, train) K_train, y_train2 = _safe_split(clfp, K, y, train) assert_array_almost_equal(K_train, np.dot(X_train, X_train.T)) assert_array_almost_equal(y_train, y_train2) X_test, y_test = _safe_split(clf, X, y, test, train) K_test, y_test2 = _safe_split(clfp, K, y, test, train) assert_array_almost_equal(K_test, np.dot(X_test, X_train.T)) assert_array_almost_equal(y_test, y_test2)
datetime.datetime.now().strftime('%Y-%m-%d_%H' '-%M-%S'))) os.makedirs(output_dir) random_state = check_random_state(0) mem = Memory(cachedir=expanduser("~/cache"), verbose=10) X_csr = mem.cache(fetch_ml_10m)(expanduser('~/data/own/ml-10M100K'), remove_empty=True) permutation = random_state.permutation(X_csr.shape[0]) X_csr = X_csr[permutation] X, y = array_to_fm_format(X_csr) uniform_split = ShuffleSplit(n_iter=4, test_size=.25, random_state=random_state) fm_decoder = FMDecoder(n_samples=X_csr.shape[0], n_features=X_csr.shape[1]) base_estimator = BaseRecommender(fm_decoder) convex_fm = ConvexFM(fit_linear=True, alpha=0, max_rank=20, beta=1, verbose=100) soft_imputer = SoftImputer(fm_decoder, alpha=.001, n_components=10, max_iter=100, random_state=None) dl_rec = DLRecommender(fm_decoder, n_components=50, batch_size=10,
def train_test_split(*arrays, **options): """Extend sklearn.model_selection.train_test_slit to have group split. Parameters ---------- *arrays : sequence of indexables with same length / shape[0] Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. test_size : float, int or None, optional (default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.25. train_size : float, int, or None, (default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : None or str (default='simple') How to shuffle the data before splitting. None, no shuffle. For str, one of 'simple', 'stratified' and 'group', corresponding to `ShuffleSplit`, `StratifiedShuffleSplit` and `GroupShuffleSplit`, respectively. labels : array-like or None (default=None) Ignored if shuffle is None or 'simple'. When shuffle='stratified', this array is used as class labels. When shuffle='group', this array is used as groups. Returns ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs. """ n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") test_size = options.pop('test_size', None) train_size = options.pop('train_size', None) random_state = options.pop('random_state', None) shuffle = options.pop('shuffle', 'simple') labels = options.pop('labels', None) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) if shuffle == 'group': if labels is None: raise ValueError("When shuffle='group', " "labels should not be None!") labels = check_array(labels, ensure_2d=False, dtype=None) uniques = np.unique(labels) n_samples = uniques.size n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) shuffle_options = dict(test_size=n_test, train_size=n_train, random_state=random_state) if shuffle is None: if labels is not None: warnings.warn("The `labels` is ignored for " "shuffle being None!") train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) elif shuffle == 'simple': if labels is not None: warnings.warn("The `labels` is not needed and therefore " "ignored for ShuffleSplit, as shuffle='simple'!") cv = ShuffleSplit(**shuffle_options) train, test = next(cv.split(X=arrays[0], y=None)) elif shuffle == 'stratified': cv = StratifiedShuffleSplit(**shuffle_options) train, test = next(cv.split(X=arrays[0], y=labels)) elif shuffle == 'group': cv = GroupShuffleSplit(**shuffle_options) train, test = next(cv.split(X=arrays[0], y=None, groups=labels)) else: raise ValueError("The argument `shuffle` only supports None, " "'simple', 'stratified' and 'group', but got `%s`!" % shuffle) return list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test)) for a in arrays))
from sklearn import metrics print('Precision: \t{}'.format(metrics.precision_score(y_test, predicted, average=None))) print('Recall: \t{}'.format(metrics.recall_score(y_test, predicted, average=None))) print('F1: \t\t{}'.format(metrics.f1_score(y_test, predicted, average=None))) print('Macro Precision: \t{}'.format(metrics.precision_score(y_test, predicted, average='macro'))) print('Macro Recall: \t\t{}'.format(metrics.recall_score(y_test, predicted, average='macro'))) print('Macro F1: \t\t{}'.format(metrics.f1_score(y_test, predicted, average='macro'))) # stratified k-fold print('-------------------------------- Shuffle Split ---------------------------------') total_score = 0 runs = 0 from sklearn.model_selection import ShuffleSplit ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10) for train, test in ss.split(tweets, target): X_train = np.array(tweets)[train] y_train = target[train] X_test = np.array(tweets)[test] y_test = target[test] pipeline = Pipeline([('vect', CountVectorizer(max_df=0.75, ngram_range=(1, 2))), ('tfidf', TfidfTransformer(norm='l1', use_idf=False)), ('clf', ExtraTreesClassifier(random_state=0, n_estimators=10, class_weight='auto'))]) pipeline = pipeline.fit(X_train, y_train) predicted = pipeline.predict(X_test) print('Accuracy: {}'.format(accuracy_score(y_test, predicted))) print(metrics.classification_report(y_test, predicted))
# Read epochs (train will be done only between 1 and 2s) # Testing will be done with a running classifier epochs = Epochs(raw, events, event_id, tmin, tmax, proj=True, picks=picks, baseline=None, preload=True) epochs_train = epochs.copy().crop(tmin=1., tmax=2.) labels = epochs.events[:, -1] - 2 ############################################################################### # Classification with linear discrimant analysis # Define a monte-carlo cross-validation generator (reduce variance): scores = [] epochs_data = epochs.get_data() epochs_data_train = epochs_train.get_data() cv = ShuffleSplit(10, test_size=0.2, random_state=42) cv_split = cv.split(epochs_data_train) # Assemble a classifier lda = LinearDiscriminantAnalysis() csp = CSP(n_components=4, reg=None, log=True, norm_trace=False) # Use scikit-learn Pipeline with cross_val_score function clf = Pipeline([('CSP', csp), ('LDA', lda)]) scores = cross_val_score(clf, epochs_data_train, labels, cv=cv, n_jobs=1) # Printing the results class_balance = np.mean(labels == labels[0]) class_balance = max(class_balance, 1. - class_balance) print("Classification accuracy: %f / Chance level: %f" % (np.mean(scores), class_balance))
from sklearn.model_selection import ShuffleSplit import numpy as np X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0) rs.get_n_splits(X) print(rs) ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None) for train_index, test_index in rs.split(X): print("TRAIN:", train_index, "TEST:", test_index) rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25, random_state=0) for train_index, test_index in rs.split(X): print("# TRAIN:", train_index, "TEST:", test_index)
@pandaize def cross_val_score_pd(estimator, X, y, **kwargs): return model_selection.cross_val_score(estimator, X, y, **kwargs) ## ----------------------------------------------------------------- ## load Patel data ## ----------------------------------------------------------------- def readTab(file): return pd.read_csv(file, sep="\t", header=0, index_col=0) x = readTab("rnaseq/GSE57872/GSE57872_DataMatrixMapped.tsv.gz").transpose() y = x.BRCA1 x0 = x[ x.columns[x.columns != "BRCA1"] ] cvSched = ShuffleSplit(n_splits=10, test_size=0.1, random_state=123) corPVals = colcor(x0, y)['p'] corQVals = bhfdr(corPVals) corQVals.sort_values(inplace=False).head() plt.close() ax = plt.subplot(111) x.plot.scatter(x="CDK1", y="BRCA1", ax=ax) ## ----------------------------------------------------------------- ## unregularized linear regression ## ----------------------------------------------------------------- nFeats = [2, 5, 10, 20, 50, 100, 200, 500, 1000] brca1Modelers = OrderedDict([
def train(working, max_samples, duration, rate, batch_size, epochs, epoch_size, validation_size, early_stopping, reduce_lr, seed): ''' Parameters ---------- working : str directory that contains the experiment data (h5) max_samples : int Maximum number of samples per streamer duration : float Duration of training patches batch_size : int Size of batches rate : int Poisson rate for pescador epochs : int Maximum number of epoch epoch_size : int Number of batches per epoch validation_size : int Number of validation batches early_stopping : int Number of epochs before early stopping reduce_lr : int Number of epochs before reducing learning rate seed : int Random seed ''' # Load the pump with open(os.path.join(OUTPUT_PATH, 'pump.pkl'), 'rb') as fd: pump = pickle.load(fd) # Build the sampler sampler = make_sampler(max_samples, duration, pump, seed) # Build the model model, inputs, outputs = construct_model(pump) # Load the training data idx_train_ = pd.read_json('index_train.json') # Split the training data into train and validation splitter_tv = ShuffleSplit(n_splits=1, test_size=0.25, random_state=seed) train, val = next(splitter_tv.split(idx_train_)) idx_train = idx_train_.iloc[train] idx_val = idx_train_.iloc[val] gen_train = data_generator(working, idx_train['id'].values, sampler, epoch_size, augment=True, lam=rate, batch_size=batch_size, revive=True, random_state=seed) gen_train = keras_tuples(gen_train(), inputs=inputs, outputs=outputs) gen_val = data_generator(working, idx_val['id'].values, sampler, len(idx_val), augment=False, batch_size=batch_size, revive=True, random_state=seed) gen_val = keras_tuples(gen_val(), inputs=inputs, outputs=outputs) loss = {'beat': 'binary_crossentropy', 'downbeat': 'binary_crossentropy'} metrics = {'beat': 'accuracy', 'downbeat': 'accuracy'} monitor = 'val_loss' model.compile(K.optimizers.Adam(), loss=loss, metrics=metrics) # Store the model model_spec = K.utils.serialize_keras_object(model) with open(os.path.join(OUTPUT_PATH, 'model_spec.pkl'), 'wb') as fd: pickle.dump(model_spec, fd) # Construct the weight path weight_path = os.path.join(OUTPUT_PATH, 'model.h5') # Build the callbacks cb = [] cb.append(K.callbacks.ModelCheckpoint(weight_path, save_best_only=True, verbose=1, monitor=monitor)) cb.append(K.callbacks.ReduceLROnPlateau(patience=reduce_lr, verbose=1, monitor=monitor)) cb.append(K.callbacks.EarlyStopping(patience=early_stopping, verbose=1, monitor=monitor)) # Fit the model model.fit_generator(gen_train, epoch_size, epochs, validation_data=gen_val, validation_steps=validation_size, callbacks=cb)
optimal.predict(data) #k-fold validation # k-fold is a type of cross validation where the data are divided into k bins. For each experiment, pick one of the k bins as the test set, #the remaining k-1 bins as training. Run k separate experiments and average all k test results. #This technique helps to test different part of the data to prevent overfitting #i.e. it prevents grid search from returning a parameter set that optimized specifically for a specific training data set but not overall. from sklearn.model_selection import KFold cv_set = KFold(n_splits=10) for train_index, test_index in cv_sets.split(X): print("%s %s" % (train_index, test_index)) #Shufflesplit #ShuffleSplit() for an alternative form of cross-validation (see the 'cv_sets' variable). #The ShuffleSplit() will create 10 ('n_splits') shuffled sets, and for each shuffle, 20% ('test_size') of the data will be used as the validation set. from sklearn.model_selection import ShuffleSplit cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0) for train_index, test_index in cv_sets.split(X): print("%s %s" % (train_index, test_index)) from sklearn.metrics import fbeta_score from sklearn.metrics import accuracy_score # pipelining #Sequentially apply a list of transforms and a final estimator. Intermediate steps #of the pipeline must be ‘transforms’, that is, they must implement fit and #transform methods. The final estimator only needs to implement fit. #The purpose of the pipeline is to assemble several steps that can be #cross-validated together while setting different parameters. from sklearn import svm from sklearn.datasets import samples_generator from sklearn.feature_selection import SelectKBest
print('Misclassified test samples: %d' % (y_test != y_pred_test).sum()) print('Training Accuracy: %.2f' % svm.score(X_train_centered, y_train)) print('Validation Accuracy: %.2f' % svm.score(X_val_centered, y_val)) print('Test Accuracy: %.2f' % svm.score(X_test_centered, y_test)) # Print out more performance metrics (Precision and Recall) more_scores = precision_recall_fscore_support(y_test, y_pred_test, average='weighted') print('Precision: ', more_scores[0]) print('Recall: ', more_scores[1]) # Define a 10 fold CV with 11 % data of training set (train_temp) for validation # 11 %, not 10 %, because the validation split is being used instead of the test split. cv = ShuffleSplit(n_splits=10, test_size=0.11, random_state=0) # Plot learning curves with 10-fold CV train_sizes, train_scores, test_scores = learning_curve( estimator=svm, X=X_train_temp_centered, y=y_train_temp, train_sizes=np.linspace(0.1, 1.0, 10), cv=cv, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) fig = plt.figure()
def test_shufflesplit_reproducible(): # Check that iterating twice on the ShuffleSplit gives the same # sequence of train-test when the random_state is given ss = ShuffleSplit(random_state=21) assert_array_equal(list(a for a, b in ss.split(X)), list(a for a, b in ss.split(X)))
labels = epochs.events[:, -1] evoked = epochs.average() ############################################################################### # Decoding in sensor space using a linear SVM from sklearn.svm import SVC # noqa from sklearn.model_selection import ShuffleSplit # noqa from mne.decoding import CSP # noqa n_components = 3 # pick some components svc = SVC(C=1, kernel='linear') csp = CSP(n_components=n_components, norm_trace=False) # Define a monte-carlo cross-validation generator (reduce variance): cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42) scores = [] epochs_data = epochs.get_data() for train_idx, test_idx in cv.split(labels): y_train, y_test = labels[train_idx], labels[test_idx] X_train = csp.fit_transform(epochs_data[train_idx], y_train) X_test = csp.transform(epochs_data[test_idx]) # fit classifier svc.fit(X_train, y_train) scores.append(svc.score(X_test, y_test)) # Printing the results
def check_fit_idempotent(name, estimator_orig): # Check that est.fit(X) is the same as est.fit(X).fit(X). Ideally we would # check that the estimated parameters during training (e.g. coefs_) are # the same, but having a universal comparison function for those # attributes is difficult and full of edge cases. So instead we check that # predict(), predict_proba(), decision_function() and transform() return # the same results. check_methods = [ "predict", "transform", "decision_function", "predict_proba" ] rng = np.random.RandomState(0) if estimator_orig._get_tags()['non_deterministic']: msg = name + ' is non deterministic' raise SkipTest(msg) estimator = clone(estimator_orig) set_random_state(estimator) if 'warm_start' in estimator.get_params().keys(): estimator.set_params(warm_start=False) n_samples = 100 X, _ = _create_small_ts_dataset() X = X.reshape((X.shape[0], X.shape[1])) X = pairwise_estimator_convert_X(X, estimator) if is_regressor(estimator_orig): y = rng.normal(size=n_samples) else: y = rng.randint(low=0, high=2, size=n_samples) train, test = next(ShuffleSplit(test_size=.2, random_state=rng).split(X)) X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) # Fit for the first time estimator.fit(X_train, y_train) result = { method: getattr(estimator, method)(X_test) for method in check_methods if hasattr(estimator, method) } # Fit again set_random_state(estimator) estimator.fit(X_train, y_train) for method in check_methods: if hasattr(estimator, method): new_result = getattr(estimator, method)(X_test) if np.issubdtype(new_result.dtype, np.floating): tol = 2 * np.finfo(new_result.dtype).eps else: tol = 2 * np.finfo(np.float64).eps assert_allclose_dense_sparse( result[method], new_result, atol=max(tol, 1e-9), rtol=max(tol, 1e-7), err_msg="Idempotency check failed for method {}".format( method))