def __call__(self, X, y): """ given a dataset X,y we split it, in order to do cross validation, according to the procedure explained below: if n_folds is not None, then we do cross validation based on stratified folds if n_class_samples is not None, then we do cross validation using only <n_class_samples> training samples per class if n_test_samples is not None, then we do cross validation using only <n_test_samples> cross validaition samples per class assumes that each datapoint is in a column of X """ n_classes = len(set(y)) if self.n_folds is not None: # generate the folds self.folds = StratifiedKFold(y, n_folds=self.n_folds, shuffle=False, random_state=None) elif self.n_class_samples is not None: self.folds = [] for i in range(self.n_tests): if type(self.n_class_samples) is not list: self.n_class_samples = (np.ones(n_classes) * self.n_class_samples).astype(int) if self.n_test_samples is not None: self.n_test_samples = (np.ones(n_classes) * self.n_test_samples).astype(int) data_idx = split_dataset(self.n_class_samples, self.n_test_samples, y) train_idx = data_idx[0] test_idx = data_idx[1] self.folds.append((train_idx, test_idx)) self.cross_validate(X, y)
class classifier(): """ an abstract class that models a classifier """ __metaclass__ = abc.ABCMeta def __init__(self, param_grid=None, n_folds=None, n_class_samples=None, n_test_samples=None, n_tests=1, name="classifier"): self.name = name self.param_grid = param_grid self.best_param_set = None self.n_folds = n_folds # the number of validation or test samples per class self.n_test_samples = n_test_samples # the number of training samples per class self.n_class_samples = n_class_samples self.n_tests = n_tests def fit(self, X, y): self.__call__(X, y) def __call__(self, X, y): """ given a dataset X,y we split it, in order to do cross validation, according to the procedure explained below: if n_folds is not None, then we do cross validation based on stratified folds if n_class_samples is not None, then we do cross validation using only <n_class_samples> training samples per class if n_test_samples is not None, then we do cross validation using only <n_test_samples> cross validaition samples per class assumes that each datapoint is in a column of X """ n_classes = len(set(y)) if self.n_folds is not None: # generate the folds self.folds = StratifiedKFold(y, n_folds=self.n_folds, shuffle=False, random_state=None) elif self.n_class_samples is not None: self.folds = [] for i in range(self.n_tests): if type(self.n_class_samples) is not list: self.n_class_samples = (np.ones(n_classes) * self.n_class_samples).astype(int) if self.n_test_samples is not None: self.n_test_samples = (np.ones(n_classes) * self.n_test_samples).astype(int) data_idx = split_dataset(self.n_class_samples, self.n_test_samples, y) train_idx = data_idx[0] test_idx = data_idx[1] self.folds.append((train_idx, test_idx)) self.cross_validate(X, y) def cross_validate(self, X, y): print "fitting {} to the training set".format(self.name) if self.param_grid is not None: param_sets = list(ParameterGrid(self.param_grid)) n_param_sets = len(param_sets) param_scores = [] for j, param_set in enumerate(param_sets): print "--------------" print "training the classifier..." print "parameter set:" for k, v in param_set.iteritems(): print "{}:{}".format(k, v) param_score = self.evaluate(X, y, param_set=param_set) param_scores.append(param_score) p = np.argmax(np.array(param_scores)) self.best_param_set = param_sets[p] print "best parameter set", self.best_param_set print "best score:", param_scores[p] else: score = self.evaluate(X, y) def evaluate(self, X, y, param_set=None): """ evaluate the performance of the classifier trained with the parameters in <param_set> """ cv_scores = [] # avg_class_accs = [] for train_index, test_index in self.folds: X_train, X_test = X[:, train_index], X[:, test_index] y_train, y_test = y[train_index], y[test_index] self.train(X_train, y_train, param_set=param_set) y_pred = self.predict(X_test) y_pred = np.array(y_pred) class_acc = class_accuracy(y_pred, y_test) # avg_class_acc = avg_class_accuracy(y_pred,y_test) cv_scores.append(class_acc) # avg_class_accs.append(avg_class_acc) print "average class accuracy:", avg_class_accuracy(y_pred, y_test) avg_cv_score = np.mean(cv_scores) print "accuracy:", avg_cv_score return avg_cv_score @abc.abstractmethod def train(self, X_train, y_train, param_set=None): """train the classifier""" raise NotImplementedError @abc.abstractmethod def predict(self, X_test): """predict labels in X_test""" raise NotImplementedError
# larger model def create_larger(): # create model model = Sequential() model.add(Dense(60, input_dim=60, init='normal', activation='relu')) model.add(Dense(30, init='normal', activation='relu')) model.add(Dense(1, init='normal', activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model numpy.random.seed(seed) estimators = [] estimators.append(('standardize', StandardScaler())) estimators.append(('mlp', KerasClassifier(build_fn=create_larger, nb_epoch=100, batch_size=5, verbose=0))) pipeline = Pipeline(estimators) kfold = StratifiedKFold(y=encoded_Y, n_folds=10, shuffle=True, random_state=seed) results = cross_val_score(pipeline, X, encoded_Y, cv=kfold) print("Larger: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))
def split_dataframe(df): kf = StratifiedKFold(df["OpenStatus"].values, 5) train, test = kf.__iter__().next() return df.take(train), df.take(test)
from time import time from sklearn.cross_validation import StratifiedKFold from feature_creation import idx, df_reduced_test import pandas as pd start = time() f1_scorer = make_scorer(f1_score) parameters = [{ 'n_estimators': [50, 100, 500, 1000, 2500], 'base_estimator__criterion': ["gini", "entropy"], 'base_estimator__splitter': ["best", "random"], }] skf = StratifiedKFold(y_train, n_folds=5, shuffle=True) for train_index, test_index in skf: # print(("TRAIN:", train_index, "TEST:", test_index)) X_train_skf, y_train_skf = df_reduced_train.iloc[train_index], y_train[ train_index] X_test_skf, y_test_skf = df_reduced_train.iloc[test_index], y_train[ test_index] dtc = DecisionTreeClassifier(max_features="auto", class_weight="balanced", max_depth=None) ab = AdaBoostClassifier(base_estimator=dtc, algorithm='SAMME') gs = GridSearchCV(ab, param_grid=parameters, scoring=f1_scorer)
NFOLDS_OUTER = 4 # 4 datasets NFOLDS_INNER = 5 site = np.load("/neurospin/brainomics/2016_schizConnect/analysis/all_studies+VIP/Freesurfer/all_subjects/data/site.npy") shutil.copy(INPUT_DATA_X, WD) shutil.copy(INPUT_DATA_y, WD) shutil.copy(INPUT_MASK_PATH, WD) ############################################################################# ## Create config file y = np.load(INPUT_DATA_y) cv_outer = [[tr, te] for tr,te in StratifiedKFold(y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)] cv_outer[0][0] = np.transpose(np.where(site != 1)).ravel() cv_outer[0][1] = np.transpose(np.where(site == 1)).ravel() #TEST ON COBRE cv_outer[1][0] = np.transpose(np.where(site != 2)).ravel() cv_outer[1][1] = np.transpose(np.where(site == 2)).ravel() # TEST ON NMORPHch cv_outer[2][0] = np.transpose(np.where(site != 3)).ravel() cv_outer[2][1] = np.transpose(np.where(site == 3)).ravel() #TEST ON NUSDAST cv_outer[3][0] = np.transpose(np.where(site != 4)).ravel() cv_outer[3][1] = np.transpose(np.where(site == 4)).ravel() #TEST ON VIP import collections cv = collections.OrderedDict() for cv_outer_i, (tr_val, te) in enumerate(cv_outer):
def crossValidateEvaluate(): beginTime = datetime.datetime.now() filename = './public/' # load data: load all the words in all the emails mailWords, classLables = naiveBayes.loadMailData(filename) skf = StratifiedKFold(classLables, k_fold_num) acc_per_fold = [] f1_per_fold = [] recall_per_fold = [] precision_per_fold = [] for train_index, test_index in skf: print("train_index->", train_index) print("test_index->", test_index) preVocabularyList = naiveBayes.createVocabularyList( [mailWords[i] for i in train_index]) #do wfo filter vocabularyList = naiveBayes.wfoFilter( preVocabularyList, [mailWords[i] for i in train_index], [classLables[i] for i in train_index]) vocabularyList = preVocabularyList print("length of vocabularyList", len(vocabularyList)) fw = open('vocabularyList.txt', 'w') for i in vocabularyList: fw.write(i + '\n') fw.flush() fw.close() print("vocabularyList finished") trainMarkedWords = naiveBayes.setOfWordsListToVecTor( vocabularyList, [mailWords[i] for i in train_index]) print("trainMarkedWords finished") # change it to array trainMarkedWords = np.array(trainMarkedWords) print("data to matrix finished") # calculate each propabilaty of spam and ham P(wi/s) p(wi/h) pWordsSpamicity, pWordsHealthy, pSpam = \ naiveBayes.trainingNaiveBayes(trainMarkedWords, [classLables[i] for i in train_index]) fpSpam = open('pSpam.txt', 'w') spam = pSpam.__str__() fpSpam.write(spam) fpSpam.close() np.savetxt('pWordsSpamicity.txt', pWordsSpamicity, delimiter='\t') np.savetxt('pWordsHealthy.txt', pWordsHealthy, delimiter='\t') predict = naiveBayes.predict([mailWords[i] for i in test_index]) #predict = naiveBayes.adaboostPredict([smsWords[i] for i in test_index]) acc_per_fold.append( accuracy_score([classLables[i] for i in test_index], predict)) f1_per_fold.append( f1_score([classLables[i] for i in test_index], predict)) recall_per_fold.append( recall_score([classLables[i] for i in test_index], predict)) precision_per_fold.append( precision_score([classLables[i] for i in test_index], predict)) print("acc_per_fold:", acc_per_fold) print("f1_per_fold:", f1_per_fold) print("recall_per_fold:", recall_per_fold) print("precision_per_fold:", precision_per_fold) print("acc_per_fold:", acc_per_fold) print("f1_per_fold:", f1_per_fold) print("recall_per_fold:", recall_per_fold) print("precision_per_fold:", precision_per_fold) print("k-fold:", k_fold_num, " spend:", (datetime.datetime.now() - beginTime))
def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) If `return_indices` is `True`, a boolean array will be returned containing the which samples have been selected. """ if self.estimator not in ESTIMATOR_KIND: raise NotImplementedError # Select the appropriate classifier if self.estimator == 'knn': from sklearn.neighbors import KNeighborsClassifier estimator = KNeighborsClassifier(**self.kwargs) elif self.estimator == 'decision-tree': from sklearn.tree import DecisionTreeClassifier estimator = DecisionTreeClassifier(random_state=self.random_state, **self.kwargs) elif self.estimator == 'random-forest': from sklearn.ensemble import RandomForestClassifier estimator = RandomForestClassifier(random_state=self.random_state, **self.kwargs) elif self.estimator == 'adaboost': from sklearn.ensemble import AdaBoostClassifier estimator = AdaBoostClassifier(random_state=self.random_state, **self.kwargs) elif self.estimator == 'gradient-boosting': from sklearn.ensemble import GradientBoostingClassifier estimator = GradientBoostingClassifier( random_state=self.random_state, **self.kwargs) elif self.estimator == 'linear-svm': from sklearn.svm import SVC estimator = SVC(probability=True, random_state=self.random_state, **self.kwargs) else: raise NotImplementedError # Create the different folds skf = StratifiedKFold(y, n_folds=self.cv, shuffle=False, random_state=self.random_state) probabilities = np.zeros(y.shape[0], dtype=float) for train_index, test_index in skf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] estimator.fit(X_train, y_train) probs = estimator.predict_proba(X_test) classes = estimator.classes_ probabilities[test_index] = [ probs[l, np.where(classes == c)[0][0]] for l, c in enumerate(y_test) ] # Compute the number of cluster needed if self.ratio == 'auto': num_samples = self.stats_c_[self.min_c_] else: num_samples = int(self.stats_c_[self.min_c_] / self.ratio) # Find the percentile corresponding to the top num_samples threshold = np.percentile( probabilities[y != self.min_c_], (1. - (num_samples / self.stats_c_[self.maj_c_])) * 100.) mask = np.logical_or(probabilities >= threshold, y == self.min_c_) # Sample the data X_resampled = X[mask] y_resampled = y[mask] self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) # If we need to offer support for the indices if self.return_indices: idx_under = np.nonzero(mask)[0] return X_resampled, y_resampled, idx_under else: return X_resampled, y_resampled
import cPickle from sklearn.cross_validation import StratifiedKFold from relevance.config import config if __name__ == "__main__": # load data with open(config.processed_train_data_path, "rb") as f: dfTrain = cPickle.load(f) skf = [0] * config.n_runs for stratified_label, key in zip(["relevance", "query"], ["median_relevance", "qid"]): for run in range(config.n_runs): random_seed = 2018 + 1000 * (run + 1) skf[run] = StratifiedKFold(dfTrain[key], n_folds=config.n_folds, shuffle=True, random_state=random_seed) for fold, (validInd, trainInd) in enumerate(skf[run]): print("================================") print("Index for run: %s, fold: %s" % (run + 1, fold + 1)) print("Train (num = %s)" % len(trainInd)) print(trainInd[:10]) print("Valid (num = %s)" % len(validInd)) print(validInd[:10]) with open( "%s/stratifiedKFold.%s.pkl" % (config.data_folder, stratified_label), "wb") as f: cPickle.dump(skf, f, -1)
# Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator=classifier, X=X_train, y=Y, cv=10) accuracies.mean() accuracies.std() # Applying Grid Search to find the best model and the best parameters from sklearn.model_selection import GridSearchCV from sklearn.cross_validation import StratifiedKFold parameters = { 'max_depth': [4, 5, 6, 7, 8], 'n_estimators': [200, 210, 240, 250], 'criterion': ['gini', 'entropy'] } cross_validation = StratifiedKFold(Y, n_folds=5) grid_search = GridSearchCV(classifier, param_grid=parameters, scoring='accuracy', cv=cross_validation, n_jobs=-1) grid_search = grid_search.fit(X_train, Y) best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_ #Predict again y_pred = grid_search.predict(X_test) #Prepare test results to submit result = pd.DataFrame() result['PassengerId'] = pd.read_csv('test.csv')['PassengerId']
tmax, proj=False, picks=picks, baseline=None, preload=True, verbose=False) # Create classification pipeline clf = make_pipeline(Xdawn(n_components=3), Vectorizer(), MinMaxScaler(), LogisticRegression(penalty='l1')) # Get the labels labels = epochs.events[:, -1] # Cross validator cv = StratifiedKFold(y=labels, n_folds=10, shuffle=True, random_state=42) # Do cross-validation preds = np.empty(len(labels)) for train, test in cv: clf.fit(epochs[train], labels[train]) preds[test] = clf.predict(epochs[test]) # Classification report target_names = ['aud_l', 'aud_r', 'vis_l', 'vis_r'] report = classification_report(labels, preds, target_names=target_names) print(report) # Normalized confusion matrix cm = confusion_matrix(labels, preds) cm_normalized = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=111) # X, y, X_submission = load_data.load() X = X_train.values y = y_train.values X_submission = X_test.values if shuffle: idx = np.random.permutation(y.size) X = X[idx] y = y[idx] skf = list(StratifiedKFold(y, n_folds)) clfs = [ RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50) ] print("Creating train and test sets for blending.")
def RandomGridSearchRFC_Fixed(X,Y,splits, model, survival): """ This function looks for the best set o parameters for RFC method Input: X: training set Y: labels of training set splits: cross validation splits, used to make sure the parameters are stable Output: clf.best_params_: dictionary with the parameters, to use: param_svm['kernel'] """ start_svm = time.time() if model == 'svm': clf = svm.SVC() tuned_parameters = { 'C': ([0.01, 1, 10]), 'kernel': (['rbf', 'linear']), # 'kernel': (['linear', 'rbf', 'sigmoid']), # 'degree': ([1,3,5,10]), # 'decision_function_shape' : (['ovo', 'ovr']), # 'cache_size': ([500,1000,1500,2000]), 'shrinking': ([False, True]), # 'probability': ([False, True]) } if model == 'cart': clf = tree.DecisionTreeClassifier() tuned_parameters = { 'criterion': (['gini', 'entropy']), 'max_depth': ([10,20]), 'min_samples_split': ([2,3,5]), 'min_samples_leaf': ([2,3,5]), } if model == 'rf': clf = ensemble.RandomForestClassifier() tuned_parameters = { 'n_estimators': ([200,500,1000]), # 'max_features': (['auto', 'sqrt', 'log2',1,4,8]), # precomputed,'poly', 'sigmoid' 'max_depth': ([10,20]), # 'criterion': (['gini', 'entropy']), 'min_samples_split': [2,3,5], 'min_samples_leaf': [2,3,5], } if model == 'xgboost': clf = XGBClassifier() tuned_parameters = { 'booster': (['gbtree']), 'max_depth': ([5,10,20]), 'reg_lambda': ([0,1]), 'reg_alpha': ([0,1]), 'subsample': ([0.5,1]) } if model == 'lr': clf = linear_model.LogisticRegression() tuned_parameters = { 'solver': (['liblinear', 'sag', 'saga']) } if model == 'cox': clf = CoxnetSurvivalAnalysis() tuned_parameters = { 'n_alphas': ([50,100,200]), 'l1_ratio': ([0.1,0.5,1]), } if model == 'survSVM': clf = FastSurvivalSVM() tuned_parameters = { 'alpha': ([0.5,1]), 'rank_ratio': ([0.5,1]), 'max_iter': ([20,40,80]), 'optimizer': (['rbtree', 'avltree']), } if model == 'gb': clf = GradientBoostingSurvivalAnalysis() tuned_parameters = { 'learning_rate': ([0.1, 0.3]), 'n_estimators': ([100,200,400]), 'max_depth': ([3,6,12]) } if survival == True: scorer = make_scorer(CI, greater_is_better=True) y_for_cv = np.array([t[0] for t in Y]) cv = StratifiedKFold(y_for_cv, n_folds=2) # x-validation else: cv = StratifiedKFold(Y, n_folds=2) # x-validation scores = ['roc_auc'] print (' ...performing x-validation') clf = GridSearchCV(clf, tuned_parameters, scoring='%s' % scores[0], cv=cv, verbose=10) #scoring='%s' % scores[0] # clf = BayesSearchCV(clf, tuned_parameters, n_iter=50, cv=splits, # optimizer_kwargs=dict(acq_func='LCB', base_estimator='RF')) clf.fit(X, Y) end_svm = time.time() print("Total time to process: ",end_svm - start_svm) return(clf.best_params_,clf)
dataset = dataset.shuffle() # Create a Classifier Service. # Classifier process starts using a default configuration. classifier = Classifier.run(Config()) # Prepare arrays to keep true/predicted labels to display a report later. true_labels = [] predicted_labels = [] # Run stratified K-fold validation. labels = list(dataset.get_labels()) if sklearn_version < 18: train_test_indices = StratifiedKFold(labels, n_folds=10) else: skf = StratifiedKFold(n_splits=10) train_test_indices = skf.split(labels, labels) for train_idx, test_idx in train_test_indices: # Clear the classifier (call `clear` RPC). classifier.clear() # Split the dataset to train/test dataset. (train_ds, test_ds) = (dataset[train_idx], dataset[test_idx]) # Train the classifier using train dataset. for (idx, label) in classifier.train(train_ds): # You can peek records being trained. #print('train[{0}]: (label: {1}) => {2}'.format(idx, label, train_ds[idx])) pass
def create_ROC(filename): from scipy import interp from sklearn import preprocessing as pps, svm from sklearn.metrics import roc_curve, auc from sklearn.cross_validation import StratifiedKFold, LeaveOneOut filepath = filename + '.pkl' with open(filepath, 'rb') as f: svm_data = pickle.load(f) labels = svm_data['labels'] data = svm_data['data'] scaler = pps.Scaler().fit(data) print "Mean: ", scaler.mean_ print "Std: ", scaler.std_ data_scaled = scaler.transform(data) classifier = svm.SVC(probability=True) classifier.fit(data_scaled, labels) #print "Support Vectors: \r\n", classifier.support_vectors_ print "SV's per class: \r\n", classifier.n_support_ ############################################################################### ## Code below modified from http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html#example-plot-roc-crossval-py X, y = data_scaled, np.array(labels) n_samples, n_features = X.shape print n_samples, n_features ############################################################################### # Classification and ROC analysis # Run classifier with crossvalidation and plot ROC curves cv = StratifiedKFold(y, k=9) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, n_samples) all_tpr = [] plt.figure(2) for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, '--', lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k-', lw=3, label='Mean ROC (area = %0.2f)' % mean_auc) plt.xlim([0, 1]) plt.ylim([0, 1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") plt.show() print "Finished!"
v, w = np.linalg.eigh(gmm._get_covars()[n][:2, :2]) u = w[0] / np.linalg.norm(w[0]) angle = np.arctan2(u[1], u[0]) angle = 180 * angle / np.pi # convert to degrees v *= 9 ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1], 180 + angle, color=color) ell.set_clip_box(ax.bbox) ell.set_alpha(0.5) ax.add_artist(ell) iris = datasets.load_iris() # Break up the dataset into non-overlapping training (75%) and testing # (25%) sets. skf = StratifiedKFold(iris.target, k=4) # Only take the first fold. train_index, test_index = skf.__iter__().next() X_train = iris.data[train_index] y_train = iris.target[train_index] X_test = iris.data[test_index] y_test = iris.target[test_index] n_classes = len(np.unique(y_train)) # Try GMMs using different types of covariances. classifiers = dict((x, GMM(n_components=n_classes, covariance_type=x)) for x in ["spherical", "diag", "tied", "full"]) n_classifiers = len(classifiers)
def reduce_number_instances(self, proportion=0.1): skf = StratifiedKFold(self._target, n_folds=1.0 / proportion) test_folds = skf.test_folds _, _, self._data, self._target = self.separate_sets( self._data, self._target, 0, test_folds)