Python StratifiedKFold Examples, sklearn.cross_validation.StratifiedKFold Python Examples

Example #1

0

Show file

File: classify.py Project: ektormak/Lyssandra

    def __call__(self, X, y):
        """
        given a dataset X,y we split it, in order to do cross validation,
        according to the procedure explained below:
        if n_folds is not None, then we do cross validation
        based on stratified folds
        if n_class_samples is not None, then we do cross validation
        using only <n_class_samples> training samples per class
        if n_test_samples is not None, then we do cross validation
        using only <n_test_samples> cross validaition samples per class
        assumes that each datapoint is in a column of X
        """
        n_classes = len(set(y))
        if self.n_folds is not None:
            # generate the folds
            self.folds = StratifiedKFold(y, n_folds=self.n_folds,
                                         shuffle=False, random_state=None)

        elif self.n_class_samples is not None:

            self.folds = []
            for i in range(self.n_tests):

                if type(self.n_class_samples) is not list:
                    self.n_class_samples = (np.ones(n_classes) * self.n_class_samples).astype(int)
                if self.n_test_samples is not None:
                    self.n_test_samples = (np.ones(n_classes) * self.n_test_samples).astype(int)

                data_idx = split_dataset(self.n_class_samples, self.n_test_samples, y)
                train_idx = data_idx[0]
                test_idx = data_idx[1]
                self.folds.append((train_idx, test_idx))

        self.cross_validate(X, y)

Example #2

0

Show file

File: classify.py Project: ektormak/Lyssandra

class classifier():
    """
    an abstract class that models a classifier
    """

    __metaclass__ = abc.ABCMeta

    def __init__(self, param_grid=None, n_folds=None,
                 n_class_samples=None, n_test_samples=None, n_tests=1, name="classifier"):
        self.name = name
        self.param_grid = param_grid
        self.best_param_set = None
        self.n_folds = n_folds
        # the number of validation or test samples per class
        self.n_test_samples = n_test_samples
        # the number of training samples per class
        self.n_class_samples = n_class_samples
        self.n_tests = n_tests

    def fit(self, X, y):
        self.__call__(X, y)

    def __call__(self, X, y):
        """
        given a dataset X,y we split it, in order to do cross validation,
        according to the procedure explained below:
        if n_folds is not None, then we do cross validation
        based on stratified folds
        if n_class_samples is not None, then we do cross validation
        using only <n_class_samples> training samples per class
        if n_test_samples is not None, then we do cross validation
        using only <n_test_samples> cross validaition samples per class
        assumes that each datapoint is in a column of X
        """
        n_classes = len(set(y))
        if self.n_folds is not None:
            # generate the folds
            self.folds = StratifiedKFold(y, n_folds=self.n_folds,
                                         shuffle=False, random_state=None)

        elif self.n_class_samples is not None:

            self.folds = []
            for i in range(self.n_tests):

                if type(self.n_class_samples) is not list:
                    self.n_class_samples = (np.ones(n_classes) * self.n_class_samples).astype(int)
                if self.n_test_samples is not None:
                    self.n_test_samples = (np.ones(n_classes) * self.n_test_samples).astype(int)

                data_idx = split_dataset(self.n_class_samples, self.n_test_samples, y)
                train_idx = data_idx[0]
                test_idx = data_idx[1]
                self.folds.append((train_idx, test_idx))

        self.cross_validate(X, y)

    def cross_validate(self, X, y):

        print "fitting {} to the training set".format(self.name)
        if self.param_grid is not None:
            param_sets = list(ParameterGrid(self.param_grid))
            n_param_sets = len(param_sets)
            param_scores = []
            for j, param_set in enumerate(param_sets):

                print "--------------"
                print "training the classifier..."
                print "parameter set:"
                for k, v in param_set.iteritems():
                    print "{}:{}".format(k, v)

                param_score = self.evaluate(X, y, param_set=param_set)
                param_scores.append(param_score)
                p = np.argmax(np.array(param_scores))
                self.best_param_set = param_sets[p]
                print "best parameter set", self.best_param_set
                print "best score:", param_scores[p]
        else:
            score = self.evaluate(X, y)

    def evaluate(self, X, y, param_set=None):
        """
        evaluate the performance of the classifier
        trained with the parameters in <param_set>
        """
        cv_scores = []
        # avg_class_accs = []
        for train_index, test_index in self.folds:
            X_train, X_test = X[:, train_index], X[:, test_index]
            y_train, y_test = y[train_index], y[test_index]
            self.train(X_train, y_train, param_set=param_set)

            y_pred = self.predict(X_test)
            y_pred = np.array(y_pred)
            class_acc = class_accuracy(y_pred, y_test)
            # avg_class_acc  = avg_class_accuracy(y_pred,y_test)
            cv_scores.append(class_acc)
            # avg_class_accs.append(avg_class_acc)
            print "average class accuracy:", avg_class_accuracy(y_pred, y_test)

        avg_cv_score = np.mean(cv_scores)
        print "accuracy:", avg_cv_score
        return avg_cv_score

    @abc.abstractmethod
    def train(self, X_train, y_train, param_set=None):
        """train the classifier"""
        raise NotImplementedError

    @abc.abstractmethod
    def predict(self, X_test):
        """predict labels in X_test"""
        raise NotImplementedError

Example #3

0

Show file

File: 4 sonar returns with standardized data and larger network.py Project: SumanthReddyKaliki/Deep-Learning

# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=60, init='normal', activation='relu'))
    model.add(Dense(30, init='normal', activation='relu'))
    model.add(Dense(1, init='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',
                   KerasClassifier(build_fn=create_larger,
                                   nb_epoch=100,
                                   batch_size=5,
                                   verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(y=encoded_Y,
                        n_folds=10,
                        shuffle=True,
                        random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Larger: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))

Example #4

0

Show file

File: common.py Project: kalaidin/stackoverflow

def split_dataframe(df):
    kf = StratifiedKFold(df["OpenStatus"].values, 5)
    train, test = kf.__iter__().next()
    return df.take(train), df.take(test)

Example #5

0

Show file

from time import time
from sklearn.cross_validation import StratifiedKFold
from feature_creation import idx, df_reduced_test
import pandas as pd

start = time()

f1_scorer = make_scorer(f1_score)

parameters = [{
    'n_estimators': [50, 100, 500, 1000, 2500],
    'base_estimator__criterion': ["gini", "entropy"],
    'base_estimator__splitter': ["best", "random"],
}]

skf = StratifiedKFold(y_train, n_folds=5, shuffle=True)

for train_index, test_index in skf:
    # print(("TRAIN:", train_index, "TEST:", test_index))
    X_train_skf, y_train_skf = df_reduced_train.iloc[train_index], y_train[
        train_index]
    X_test_skf, y_test_skf = df_reduced_train.iloc[test_index], y_train[
        test_index]

dtc = DecisionTreeClassifier(max_features="auto",
                             class_weight="balanced",
                             max_depth=None)

ab = AdaBoostClassifier(base_estimator=dtc, algorithm='SAMME')

gs = GridSearchCV(ab, param_grid=parameters, scoring=f1_scorer)

Example #6

0

Show file

    NFOLDS_OUTER = 4 # 4 datasets
    NFOLDS_INNER = 5

    site = np.load("/neurospin/brainomics/2016_schizConnect/analysis/all_studies+VIP/Freesurfer/all_subjects/data/site.npy")



    shutil.copy(INPUT_DATA_X, WD)
    shutil.copy(INPUT_DATA_y, WD)
    shutil.copy(INPUT_MASK_PATH, WD)
    #############################################################################
    ## Create config file
    y = np.load(INPUT_DATA_y)

    cv_outer = [[tr, te] for tr,te in StratifiedKFold(y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)]
    cv_outer[0][0] = np.transpose(np.where(site != 1)).ravel()
    cv_outer[0][1] = np.transpose(np.where(site == 1)).ravel()  #TEST ON COBRE

    cv_outer[1][0] = np.transpose(np.where(site != 2)).ravel()
    cv_outer[1][1] = np.transpose(np.where(site == 2)).ravel()   # TEST ON NMORPHch

    cv_outer[2][0] = np.transpose(np.where(site != 3)).ravel()
    cv_outer[2][1] = np.transpose(np.where(site == 3)).ravel()  #TEST ON NUSDAST

    cv_outer[3][0] = np.transpose(np.where(site != 4)).ravel()
    cv_outer[3][1] = np.transpose(np.where(site == 4)).ravel() #TEST ON VIP

    import collections
    cv = collections.OrderedDict()
    for cv_outer_i, (tr_val, te) in enumerate(cv_outer):

Example #7

0

Show file

File: test.py Project: zetaby/spam-filter

def crossValidateEvaluate():
    beginTime = datetime.datetime.now()
    filename = './public/'
    # load data: load all the words in all the emails
    mailWords, classLables = naiveBayes.loadMailData(filename)

    skf = StratifiedKFold(classLables, k_fold_num)
    acc_per_fold = []
    f1_per_fold = []
    recall_per_fold = []
    precision_per_fold = []

    for train_index, test_index in skf:
        print("train_index->", train_index)
        print("test_index->", test_index)
        preVocabularyList = naiveBayes.createVocabularyList(
            [mailWords[i] for i in train_index])
        #do wfo filter
        vocabularyList = naiveBayes.wfoFilter(
            preVocabularyList, [mailWords[i] for i in train_index],
            [classLables[i] for i in train_index])
        vocabularyList = preVocabularyList
        print("length of vocabularyList", len(vocabularyList))
        fw = open('vocabularyList.txt', 'w')
        for i in vocabularyList:
            fw.write(i + '\n')
        fw.flush()
        fw.close()
        print("vocabularyList finished")

        trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
            vocabularyList, [mailWords[i] for i in train_index])
        print("trainMarkedWords finished")

        # change it to array
        trainMarkedWords = np.array(trainMarkedWords)
        print("data to matrix finished")
        # calculate each propabilaty of spam and ham P(wi/s)  p(wi/h)
        pWordsSpamicity, pWordsHealthy, pSpam = \
            naiveBayes.trainingNaiveBayes(trainMarkedWords, [classLables[i] for i in train_index])
        fpSpam = open('pSpam.txt', 'w')
        spam = pSpam.__str__()
        fpSpam.write(spam)
        fpSpam.close()

        np.savetxt('pWordsSpamicity.txt', pWordsSpamicity, delimiter='\t')
        np.savetxt('pWordsHealthy.txt', pWordsHealthy, delimiter='\t')

        predict = naiveBayes.predict([mailWords[i] for i in test_index])
        #predict = naiveBayes.adaboostPredict([smsWords[i] for i in test_index])
        acc_per_fold.append(
            accuracy_score([classLables[i] for i in test_index], predict))
        f1_per_fold.append(
            f1_score([classLables[i] for i in test_index], predict))
        recall_per_fold.append(
            recall_score([classLables[i] for i in test_index], predict))
        precision_per_fold.append(
            precision_score([classLables[i] for i in test_index], predict))
        print("acc_per_fold:", acc_per_fold)
        print("f1_per_fold:", f1_per_fold)
        print("recall_per_fold:", recall_per_fold)
        print("precision_per_fold:", precision_per_fold)

    print("acc_per_fold:", acc_per_fold)
    print("f1_per_fold:", f1_per_fold)
    print("recall_per_fold:", recall_per_fold)
    print("precision_per_fold:", precision_per_fold)
    print("k-fold:", k_fold_num, " spend:",
          (datetime.datetime.now() - beginTime))

Example #8

0

Show file

File: instance_hardness_threshold.py Project: yanshanjing/imbalanced-learn

    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        idx_under : ndarray, shape (n_samples, )
            If `return_indices` is `True`, a boolean array will be returned
            containing the which samples have been selected.

        """

        if self.estimator not in ESTIMATOR_KIND:
            raise NotImplementedError

        # Select the appropriate classifier
        if self.estimator == 'knn':
            from sklearn.neighbors import KNeighborsClassifier
            estimator = KNeighborsClassifier(**self.kwargs)
        elif self.estimator == 'decision-tree':
            from sklearn.tree import DecisionTreeClassifier
            estimator = DecisionTreeClassifier(random_state=self.random_state,
                                               **self.kwargs)
        elif self.estimator == 'random-forest':
            from sklearn.ensemble import RandomForestClassifier
            estimator = RandomForestClassifier(random_state=self.random_state,
                                               **self.kwargs)
        elif self.estimator == 'adaboost':
            from sklearn.ensemble import AdaBoostClassifier
            estimator = AdaBoostClassifier(random_state=self.random_state,
                                           **self.kwargs)
        elif self.estimator == 'gradient-boosting':
            from sklearn.ensemble import GradientBoostingClassifier
            estimator = GradientBoostingClassifier(
                random_state=self.random_state, **self.kwargs)
        elif self.estimator == 'linear-svm':
            from sklearn.svm import SVC
            estimator = SVC(probability=True,
                            random_state=self.random_state,
                            **self.kwargs)
        else:
            raise NotImplementedError

        # Create the different folds
        skf = StratifiedKFold(y,
                              n_folds=self.cv,
                              shuffle=False,
                              random_state=self.random_state)

        probabilities = np.zeros(y.shape[0], dtype=float)

        for train_index, test_index in skf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            estimator.fit(X_train, y_train)

            probs = estimator.predict_proba(X_test)
            classes = estimator.classes_
            probabilities[test_index] = [
                probs[l, np.where(classes == c)[0][0]]
                for l, c in enumerate(y_test)
            ]

        # Compute the number of cluster needed
        if self.ratio == 'auto':
            num_samples = self.stats_c_[self.min_c_]
        else:
            num_samples = int(self.stats_c_[self.min_c_] / self.ratio)

        # Find the percentile corresponding to the top num_samples
        threshold = np.percentile(
            probabilities[y != self.min_c_],
            (1. - (num_samples / self.stats_c_[self.maj_c_])) * 100.)

        mask = np.logical_or(probabilities >= threshold, y == self.min_c_)

        # Sample the data
        X_resampled = X[mask]
        y_resampled = y[mask]

        self.logger.info('Under-sampling performed: %s', Counter(y_resampled))

        # If we need to offer support for the indices
        if self.return_indices:
            idx_under = np.nonzero(mask)[0]
            return X_resampled, y_resampled, idx_under
        else:
            return X_resampled, y_resampled

Example #9

0

Show file

File: gen_kfold.py Project: zhouwubai/kaggle

import cPickle
from sklearn.cross_validation import StratifiedKFold
from relevance.config import config

if __name__ == "__main__":

    # load data
    with open(config.processed_train_data_path, "rb") as f:
        dfTrain = cPickle.load(f)

    skf = [0] * config.n_runs
    for stratified_label, key in zip(["relevance", "query"],
                                     ["median_relevance", "qid"]):
        for run in range(config.n_runs):
            random_seed = 2018 + 1000 * (run + 1)
            skf[run] = StratifiedKFold(dfTrain[key],
                                       n_folds=config.n_folds,
                                       shuffle=True,
                                       random_state=random_seed)
            for fold, (validInd, trainInd) in enumerate(skf[run]):
                print("================================")
                print("Index for run: %s, fold: %s" % (run + 1, fold + 1))
                print("Train (num = %s)" % len(trainInd))
                print(trainInd[:10])
                print("Valid (num = %s)" % len(validInd))
                print(validInd[:10])
        with open(
                "%s/stratifiedKFold.%s.pkl" %
            (config.data_folder, stratified_label), "wb") as f:
            cPickle.dump(skf, f, -1)

Example #10

0

Show file

File: random_forest_classification.py Project: vineethcv/Kaggle_titanic

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=Y, cv=10)
accuracies.mean()
accuracies.std()

# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
parameters = {
    'max_depth': [4, 5, 6, 7, 8],
    'n_estimators': [200, 210, 240, 250],
    'criterion': ['gini', 'entropy']
}
cross_validation = StratifiedKFold(Y, n_folds=5)
grid_search = GridSearchCV(classifier,
                           param_grid=parameters,
                           scoring='accuracy',
                           cv=cross_validation,
                           n_jobs=-1)
grid_search = grid_search.fit(X_train, Y)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

#Predict again
y_pred = grid_search.predict(X_test)

#Prepare test results to submit
result = pd.DataFrame()
result['PassengerId'] = pd.read_csv('test.csv')['PassengerId']

Example #11

0

Show file

                tmax,
                proj=False,
                picks=picks,
                baseline=None,
                preload=True,
                verbose=False)

# Create classification pipeline
clf = make_pipeline(Xdawn(n_components=3), Vectorizer(), MinMaxScaler(),
                    LogisticRegression(penalty='l1'))

# Get the labels
labels = epochs.events[:, -1]

# Cross validator
cv = StratifiedKFold(y=labels, n_folds=10, shuffle=True, random_state=42)

# Do cross-validation
preds = np.empty(len(labels))
for train, test in cv:
    clf.fit(epochs[train], labels[train])
    preds[test] = clf.predict(epochs[test])

# Classification report
target_names = ['aud_l', 'aud_r', 'vis_l', 'vis_r']
report = classification_report(labels, preds, target_names=target_names)
print(report)

# Normalized confusion matrix
cm = confusion_matrix(labels, preds)
cm_normalized = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]

Example #12

0

Show file

File: KmmtBlend.py Project: hyusak/KmmtML

    X_train, X_test, y_train, y_test = train_test_split(train,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=111)

    # X, y, X_submission = load_data.load()
    X = X_train.values
    y = y_train.values
    X_submission = X_test.values

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]

    skf = list(StratifiedKFold(y, n_folds))

    clfs = [
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100,
                               n_jobs=-1,
                               criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05,
                                   subsample=0.5,
                                   max_depth=6,
                                   n_estimators=50)
    ]

    print("Creating train and test sets for blending.")

Example #13

0

Show file

def RandomGridSearchRFC_Fixed(X,Y,splits, model, survival):
    """
    This function looks for the best set o parameters for RFC method
    Input: 
        X: training set
        Y: labels of training set
        splits: cross validation splits, used to make sure the parameters are stable
    Output:
        clf.best_params_: dictionary with the parameters, to use: param_svm['kernel']
    """    
      

    start_svm = time.time()  
    
    if model == 'svm':
        clf = svm.SVC()

        tuned_parameters = {
        'C': ([0.01, 1, 10]),
         'kernel': (['rbf', 'linear']),
        # 'kernel': (['linear', 'rbf', 'sigmoid']),
        # 'degree': ([1,3,5,10]),
        # 'decision_function_shape' : (['ovo', 'ovr']),
        # 'cache_size': ([500,1000,1500,2000]),
        'shrinking': ([False, True]),
        # 'probability': ([False, True])
        }
    
    if model == 'cart':
        clf = tree.DecisionTreeClassifier()

        tuned_parameters = {
        'criterion': (['gini', 'entropy']),
        'max_depth': ([10,20]),
        'min_samples_split': ([2,3,5]),
        'min_samples_leaf': ([2,3,5]),
        }

    if model == 'rf':
        clf = ensemble.RandomForestClassifier()
 
        tuned_parameters = {
        'n_estimators': ([200,500,1000]),
        # 'max_features': (['auto', 'sqrt', 'log2',1,4,8]),                   # precomputed,'poly', 'sigmoid'
        'max_depth':    ([10,20]),
        # 'criterion':    (['gini', 'entropy']),
        'min_samples_split':  [2,3,5],
        'min_samples_leaf':   [2,3,5],
        }
        
    if model == 'xgboost':
        clf = XGBClassifier()

        tuned_parameters = {
        'booster': (['gbtree']),
        'max_depth':   ([5,10,20]),
        'reg_lambda': ([0,1]),
        'reg_alpha': ([0,1]),
        'subsample': ([0.5,1])
        }

    if model == 'lr':
        clf = linear_model.LogisticRegression()

        tuned_parameters = {
        'solver': (['liblinear', 'sag', 'saga'])
        }

    if model == 'cox':
       
        clf =  CoxnetSurvivalAnalysis()
        tuned_parameters = {
        'n_alphas': ([50,100,200]),
        'l1_ratio': ([0.1,0.5,1]),

        }

    if model == 'survSVM':
        clf = FastSurvivalSVM()
        
        tuned_parameters = {
        'alpha': ([0.5,1]),
        'rank_ratio': ([0.5,1]),
        'max_iter': ([20,40,80]),
        'optimizer': (['rbtree', 'avltree']),
        }

    if model == 'gb':
        clf = GradientBoostingSurvivalAnalysis()
       
        tuned_parameters = {
        'learning_rate': ([0.1, 0.3]),
        'n_estimators': ([100,200,400]),
        'max_depth': ([3,6,12])        
        }

    
    if survival == True:
        scorer = make_scorer(CI, greater_is_better=True)

        y_for_cv = np.array([t[0] for t in Y])
        cv = StratifiedKFold(y_for_cv, n_folds=2) # x-validation

    else:
        cv = StratifiedKFold(Y, n_folds=2) # x-validation
        scores = ['roc_auc']   

    print ('  ...performing x-validation')
   
    clf =  GridSearchCV(clf, tuned_parameters, scoring='%s' % scores[0], cv=cv, verbose=10) #scoring='%s' % scores[0]
    # clf = BayesSearchCV(clf, tuned_parameters, n_iter=50, cv=splits,
    #                 optimizer_kwargs=dict(acq_func='LCB', base_estimator='RF'))

    clf.fit(X, Y)

    end_svm = time.time()
    print("Total time to process: ",end_svm - start_svm)
  
    return(clf.best_params_,clf)

Example #14

0

Show file

File: classifier_kfold.py Project: jubatus/jubakit

dataset = dataset.shuffle()

# Create a Classifier Service.
# Classifier process starts using a default configuration.
classifier = Classifier.run(Config())

# Prepare arrays to keep true/predicted labels to display a report later.
true_labels = []
predicted_labels = []

# Run stratified K-fold validation.
labels = list(dataset.get_labels())
if sklearn_version < 18:
    train_test_indices = StratifiedKFold(labels, n_folds=10)
else:
    skf = StratifiedKFold(n_splits=10)
    train_test_indices = skf.split(labels, labels)

for train_idx, test_idx in train_test_indices:
  # Clear the classifier (call `clear` RPC).
  classifier.clear()

  # Split the dataset to train/test dataset.
  (train_ds, test_ds) = (dataset[train_idx], dataset[test_idx])

  # Train the classifier using train dataset.
  for (idx, label) in classifier.train(train_ds):
    # You can peek records being trained.
    #print('train[{0}]: (label: {1}) => {2}'.format(idx, label, train_ds[idx]))
    pass

Example #15

0

Show file

def create_ROC(filename):
    from scipy import interp
    from sklearn import preprocessing as pps, svm
    from sklearn.metrics import roc_curve, auc
    from sklearn.cross_validation import StratifiedKFold, LeaveOneOut

    filepath = filename + '.pkl'
    with open(filepath, 'rb') as f:
        svm_data = pickle.load(f)
    labels = svm_data['labels']
    data = svm_data['data']

    scaler = pps.Scaler().fit(data)
    print "Mean: ", scaler.mean_
    print "Std: ", scaler.std_
    data_scaled = scaler.transform(data)

    classifier = svm.SVC(probability=True)
    classifier.fit(data_scaled, labels)

    #print "Support Vectors: \r\n", classifier.support_vectors_
    print "SV's per class: \r\n", classifier.n_support_

    ###############################################################################
    ## Code below modified from http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html#example-plot-roc-crossval-py
    X, y = data_scaled, np.array(labels)
    n_samples, n_features = X.shape
    print n_samples, n_features

    ###############################################################################
    # Classification and ROC analysis
    # Run classifier with crossvalidation and plot ROC curves
    cv = StratifiedKFold(y, k=9)

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, n_samples)
    all_tpr = []
    plt.figure(2)
    for i, (train, test) in enumerate(cv):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr,
                 tpr,
                 '--',
                 lw=1,
                 label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr,
             mean_tpr,
             'k-',
             lw=3,
             label='Mean ROC (area = %0.2f)' % mean_auc)

    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
    print "Finished!"

Example #16

0

Show file

File: plot_gmm_classifier.py Project: njwilson/scikit-learn

        v, w = np.linalg.eigh(gmm._get_covars()[n][:2, :2])
        u = w[0] / np.linalg.norm(w[0])
        angle = np.arctan2(u[1], u[0])
        angle = 180 * angle / np.pi  # convert to degrees
        v *= 9
        ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1], 180 + angle, color=color)
        ell.set_clip_box(ax.bbox)
        ell.set_alpha(0.5)
        ax.add_artist(ell)


iris = datasets.load_iris()

# Break up the dataset into non-overlapping training (75%) and testing
# (25%) sets.
skf = StratifiedKFold(iris.target, k=4)
# Only take the first fold.
train_index, test_index = skf.__iter__().next()


X_train = iris.data[train_index]
y_train = iris.target[train_index]
X_test = iris.data[test_index]
y_test = iris.target[test_index]

n_classes = len(np.unique(y_train))

# Try GMMs using different types of covariances.
classifiers = dict((x, GMM(n_components=n_classes, covariance_type=x)) for x in ["spherical", "diag", "tied", "full"])

n_classifiers = len(classifiers)

Example #17

0

Show file

 def reduce_number_instances(self, proportion=0.1):
     skf = StratifiedKFold(self._target, n_folds=1.0 / proportion)
     test_folds = skf.test_folds
     _, _, self._data, self._target = self.separate_sets(
         self._data, self._target, 0, test_folds)