df_equal = pd.concat([df_equal, df_subset], axis=0)
species_key_df = df_all[['Species', 'Species_code']].drop_duplicates()

# create arrays of required data
X_columns = ['leaf length', 'leaf width', 'widest point', 'total veins']
y_columns = ['Species']
X = df_equal[X_columns].values
y = df_equal[y_columns].values


# parameters of the model
n_neighbors = 10
weights = ['uniform', 'distance']
weight = weights[0]

ss = ShuffleSplit(n_splits=10, test_size=0.1)
for train_index, test_index in ss.split(X):
    # generate data from indices
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

    # fit the training data
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weight)
    clf.fit(X_train, y_train.ravel())

    # predict the test data
    output = clf.predict(X_test)

    # report results
    score = clf.score(X_test, y_test)
    print("Score: {:.2%}".format(score))
def FitModel(cnnc, A, Y, T, FN):
    print('Fitting model...')
    ss = ShuffleSplit(n_splits = 1)
    trn, tst = next(ss.split(A))
    #Fit the network
    cnnc.fit(A[trn], Y[trn])
    #The predictions as sequences of character indices
    YH = []
    for i in np.array_split(np.arange(A.shape[0]), 32): 
        YH.append(cnnc.predict(A[i]))
    YH = np.vstack(YH)
    #Convert from sequence of char indices to strings
    PS = np.array([''.join(YHi) for YHi in YH])
    #Compute the accuracy
    S1 = SAcc(PS[trn], T[trn])
    S2 = SAcc(PS[tst], T[tst])
    print('Train: ' + str(S1))
    print('Test: ' + str(S2))
    for PSi, Ti, FNi in zip(PS, T, FN):
        if np.random.rand() > 0.99: #Randomly select rows to print
            print(FNi + ': ' + Ti + ' -> ' + PSi)
    print('Fitting with CV data...')
    #Fit remainder
    cnnc.SetMaxIter(4)
    cnnc.fit(A, Y)
    return cnnc
Exemple #3
0
def main():
    from io import open as uopen
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('fname')
    parser.add_argument('idx', default=2, type=int)
    parser.add_argument('--key', default=u'V;1;SG;IND;PST;PFV')
    parser.add_argument('--shuffle', action='store_true')
    parser.add_argument('--folds', default=10, type=int)
    parser.add_argument('--lang', default='sp')
    parser.add_argument('--key-idx', default=3, type=int)
    args = parser.parse_args()
    fh = uopen(args.fname, encoding='utf-8')
    lines = [x.strip().split(u'\t') for x in fh]
    to_extract = [(x[0], x[args.idx]) for x in lines if x[args.key_idx] == args.key]
    if args.shuffle:
        from random import shuffle
        shuffle(to_extract)
    from distutils.dir_util import mkpath
    from sklearn.model_selection import ShuffleSplit
    rs = ShuffleSplit(n_splits=args.folds, test_size=0.2, random_state=42)
    for i, (train_indices, test_indices) in enumerate(rs.split(to_extract)):
        mkpath('res/ryan_splits/{}-10fold/{}'.format(args.lang, i))
        train_fh, dev_fh, test_fh = (uopen('res/ryan_splits/{}-10fold/{}/train.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
                                     uopen('res/ryan_splits/{}-10fold/{}/dev.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
                                     uopen('res/ryan_splits/{}-10fold/{}/test.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
                                     )
        for idx in train_indices:
            train_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))

        for j, idx in enumerate(test_indices):
            if j % 2 == 0:
                dev_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
            else:
                test_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    # cv = ShuffleSplit(
    #     n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
    cv = ShuffleSplit(
        n_splits=10, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv.split(X):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors)
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Initiate model
    model = init_model(X_train.shape[1])
    vanilla_weights = model.get_weights()

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(
        cross_validation_iterator.split(X_train), start=1
    ):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        optimal_weights_path = "/tmp/Optimal_Weights_{}.h5".format(cross_validation_index)
        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        if not os.path.isfile(optimal_weights_path):
            # Load the vanilla weights
            model.set_weights(vanilla_weights)

            # Perform the training procedure
            earlystopping_callback = EarlyStopping(monitor="val_actual_mae", patience=EARLYSTOPPING_PATIENCE)
            modelcheckpoint_callback = ModelCheckpoint(optimal_weights_path, monitor="val_loss", save_best_only=True)
            model.fit(
                X_train[train_index],
                Y_train[train_index],
                batch_size=TRAIN_BATCH_SIZE,
                nb_epoch=MAXIMUM_EPOCH_NUM,
                validation_data=(X_train[valid_index], Y_train[valid_index]),
                callbacks=[earlystopping_callback, modelcheckpoint_callback],
                verbose=2,
            )

        # Load the optimal weights
        model.load_weights(optimal_weights_path)

        # Perform the testing procedure
        Y_test = model.predict(X_test, batch_size=TEST_BATCH_SIZE, verbose=2)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
    # cv = ShuffleSplit(
    #     n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
    # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html
    # old:http://scikit-learn.org/0.15/modules/generated/sklearn
    # .cross_validation.ShuffleSplit.html#sklearn.cross_validation.ShuffleSplit
    cv = ShuffleSplit(
        n_splits=10, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []
    for train, test in cv.split(X):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)
        # print('proba:', proba)
        # fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    scores_to_sort = pr_scores
    # print('np.argsort(scores_to_sort):', np.argsort(scores_to_sort),len(scores_to_sort) / 2)
    median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)]

    if plot:
        plot_pr(pr_scores[median], name, "01", precisions[median],
                recalls[median], label=name)

        summary = (np.mean(scores), np.std(scores),
                   np.mean(pr_scores), np.std(pr_scores))
        print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors)
Exemple #7
0
def fit_models(imps, X, Y, all_props, props=None,
               labels=None, n_splits=5, 
               clf_args={'n_estimators':25, 
                         'max_features':'auto', 
                         'random_state':0}):
    if props is None:
        props = all_props
    n_obs = X['missing'].shape[0] # Number of observations.  
    n_features = X['missing'].shape[1] # Number of observations.  
    n_props = len(props) # Number of properties to predict.  
    test_size = 0.2
    if labels is None:
        shuffle_split = ShuffleSplit(n_iter=n_splits,
                                     test_size=test_size,random_state=0)
    else:
        shuffle_split = GroupShuffleSplit(n_iter=n_splits,
                                          test_size=test_size,random_state=0)
    n_test_samples = np.max([len(list(shuffle_split.split(range(n_obs),groups=labels))[i][1]) \
                            for i in range(n_splits)])
    rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps}
    ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    feature_importances = {imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps}
    for n_prop,prop in enumerate(props):
        j = all_props.index(prop)
        print("Fitting model for %s..." % prop)
        for imp in imps:
            for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),
                                                                groups=labels)):
                X_train,X_test = X[imp][train],X[imp][test]
                Y_train,Y_test = Y[imp][train,j],Y['missing'][test,j]
                clf_args_ = {key:(value if type(value) is not dict \
                             else value[prop])\
                             for key,value in clf_args.items()}
                if clf_args_['max_features'] not in [None, 'auto']:
                   clf_args_['max_features'] = min(X_train.shape[1],
                                                   clf_args_['max_features'])
                rfc = RandomForestClassifier(**clf_args_)
                #if Y_train.shape[1] == 1:
                #    Y_train = Y_train.ravel()
                rfc.fit(X_train,Y_train)
                Y_predict = rfc.predict(X_test)#.reshape(-1,n_props)
                probs = rfc.predict_proba(X_test)
                if probs.shape[1]<2 and probs.mean()==1.0:
                    n_test_samples = len(probs)
                    ps[imp][n_prop,k,:n_test_samples] = 0.0
                else:
                    n_test_samples = len(probs[:,1])
                    ps[imp][n_prop,k,:n_test_samples] = probs[:,1]
                ys[imp][n_prop,k,:n_test_samples] = Y_test
                rs[imp][n_prop,k] = np.ma.corrcoef(Y_predict,Y_test)[0,1]
                feature_importances[imp][n_prop,:,k] = rfc.feature_importances_
    return rs,feature_importances,ys,ps
Exemple #8
0
def fit_models_mc(imps, X, Y, all_props, props=None,
               labels=None, n_splits=5, 
               clf_args={'n_estimators':25, 
                         'max_features':'auto', 
                         'random_state':0}):
    if props is None:
        props = all_props
    n_obs = X['missing'].shape[0] # Number of observations.  
    n_features = X['missing'].shape[1] # Number of observations.  
    n_props = len(props) # Number of properties to predict.  
    test_size = 0.2
    if labels is None:
        shuffle_split = ShuffleSplit(n_iter=n_splits,
                                     test_size=test_size,random_state=0)
    else:
        shuffle_split = LabelShuffleSplit(n_iter=n_splits,
                                          test_size=test_size,random_state=0)
    n_test_samples = np.max([len(list(shuffle_split)[i][1]) \
                            for i in range(n_splits)])
    rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps}
    ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    feature_importances = None#{imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps}
    cols = np.array([i for i in range(len(all_props)) if all_props[i] in props])
    for imp in imps:
        for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),groups=labels)):
            #X_train,X_test = X[imp][train][:,cols],X[imp][test][:,cols]
            #Y_train,Y_test = Y[imp][train][:,cols],Y['missing'][test][:,cols]
            X_train,X_test = X[imp][train,:],X[imp][test,:]
            Y_train,Y_test = Y[imp][train,:],Y['missing'][test,:]
            clf_args_ = {key:(value if type(value) is not dict \
                         else value[prop])\
                         for key,value in clf_args.items()}
            if clf_args_['max_features'] not in [None, 'auto']:
               clf_args_['max_features'] = min(X_train.shape[1],
                                               clf_args_['max_features'])
            rfc = RandomForestClassifier(**clf_args_)
            onevsrest = OneVsRestClassifier(rfc)
            onevsrest.fit(X_train,Y_train)
            Y_predict = onevsrest.predict(X_test)#.reshape(-1,n_props)
            probs = onevsrest.predict_proba(X_test)
            if probs.shape[1]<2 and probs.mean()==1.0:
                n_test_samples = len(probs)
                ps[imp][:,k,:n_test_samples] = 0.0
            else:
                n_test_samples = len(probs[:,1])
                ps[imp][:,k,:n_test_samples] = probs.T
            ys[imp][:,k,:n_test_samples] = Y_test.T
            for i in range(n_props):
                rs[imp][i,k] = np.ma.corrcoef(Y_predict[:,i],Y_test[:,i])[0,1]
            #feature_importances[imp][n_prop,:,k] = onevsrest.feature_importances_
    return rs,feature_importances,ys,ps
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = XGBRegressor(
            learning_rate=0.01,
            max_depth=12,
            n_estimators=N_ESTIMATORS,
            silent=False,
            objective="reg:linear",
            gamma=1,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.5,
            reg_alpha=1,
            seed=cross_validation_index,
            nthread=-1)

        model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])],
            eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))),
            early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True)

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
 def TestPerformance(self, df = None):
     #If no dataframe is provided, use the currently learned one
     if(df is None):
         D = self.D
     else:
         D = self.S.transform(df.copy())
     #Get features from the data frame
     A = self._ExtractFeat(D)
     #Get the target values and their corresponding column names
     y, _ = self._ExtractTarg(D)
     #Begin cross validation
     ss = ShuffleSplit(n_splits = 1)
     for trn, tst in ss.split(A):
         s1 = self.R.score(A, y)
         s2 = self.R.score(A[tst], y[tst])
         s3 = self.R.score(A[trn], y[trn])
         print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = ShuffleSplit(test_size=0.25, random_state=0)
    tr, te = list(cv.split(X))[0]

    X_tr, y_tr = _safe_split(clf, X, y, tr)
    K_tr, y_tr2 = _safe_split(clfp, K, y, tr)
    assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))

    X_te, y_te = _safe_split(clf, X, y, te, tr)
    K_te, y_te2 = _safe_split(clfp, K, y, te, tr)
    assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = GBMRegressor(
            learning_rate=0.01,
            num_iterations=NUM_ITERATIONS,
            num_leaves=200,
            min_data_in_leaf=10,
            feature_fraction=0.3,
            feature_fraction_seed=cross_validation_index,
            bagging_fraction=0.8,
            bagging_freq=10,
            bagging_seed=cross_validation_index,
            metric="l1",
            metric_freq=10,
            early_stopping_round=EARLY_STOPPING_ROUND,
            num_threads=-1)

        model.fit(X_train[train_index], Y_train[train_index], test_data=[(X_train[valid_index], Y_train[valid_index])])

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
def plot_shuffle_split():
    from sklearn.model_selection import ShuffleSplit
    plt.figure(figsize=(10, 2))
    plt.title("ShuffleSplit with 10 points"
              ", train_size=5, test_size=2, n_splits=4")

    axes = plt.gca()
    axes.set_frame_on(False)

    n_folds = 10
    n_samples = 10
    n_iter = 4
    n_samples_per_fold = 1

    ss = ShuffleSplit(n_splits=4, train_size=5, test_size=2, random_state=43)
    mask = np.zeros((n_iter, n_samples))
    for i, (train, test) in enumerate(ss.split(range(10))):
        mask[i, train] = 1
        mask[i, test] = 2

    for i in range(n_folds):
        # test is grey
        colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
        # not selected has no hatch

        boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter,
                          left=i * n_samples_per_fold, height=.6, color=colors,
                          hatch="//", edgecolor='k', align='edge')
        for j in np.where(mask[:, i] == 0)[0]:
            boxes[j].set_hatch("")

    axes.invert_yaxis()
    axes.set_xlim(0, n_samples + 1)
    axes.set_ylabel("CV iterations")
    axes.set_xlabel("Data points")
    axes.set_xticks(np.arange(n_samples) + .5)
    axes.set_xticklabels(np.arange(1, n_samples + 1))
    axes.set_yticks(np.arange(n_iter) + .3)
    axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)])
    # legend hacked for this random state
    plt.legend([boxes[1], boxes[0], boxes[2]], [
               "Training set", "Test set", "Not selected"], loc=(1, .3))
    plt.tight_layout()
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = ShuffleSplit(test_size=0.25, random_state=0)
    train, test = list(cv.split(X))[0]

    X_train, y_train = _safe_split(clf, X, y, train)
    K_train, y_train2 = _safe_split(clfp, K, y, train)
    assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
    assert_array_almost_equal(y_train, y_train2)

    X_test, y_test = _safe_split(clf, X, y, test, train)
    K_test, y_test2 = _safe_split(clfp, K, y, test, train)
    assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
    assert_array_almost_equal(y_test, y_test2)
                             datetime.datetime.now().strftime('%Y-%m-%d_%H'
                                                              '-%M-%S')))
os.makedirs(output_dir)

random_state = check_random_state(0)
mem = Memory(cachedir=expanduser("~/cache"), verbose=10)
X_csr = mem.cache(fetch_ml_10m)(expanduser('~/data/own/ml-10M100K'),
                               remove_empty=True)

permutation = random_state.permutation(X_csr.shape[0])

X_csr = X_csr[permutation]

X, y = array_to_fm_format(X_csr)

uniform_split = ShuffleSplit(n_iter=4,
                             test_size=.25, random_state=random_state)

fm_decoder = FMDecoder(n_samples=X_csr.shape[0], n_features=X_csr.shape[1])

base_estimator = BaseRecommender(fm_decoder)

convex_fm = ConvexFM(fit_linear=True, alpha=0, max_rank=20,
                     beta=1, verbose=100)

soft_imputer = SoftImputer(fm_decoder, alpha=.001, n_components=10,
                           max_iter=100,
                           random_state=None)

dl_rec = DLRecommender(fm_decoder,
                       n_components=50,
                       batch_size=10,
def train_test_split(*arrays, **options):
    """Extend sklearn.model_selection.train_test_slit to have group split.

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.

    test_size : float, int or None, optional (default=None)
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.25.

    train_size : float, int, or None, (default=None)
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    shuffle : None or str (default='simple')
        How to shuffle the data before splitting.
        None, no shuffle.
        For str, one of 'simple', 'stratified' and 'group', corresponding to
        `ShuffleSplit`, `StratifiedShuffleSplit` and `GroupShuffleSplit`,
        respectively.

    labels : array-like or None (default=None)
        Ignored if shuffle is None or 'simple'.
        When shuffle='stratified', this array is used as class labels.
        When shuffle='group', this array is used as groups.

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.

    """
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    random_state = options.pop('random_state', None)
    shuffle = options.pop('shuffle', 'simple')
    labels = options.pop('labels', None)

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))

    arrays = indexable(*arrays)

    n_samples = _num_samples(arrays[0])
    if shuffle == 'group':
        if labels is None:
            raise ValueError("When shuffle='group', "
                             "labels should not be None!")
        labels = check_array(labels, ensure_2d=False, dtype=None)
        uniques = np.unique(labels)
        n_samples = uniques.size

    n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size,
                                              default_test_size=0.25)

    shuffle_options = dict(test_size=n_test,
                           train_size=n_train,
                           random_state=random_state)

    if shuffle is None:
        if labels is not None:
            warnings.warn("The `labels` is ignored for "
                          "shuffle being None!")

        train = np.arange(n_train)
        test = np.arange(n_train, n_train + n_test)

    elif shuffle == 'simple':
        if labels is not None:
            warnings.warn("The `labels` is not needed and therefore "
                          "ignored for ShuffleSplit, as shuffle='simple'!")

        cv = ShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=None))

    elif shuffle == 'stratified':
        cv = StratifiedShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=labels))

    elif shuffle == 'group':
        cv = GroupShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=None, groups=labels))

    else:
        raise ValueError("The argument `shuffle` only supports None, "
                         "'simple', 'stratified' and 'group', but got `%s`!"
                         % shuffle)

    return list(chain.from_iterable((safe_indexing(a, train),
                                    safe_indexing(a, test)) for a in arrays))
from sklearn import metrics
print('Precision: \t{}'.format(metrics.precision_score(y_test, predicted, average=None)))
print('Recall: \t{}'.format(metrics.recall_score(y_test, predicted, average=None)))
print('F1: \t\t{}'.format(metrics.f1_score(y_test, predicted, average=None)))

print('Macro Precision: \t{}'.format(metrics.precision_score(y_test, predicted, average='macro')))
print('Macro Recall: \t\t{}'.format(metrics.recall_score(y_test, predicted, average='macro')))
print('Macro F1: \t\t{}'.format(metrics.f1_score(y_test, predicted, average='macro')))

# stratified k-fold
print('-------------------------------- Shuffle Split ---------------------------------')
total_score = 0
runs = 0

from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10)
for train, test in ss.split(tweets, target):
    X_train = np.array(tweets)[train]
    y_train = target[train]

    X_test = np.array(tweets)[test]
    y_test = target[test]

    pipeline = Pipeline([('vect', CountVectorizer(max_df=0.75, ngram_range=(1, 2))),
                         ('tfidf', TfidfTransformer(norm='l1', use_idf=False)),
                         ('clf', ExtraTreesClassifier(random_state=0, n_estimators=10, class_weight='auto'))])
    pipeline = pipeline.fit(X_train, y_train)

    predicted = pipeline.predict(X_test)
    print('Accuracy: {}'.format(accuracy_score(y_test, predicted)))
    print(metrics.classification_report(y_test, predicted))
# Read epochs (train will be done only between 1 and 2s)
# Testing will be done with a running classifier
epochs = Epochs(raw, events, event_id, tmin, tmax, proj=True, picks=picks,
                baseline=None, preload=True)
epochs_train = epochs.copy().crop(tmin=1., tmax=2.)
labels = epochs.events[:, -1] - 2

###############################################################################
# Classification with linear discrimant analysis

# Define a monte-carlo cross-validation generator (reduce variance):
scores = []
epochs_data = epochs.get_data()
epochs_data_train = epochs_train.get_data()
cv = ShuffleSplit(10, test_size=0.2, random_state=42)
cv_split = cv.split(epochs_data_train)

# Assemble a classifier
lda = LinearDiscriminantAnalysis()
csp = CSP(n_components=4, reg=None, log=True, norm_trace=False)

# Use scikit-learn Pipeline with cross_val_score function
clf = Pipeline([('CSP', csp), ('LDA', lda)])
scores = cross_val_score(clf, epochs_data_train, labels, cv=cv, n_jobs=1)

# Printing the results
class_balance = np.mean(labels == labels[0])
class_balance = max(class_balance, 1. - class_balance)
print("Classification accuracy: %f / Chance level: %f" % (np.mean(scores),
                                                          class_balance))
from sklearn.model_selection import ShuffleSplit
import numpy as np


X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
rs.get_n_splits(X)
print(rs)
ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None)
for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25,
                     random_state=0)

for train_index, test_index in rs.split(X):
    print("# TRAIN:", train_index, "TEST:", test_index)
@pandaize
def cross_val_score_pd(estimator, X, y, **kwargs):
    return model_selection.cross_val_score(estimator, X, y, **kwargs)


## -----------------------------------------------------------------
## load Patel data
## -----------------------------------------------------------------
def readTab(file):
    return pd.read_csv(file, sep="\t", header=0, index_col=0)

x = readTab("rnaseq/GSE57872/GSE57872_DataMatrixMapped.tsv.gz").transpose()
y = x.BRCA1
x0 = x[ x.columns[x.columns != "BRCA1"] ]

cvSched = ShuffleSplit(n_splits=10, test_size=0.1, random_state=123)

corPVals = colcor(x0, y)['p']
corQVals = bhfdr(corPVals)
corQVals.sort_values(inplace=False).head()

plt.close()
ax = plt.subplot(111)
x.plot.scatter(x="CDK1", y="BRCA1", ax=ax)


## -----------------------------------------------------------------
## unregularized linear regression
## -----------------------------------------------------------------
nFeats = [2, 5, 10, 20, 50, 100, 200, 500, 1000]
brca1Modelers = OrderedDict([
Exemple #21
0
def train(working, max_samples, duration, rate,
          batch_size, epochs, epoch_size, validation_size,
          early_stopping, reduce_lr, seed):
    '''
    Parameters
    ----------
    working : str
        directory that contains the experiment data (h5)

    max_samples : int
        Maximum number of samples per streamer

    duration : float
        Duration of training patches

    batch_size : int
        Size of batches

    rate : int
        Poisson rate for pescador

    epochs : int
        Maximum number of epoch

    epoch_size : int
        Number of batches per epoch

    validation_size : int
        Number of validation batches

    early_stopping : int
        Number of epochs before early stopping

    reduce_lr : int
        Number of epochs before reducing learning rate

    seed : int
        Random seed
    '''

    # Load the pump
    with open(os.path.join(OUTPUT_PATH, 'pump.pkl'), 'rb') as fd:
        pump = pickle.load(fd)

    # Build the sampler
    sampler = make_sampler(max_samples, duration, pump, seed)

    # Build the model
    model, inputs, outputs = construct_model(pump)

    # Load the training data
    idx_train_ = pd.read_json('index_train.json')

    # Split the training data into train and validation
    splitter_tv = ShuffleSplit(n_splits=1, test_size=0.25,
                               random_state=seed)
    train, val = next(splitter_tv.split(idx_train_))

    idx_train = idx_train_.iloc[train]
    idx_val = idx_train_.iloc[val]

    gen_train = data_generator(working,
                               idx_train['id'].values, sampler, epoch_size,
                               augment=True,
                               lam=rate,
                               batch_size=batch_size,
                               revive=True,
                               random_state=seed)

    gen_train = keras_tuples(gen_train(), inputs=inputs, outputs=outputs)

    gen_val = data_generator(working,
                             idx_val['id'].values, sampler, len(idx_val),
                             augment=False,
                             batch_size=batch_size,
                             revive=True,
                             random_state=seed)

    gen_val = keras_tuples(gen_val(), inputs=inputs, outputs=outputs)

    loss = {'beat': 'binary_crossentropy',
            'downbeat': 'binary_crossentropy'}

    metrics = {'beat': 'accuracy', 'downbeat': 'accuracy'}

    monitor = 'val_loss'

    model.compile(K.optimizers.Adam(), loss=loss, metrics=metrics)

    # Store the model
    model_spec = K.utils.serialize_keras_object(model)
    with open(os.path.join(OUTPUT_PATH, 'model_spec.pkl'), 'wb') as fd:
        pickle.dump(model_spec, fd)

    # Construct the weight path
    weight_path = os.path.join(OUTPUT_PATH, 'model.h5')

    # Build the callbacks
    cb = []
    cb.append(K.callbacks.ModelCheckpoint(weight_path,
                                          save_best_only=True,
                                          verbose=1,
                                          monitor=monitor))

    cb.append(K.callbacks.ReduceLROnPlateau(patience=reduce_lr,
                                            verbose=1,
                                            monitor=monitor))

    cb.append(K.callbacks.EarlyStopping(patience=early_stopping,
                                        verbose=1,
                                        monitor=monitor))

    # Fit the model
    model.fit_generator(gen_train, epoch_size, epochs,
                        validation_data=gen_val,
                        validation_steps=validation_size,
                        callbacks=cb)
Exemple #22
0
optimal.predict(data)
    
#k-fold validation
# k-fold is a type of cross validation where the data are divided into k bins. For each experiment, pick one of the k bins as the test set, 
#the remaining k-1 bins as training. Run k separate experiments and average all k test results. 
#This technique helps to test different part of the data to prevent overfitting 
#i.e. it prevents grid search from returning a parameter set that optimized specifically for a specific training data set but not overall.
from sklearn.model_selection import KFold
cv_set = KFold(n_splits=10)
for train_index, test_index in cv_sets.split(X):
     print("%s %s" % (train_index, test_index))
#Shufflesplit
#ShuffleSplit() for an alternative form of cross-validation (see the 'cv_sets' variable). 
#The ShuffleSplit() will create 10 ('n_splits') shuffled sets, and for each shuffle, 20% ('test_size') of the data will be used as the validation set.
from sklearn.model_selection import ShuffleSplit
cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0)
for train_index, test_index in cv_sets.split(X):
     print("%s %s" % (train_index, test_index))
     
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score

# pipelining
#Sequentially apply a list of transforms and a final estimator. Intermediate steps 
#of the pipeline must be ‘transforms’, that is, they must implement fit and 
#transform methods. The final estimator only needs to implement fit.
#The purpose of the pipeline is to assemble several steps that can be 
#cross-validated together while setting different parameters.
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
Exemple #23
0
print('Misclassified test samples: %d' % (y_test != y_pred_test).sum())
print('Training Accuracy: %.2f' % svm.score(X_train_centered, y_train))
print('Validation Accuracy: %.2f' % svm.score(X_val_centered, y_val))
print('Test Accuracy: %.2f' % svm.score(X_test_centered, y_test))

# Print out more performance metrics (Precision and Recall)
more_scores = precision_recall_fscore_support(y_test,
                                              y_pred_test,
                                              average='weighted')

print('Precision: ', more_scores[0])
print('Recall: ', more_scores[1])

# Define a 10 fold CV with 11 % data of training set (train_temp) for validation
# 11 %, not 10 %,  because the validation split is being used instead of the test split.
cv = ShuffleSplit(n_splits=10, test_size=0.11, random_state=0)

# Plot learning curves with 10-fold CV
train_sizes, train_scores, test_scores = learning_curve(
    estimator=svm,
    X=X_train_temp_centered,
    y=y_train_temp,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=cv,
    n_jobs=1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

fig = plt.figure()
def test_shufflesplit_reproducible():
    # Check that iterating twice on the ShuffleSplit gives the same
    # sequence of train-test when the random_state is given
    ss = ShuffleSplit(random_state=21)
    assert_array_equal(list(a for a, b in ss.split(X)),
                       list(a for a, b in ss.split(X)))
labels = epochs.events[:, -1]
evoked = epochs.average()

###############################################################################
# Decoding in sensor space using a linear SVM

from sklearn.svm import SVC  # noqa
from sklearn.model_selection import ShuffleSplit  # noqa
from mne.decoding import CSP  # noqa

n_components = 3  # pick some components
svc = SVC(C=1, kernel='linear')
csp = CSP(n_components=n_components, norm_trace=False)

# Define a monte-carlo cross-validation generator (reduce variance):
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
scores = []
epochs_data = epochs.get_data()

for train_idx, test_idx in cv.split(labels):
    y_train, y_test = labels[train_idx], labels[test_idx]

    X_train = csp.fit_transform(epochs_data[train_idx], y_train)
    X_test = csp.transform(epochs_data[test_idx])

    # fit classifier
    svc.fit(X_train, y_train)

    scores.append(svc.score(X_test, y_test))

# Printing the results
Exemple #26
0
def check_fit_idempotent(name, estimator_orig):
    # Check that est.fit(X) is the same as est.fit(X).fit(X). Ideally we would
    # check that the estimated parameters during training (e.g. coefs_) are
    # the same, but having a universal comparison function for those
    # attributes is difficult and full of edge cases. So instead we check that
    # predict(), predict_proba(), decision_function() and transform() return
    # the same results.

    check_methods = [
        "predict", "transform", "decision_function", "predict_proba"
    ]
    rng = np.random.RandomState(0)

    if estimator_orig._get_tags()['non_deterministic']:
        msg = name + ' is non deterministic'
        raise SkipTest(msg)

    estimator = clone(estimator_orig)
    set_random_state(estimator)
    if 'warm_start' in estimator.get_params().keys():
        estimator.set_params(warm_start=False)

    n_samples = 100
    X, _ = _create_small_ts_dataset()
    X = X.reshape((X.shape[0], X.shape[1]))
    X = pairwise_estimator_convert_X(X, estimator)
    if is_regressor(estimator_orig):
        y = rng.normal(size=n_samples)
    else:
        y = rng.randint(low=0, high=2, size=n_samples)

    train, test = next(ShuffleSplit(test_size=.2, random_state=rng).split(X))
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    # Fit for the first time
    estimator.fit(X_train, y_train)

    result = {
        method: getattr(estimator, method)(X_test)
        for method in check_methods if hasattr(estimator, method)
    }

    # Fit again
    set_random_state(estimator)
    estimator.fit(X_train, y_train)

    for method in check_methods:
        if hasattr(estimator, method):
            new_result = getattr(estimator, method)(X_test)
            if np.issubdtype(new_result.dtype, np.floating):
                tol = 2 * np.finfo(new_result.dtype).eps
            else:
                tol = 2 * np.finfo(np.float64).eps
            assert_allclose_dense_sparse(
                result[method],
                new_result,
                atol=max(tol, 1e-9),
                rtol=max(tol, 1e-7),
                err_msg="Idempotency check failed for method {}".format(
                    method))