Esempio n. 1
0
def fit_models(imps, X, Y, all_props, props=None,
               labels=None, n_splits=5, 
               clf_args={'n_estimators':25, 
                         'max_features':'auto', 
                         'random_state':0}):
    if props is None:
        props = all_props
    n_obs = X['missing'].shape[0] # Number of observations.  
    n_features = X['missing'].shape[1] # Number of observations.  
    n_props = len(props) # Number of properties to predict.  
    test_size = 0.2
    if labels is None:
        shuffle_split = ShuffleSplit(n_iter=n_splits,
                                     test_size=test_size,random_state=0)
    else:
        shuffle_split = GroupShuffleSplit(n_iter=n_splits,
                                          test_size=test_size,random_state=0)
    n_test_samples = np.max([len(list(shuffle_split.split(range(n_obs),groups=labels))[i][1]) \
                            for i in range(n_splits)])
    rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps}
    ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    feature_importances = {imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps}
    for n_prop,prop in enumerate(props):
        j = all_props.index(prop)
        print("Fitting model for %s..." % prop)
        for imp in imps:
            for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),
                                                                groups=labels)):
                X_train,X_test = X[imp][train],X[imp][test]
                Y_train,Y_test = Y[imp][train,j],Y['missing'][test,j]
                clf_args_ = {key:(value if type(value) is not dict \
                             else value[prop])\
                             for key,value in clf_args.items()}
                if clf_args_['max_features'] not in [None, 'auto']:
                   clf_args_['max_features'] = min(X_train.shape[1],
                                                   clf_args_['max_features'])
                rfc = RandomForestClassifier(**clf_args_)
                #if Y_train.shape[1] == 1:
                #    Y_train = Y_train.ravel()
                rfc.fit(X_train,Y_train)
                Y_predict = rfc.predict(X_test)#.reshape(-1,n_props)
                probs = rfc.predict_proba(X_test)
                if probs.shape[1]<2 and probs.mean()==1.0:
                    n_test_samples = len(probs)
                    ps[imp][n_prop,k,:n_test_samples] = 0.0
                else:
                    n_test_samples = len(probs[:,1])
                    ps[imp][n_prop,k,:n_test_samples] = probs[:,1]
                ys[imp][n_prop,k,:n_test_samples] = Y_test
                rs[imp][n_prop,k] = np.ma.corrcoef(Y_predict,Y_test)[0,1]
                feature_importances[imp][n_prop,:,k] = rfc.feature_importances_
    return rs,feature_importances,ys,ps
Esempio n. 2
0
def FitModel(cnnc, A, Y, T, FN):
    print('Fitting model...')
    ss = ShuffleSplit(n_splits = 1)
    trn, tst = next(ss.split(A))
    #Fit the network
    cnnc.fit(A[trn], Y[trn])
    #The predictions as sequences of character indices
    YH = []
    for i in np.array_split(np.arange(A.shape[0]), 32): 
        YH.append(cnnc.predict(A[i]))
    YH = np.vstack(YH)
    #Convert from sequence of char indices to strings
    PS = np.array([''.join(YHi) for YHi in YH])
    #Compute the accuracy
    S1 = SAcc(PS[trn], T[trn])
    S2 = SAcc(PS[tst], T[tst])
    print('Train: ' + str(S1))
    print('Test: ' + str(S2))
    for PSi, Ti, FNi in zip(PS, T, FN):
        if np.random.rand() > 0.99: #Randomly select rows to print
            print(FNi + ': ' + Ti + ' -> ' + PSi)
    print('Fitting with CV data...')
    #Fit remainder
    cnnc.SetMaxIter(4)
    cnnc.fit(A, Y)
    return cnnc
Esempio n. 3
0
def main():
    from io import open as uopen
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('fname')
    parser.add_argument('idx', default=2, type=int)
    parser.add_argument('--key', default=u'V;1;SG;IND;PST;PFV')
    parser.add_argument('--shuffle', action='store_true')
    parser.add_argument('--folds', default=10, type=int)
    parser.add_argument('--lang', default='sp')
    parser.add_argument('--key-idx', default=3, type=int)
    args = parser.parse_args()
    fh = uopen(args.fname, encoding='utf-8')
    lines = [x.strip().split(u'\t') for x in fh]
    to_extract = [(x[0], x[args.idx]) for x in lines if x[args.key_idx] == args.key]
    if args.shuffle:
        from random import shuffle
        shuffle(to_extract)
    from distutils.dir_util import mkpath
    from sklearn.model_selection import ShuffleSplit
    rs = ShuffleSplit(n_splits=args.folds, test_size=0.2, random_state=42)
    for i, (train_indices, test_indices) in enumerate(rs.split(to_extract)):
        mkpath('res/ryan_splits/{}-10fold/{}'.format(args.lang, i))
        train_fh, dev_fh, test_fh = (uopen('res/ryan_splits/{}-10fold/{}/train.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
                                     uopen('res/ryan_splits/{}-10fold/{}/dev.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
                                     uopen('res/ryan_splits/{}-10fold/{}/test.uniq'.format(args.lang, i), mode='w', encoding='utf-8'),
                                     )
        for idx in train_indices:
            train_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))

        for j, idx in enumerate(test_indices):
            if j % 2 == 0:
                dev_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
            else:
                test_fh.write(u'{}\t{}\n'.format(to_extract[idx][0], to_extract[idx][1]))
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Initiate model
    model = init_model(X_train.shape[1])
    vanilla_weights = model.get_weights()

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(
        cross_validation_iterator.split(X_train), start=1
    ):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        optimal_weights_path = "/tmp/Optimal_Weights_{}.h5".format(cross_validation_index)
        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        if not os.path.isfile(optimal_weights_path):
            # Load the vanilla weights
            model.set_weights(vanilla_weights)

            # Perform the training procedure
            earlystopping_callback = EarlyStopping(monitor="val_actual_mae", patience=EARLYSTOPPING_PATIENCE)
            modelcheckpoint_callback = ModelCheckpoint(optimal_weights_path, monitor="val_loss", save_best_only=True)
            model.fit(
                X_train[train_index],
                Y_train[train_index],
                batch_size=TRAIN_BATCH_SIZE,
                nb_epoch=MAXIMUM_EPOCH_NUM,
                validation_data=(X_train[valid_index], Y_train[valid_index]),
                callbacks=[earlystopping_callback, modelcheckpoint_callback],
                verbose=2,
            )

        # Load the optimal weights
        model.load_weights(optimal_weights_path)

        # Perform the testing procedure
        Y_test = model.predict(X_test, batch_size=TEST_BATCH_SIZE, verbose=2)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    # cv = ShuffleSplit(
    #     n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
    cv = ShuffleSplit(
        n_splits=10, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv.split(X):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors)
def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
    # cv = ShuffleSplit(
    #     n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
    # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html
    # old:http://scikit-learn.org/0.15/modules/generated/sklearn
    # .cross_validation.ShuffleSplit.html#sklearn.cross_validation.ShuffleSplit
    cv = ShuffleSplit(
        n_splits=10, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []
    for train, test in cv.split(X):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)
        # print('proba:', proba)
        # fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    scores_to_sort = pr_scores
    # print('np.argsort(scores_to_sort):', np.argsort(scores_to_sort),len(scores_to_sort) / 2)
    median = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)]

    if plot:
        plot_pr(pr_scores[median], name, "01", precisions[median],
                recalls[median], label=name)

        summary = (np.mean(scores), np.std(scores),
                   np.mean(pr_scores), np.std(pr_scores))
        print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors)
Esempio n. 7
0
def fit_models_mc(imps, X, Y, all_props, props=None,
               labels=None, n_splits=5, 
               clf_args={'n_estimators':25, 
                         'max_features':'auto', 
                         'random_state':0}):
    if props is None:
        props = all_props
    n_obs = X['missing'].shape[0] # Number of observations.  
    n_features = X['missing'].shape[1] # Number of observations.  
    n_props = len(props) # Number of properties to predict.  
    test_size = 0.2
    if labels is None:
        shuffle_split = ShuffleSplit(n_iter=n_splits,
                                     test_size=test_size,random_state=0)
    else:
        shuffle_split = LabelShuffleSplit(n_iter=n_splits,
                                          test_size=test_size,random_state=0)
    n_test_samples = np.max([len(list(shuffle_split)[i][1]) \
                            for i in range(n_splits)])
    rs = {imp:np.ma.zeros((n_props,n_splits)) for imp in imps}
    ps = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    ys = {imp:np.ma.masked_all((n_props,n_splits,n_test_samples)) for imp in imps}
    feature_importances = None#{imp:np.ma.zeros((n_props,n_features,n_splits)) for imp in imps}
    cols = np.array([i for i in range(len(all_props)) if all_props[i] in props])
    for imp in imps:
        for k,(train,test) in enumerate(shuffle_split.split(range(n_obs),groups=labels)):
            #X_train,X_test = X[imp][train][:,cols],X[imp][test][:,cols]
            #Y_train,Y_test = Y[imp][train][:,cols],Y['missing'][test][:,cols]
            X_train,X_test = X[imp][train,:],X[imp][test,:]
            Y_train,Y_test = Y[imp][train,:],Y['missing'][test,:]
            clf_args_ = {key:(value if type(value) is not dict \
                         else value[prop])\
                         for key,value in clf_args.items()}
            if clf_args_['max_features'] not in [None, 'auto']:
               clf_args_['max_features'] = min(X_train.shape[1],
                                               clf_args_['max_features'])
            rfc = RandomForestClassifier(**clf_args_)
            onevsrest = OneVsRestClassifier(rfc)
            onevsrest.fit(X_train,Y_train)
            Y_predict = onevsrest.predict(X_test)#.reshape(-1,n_props)
            probs = onevsrest.predict_proba(X_test)
            if probs.shape[1]<2 and probs.mean()==1.0:
                n_test_samples = len(probs)
                ps[imp][:,k,:n_test_samples] = 0.0
            else:
                n_test_samples = len(probs[:,1])
                ps[imp][:,k,:n_test_samples] = probs.T
            ys[imp][:,k,:n_test_samples] = Y_test.T
            for i in range(n_props):
                rs[imp][i,k] = np.ma.corrcoef(Y_predict[:,i],Y_test[:,i])[0,1]
            #feature_importances[imp][n_prop,:,k] = onevsrest.feature_importances_
    return rs,feature_importances,ys,ps
 def TestPerformance(self, df = None):
     #If no dataframe is provided, use the currently learned one
     if(df is None):
         D = self.D
     else:
         D = self.S.transform(df.copy())
     #Get features from the data frame
     A = self._ExtractFeat(D)
     #Get the target values and their corresponding column names
     y, _ = self._ExtractTarg(D)
     #Begin cross validation
     ss = ShuffleSplit(n_splits = 1)
     for trn, tst in ss.split(A):
         s1 = self.R.score(A, y)
         s2 = self.R.score(A[tst], y[tst])
         s3 = self.R.score(A[trn], y[trn])
         print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = ShuffleSplit(test_size=0.25, random_state=0)
    tr, te = list(cv.split(X))[0]

    X_tr, y_tr = _safe_split(clf, X, y, tr)
    K_tr, y_tr2 = _safe_split(clfp, K, y, tr)
    assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))

    X_te, y_te = _safe_split(clf, X, y, te, tr)
    K_te, y_te2 = _safe_split(clfp, K, y, te, tr)
    assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = XGBRegressor(
            learning_rate=0.01,
            max_depth=12,
            n_estimators=N_ESTIMATORS,
            silent=False,
            objective="reg:linear",
            gamma=1,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.5,
            reg_alpha=1,
            seed=cross_validation_index,
            nthread=-1)

        model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])],
            eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))),
            early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True)

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = GBMRegressor(
            learning_rate=0.01,
            num_iterations=NUM_ITERATIONS,
            num_leaves=200,
            min_data_in_leaf=10,
            feature_fraction=0.3,
            feature_fraction_seed=cross_validation_index,
            bagging_fraction=0.8,
            bagging_freq=10,
            bagging_seed=cross_validation_index,
            metric="l1",
            metric_freq=10,
            early_stopping_round=EARLY_STOPPING_ROUND,
            num_threads=-1)

        model.fit(X_train[train_index], Y_train[train_index], test_data=[(X_train[valid_index], Y_train[valid_index])])

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
def plot_shuffle_split():
    from sklearn.model_selection import ShuffleSplit
    plt.figure(figsize=(10, 2))
    plt.title("ShuffleSplit with 10 points"
              ", train_size=5, test_size=2, n_splits=4")

    axes = plt.gca()
    axes.set_frame_on(False)

    n_folds = 10
    n_samples = 10
    n_iter = 4
    n_samples_per_fold = 1

    ss = ShuffleSplit(n_splits=4, train_size=5, test_size=2, random_state=43)
    mask = np.zeros((n_iter, n_samples))
    for i, (train, test) in enumerate(ss.split(range(10))):
        mask[i, train] = 1
        mask[i, test] = 2

    for i in range(n_folds):
        # test is grey
        colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
        # not selected has no hatch

        boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter,
                          left=i * n_samples_per_fold, height=.6, color=colors,
                          hatch="//", edgecolor='k', align='edge')
        for j in np.where(mask[:, i] == 0)[0]:
            boxes[j].set_hatch("")

    axes.invert_yaxis()
    axes.set_xlim(0, n_samples + 1)
    axes.set_ylabel("CV iterations")
    axes.set_xlabel("Data points")
    axes.set_xticks(np.arange(n_samples) + .5)
    axes.set_xticklabels(np.arange(1, n_samples + 1))
    axes.set_yticks(np.arange(n_iter) + .3)
    axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)])
    # legend hacked for this random state
    plt.legend([boxes[1], boxes[0], boxes[2]], [
               "Training set", "Test set", "Not selected"], loc=(1, .3))
    plt.tight_layout()
Esempio n. 13
0
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = ShuffleSplit(test_size=0.25, random_state=0)
    train, test = list(cv.split(X))[0]

    X_train, y_train = _safe_split(clf, X, y, train)
    K_train, y_train2 = _safe_split(clfp, K, y, train)
    assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
    assert_array_almost_equal(y_train, y_train2)

    X_test, y_test = _safe_split(clf, X, y, test, train)
    K_test, y_test2 = _safe_split(clfp, K, y, test, train)
    assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
    assert_array_almost_equal(y_test, y_test2)
Esempio n. 14
0
    
#k-fold validation
# k-fold is a type of cross validation where the data are divided into k bins. For each experiment, pick one of the k bins as the test set, 
#the remaining k-1 bins as training. Run k separate experiments and average all k test results. 
#This technique helps to test different part of the data to prevent overfitting 
#i.e. it prevents grid search from returning a parameter set that optimized specifically for a specific training data set but not overall.
from sklearn.model_selection import KFold
cv_set = KFold(n_splits=10)
for train_index, test_index in cv_sets.split(X):
     print("%s %s" % (train_index, test_index))
#Shufflesplit
#ShuffleSplit() for an alternative form of cross-validation (see the 'cv_sets' variable). 
#The ShuffleSplit() will create 10 ('n_splits') shuffled sets, and for each shuffle, 20% ('test_size') of the data will be used as the validation set.
from sklearn.model_selection import ShuffleSplit
cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0)
for train_index, test_index in cv_sets.split(X):
     print("%s %s" % (train_index, test_index))
     
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score

# pipelining
#Sequentially apply a list of transforms and a final estimator. Intermediate steps 
#of the pipeline must be ‘transforms’, that is, they must implement fit and 
#transform methods. The final estimator only needs to implement fit.
#The purpose of the pipeline is to assemble several steps that can be 
#cross-validated together while setting different parameters.
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
Esempio n. 15
0
def train(working, max_samples, duration, rate,
          batch_size, epochs, epoch_size, validation_size,
          early_stopping, reduce_lr, seed):
    '''
    Parameters
    ----------
    working : str
        directory that contains the experiment data (h5)

    max_samples : int
        Maximum number of samples per streamer

    duration : float
        Duration of training patches

    batch_size : int
        Size of batches

    rate : int
        Poisson rate for pescador

    epochs : int
        Maximum number of epoch

    epoch_size : int
        Number of batches per epoch

    validation_size : int
        Number of validation batches

    early_stopping : int
        Number of epochs before early stopping

    reduce_lr : int
        Number of epochs before reducing learning rate

    seed : int
        Random seed
    '''

    # Load the pump
    with open(os.path.join(OUTPUT_PATH, 'pump.pkl'), 'rb') as fd:
        pump = pickle.load(fd)

    # Build the sampler
    sampler = make_sampler(max_samples, duration, pump, seed)

    # Build the model
    model, inputs, outputs = construct_model(pump)

    # Load the training data
    idx_train_ = pd.read_json('index_train.json')

    # Split the training data into train and validation
    splitter_tv = ShuffleSplit(n_splits=1, test_size=0.25,
                               random_state=seed)
    train, val = next(splitter_tv.split(idx_train_))

    idx_train = idx_train_.iloc[train]
    idx_val = idx_train_.iloc[val]

    gen_train = data_generator(working,
                               idx_train['id'].values, sampler, epoch_size,
                               augment=True,
                               lam=rate,
                               batch_size=batch_size,
                               revive=True,
                               random_state=seed)

    gen_train = keras_tuples(gen_train(), inputs=inputs, outputs=outputs)

    gen_val = data_generator(working,
                             idx_val['id'].values, sampler, len(idx_val),
                             augment=False,
                             batch_size=batch_size,
                             revive=True,
                             random_state=seed)

    gen_val = keras_tuples(gen_val(), inputs=inputs, outputs=outputs)

    loss = {'beat': 'binary_crossentropy',
            'downbeat': 'binary_crossentropy'}

    metrics = {'beat': 'accuracy', 'downbeat': 'accuracy'}

    monitor = 'val_loss'

    model.compile(K.optimizers.Adam(), loss=loss, metrics=metrics)

    # Store the model
    model_spec = K.utils.serialize_keras_object(model)
    with open(os.path.join(OUTPUT_PATH, 'model_spec.pkl'), 'wb') as fd:
        pickle.dump(model_spec, fd)

    # Construct the weight path
    weight_path = os.path.join(OUTPUT_PATH, 'model.h5')

    # Build the callbacks
    cb = []
    cb.append(K.callbacks.ModelCheckpoint(weight_path,
                                          save_best_only=True,
                                          verbose=1,
                                          monitor=monitor))

    cb.append(K.callbacks.ReduceLROnPlateau(patience=reduce_lr,
                                            verbose=1,
                                            monitor=monitor))

    cb.append(K.callbacks.EarlyStopping(patience=early_stopping,
                                        verbose=1,
                                        monitor=monitor))

    # Fit the model
    model.fit_generator(gen_train, epoch_size, epochs,
                        validation_data=gen_val,
                        validation_steps=validation_size,
                        callbacks=cb)
Esempio n. 16
0
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import ShuffleSplit

iris = datasets.load_iris()
X = iris.data
y = iris.target

iris_ss = ShuffleSplit(train_size=0.6, test_size=0.4, random_state=0)
train_index, test_index = next(iris_ss.split(X))

X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]
clf = svm.SVC()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
Esempio n. 17
0
# Decoding in sensor space using a linear SVM

from sklearn.svm import SVC  # noqa
from sklearn.model_selection import ShuffleSplit  # noqa
from mne.decoding import CSP  # noqa

n_components = 3  # pick some components
svc = SVC(C=1, kernel='linear')
csp = CSP(n_components=n_components, norm_trace=False)

# Define a monte-carlo cross-validation generator (reduce variance):
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
scores = []
epochs_data = epochs.get_data()

for train_idx, test_idx in cv.split(labels):
    y_train, y_test = labels[train_idx], labels[test_idx]

    X_train = csp.fit_transform(epochs_data[train_idx], y_train)
    X_test = csp.transform(epochs_data[test_idx])

    # fit classifier
    svc.fit(X_train, y_train)

    scores.append(svc.score(X_test, y_test))

# Printing the results
class_balance = np.mean(labels == labels[0])
class_balance = max(class_balance, 1. - class_balance)
print("Classification accuracy: %f / Chance level: %f" % (np.mean(scores),
                                                          class_balance))
Esempio n. 18
0
        ]
    else:
        num_stds_config = [num_stds]

    # only need to generate models once for all CVs
    all_models = model_all_training_graphs(train_sketches, train_targets,
                                           args['size'])

    num_cross_validation = 5
    # kf = KFold(n_splits=num_cross_validation)
    kf = ShuffleSplit(n_splits=num_cross_validation,
                      test_size=0.2,
                      random_state=0)
    print "We will perform " + str(
        num_cross_validation) + "-fold cross validation..."
    for benign_train, benign_validate in kf.split(train_targets):
        benign_validate_sketches, benign_validate_names = train_sketches[
            benign_validate], train_targets[benign_validate]
        kf_test_sketches = np.concatenate(
            (test_sketches, benign_validate_sketches), axis=0)
        kf_test_targets = np.concatenate((test_targets, benign_validate_names),
                                         axis=0)

        # Modeling (training)
        models = []
        for index in benign_train:
            models.append(all_models[index])

        print "We will attempt multiple cluster threshold configurations for the best results."
        print "Trying: mean/max distances with 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0 standard deviation(s)..."
        print "Best Configuration: "
Esempio n. 19
0
                proj=True,
                picks=picks,
                baseline=None,
                preload=True)
epochs_train = epochs.copy().crop(tmin=1., tmax=2.)
labels = epochs.events[:, -1] - 2

###############################################################################
# Classification with linear discrimant analysis

# Define a monte-carlo cross-validation generator (reduce variance):
scores = []
epochs_data = epochs.get_data()
epochs_data_train = epochs_train.get_data()
cv = ShuffleSplit(10, test_size=0.2, random_state=42)
cv_split = cv.split(epochs_data_train)

# Assemble a classifier
lda = LinearDiscriminantAnalysis()
csp = CSP(n_components=4, reg=None, log=True, norm_trace=False)

# Use scikit-learn Pipeline with cross_val_score function
clf = Pipeline([('CSP', csp), ('LDA', lda)])
scores = cross_val_score(clf, epochs_data_train, labels, cv=cv, n_jobs=1)

# Printing the results
class_balance = np.mean(labels == labels[0])
class_balance = max(class_balance, 1. - class_balance)
print("Classification accuracy: %f / Chance level: %f" %
      (np.mean(scores), class_balance))
Esempio n. 20
0
        v1 = time.perf_counter()
        vecTime = v1 - v0

        # prepare output folder
        print()
        print(f"Threshold: {threshold}; Eps: {eps}")
        outputFile = f"{outDir}/FLAST___t{threshold}__eps{eps}.csv"
        with open(outputFile, "w") as fileOut:
            fileOut.write(
                "fold,numFlakyTrainSet,numNonFlakyTrainSet,numFlakyTestSet,numNonFlakyTestSet,vecTime,trainTime,testTime,avgPredTime,f-measure,precision,recall,accuracy,tp,fp,fn,tn\n"
            )

        kf = ShuffleSplit(n_splits=numKFold, test_size=testSize)
        successFold = 0
        for kFold, (trnIdx, tstIdx) in enumerate(
                kf.split(dataPointsList, dataLabelsList)):
            # data points vectorization
            v0 = time.perf_counter()
            dataPointsFlaky, dataPointsNonFlaky = flast.getDataPointsInfo(
                projectBasePath, projectName)
            dataPoints = dataPointsFlaky + dataPointsNonFlaky
            Z = flast.flastVectorization(dataPoints,
                                         reduceDim=reduceDim,
                                         dim=dim,
                                         eps=eps)
            dataPointsList = np.array(
                [Z[i].toarray() for i in range(Z.shape[0])])
            dataLabelsList = np.array([1] * len(dataPointsFlaky) +
                                      [0] * len(dataPointsNonFlaky))
            v1 = time.perf_counter()
            vecTime = v1 - v0
Esempio n. 21
0
# Read epochs (train will be done only between 1 and 2s)
# Testing will be done with a running classifier
epochs = Epochs(raw, events, event_id, tmin, tmax, proj=True, picks=picks,
                baseline=None, preload=True)
epochs_train = epochs.copy().crop(tmin=1., tmax=2.)
labels = epochs.events[:, -1] - 2

###############################################################################
# Classification with linear discrimant analysis

# Define a monte-carlo cross-validation generator (reduce variance):
scores = []
epochs_data = epochs.get_data()
epochs_data_train = epochs_train.get_data()
cv = ShuffleSplit(10, test_size=0.2, random_state=42)
cv_split = cv.split(epochs_data_train)

# Assemble a classifier
lda = LinearDiscriminantAnalysis()
csp = CSP(n_components=4, reg=None, log=True, norm_trace=False)

# Use scikit-learn Pipeline with cross_val_score function
clf = Pipeline([('CSP', csp), ('LDA', lda)])
scores = cross_val_score(clf, epochs_data_train, labels, cv=cv, n_jobs=1)

# Printing the results
class_balance = np.mean(labels == labels[0])
class_balance = max(class_balance, 1. - class_balance)
print("Classification accuracy: %f / Chance level: %f" % (np.mean(scores),
                                                          class_balance))
def test_shufflesplit_reproducible():
    # Check that iterating twice on the ShuffleSplit gives the same
    # sequence of train-test when the random_state is given
    ss = ShuffleSplit(random_state=21)
    assert_array_equal(list(a for a, b in ss.split(X)),
                       list(a for a, b in ss.split(X)))
Esempio n. 23
0
def run_model(X, y, outdir, **params):

    outfile = outdir + 'output_{}.csv'.format(params['model'])
    if not os.path.isfile(outfile):
        with open(outfile, 'w') as f:
            for param in params:
                f.write('{},'.format(param))
            f.write('Accuracy,AUPRC,AUROC,Fold\n')

    accuracies = []
    auprcs = []
    aurocs = []
    prc_fig = plt.figure()
    prc_ax = prc_fig.add_subplot(1, 1, 1)
    roc_fig = plt.figure()
    roc_ax = roc_fig.add_subplot(1, 1, 1)

    # prepare validation splits
    n_splits = 5
    test_size = 0.2
    splitter = ShuffleSplit(n_splits=n_splits,
                            test_size=test_size,
                            random_state=seed)

    fold = 1

    import time
    start = time.time()

    for train, test in splitter.split(X):
        print time.time() - start
        X_trn = X[train]
        y_trn = y[train]
        X_val = X[test]
        y_val = y[test]

        classifier = build_classifier(**params)
        classifier.fit(X_trn, y_trn)

        accuracy, auprc, auroc = analyze(classifier, X_val, y_val, prc_ax,
                                         roc_ax, **params)
        accuracies.append(accuracy)
        auprcs.append(auprc)
        aurocs.append(auroc)

        with open(outfile, 'a') as f:
            for param in params:
                f.write('{},'.format(params[param]))
            f.write('{},{},{},{}\n'.format(accuracy, auprc, auroc, fold))

        fold += 1

    n_folds = len(accuracies)
    avg_accuracy = sum(accuracies) / n_folds
    avg_auprc = sum(auprcs) / n_folds
    avg_auroc = sum(aurocs) / n_folds

    # Write average values to output file
    with open(outfile, 'a') as f:
        for param in params:
            f.write('{},'.format(params[param]))
        f.write('{},{},{},AVG\n'.format(
            avg_accuracy,
            avg_auprc,
            avg_auroc,
        ))

    name = ''
    for param in params:
        if isinstance(params[param], float):
            name = name + '_' + '{:.0g}'.format(params[param])
        else:
            name = name + '_' + str(params[param])

    # PRC figure
    prc_ax.set_xlabel('Recall')
    prc_ax.set_ylabel('Precision')
    prc_ax.set_xlim([0.0, 1.0])
    prc_ax.set_ylim([0.0, 1.05])
    prc_ax.set_title('PRC{}'.format(name))
    prc_fig.savefig(outdir + 'PRC{}'.format(name) + ".png")
    plt.close(prc_fig)

    # ROC figure
    roc_ax.set_xlabel('False Positive Rate')
    roc_ax.set_ylabel('True Positive Rate')
    roc_ax.set_xlim([0.0, 1.0])
    roc_ax.set_ylim([0.0, 1.05])
    roc_ax.set_title('ROC{}'.format(name))
    roc_fig.savefig(outdir + 'ROC{}'.format(name) + ".png")
    plt.close(roc_fig)

    return classifier
Esempio n. 24
0
def train_model(model, num_split, seed, X, Y, neural_network=0):
    """
    Train input model and obtain averaged results.
    :param model: Input model for training.
    :param num_split: (int) Number of splits for averaging of performance metrics.
    :param seed: (int) Seed for random state.
    :param X: (numpy array) Training input.
    :param Y: (numpy array) Class label.
    :param neural_network: (bool) whether the model is a neural network model (keras) or conventional machine learning model (scikit-learn).
    :return: Trained model
    """
    from sklearn.model_selection import ShuffleSplit
    from keras.utils import to_categorical
    import numpy as np
    import time

    shuffle = ShuffleSplit(n_splits=num_split,
                           random_state=seed,
                           test_size=0.2)
    accuracy, fitting_time = 0.0, 0.0
    accuracy, precision, recall, f1 = 0.0, 0.0, 0.0, 0.0
    i = 0

    for train_idx, test_idx in shuffle.split(X):
        i += 1
        print("== Split %s ==" % i)
        start = time.perf_counter()

        x_train, x_test, y_train, y_test = X[train_idx], X[test_idx], Y[
            train_idx], Y[test_idx]

        ml = model

        if neural_network:
            y_train_categorical = to_categorical(y_train)
            ml.fit(x_train,
                   y_train_categorical,
                   epochs=1,
                   batch_size=32,
                   verbose=0)
        else:
            ml.fit(x_train, y_train)

        pred_train = ml.predict(x_train)
        pred_test = ml.predict(x_test)

        if neural_network:
            pred_train = pred_train.argmax(axis=1)
            pred_test = pred_test.argmax(axis=1)

        pred_train = np.around(np.ndarray.flatten(pred_train))
        pred_test = np.around(np.ndarray.flatten(pred_test))

        end = time.perf_counter()

        acc_, prec_, rec_, f1_ = performance_metrics(y_train, y_test,
                                                     pred_train, pred_test)

        accuracy += float(acc_) / num_split
        precision += float(prec_) / num_split
        recall += float(rec_) / num_split
        f1 += float(f1_) / num_split

        fitting_time += (end - start) / num_split
        print("Fitting time", (end - start), "\n")

    print("===== Average results over %s splits =====" % num_split)
    print("Accuracy : %f" % accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 score:", f1)
    print("Average time taken: %f" % fitting_time)
    print("==========================================")

    return ml
Esempio n. 25
0
def get_cv(X, y):
    cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=57)
    return cv.split(X)
Esempio n. 26
0
def train_test_split(*arrays, **options):
    """Extend sklearn.model_selection.train_test_slit to have group split.

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.

    test_size : float, int or None, optional (default=None)
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.25.

    train_size : float, int, or None, (default=None)
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    shuffle : None or str (default='simple')
        How to shuffle the data before splitting.
        None, no shuffle.
        For str, one of 'simple', 'stratified' and 'group', corresponding to
        `ShuffleSplit`, `StratifiedShuffleSplit` and `GroupShuffleSplit`,
        respectively.

    labels : array-like or None (default=None)
        Ignored if shuffle is None or 'simple'.
        When shuffle='stratified', this array is used as class labels.
        When shuffle='group', this array is used as groups.

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.

    """
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    random_state = options.pop('random_state', None)
    shuffle = options.pop('shuffle', 'simple')
    labels = options.pop('labels', None)

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))

    arrays = indexable(*arrays)

    n_samples = _num_samples(arrays[0])
    if shuffle == 'group':
        if labels is None:
            raise ValueError("When shuffle='group', "
                             "labels should not be None!")
        labels = check_array(labels, ensure_2d=False, dtype=None)
        uniques = np.unique(labels)
        n_samples = uniques.size

    n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size,
                                              default_test_size=0.25)

    shuffle_options = dict(test_size=n_test,
                           train_size=n_train,
                           random_state=random_state)

    if shuffle is None:
        if labels is not None:
            warnings.warn("The `labels` is ignored for "
                          "shuffle being None!")

        train = np.arange(n_train)
        test = np.arange(n_train, n_train + n_test)

    elif shuffle == 'simple':
        if labels is not None:
            warnings.warn("The `labels` is not needed and therefore "
                          "ignored for ShuffleSplit, as shuffle='simple'!")

        cv = ShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=None))

    elif shuffle == 'stratified':
        cv = StratifiedShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=labels))

    elif shuffle == 'group':
        cv = GroupShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=None, groups=labels))

    else:
        raise ValueError("The argument `shuffle` only supports None, "
                         "'simple', 'stratified' and 'group', but got `%s`!"
                         % shuffle)

    return list(chain.from_iterable((safe_indexing(a, train),
                                    safe_indexing(a, test)) for a in arrays))
Esempio n. 27
0
column_q = ['ct_dst_sport_ltm','tcprtt','dwin','ct_src_dport_ltm',
'ct_dst_src_ltm','ct_dst_ltm','smean','dmean','dtcpb',]
# data_x_xgboost = pd.DataFrame(data2,columns=column_q)
# data_x = pd.get_dummies(data_x_xgboost)
data_x = pd.get_dummies(data2)
data_two = pd.concat([data_x, data_ytwo], axis=1)
data_five = pd.concat([data_x, data_yfive], axis=1)
# scaler_2 = MinMaxScaler(feature_range=(0, 1))  #自动将dtype转换成float64
# data_two = scaler_2.fit_transform(data_two)
# index_train = np.arange(175341)
# np.random.shuffle(index_train)
data_train = np.array(data_five.iloc[:175341, :])
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=1, test_size=0.3, random_state=1)

for train_1,train_2 in rs.split(data_train):
    train_70 = data_train[train_1,:]
    train_30 = data_train[train_2,:]
train_70_x = train_70[:,:-1]
train_70_y = train_70[:,-1]
train_30_x = train_30[:,:-1]
train_30_y = train_30[:,-1]
x_test = np.array(data_five.iloc[175341:, :-1])
y_test = np.array(data_five.iloc[175341:, -1])
# x_train = x_train[index_train]
# y_train = y_train[index_train]

scaler_2 = MinMaxScaler(feature_range=(0, 1))  #自动将dtype转换成float64
train_70_x = scaler_2.fit_transform(train_70_x)
train_30_x = scaler_2.transform(train_30_x)
x_test = scaler_2.transform(x_test)
dl_rec = DLRecommender(fm_decoder,
                       n_components=50,
                       batch_size=10,
                       n_epochs=1,
                       alpha=10e-8,
                       learning_rate=.75,
                       memory=mem,
                       l1_ratio=0.,
                       random_state=0)

dl_cv = GridSearchCV(dl_rec,
                     param_grid={'alpha': np.logspace(-4, 0, 5)},
                     cv=KFold(
                         shuffle=False,
                         n_folds=3),
                     error_score=-1000,
                     n_jobs=15,
                     refit=False,
                     verbose=10)
estimators = [dl_cv]

scores = Parallel(n_jobs=1, verbose=10)(
    delayed(single_run)(X, y, estimator, train, test,
                        estimator_idx,
                        split_idx,
                        output_dir=output_dir,
                        )
    for split_idx, (train, test) in enumerate(
        uniform_split.split(X, y))
    for estimator_idx, estimator in enumerate(estimators))
Esempio n. 29
0
species_key_df = df_all[['Species', 'Species_code']].drop_duplicates()

# create arrays of required data
X_columns = ['leaf length', 'leaf width', 'widest point', 'total veins']
y_columns = ['Species']
X = df_equal[X_columns].values
y = df_equal[y_columns].values


# parameters of the model
n_neighbors = 10
weights = ['uniform', 'distance']
weight = weights[0]

ss = ShuffleSplit(n_splits=10, test_size=0.1)
for train_index, test_index in ss.split(X):
    # generate data from indices
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

    # fit the training data
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weight)
    clf.fit(X_train, y_train.ravel())

    # predict the test data
    output = clf.predict(X_test)

    # report results
    score = clf.score(X_test, y_test)
    print("Score: {:.2%}".format(score))

Esempio n. 30
0
    def testStacking(self):
        svc_c = [.1, 1]
        svc_kernel = ['linear']

        cv_outer = ShuffleSplit(n_splits=1, test_size=0.2, random_state=3)
        # cv_inner = ShuffleSplit(n_splits=1, test_size=0.2, random_state=3)
        #cv_outer = KFold(n_splits=3, random_state=3)
        cv_inner = KFold(n_splits=3, random_state=3)

        sources = [np.arange(0, self.surface.shape[1]),np.arange(self.surface.shape[1], self.surface.shape[1] + self.thickness.shape[1])]

        ##################################################################################
        # SET UP HYPERPIPES
        ##################################################################################

        # surface pipe
        surface_pipe = Hyperpipe('surface_pipe', optimizer='grid_search',
                                 metrics=['accuracy'],
                                 inner_cv=cv_inner, verbose=1)

        surface_pipe += PipelineElement.create('svc', {'C': svc_c, 'kernel': svc_kernel})
        # use source filter to select data for stacked hyperpipes
        surface_pipe.filter_element = SourceFilter(sources[0])

        # thickness pipe
        thickness_pipe = Hyperpipe('thickness_pipe', optimizer='grid_search',
                                   metrics=['accuracy'],
                                   inner_cv=cv_inner, verbose=1)

        thickness_pipe += PipelineElement.create('svc', {'C': svc_c, 'kernel': svc_kernel})
        # use source filter to select data for stacked hyperpipes
        thickness_pipe.filter_element = SourceFilter(sources[1])

        # Mother Pipe
        mother = Hyperpipe('mother', optimizer='grid_search',
                           metrics=['accuracy'],
                           inner_cv=cv_inner,
                           outer_cv=cv_outer,
                           eval_final_performance=True, verbose=1)

        mother += PipelineStacking('multiple_sources', [surface_pipe, thickness_pipe], voting=False)
        mother += PipelineElement.create('svc', {'C': svc_c, 'kernel': svc_kernel})

        mother.fit(self.X, self.y)
        final_score_photon = mother.result_tree.get_best_config_performance_test_set(0).metrics['accuracy']

        ##################################################################################
        # SKLEARN
        ##################################################################################

        for train1, test in cv_outer.split(self.X):
            X_train1 = self.X[train1]
            X_test = self.X[test]
            y_train1 = self.y[train1]
            y_test = self.y[test]

            results_outer = {'C': [], 'kernel': [], 'val1_score': []}
            done_source_optimization = False
            for c_outer in svc_c:
                for kernel_outer in svc_kernel:
                    results_outer['C'].extend([c_outer])
                    results_outer['kernel'].extend([kernel_outer])

                    print('C Outer:', c_outer, 'Kernel Outer:', kernel_outer, '\n')
                    results_val1 = []
                    for train2, val1 in cv_inner.split(X_train1):
                        X_train2 = X_train1[train2]
                        X_val1 = X_train1[val1]
                        y_train2 = y_train1[train2]
                        y_val1 = y_train1[val1]

                        if done_source_optimization is not True:
                            source_predictions_train2 = list()
                            source_predictions_val1 = list()
                            best_inner_config = []

                            for source in range(2):
                                results_source = {'C': list(), 'kernel': list(),
                                                  'test_score': list(), 'test_predictions': list()}

                                for c_inner in svc_c:
                                    for kernel_inner in svc_kernel:
                                        results_source['C'].append(c_inner)
                                        results_source['kernel'].append(kernel_inner)
                                        print('Source {} C:{} Kernel:{}\n'.format(source, c_inner, kernel_inner))

                                        results_source_folds = list()
                                        for train3, val2 in cv_inner.split(X_train2):
                                            X_train3 = X_train2[train3][:, sources[source]]
                                            X_val2 = X_train2[val2][:, sources[source]]
                                            y_train3 = y_train2[train3]
                                            y_val2 = y_train2[val2]

                                            svc_source = SVC(kernel=kernel_inner, C=c_inner)
                                            svc_source.fit(X_train3, y_train3)
                                            results_source_folds.append(svc_source.score(X_val2, y_val2))
                                        results_source['test_score'].append(np.mean(results_source_folds))

                                best_inner_config_id = np.argmax(results_source['test_score'])
                                best_inner_config.append({'C': results_source['C'][best_inner_config_id],
                                                          'kernel': results_source['kernel'][best_inner_config_id]})
                                print('Optimum config for source {}: {}'.format(source, best_inner_config[-1]))
                                print('Now fitting optimum source pipe...')
                                svc_source_opt = SVC(C=best_inner_config[-1]['C'],
                                                     kernel=best_inner_config[-1]['kernel'])
                                svc_source_opt.fit(X_train2[:, sources[source]], y_train2)
                                source_predictions_train2.append(svc_source_opt.predict(X_train2[:, sources[source]]))
                                source_predictions_val1.append(svc_source_opt.predict(X_val1[:, sources[source]]))
                            done_source_optimization = True
                        else:
                            print('Skipping optimization of sources')
                        print('Now fit 2nd level classifier with C={} and kernel={}'.format(c_outer, kernel_outer))
                        svc_meta = SVC(C=c_outer, kernel=kernel_outer)
                        svc_meta.fit(np.transpose(np.asarray(source_predictions_train2)), y_train2)
                        results_val1.append(svc_meta.score(np.transpose(np.asarray(source_predictions_val1)), y_val1))
                    results_outer['val1_score'].append(np.mean(results_val1))
            best_outer_config_id = np.argmax(results_outer['val1_score'])
            best_outer_config = {'C': results_outer['C'][best_outer_config_id],
                                 'kernel': results_outer['kernel'][best_outer_config_id]}
            print('Optimum config for meta classifier: {}'.format(best_outer_config))
            print('Now fitting optimum meta pipe...')
            print('...with source config for source 1: {} and source 2: {}'.format(best_inner_config[0],
                                                                                   best_inner_config[1]))
            svc_meta_opt = SVC(C=best_outer_config['C'], kernel=best_outer_config['kernel'])
            svc_source_1_opt = SVC(C=best_inner_config[0]['C'], kernel=best_inner_config[0]['kernel'])
            svc_source_2_opt = SVC(C=best_inner_config[1]['C'], kernel=best_inner_config[1]['kernel'])
            svc_source_1_opt.fit(X_train1[:, sources[0]], y_train1)
            svc_source_2_opt.fit(X_train1[:, sources[1]], y_train1)
            pred_source1_train1 = svc_source_1_opt.predict(X_train1[:, sources[0]])
            pred_source2_train1 = svc_source_2_opt.predict(X_train1[:, sources[1]])
            svc_meta_opt.fit(np.transpose(np.asarray([pred_source1_train1, pred_source2_train1])), y_train1)

            # get test performance
            pred_source1_test = svc_source_1_opt.predict(X_test[:, sources[0]])
            pred_source2_test = svc_source_2_opt.predict(X_test[:, sources[1]])
            final_score = svc_meta_opt.score(np.transpose(np.asarray([pred_source1_test, pred_source2_test])), y_test)
            print('Final test performance: {}'.format(final_score))


        self.assertEqual(final_score, final_score_photon)
print('Precision: \t{}'.format(metrics.precision_score(y_test, predicted, average=None)))
print('Recall: \t{}'.format(metrics.recall_score(y_test, predicted, average=None)))
print('F1: \t\t{}'.format(metrics.f1_score(y_test, predicted, average=None)))

print('Macro Precision: \t{}'.format(metrics.precision_score(y_test, predicted, average='macro')))
print('Macro Recall: \t\t{}'.format(metrics.recall_score(y_test, predicted, average='macro')))
print('Macro F1: \t\t{}'.format(metrics.f1_score(y_test, predicted, average='macro')))

# stratified k-fold
print('-------------------------------- Shuffle Split ---------------------------------')
total_score = 0
runs = 0

from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10)
for train, test in ss.split(tweets, target):
    X_train = np.array(tweets)[train]
    y_train = target[train]

    X_test = np.array(tweets)[test]
    y_test = target[test]

    pipeline = Pipeline([('vect', CountVectorizer(max_df=0.75, ngram_range=(1, 2))),
                         ('tfidf', TfidfTransformer(norm='l1', use_idf=False)),
                         ('clf', ExtraTreesClassifier(random_state=0, n_estimators=10, class_weight='auto'))])
    pipeline = pipeline.fit(X_train, y_train)

    predicted = pipeline.predict(X_test)
    print('Accuracy: {}'.format(accuracy_score(y_test, predicted)))
    print(metrics.classification_report(y_test, predicted))
    print('Macro Precision: \t{}'.format(metrics.precision_score(y_test, predicted, average='macro')))
def load_data(training=False):

    tmin, tmax = -1., 4.1

    raw_edf = []
    X = []
    y = []
    X_train =[]
    X_test = []
    y_train =[]
    y_test =[]
    stim_code = dict([(32766,1),(769,2), (770,3), (771,5), (772,4),(783,6),(276,7),(277,8),(768,9),
                      (1023,10),(1072,11)])
    if training:
        path = op.join('data_i2r', 'BCI_IV_2a', 'TrainingSet')
    if not training:
        path = op.join('data_i2r', 'BCI_IV_2a', 'TestingSet')
    #directories = os.listdir(path)
    #for data_folder in directories:
    file_list = glob.glob(path + '/*.gdf')
    print file_list
    raw_files = [read_raw_edf(raw_fnames, preload=True, stim_channel='auto')for raw_fnames in file_list]
    raw_edf.extend(raw_files)
    #events = find_events(raw, shortest_event=0, stim_channel='STI 014')
    samplin_frequency =250;


    for edf_raw in raw_edf:
        event_id = dict()
        events = find_events(edf_raw, shortest_event=0, stim_channel='STI 014')
        events_from_edf = []
        samplin_frequency=edf_raw._raw_extras[0]['max_samp']
        original_event = edf_raw.find_edf_events()
        annot_list = zip(original_event[1], original_event[4], original_event[2])
        events_from_edf.extend(annot_list)
        events_from_edf = np.array(events_from_edf)
        i = 0
        events_arr = np.zeros(events_from_edf.shape, dtype=int)
        for i_event in events_from_edf:

            index = int((float(i_event[0])) * samplin_frequency)

            events_arr[i,:] = index,0,stim_code[int(i_event[2])]
            i=i+1

        # strip channel names of "." characters
        edf_raw.rename_channels(lambda x: x.strip('.'))
        #create Event dictionary based on File
        events_in_edf = [event[2] for event in events_arr[:]]
        if(events_in_edf.__contains__(2)):
            event_id['LEFT_HAND']=2
        if (events_in_edf.__contains__(3)):
            event_id['RIGHT_HAND'] = 3
        # if (events_in_edf.__contains__(4)):
        #     event_id['FEET'] = 4
        # if (events_in_edf.__contains__(5)):
        #     event_id['IDLE'] = 5
        # Apply band-pass filter
        edf_raw.filter(4., 40., fir_design='firwin', skip_by_annotation='edge')   # 4-40Hz



        picks = pick_types(edf_raw.info, meg=False, eeg=True, stim=False, eog=False,
                   exclude='bads')
        print events_arr[:10]

        # Read epochs (train will be done only between 0.5 and 2.5s)
        # Testing will be done with a running classifier
        print edf_raw
        if event_id:

            epochs = Epochs(edf_raw, events_arr, event_id, tmin, tmax, proj=True, picks=picks,
                    baseline=None, preload=True)
            tmaximum =2.5
            tminimum = 0.5
            epochs_train = []
            while (tmaximum<4.1):
                epochs_train.append(epochs.copy().crop(tmin=tminimum, tmax=tmaximum))
                tminimum=tminimum+0.1
                tmaximum=tmaximum+0.1

            labels = [epochs_from_train.events[:, -1] - 2 for epochs_from_train in epochs_train]
            labels_array = np.array(labels)
            epochs_data = epochs.get_data()
            epochs_data_train = [epochs_from_train.get_data() for epochs_from_train in epochs_train]
            epochs_array_train = np.array(epochs_data_train)
            #split data into training and testing set

            i=0
            cv = ShuffleSplit(10, test_size=0.2, random_state=42)
            if (len(epochs_data_train) != len(labels_array)):
                print "Something is not right"
            else:
                while (i<len(epochs_array_train)):
                    X.extend(epochs_array_train[i])
                    y.extend(labels_array[i])

                    # for train_idx, test_idx in cv_split:
                    #     #X_train, X_test = epochs_array_train[train_idx],epochs_array_train[test_idx]
                    #     #y_train, y_test = labels[train_idx], labels[test_idx]
                    #     X_train.append(epochs_array_train[i][train_idx])
                    #     X_test.append(epochs_array_train[i][test_idx])
                    #     y_train.append(labels_array[i][train_idx])
                    #     y_test.append(labels_array[i][test_idx])
                    i=i+1
    cv_split = cv.split(X)
    X = np.array(X)
    y = np.array(y)
    # for train_idx, test_idx in cv_split:
    #     X_train.append(X[train_idx])
    #     X_test.append(X[test_idx])
    #     y_train.append(y[train_idx])
    #     y_test.append(y[test_idx])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return np.array(X_train),np.array(y_train),np.array(X_test),np.array(y_test)
    #return X,y

# if __name__ == '__main__':
#     data_directory = 'data_i2r';
#     #user = '******'
#     (X_train,y_train,X_test,y_test)=load_data(training=True)
#     print ("train data size is " + str(X_train.size))
#     print ("test data size is  "+ str(X_test.size))
Esempio n. 33
0
def run_eval(dataset, iterations):
    suffixes = ['Zafar et al.', 'Adaboost', 'AdaFair', 'SMOTEBoost']

    if dataset == "compass-gender":
        X, y, sa_index, p_Group, x_control = load_compas("sex")
    elif dataset == "compass-race":
        X, y, sa_index, p_Group, x_control = load_compas("race")
    elif dataset == "adult-gender":
        X, y, sa_index, p_Group, x_control = load_adult("sex")
    elif dataset == "adult-race":
        X, y, sa_index, p_Group, x_control = load_adult("race")
    elif dataset == "bank":
        X, y, sa_index, p_Group, x_control = load_bank()
    elif dataset == "kdd":
        X, y, sa_index, p_Group, x_control = load_kdd()

    else:
        exit(1)
    create_temp_files(dataset, suffixes)

    # init parameters for zafar method (default settings)
    tau = 3.0
    mu = 1.2
    cons_type = 4
    sensitive_attrs = x_control.keys()
    loss_function = "logreg"
    EPS = 1e-6
    sensitive_attrs_to_cov_thresh = {
        sensitive_attrs[0]: {
            0: {
                0: 0,
                1: 0
            },
            1: {
                0: 0,
                1: 0
            },
            2: {
                0: 0,
                1: 0
            }
        }
    }
    cons_params = {
        "cons_type": cons_type,
        "tau": tau,
        "mu": mu,
        "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh
    }

    threads = []
    mutex = []
    for lock in range(0, 8):
        mutex.append(Lock())

    random.seed(int(time.time()))

    for iter in range(0, iterations):

        sss = ShuffleSplit(n_splits=1, test_size=0.5)
        for train_index, test_index in sss.split(X, y):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            for proc in range(0, 4):
                if proc != 2:
                    time.sleep(1)
                    continue

                if proc > 0:
                    threads.append(
                        Process(target=train_classifier,
                                args=(X_train, X_test, y_train, y_test,
                                      sa_index, p_Group,
                                      dataset + suffixes[proc], mutex[proc],
                                      proc, 200, 1)))

                elif proc == 0:
                    temp_x_control_train = defaultdict(list)
                    temp_x_control_test = defaultdict(list)

                    temp_x_control_train[sensitive_attrs[0]] = x_control[
                        sensitive_attrs[0]][train_index]
                    temp_x_control_test[sensitive_attrs[0]] = x_control[
                        sensitive_attrs[0]][test_index]

                    x_zafar_train, y_zafar_train, x_control_train = ut.conversion(
                        X[train_index], y[train_index],
                        dict(temp_x_control_train), 1)

                    x_zafar_test, y_zafar_test, x_control_test = ut.conversion(
                        X[test_index], y[test_index],
                        dict(temp_x_control_test), 1)

                    threads.append(
                        Process(target=train_zafar,
                                args=(x_zafar_train, y_zafar_train,
                                      x_control_train, x_zafar_test,
                                      y_zafar_test, x_control_test,
                                      cons_params, loss_function, EPS,
                                      dataset + suffixes[proc], mutex[proc],
                                      sensitive_attrs)))

    for process in threads:
        process.start()

    for process in threads:
        process.join()

    threads = []

    results = []
    for suffix in suffixes:
        infile = open(dataset + suffix, 'rb')
        temp_buffer = pickle.load(infile)
        results.append(temp_buffer.performance)
        infile.close()

    plot_my_results(results, suffixes, "Images/" + dataset, dataset)
    delete_temp_files(dataset, suffixes)
Esempio n. 34
0
    # test
    y_pred = grid_search_cv.predict(X_test)
    print(accuracy_score(y_test, y_pred))  # 0.8695

    # 8: grow a forest

    # sub-data set
    n_trees = 1000
    n_instances = 100

    mini_sets = []

    # split X_train into 1000 pieces and each piece has 100 samples reshuffled
    rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)
    # refer to its definition and cloud note: sklearn
    for mini_train_index, mini_test_index in rs.split(X_train):
        X_mini_train = X_train[mini_train_index]
        y_mini_train = y_train[mini_train_index]
        mini_sets.append((X_mini_train, y_mini_train))

    # train tree models in forest
    forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]  # copy: deep copy

    # fit and predict: train 1000 tree models with X_mini_train and predict
    accuracy_scores = []
    for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
        tree.fit(X_mini_train, y_mini_train)
        y_pred = tree.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
    print(np.mean(accuracy_scores))  # mean accuracy of all test data: 0.8054494999999999
from sklearn.model_selection import ShuffleSplit
import numpy as np


X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
rs.get_n_splits(X)
print(rs)
ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None)
for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25,
                     random_state=0)

for train_index, test_index in rs.split(X):
    print("# TRAIN:", train_index, "TEST:", test_index)
from sklearn.preprocessing import MinMaxScaler

myMinMaxScaler = MinMaxScaler()
Xsc = myMinMaxScaler.fit_transform(Xraw)

from sklearn.decomposition import PCA

mypca = PCA(n_components=10)
#X = mypca.fit_transform(Xsc)
X = Xsc

from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=1, test_size=0.2)

for train_index, test_index in rs.split(X):
    Xtrain = X[train_index, :]
    Xtest = X[test_index, :]
    ytrain = y[train_index]
    ytest = y[test_index]

from sklearn.metrics import confusion_matrix

myclf = GaussianNB()

myclf.fit(Xtrain, ytrain)
ypred = myclf.predict(Xtest)
print confusion_matrix(ytest, ypred)

from sklearn.model_selection import cross_val_score
    for n in nFeats
])
            
brca1Model20 = copy.deepcopy(brca1Modelers[20]).fit(x0, y)
brca1Preds = brca1Model20.predict(x0)
stats.pearsonr(brca1Preds, y)[0]

brca1Model1000 = copy.deepcopy(brca1Modelers[1000]).fit(x0, y)
brca1Preds = brca1Model1000.predict(x0)
stats.pearsonr(brca1Preds, y)[0]

cvR2s_unreg = Series(OrderedDict([
    (n, np.mean(cross_val_score_pd(copy.deepcopy(brca1Modelers[n]),
                                   X = x0,
                                   y = y,
                                   cv = cvSched.split(x0))))
    for n in nFeats
]))


## -----------------------------------------------------------------
## L2-regularized linear regression
## -----------------------------------------------------------------
brca1Modelers2 = OrderedDict([
    (n, pipeline.Pipeline([
        ('featsel', feature_selection.SelectKBest(
                feature_selection.f_regression, k=n)),
        ('regressor', linear_model.Ridge(
                alpha=len(y)*(1.5 + 0.034*n)))
    ]))
    for n in nFeats
Esempio n. 38
0
def train_model(clf, param_grid, X, Y):
    '''Trains and evaluates the model clf from input
    
    The function selects the best model of clf by optimizing for the validation data,
    then evaluates its performance using the out of sample test data.
    
    input - clf: the model to train
            param_grid: a dict of hyperparameters to use for optimization
            X: features
            Y: labels
    
    output - the best estimator (trained model)
             the confusion matrix from classifying the test data
    '''
    
    #First, partition into train and test data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    n_iter = 5
    #If number of possible iterations are less than prefered number of iterations, 
    #set it to the number of possible iterations
    #number of possible iterations are not less than prefered number of iterations if any argument is expon()
    #because expon() is continous (writing 100 instead, could be any large number)
    n_iter = min(n_iter,np.prod([
        100 if type(xs) == type(expon()) 
        else len(xs) 
        for xs in param_grid.values()
    ]))
    
    #perform a grid search for the best parameters on the training data.
    #Cross validation is made to select the parameters, so the training data is actually split into
    #a new train data set and a validation data set, K number of times
    cvv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) #DEBUG: n_iter=10
    cv = cvv.split(X_train)
    #cv = KFold(n=len(X), n_folds=10)
    random_grid_search = RandomizedSearchCV(
        clf, 
        param_distributions=param_grid,
        cv=cv, 
        scoring='f1', 
        n_iter=n_iter, #DEBUG 1 
        random_state=5,
        refit=True,
        verbose=10,
        n_jobs=-1 # modify
    )
    
    '''Randomized search used instead. We have limited computing power
    grid_search = GridSearchCV(
        clf,
        param_grid=param_grid,
        cv=cv,
        scoring='f1', #accuracy/f1/f1_weighted all give same result?
        verbose=10,
        n_jobs=-1
    )
    grid_search.fit(X_train, Y_train)
    '''
    random_grid_search.fit(X_train, Y_train)
    
    #Evaluate the best model on the test data
    Y_test_predicted = random_grid_search.best_estimator_.predict(X_test)
    Y_test_predicted_prob = random_grid_search.best_estimator_.predict_proba(X_test)[:, 1]

    confusion = confusion_matrix(Y_test, Y_test_predicted)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]

    #Calculate recall (sensitivity) from confusion matrix
    sensitivity = TP / float(TP + FN)
    
    #Calculate specificity from confusion matrix
    specificity = TN / float(TN + FP)

    #Calculate accuracy
    accuracy = (confusion[0][0] + confusion[1][1]) / (confusion.sum().sum())
    
    #Calculate axes of ROC curve
    fpr, tpr, thresholds = roc_curve(Y_test, Y_test_predicted_prob)
    
    #Area under the ROC curve
    auc = roc_auc_score(Y_test, Y_test_predicted_prob)

    return {
        'conf_matrix':confusion, 
        'accuracy':accuracy, 
        'sensitivity':sensitivity,
        'specificity':specificity,
        'auc':auc,
        'params':random_grid_search.best_params_,
        'model':random_grid_search.best_estimator_,
        'roc':{'fpr':fpr,'tpr':tpr,'thresholds':thresholds}
    }