Beispiel #1
0
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)

    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)

    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):

        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])

    tpr = np.mean(tprs,0)
    fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy
    def crossValidation(self):
        kf = KFold(n_splits=10)
        inputData = kf.split(self.features)
        count = 1
        totalCM = np.zeros([2, 2])
        for train_index, test_index in inputData:
            from sklearn.ensemble import IsolationForest
            clf = IsolationForest(random_state=666, contamination=0.07)
            X_train, X_test = self.features[train_index], self.features[
                test_index]

            #X_train, X_test = np.abs(X_train), np.abs(X_test)
            # print(X_train)
            y_train, y_test = self.labels[train_index], self.labels[test_index]

            X_train, X_test = X_train[:, :], X_test[:, :]
            y_train, y_test = np.array(y_train), np.array(y_test)

            clf.fit(X_train, y_train)
            pred_train = clf.predict(X_train)
            y_train = list(y_train)
            cm = confusion_matrix(y_train, pred_train, labels=[-1, 1])
            print("第", count, "轮训练集里的表现\n", cm)
            pred = clf.predict(X_test)
            y_test = list(map(lambda x: x[0], y_test))
            print(y_test)
            #print(pred)
            cm = confusion_matrix(y_test, pred, labels=[-1, 1])
            totalCM = totalCM + np.array(cm)
            print("第", count, "轮测试集里的表现\n", cm)
            # print(X_test)
            print("####################################")
            count += 1
            print("混淆矩阵的和是\n", totalCM, "准确率是",
                  (totalCM[0][0] + totalCM[1][1]) / (sum(sum(totalCM))))
Beispiel #3
0
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)

    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)

    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)

    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):

        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train)>=far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0

        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])

    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean
    def crossValidation(self):
        kf = KFold(n_splits=10)
        inputData = kf.split(self.features)
        count = 1
        totalCM = np.zeros([2,2])
        for train_index, test_index in inputData:
            X_train, X_test = self.features[train_index], self.features[test_index]

            #X_train, X_test = np.abs(X_train), np.abs(X_test)
            # print(X_train)
            y_train, y_test = self.labels[train_index], self.labels[test_index]
            num_feature = len(X_train[0])
            clf = deepLearning.LSTMClassifier(2, num_feature, learning_rate=1e-3,
                                              layer_num=2, hidden_size=50, timestep_size=60)
            clf.initGraph()
            clf.initOneHotEncoder4Y(y_train)

            def scala(x):
                res = []
                for i in range(len(x)):
                    temp = []
                    for n in x[i]:
                        v = 1 if n > 0 else 0
                        temp.append(v)
                    res.append(temp)
                return np.array(res)

           # X_train, X_test = aaa(X_train, len(y_train)), aaa(X_test, len(y_test))
            X_train,X_test = X_train[:,:], X_test[:,:]
            X_train, X_test  = scala(X_train), scala(X_test)
            y_train, y_test = np.array(y_train), np.array(y_test)

            for i in range(5000):
                print("这是第", count, "折,第", i, "轮训练。", len(y_train) )
                step = 500
                for j in range(0, len(y_train), step):

                    batch_ys = clf.oneHotEncode(y_train[j: j+ step,:])
                    batch_xs = np.array(X_train[j: j+ step,:]).astype(np.float32)
                    clf.fit(batch_xs, batch_ys)
                    #clf.fit(batch_xs + random.uniform(0,0.01), batch_ys)
                print("训练集")
                batch_ys = clf.oneHotEncode(y_train)
                batch_xs = np.array(X_train).astype(np.float32)
                clf.test(batch_xs, batch_ys)

                print("测试机")
                batch_ys = clf.oneHotEncode(y_test)
                batch_xs = np.array(X_test).astype(np.float32)
                y_pre = clf.test(batch_xs, batch_ys)
                y_pre = list(map(lambda x: 1 if x[0]<x[1] else 0, y_pre))
                y_real = list(map(lambda x:x[0], y_test))
                cm = confusion_matrix(y_real, y_pre, labels=[0, 1])
                #print(y_pre)
                #print(y_real)
                print(cm)
                print("*************************")
            break
def trainPatient (X, y): # y=0 for interictal, y=1 for preictal

    # CALLED BY: __main__()

    # given the data for a patient, train a classifier

    # do K-fold splitting on training data, where K is 5
    k_fold = KFold(n_splits=5)
    dataSplitIndices = k_fold.split(X) # use example: [svc.fit(X[train], y[train]) for train, test in k_fold.split(X)]
 def k_fold(self, X, y):
     X = np.array(X)
     y = np.array(y)
     kf = KFold(3, True, 1)
     
     data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
     
     for train_indices, test_indices in kf.split(data):
         print('Train: %s | test: %s' % (train_indices, test_indices))
def get_image_paths_and_labels_headcap(images_path, usage, nfold, ifold):

    image_paths = []
    labels = []
    idx_train_all = []
    idx_test_all = []
    image_paths_final = []
    labels_final = []


    folders = os.listdir(images_path)
    folders.sort()
    for fold in folders:
        if not os.path.isdir(os.path.join(images_path, fold)):
            continue
        img_path_folder = glob.glob(os.path.join(images_path, fold, '*.png'))
        img_path_folder.sort()
        image_paths += img_path_folder
        label_txt = glob.glob(os.path.join(images_path, fold, '*.txt'))[0]
        with open(label_txt, 'r') as f:
            for line in f.readlines():
                line = line.replace('\r\n','\n')
                #print ('%s   %s'%(fold, line))
                labels.append(int(line[-2:-1]))

    # folds = KFold(n=len(labels_flat), n_folds=nrof_folds, shuffle=True)
    #folds = KFold(n=len(labels), n_folds=10, shuffle=False) ## Before the version of sklearn 0.20
    kf = KFold(n_splits=nfold, shuffle=False) ## After the version of sklearn 0.20

    i = 0
    #for idx_train, idx_test in folds: ## Before sklearn 0.20
    for idx_train, idx_test in kf.split(labels):  ## After skleran 0.20
        idx_train_all.append([])
        idx_train_all[i].append(idx_train)
        idx_test_all.append([])
        idx_test_all[i].append(idx_test)
        #print('train:', idx_train, 'test', idx_test)
        i += 1

    idx_train = idx_train_all[ifold][0]
    idx_test = idx_test_all[ifold][0]

    if usage == 'Training':
        for idx in idx_train:
            #idx_train.append(idx)
            image_paths_final.append(image_paths[idx])
            labels_final.append(labels[idx])

    if usage == 'Test':
        for idx in idx_test:
            #idx_test.append(idx)
            image_paths_final.append(image_paths[idx])
            labels_final.append(labels[idx])

    nrof_classes = len(set(labels_final))
    return image_paths_final, labels_final, usage, nrof_classes
Beispiel #8
0
def run_kfold_on_model(df: DataFrame, exclude_cols: Iterable[str], target_col: str, model, lossfun):
    X, Y = make_dataset(df, exclude_cols, target_col)
    kf = KFold(n_folds=10, shuffle=True)
    losses = []
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        model.fit(X_train, Y_train)
        pred = model.predict(Y_test)
        losses.append(lossfun(Y_test, pred))
    return losses
Beispiel #9
0
    def crossValidation(self):
        kf = KFold(n_splits=10)
        inputData = kf.split(self.features)
        count = 1
        totalCM = np.zeros([2,2])
        for train_index, test_index in inputData:
            clf = StackingClassifier()
            clf.setBaseModels({"DT": DecisionTreeClassifier(),
                'KNN': KNeighborsClassifier(n_neighbors=20),
                 "LR": LogisticRegression(max_iter=1000, solver='lbfgs', C=100),
            'mlp': MLPClassifier(hidden_layer_sizes=(100,)),
            "gbdt": GradientBoostingClassifier(n_estimators=20)
                                                      })
            clf.setMetaModel(MLPClassifier(hidden_layer_sizes=(100,)))
            X_train, X_test = self.features[train_index], self.features[test_index]
            y_train, y_test = self.labels[train_index], self.labels[test_index]
            def scala(x):
                res = []
                for i in range(len(x)):
                    temp = []
                    for n in x[i]:
                        v = 1 if n>0 else 0
                        temp.append(v)
                    res.append(temp)
                return np.array(res)

            X_train,X_test = X_train[:,:], X_test[:,:]
            X_train, X_test  = scala(X_train), scala(X_test)
            y_train, y_test = np.array(y_train), np.array(y_test)
            inputMap = {"DT": X_train, 'KNN': X_train, 'mlp': X_train, "gbdt": X_train, 'LR': X_train}
            clf.fit(inputMap, y_train)


            pred_train = clf.predict(inputMap)
            y_train = list(y_train)
            cm = confusion_matrix(y_train, pred_train)
            print("第", count, "轮训练集里的表现\n", cm)
            inputMap = {"DT": X_test, 'KNN': X_test, 'mlp': X_test, "gbdt": X_test, 'LR': X_test}

            pred = clf.predict(inputMap)
            y_test = list(y_test)
            cm = confusion_matrix(y_test, pred, labels=[0, 1])
            totalCM = totalCM + np.array(cm)
            print("第", count, "轮测试集里的表现\n", cm)
            # print(X_test)
            print("####################################")
            count += 1
            print("混淆矩阵的和是\n", totalCM, "准确率是",
                  (totalCM[0][0] + totalCM[1][1]) / (sum(sum(totalCM))))
Beispiel #10
0
def calculate_roc(thresholds,
                  embeddings1,
                  embeddings2,
                  actual_issame,
                  nrof_folds=10,
                  distance_metric=0,
                  subtract_mean=False):
    assert (embeddings1.shape[0] == embeddings2.shape[0])
    assert (embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)

    tprs = np.zeros((nrof_folds, nrof_thresholds))
    fprs = np.zeros((nrof_folds, nrof_thresholds))
    accuracy = np.zeros((nrof_folds))

    indices = np.arange(nrof_pairs)

    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        if subtract_mean:
            mean = np.mean(np.concatenate(
                [embeddings1[train_set], embeddings2[train_set]]),
                           axis=0)
        else:
            mean = 0.0
        dist = distance(embeddings1 - mean, embeddings2 - mean,
                        distance_metric)

        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(
                threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,
                 threshold_idx], fprs[fold_idx,
                                      threshold_idx], _ = calculate_accuracy(
                                          threshold, dist[test_set],
                                          actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(
            thresholds[best_threshold_index], dist[test_set],
            actual_issame[test_set])

        tpr = np.mean(tprs, 0)
        fpr = np.mean(fprs, 0)
    return tpr, fpr, accuracy
Beispiel #11
0
def calculate_val(thresholds,
                  embeddings1,
                  embeddings2,
                  actual_issame,
                  far_target,
                  nrof_folds=10,
                  distance_metric=0,
                  subtract_mean=False):
    assert (embeddings1.shape[0] == embeddings2.shape[0])
    assert (embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)

    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)

    indices = np.arange(nrof_pairs)

    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        if subtract_mean:
            mean = np.mean(np.concatenate(
                [embeddings1[train_set], embeddings2[train_set]]),
                           axis=0)
        else:
            mean = 0.0
        dist = distance(embeddings1 - mean, embeddings2 - mean,
                        distance_metric)

        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(
                threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train) >= far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0

        val[fold_idx], far[fold_idx] = calculate_val_far(
            threshold, dist[test_set], actual_issame[test_set])

    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean
Beispiel #12
0
    def crossValidation(self):
        kf = KFold(n_splits=10)
        inputData = kf.split(self.features)
        count = 1
        totalCM = np.zeros([2, 2])
        for train_index, test_index in inputData:
            # clf = DecisionTreeClassifier(max_depth=10)
            # clf = RandomForestClassifier(n_estimators=10, max_depth=5,random_state=666)
            # clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10))
            # clf = MLPClassifier(max_iter=200, hidden_layer_sizes=(200, 20))
            # clf = LogisticRegression(max_iter=1000, solver='lbfgs', C=100)
            # clf = GradientBoostingClassifier(n_estimators=20)
            # clf = SVC(C=0.8)
            clfList = [['desisionTree',
                        DecisionTreeClassifier()], ['mlp',
                                                    MLPClassifier()],
                       ['KNN', KNeighborsClassifier(n_neighbors=10)]]
            clf = VotingClassifier(clfList, voting='hard')
            X_train, X_test = self.features[train_index], self.features[
                test_index]
            y_train, y_test = self.labels[train_index], self.labels[test_index]
            # featureProcessor = PCA(n_components=20)
            featureProcessor = LinearDiscriminantAnalysis(n_components=50).fit(
                X_train, y_train)
            # featureProcessor = SelectKBest(chi2, k=20)#.fit(features, labels)
            featureProcessor.fit(X_train, y_train)
            X_train = featureProcessor.transform(X_train)
            X_test = featureProcessor.transform(X_test)

            clf.fit(X_train, y_train)
            pred_train = clf.predict(X_train)
            y_train = list(y_train)
            cm = confusion_matrix(y_train, pred_train)
            print("第", count, "轮训练集里的表现\n", cm)
            pred = clf.predict(X_test)
            y_test = list(y_test)
            cm = confusion_matrix(y_test, pred, labels=['m', 'f'])
            totalCM = totalCM + np.array(cm)
            print("第", count, "轮测试集里的表现\n", cm)
            res = list(
                map(lambda x: y_test[x] + '_' + pred[x], range(len(pred))))
            print(res)
            # print(X_test)
            print("####################################")
            count += 1
        print("混淆矩阵的和是\n", totalCM, "准确率是",
              (totalCM[0][0] + totalCM[1][1]) / (sum(sum(totalCM))))
Beispiel #13
0
def keras_eval(X_train_cv, y, X_test, test_ids, folds=5, nbags=5, nepochs=55):
    np.random.seed(123)    
    start_time = timer(None)
    
    # set up KFold that matches xgb.cv number of folds
    cv_pred = np.zeros((X_train_cv.shape[0], folds, nbags))
    test_pred = np.zeros((X_test.shape[0], folds, nbags))
    cv_score = np.zeros((folds, nbags))
    kf = KFold(n_splits=folds, shuffle=True, random_state=0)
    for i, (train_index, cv_index) in enumerate(kf.split(X_train_cv)):
        X_train, X_cv = X_train_cv.iloc[train_index,:], X_train_cv.iloc[cv_index,:]
        y_train, y_cv = y[train_index], y[cv_index]

        ## train models
        for j in range(nbags):
            model = nn_model()
            model.fit_generator(
                generator = batch_generator(X_train, y_train, 128, True),
                nb_epoch = nepochs,
                samples_per_epoch = X_train.shape[0],
                verbose = 0)
            cv_pred[cv_index,i,j] = model.predict_generator(
                generator = batch_generatorp(X_cv, 800, False),
                val_samples = X_cv.shape[0])[:,0]
            test_pred[:,i,j] = model.predict_generator(
                generator = batch_generatorp(X_test, 800, False),
                val_samples = X_test.shape[0])[:,0]
            cv_score[i,j] = mean_absolute_error(y_cv, cv_pred[cv_index])
            print('Fold {}, Bag {} - MAE: {}'.format(i, j, cv_score[i,j]))
        print(' Fold {} - MAE: {}\n'.format(i, cv_score.mean(1)[i]))
    score = mean_absolute_error(y, cv_pred.mean(2).mean(1))
    print('Total - MAE: {}'.format(score))
    timer(start_time)
    
    print("#\n Writing results")
    result = pd.DataFrame({'id': test_ids, 'loss': test_pred.mean(2).mean(1)})
    result = result.set_index("id")
    
    now = datetime.now()

    sub_file = 'submission_{}fold-{}bag-average-keras-{}-{}.csv.gz'.format(
        folds, nbags, score, now.strftime("%Y-%m-%d-%H-%M"))
    print("\n Writing submission: {}".format(sub_file))
    result.to_csv(sub_file, index=True, index_label='id', compression='gzip')
Beispiel #14
0
def Stacking(model,train_x,train_y, test,n_splits=5, random_state=None, shuffle=False):
# only return the predictions of the model, no target
# average the prediction on test by every model

    df_kf_valid=np.zeros((train_x.shape[0],))
    df_kf_test =np.zeros((test.shape[0], n_splits))

    kf=KFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    for i, (train_index, valid_index) in enumerate(kf.split(train_x)):
        kf_train_x= train_x.reindex(train_index)
        kf_train_y=train_y.reindex(train_index)
        kf_valid_x=train_x.reindex(valid_index)

        model.train(kf_train_x, kf_train_y)

        df_kf_valid[train_index]= model.predict(kf_valid_x)
        df_kf_test[:,i]= model.predict(test)
    df_test=np.mean(df_kf_test ,axis=1)
    #samples x 1 ,
    return df_kf_valid.reshape(-1,1), df_kf_test.reshape(-1,1)
           "param_3",
           "price+",
           "item_seq_number+",
           ]




from sklearn.model_selection import KFold

kf = KFold(n_splits=10, random_state=42, shuffle=True)
numIter = 0
rmse_sume = 0.
numLimit = 1 # 5

for train_index, valid_index in kf.split(y):
      numIter +=1
      
      if numIter>=numLimit+1:
            pass
      else:
      
            print("Modeling Stage ...")    
            
            X_train, X_valid = X.tocsr()[train_index], X.tocsr()[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
            gc.collect()
            lgbm_params =  {
                    "tree_method": "feature",    
                    "num_threads": 11, # 3
Beispiel #16
0
recall_p = 0

Loop_n = 1  #循环次数
fold_n = 10  #n-折交叉验证:折数

for i in range(0, Loop_n):
    train = shuffle(data_train)
    x_columns = [x for x in train.columns if x not in [label, cardcol]]
    X = train[x_columns]
    y = train[label]

    X = np.array(X)
    y = np.array(y)
    kf = KFold(n_splits=fold_n)
    kf.get_n_splits(X)  #给出K折的折数,输出为2
    for train_index, test_index in kf.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        rf_model = RandomForestClassifier(oob_score=True, random_state=10)
        time1 = time.time()
        rf_model.fit(X_train, y_train)
        time2 = time.time()
        print "rf_model used time: %f sec" % (time2 - time1)  #时间 second

        pred_test = rf_model.predict(X_test)
        temp_m = confusion_matrix(y_test, pred_test)
        precision_p = precision_p + float(temp_m[1][1]) / float(
            (temp_m[0][1] + temp_m[1][1]))
        recall_p = recall_p + float(temp_m[1][1]) / float(
Beispiel #17
0
train['new2'] = train['V2'] + train['V3'] + train['V4']

test['new2'] = test['V2'] + test['V3'] + test['V4']

train_x = train.drop(['target'], axis=1)

Y = train['target']

import lightgbm as lgb
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=2)
p = []
test_err = 0
res = np.zeros((test.shape[0], 5))
for k, (train_index, test_index) in enumerate(kf.split(train_x)):
    x, test_x = train_x.loc[train_index], train_x.loc[test_index]
    y, test_y = Y[train_index], Y[test_index]
    lgb_model = lgb.LGBMRegressor(
        boosting_type='gbdt',
        max_depth=-1,
        learning_rate=0.01,
        n_estimators=5000,
        objective='regression',
    )

    lgb_model.fit(
        x,
        y,
        eval_set=[(x, y), (test_x, test_y)],
        eval_names=['Train', 'Test'],
Beispiel #18
0
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import KFold
# Viz lib
import matplotlib
if os.environ.get('DISPLAY', '') == '':
    print('no display found. Using non-interactive Agg backend')
    matplotlib.use('agg')
import seaborn as sns
import matplotlib.pyplot as plt

kf = KFold(n_splits=5, shuffle=True)
if useKfold:
    for idx, (train_index, test_index) in enumerate(kf.split(X_tr)):
        print('+++++ CV at fold number ', idx)
        X_train, X_test = X_tr[train_index], X_tr[test_index]
        y_train, y_test = y_tr[train_index], y_tr[test_index]
        X_tr_lgb = lgb.Dataset(X_train,
                               label=y_train,
                               feature_name=feature_names,
                               categorical_feature=cat_cols)
        X_va_lgb = lgb.Dataset(X_test,
                               label=y_test,
                               feature_name=feature_names,
                               categorical_feature=cat_cols,
                               reference=X_tr_lgb)
        model = lgb.train(parameters,
                          X_tr_lgb,
                          valid_sets=[X_tr_lgb, X_va_lgb],
def get_image_paths_and_labels_hand(images_path, labelfile, nfold, ifold):

    image_paths = []
    labels = []
    idx_train_all = []
    idx_test_all = []
    image_paths_final = []
    labels_final = []
    image_paths_final_test = []
    labels_final_test = []

    datal = pandas.read_excel(labelfile)
    labels_all = datal['PersonID'].values
    labels_frm = datal['Frame'].values
    labels_frm_list = labels_frm.tolist()
    labels_all_list = labels_all.tolist()

    image_paths = glob.glob(os.path.join(images_path, '*.png'))
    image_paths.sort()
    for imgfile in image_paths:
        strtmp = str.split(imgfile,'/')[-1]
        strtmp = str.split(strtmp, '_')[0]
        framenum = int(strtmp[5:])

        idx = labels_frm_list.index(framenum)
        labels.append(labels_all_list[idx])


    # folds = KFold(n=len(labels_flat), n_folds=nrof_folds, shuffle=True)
    if sklearn.__version__ < '0.20':
        folds = KFold(n=len(labels), n_folds=10, shuffle=True) ## Before the version of sklearn 0.20
    else:
        kf = KFold(n_splits=nfold, shuffle=True) ## After the version of sklearn 0.20

    i = 0

    if sklearn.__version__ < '0.20':
        for idx_train, idx_test in folds: ## Before sklearn 0.20
            idx_train_all.append([])
            idx_train_all[i].append(idx_train)
            idx_test_all.append([])
            idx_test_all[i].append(idx_test)
            # print('train:', idx_train, 'test', idx_test)
            i += 1
    else:
        for idx_train, idx_test in kf.split(labels):  ## After skleran 0.20
            idx_train_all.append([])
            idx_train_all[i].append(idx_train)
            idx_test_all.append([])
            idx_test_all[i].append(idx_test)
            #print('train:', idx_train, 'test', idx_test)
            i += 1

    idx_train = idx_train_all[ifold][0]
    idx_test = idx_test_all[ifold][0]


    for idx in idx_train:
        #idx_train.append(idx)
        image_paths_final.append(image_paths[idx])
        labels_final.append(labels[idx])


    for idx in idx_test:
        #idx_test.append(idx)
        image_paths_final_test.append(image_paths[idx])
        labels_final_test.append(labels[idx])

    nrof_classes = len(set(labels_final))
    nrof_classes_test = len(set(labels_final_test))

    return image_paths_final, labels_final, nrof_classes, image_paths_final_test, labels_final_test, nrof_classes_test
Beispiel #20
0
                         num_leaves=85,
                         max_depth=15,
                         learning_rate=0.003,
                         n_estimators=3677,
                         subsample_for_bin=400000,
                         objective="binary",
                         min_split_gain=0.0,
                         min_child_weight=0.01,
                         min_child_samples=50,
                         subsample=0.8,
                         subsample_freq=1,
                         colsample_bytree=0.7,
                         reg_alpha=5.0,
                         reg_lambda=0.0,
                         silent=True)

kf = KFold(n_splits=5)
for n_fold, (train_index, test_index) in enumerate(kf.split(train_X)):
    print n_fold
    X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    model_1.fit(X_train, y_train)
    #prediction = model_1.predict_proba(X_test)
    #train_score.append(prediction[:,1])
    oof_train[test_index] = model_1.predict_proba(X_test)[:, 1]
    oof_test_skf[n_fold, :] = model_1.predict_proba(test_X)[:, 1]

oof_test[:] = oof_test_skf.mean(axis=0)
te['buy'] = oof_test
tr['buy'] = oof_train
# Toy data set modeled off of the Iris data set
data = [[3.0, 1.2, 4.5],
        [2.8, 2.0, 5.6],
        [1.4, 2.2, 5.2],
        [2.5, 1.5, 6.3],
        [3.1, 1.7, 5.7]]

# Toy target set modeled off of the Iris target set
target = [0, 0, 1, 2, 1]

# The number of splits I want
n = 10

# Get the KFolder, telling it the number of ways you want it to be split and that you want the data to be
# selected randomly.
kf = KFold(10, n_folds=n, shuffle=True)

# Store what your classifier returns in a list. There are alternate ways of doing this step
predictions = []

# I put in print statements just so that you could see what it was doing
for train_index, test_index in kf.split(data):
    print(train_index) # See the list. Be the list.
    print(test_index) # And do the same here
    print(data[train_index]) # Proof that it gets those indexes
    print(data[test_index]) # Notice that it collects different ones
    # collect all of your predictions. This is optional, depending on what else you are doing.
    predictions.append(classifier(data[train_index], # I don't know what your classifier does, but mine takes in all of these
                                  data[test_index], 
                                  target[train_index], # Notice that both target and data get indexed. This is important!
                                  target[test_index]))
Beispiel #22
0
#Model Score
print("The coefficient of determination for the Random Forest model is: %.4f" %
      iris_rf.score(irisX, irisY))

# # K- Fold Cross Validation

# In[6]:

from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

x = irisX
y = irisY
kf = KFold(n_splits=5, random_state=None, shuffle=True)
kf.get_n_splits(x)
for train_i, test_i in kf.split(x):
    print("TRAIN:", train_i, "TEST:", test_i)
    X_train, X_test = x[train_i], x[test_i]
    y_train, y_test = y[train_i], y[test_i]

# # 2. KFold Score
# We use cross validation so as to better predict the test error and gauge the accuracy of our model by using such a prediction. it is used over a validation set so as to not decrease the size of our training data too much as that raises error.

# In[7]:

#K- Fold Score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

iris_dtree.fit(X_train, y_train)
    def crossValidation(self):
        kf = KFold(n_splits=10)
        inputData = kf.split(self.features)
        count = 1
        totalCM = np.zeros([2, 2])
        # self.features = np.array(self.features)
        # self.labels = np.array(self.labels)
        for train_index, test_index in inputData:
            # clf = DecisionTreeClassifier(max_depth=10)
            # clf = RandomForestClassifier(n_estimators=30, max_depth=6,random_state=666)
            # clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=20)
            clf = MLPClassifier(hidden_layer_sizes=(500, ))
            # clf = LogisticRegression(max_iter=1000, solver='lbfgs', C=100)
            # clf = GradientBoostingClassifier(n_estimators=20)
            # clf = SVC(C=0.8)
            # clfList = [
            #     ["AD", AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=20)],
            #     ["gbdt", GradientBoostingClassifier(n_estimators=20)],
            #     ["LR", LogisticRegression(max_iter=1000, solver='lbfgs', C=100)],
            #     ['desisionTree', DecisionTreeClassifier()],
            #            ['mlp', MLPClassifier(hidden_layer_sizes=(200, 100))],
            #            ['KNN', KNeighborsClassifier(n_neighbors=50)]]
            # clf = VotingClassifier(clfList, voting='hard')
            X_train, X_test = self.features[train_index], self.features[
                test_index]
            y_train, y_test = self.labels[train_index], self.labels[test_index]
            # featureProcessor = PCA(n_components=20)
            print(len(X_train), 'asd', len(y_train))

            # print(X_train,'asd', y_train)
            def aaa(X, L):
                res = []
                for i in range(L):
                    line = X[i]
                    tres = []
                    for n in line:
                        tres.append(n)
                    res.append(tres)
                return np.array(res)

            X_train, X_test = aaa(X_train,
                                  len(y_train)), aaa(X_test, len(y_test))
            X_train, X_test = X_train[:, :], X_test[:, :]
            y_train, y_test = np.array(y_train), np.array(y_test)
            # featureProcessor = PCA(n_components=10)

            featureProcessor = LinearDiscriminantAnalysis(
                n_components=200).fit(X_train, y_train)
            # # featureProcessor = SelectKBest(chi2, k=20)#.fit(features, labels)
            # featureProcessor.fit(X_train, y_train)
            X_train = featureProcessor.transform(X_train)
            X_test = featureProcessor.transform(X_test)

            clf.fit(X_train, y_train)
            pred_train = clf.predict(X_train)
            y_train = list(y_train)
            cm = confusion_matrix(y_train, pred_train)
            print("第", count, "轮训练集里的表现\n", cm)
            pred = clf.predict(X_test)
            y_test = list(y_test)
            cm = confusion_matrix(y_test, pred, labels=[0, 1])
            totalCM = totalCM + np.array(cm)
            print("第", count, "轮测试集里的表现\n", cm)
            # print(X_test)
            print("####################################")
            count += 1
        print("混淆矩阵的和是\n", totalCM, "准确率是",
              (totalCM[0][0] + totalCM[1][1]) / (sum(sum(totalCM))))
                  random_state=12345)

f, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].scatter(X[y == 0, 0], X[y == 0, 1], color='blue', s=2, label='y=0')
axes[0].scatter(X[y != 0, 0], X[y != 0, 1], color='red', s=2, label='y=1')
axes[0].set_xlabel('X[:,0]')
axes[0].set_ylabel('X[:,1]')
axes[0].legend(loc='lower left', fontsize='small')

k_fold = KFold(y, n_folds=FOLDS, shuffle=True, random_state=12345)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=12345)

y_real = []
y_proba = []
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    predictor.fit(Xtrain, ytrain)
    pred_proba = predictor.predict_proba(Xtest)
    precision, recall, _ = precision_recall_curve(ytest, pred_proba[:, 1])
    lab = 'Fold %d AUC=%.4f' % (i + 1, auc(recall, precision))
    axes[1].step(recall, precision, label=lab)
    y_real.append(ytest)
    y_proba.append(pred_proba[:, 1])

y_real = numpy.concatenate(y_real)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUC=%.4f' % (auc(recall, precision))
axes[1].step(recall, precision, label=lab, lw=2, color='black')
    def crossValidation(self):
        kf = KFold(n_splits=10)
        inputData = kf.split(self.features)
        count = 1
        totalCM = np.zeros([2, 2])
        for train_index, test_index in inputData:
            X_train, X_test = self.features[train_index], self.features[
                test_index]

            #X_train, X_test = np.abs(X_train), np.abs(X_test)
            # print(X_train)
            y_train, y_test = self.labels[train_index], self.labels[test_index]
            num_feature = len(X_train[0])
            clf = deepLearning.LSTMClassifier(2,
                                              num_feature,
                                              learning_rate=1e-3,
                                              layer_num=2,
                                              hidden_size=50,
                                              timestep_size=350)
            clf.initGraph()
            clf.initOneHotEncoder4Y(y_train)

            def aaa(X, L):
                res = []
                for i in range(L):
                    line = X[i]
                    tres = []
                    for n in line:
                        tres.append(n)
                    res.append(tres)
                return np.array(res)

            def scala(data):
                res = []
                for i in range(len(data)):
                    res.append(data[i] / (0.0000001 + sum(data[i])))
                return np.array(res)

            X_train, X_test = aaa(X_train,
                                  len(y_train)), aaa(X_test, len(y_test))
            from sklearn.preprocessing import RobustScaler, StandardScaler
            #norm_x = StandardScaler().fit(X_train)
            #X_train, X_test = norm_x.transform(X_train), norm_x.transform(X_test)
            X_train, X_test = X_train[:, :], X_test[:, :]
            X_train, X_test = scala(X_train), scala(X_test)

            y_train, y_test = np.array(y_train), np.array(y_test)

            for i in range(500):
                print("这是第", count, "折,第", i, "轮训练。", len(y_train))
                for j in range(0, len(y_train), 200):

                    batch_ys = clf.oneHotEncode(y_train[j:j + 200, :])
                    batch_xs = np.array(X_train[j:j + 200, :]).astype(
                        np.float32)
                    clf.fit(batch_xs, batch_ys)
                print("训练集")
                batch_ys = clf.oneHotEncode(y_train)
                batch_xs = np.array(X_train).astype(np.float32)
                clf.test(batch_xs, batch_ys)
                print("测试机")
                batch_ys = clf.oneHotEncode(y_test)
                batch_xs = np.array(X_test).astype(np.float32)
                clf.test(batch_xs, batch_ys)
                print("*************************")
            break
Beispiel #26
0
#
# Fradient Boosting
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold

NFOLDS = 5
SEED = 42

if os.path.exists("../tmp/oof_index.dat"):
    with open("../tmp/oof_index.dat", "rb") as f:
        kfolds = dill.load(f)
else:
    dftrain_tmp = pd.read_csv("../input/train.csv")
    fold = KFold(n_splits=5, shuffle=True, random_state=1234)
    kfolds = list(fold.split(dftrain_tmp))
    with open("../tmp/oof_index.dat", "wb") as f:
        dill.dump(kfolds, f)
    del dftrain_tmp; gc.collect()

print("Creating Ridge Features...")
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None, seed_bool = True):
        if(seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
Beispiel #27
0
from scipy import interp
from sklearn.multiclass import OneVsRestClassifier
plt.style.use('ggplot')

X, y = make_classification(n_samples=500, random_state=100, flip_y=0.3)

kf = KFold(n_splits=5, shuffle=True, random_state=0)
clf = LinearDiscriminantAnalysis()
pipe = Pipeline([('scaler', StandardScaler()), ('clf', clf)])

tprs = []
aucs = []
base_fpr = np.linspace(0, 1, 101)
colors = ['darksalmon', 'gold', 'royalblue', 'mediumseagreen', 'violet']

for i, (train, test) in enumerate(kf.split(X, y)):
    model = pipe.fit(X[train], y[train])
    y_score = model.predict_proba(X[test])
    fpr, tpr, _ = roc_curve(y[test], y_score[:, 1])
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    #plt.plot(fpr, tpr, lw=1, alpha=0.6, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc), c = colors[i])
    tpr = interp(base_fpr, fpr, tpr)
    tpr[0] = 0.0
    tprs.append(tpr)

tprs = np.array(tprs)
mean_tprs = tprs.mean(axis=0)
std = tprs.std(axis=0)

mean_auc = auc(base_fpr, mean_tprs)
    def crossValidation(self):
        kf = KFold(n_splits=10)
        inputData = kf.split(self.features)
        count = 1
        totalCM = np.zeros([2, 2])
        for train_index, test_index in inputData:
            # clf = DecisionTreeClassifier(max_depth=10)
            clf = RandomForestClassifier(n_estimators=50,
                                         max_depth=4,
                                         random_state=666)
            #clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=20)
            #clf = MLPClassifier(hidden_layer_sizes=(200,))
            # clf = LogisticRegression(max_iter=2000, solver='lbfgs', C=100)
            #clf = GradientBoostingClassifier(n_estimators=50)
            # clf = SVC(C=0.8)
            clfList = [[
                "AD",
                AdaBoostClassifier(
                    base_estimator=DecisionTreeClassifier(max_depth=10),
                    n_estimators=20)
            ], ["gbdt", GradientBoostingClassifier(n_estimators=20)],
                       [
                           "LR",
                           LogisticRegression(max_iter=1000,
                                              solver='lbfgs',
                                              C=100)
                       ], ['desisionTree',
                           DecisionTreeClassifier()],
                       ['mlp',
                        MLPClassifier(hidden_layer_sizes=(100, ))],
                       ['KNN', KNeighborsClassifier(n_neighbors=20)]]
            clf = VotingClassifier(clfList, voting='hard')
            X_train, X_test = self.features[train_index], self.features[
                test_index]

            #X_train, X_test = np.abs(X_train), np.abs(X_test)
            # print(X_train)
            y_train, y_test = self.labels[train_index], self.labels[test_index]

            def scala(x):
                res = []
                for i in range(len(x)):
                    res.append(x[i, :] / (0.0000001 + np.median(x[i, :])))
                return np.array(res)

            X_train, X_test = X_train[:, :], X_test[:, :]
            X_train, X_test = scala(X_train), scala(X_test)
            y_train, y_test = np.array(y_train), np.array(y_test)

            clf.fit(X_train, y_train)
            pred_train = clf.predict(X_train)
            y_train = list(y_train)
            cm = confusion_matrix(y_train, pred_train)
            print("第", count, "轮训练集里的表现\n", cm)
            pred = clf.predict(X_test)
            y_test = list(y_test)
            cm = confusion_matrix(y_test, pred, labels=[0, 1])
            totalCM = totalCM + np.array(cm)
            print("第", count, "轮测试集里的表现\n", cm)
            # print(X_test)
            print("####################################")
            count += 1
            print("混淆矩阵的和是\n", totalCM, "准确率是",
                  (totalCM[0][0] + totalCM[1][1]) / (sum(sum(totalCM))))