def _get_fold_generator(target_values):
    if params.stratified_cv:
        cv = StratifiedKFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
        cv.get_n_splits(target_values)
        fold_generator = cv.split(target_values, target_values)
    else:
        cv = KFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
        fold_generator = cv.split(target_values)
    return fold_generator
def test_shuffle_stratifiedkfold():
    # Check that shuffling is happening when requested, and for proper
    # sample coverage
    X_40 = np.ones(40)
    y = [0] * 20 + [1] * 20
    kf0 = StratifiedKFold(5, shuffle=True, random_state=0)
    kf1 = StratifiedKFold(5, shuffle=True, random_state=1)
    for (_, test0), (_, test1) in zip(kf0.split(X_40, y),
                                      kf1.split(X_40, y)):
        assert_not_equal(set(test0), set(test1))
    check_cv_coverage(kf0, X_40, y, labels=None, expected_n_iter=5)
def Kfold(dataset, k, shuffle=False, stratify=False):
    """
    Envelop function for folding operation
    """
    # remove class labels
    data = dataset[0]
    if stratify:
        kf = StratifiedKFold(k, shuffle)
        return kf.split(dataset[0], dataset[1])

    kf = KFold(k, shuffle)
    return kf.split(data)
Beispiel #4
0
def test_kfold_valueerrors():
    X1 = np.array([[1, 2], [3, 4], [5, 6]])
    X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, next, KFold(4).split(X1))

    # Check that a warning is raised if the least populated class has too few
    # members.
    y = np.array([3, 3, -1, -1, 3])

    skf_3 = StratifiedKFold(3)
    assert_warns_message(Warning, "The least populated class",
                         next, skf_3.split(X2, y))

    # Check that despite the warning the folds are still computed even
    # though all the classes are not necessarily represented at on each
    # side of the split at each split
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        check_cv_coverage(skf_3, X2, y, labels=None, expected_n_splits=3)

    # Check that errors are raised if all n_labels for individual
    # classes are less than n_splits.
    y = np.array([3, 3, -1, -1, 2])

    assert_raises(ValueError, next, skf_3.split(X2, y))

    # Check that errors are raised if all n_labels for individual
    # classes are less than n_folds.
    y = np.array([3, 3, -1, -1, 2])

    assert_raises(ValueError, next, skf_3.split(X2, y))

    # Error when number of folds is <= 1
    assert_raises(ValueError, KFold, 0)
    assert_raises(ValueError, KFold, 1)
    error_string = ("k-fold cross-validation requires at least one"
                    " train/test split")
    assert_raise_message(ValueError, error_string,
                         StratifiedKFold, 0)
    assert_raise_message(ValueError, error_string,
                         StratifiedKFold, 1)

    # When n_splits is not integer:
    assert_raises(ValueError, KFold, 1.5)
    assert_raises(ValueError, KFold, 2.0)
    assert_raises(ValueError, StratifiedKFold, 1.5)
    assert_raises(ValueError, StratifiedKFold, 2.0)

    # When shuffle is not  a bool:
    assert_raises(TypeError, KFold, n_splits=4, shuffle=None)
Beispiel #5
0
def cv_score(X, y, n_epochs = 10, n_folds=10, random_state=1999):
    kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)
    scores = np.zeros((n_folds, n_epochs))
    val_scores = np.zeros((n_folds, n_epochs))
    best_epochs = np.zeros(n_folds)
    clfs = [KerasWrapper(num_features=X.shape[1], label='keras_{}'.format(i)) for i in range(n_folds)]
    folds = kf.split(X, y_train)
    #iteratively train epochs
    kfsplit = [(itrain, itest) for itrain, itest in folds]
    for i in range(n_epochs):
        print('=============Epoch {}================'.format(i))
        i_fold = 0
        for itrain, itest in kfsplit:
            print('Fold ', i_fold)
            train = X[itrain,:]
            test = X[itest,:]
            ytrain, ytest = y[itrain], y[itest]
            clf, score, num_epoch = clfs[i_fold].fit(train, ytrain, nb_epoch=1, 
                                               validation_split=None, batch_size=64,
                                               patience=1)

            print('score: {}'.format(score))
            scores[i_fold, i] = score
            best_epochs[i_fold] = num_epoch

            # predict on oof
            pred = clf.predict_proba(test)
            val_score = log_loss(ytest, pred)
            print('Validation score: ', val_score)
            val_scores[i_fold, i] = val_score
            i_fold += 1
    return scores, val_scores, best_epochs
def test_kfold_valueerrors():
    X1 = np.array([[1, 2], [3, 4], [5, 6]])
    X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, next, KFold(4).split(X1))

    # Check that a warning is raised if the least populated class has too few
    # members.
    y = np.array([3, 3, -1, -1, 2])

    skf_3 = StratifiedKFold(3)
    assert_warns_message(Warning, "The least populated class",
                         next, skf_3.split(X2, y))

    # Check that despite the warning the folds are still computed even
    # though all the classes are not necessarily represented at on each
    # side of the split at each split
    with warnings.catch_warnings():
        check_cv_coverage(skf_3, X2, y, labels=None, expected_n_iter=3)

    # Error when number of folds is <= 1
    assert_raises(ValueError, KFold, 0)
    assert_raises(ValueError, KFold, 1)
    assert_raises(ValueError, StratifiedKFold, 0)
    assert_raises(ValueError, StratifiedKFold, 1)

    # When n_folds is not integer:
    assert_raises(ValueError, KFold, 1.5)
    assert_raises(ValueError, KFold, 2.0)
    assert_raises(ValueError, StratifiedKFold, 1.5)
    assert_raises(ValueError, StratifiedKFold, 2.0)

    # When shuffle is not  a bool:
    assert_raises(TypeError, KFold, n_folds=4, shuffle=None)
Beispiel #7
0
    def stratified_cross_validate(self, k):
        attributes = np.append(self.training_attributes, self.testing_attributes, axis=0)
        labels = np.append(self.training_labels, self.testing_labels, axis=0)

        all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))])

        #print("all data : %s" % all_data)
        #print("")

        np.random.shuffle(all_data)

        X = all_data[:, :-1]
        y = all_data[:, -1]
        print(X.shape, y.shape)
        skf = StratifiedKFold(n_splits=2)
        print(skf.get_n_splits(X, y))
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            yield (X_train, y_train, X_test, y_test)

        #print("shuffled data : %s" % all_data)
        #print("")

        for i in range(k):
            split = len(all_data) / k
            #print("split : %s" % split)

            test_data = all_data[i * split:(i + 1) * split, :]
            train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0)

            train_input, train_output = train_data[:, :-1], train_data[:, -1]
            test_input, test_output = test_data[:, :-1], test_data[:, -1]

            yield (train_input, train_output, test_input, test_output)
Beispiel #8
0
def test_datasets(dataset_names):
    from sklearn.svm import SVC
    data = Data(dataset_names=dataset_names)

    def separate_sets(x, y, test_fold_id, test_folds):
        x_test = x[test_folds == test_fold_id, :]
        y_test = y[test_folds == test_fold_id]

        x_train = x[test_folds != test_fold_id, :]
        y_train = y[test_folds != test_fold_id]
        return [x_train, y_train, x_test, y_test]

    n_folds = 2
    accuracies = {}
    for name, dataset in data.datasets.items():
        dataset.print_summary()
        skf = StratifiedKFold(dataset.target, n_folds=n_folds, shuffle=True)
        test_folds = skf.test_folds
        accuracies[name] = np.zeros(n_folds)
        test_fold = 0
        for train_idx, test_idx in skf.split(X=dataset.data, y=dataset.target):
            x_train, y_train = dataset.data[train_idx], dataset.target[train_idx]
            x_test, y_test = dataset.data[test_idx], dataset.target[test_idx]

            svc = SVC(C=1.0, kernel='rbf', degree=1, tol=0.01)
            svc.fit(x_train, y_train)
            prediction = svc.predict(x_test)
            accuracies[name][test_fold] = 100*np.mean((prediction == y_test))
            print("Acc = {0:.2f}%".format(accuracies[name][test_fold]))
            test_fold += 1
    return accuracies
def cv(X_train, y_train):

    kfold = StratifiedKFold(n_splits=5, shuffle=True)

    scores_f = []
    scores_p = []
    scores_r = []


    for train, test in kfold.split(X_train, y_train):

        model = TargetEnsembler(features)
        X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns)
        y_train_cv = pd.DataFrame(y_train.values[train], columns=["PCL_Strict3"])
        X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns)
        y_test_cv = pd.DataFrame(y_train.values[test], columns=["PCL_Strict3"])
        model.fit(X_train_cv, y_train_cv)

        y_pred = model.predict(X_test_cv)

        s_f = f1_score(y_test_cv, y_pred)
        s_p = precision_score(y_test_cv, y_pred)
        s_r = recall_score(y_test_cv, y_pred)
        print("\tscores f1", (s_f))
        print("\tscores p", (s_p))
        print("\tscores r", (s_r))
        scores_f.append(s_f)
        scores_p.append(s_p)
        scores_r.append(s_r)

    print("mean scores f1", np.mean(scores_f))
    print("mean scores p", np.mean(scores_p))
    print("mean scores r", np.mean(scores_r))
Beispiel #10
0
def get_cv_results(design, data, cv_splits=10):
  test_df, unit_onehot, unit_x = data
  cv_results = []
  for i in range(design.shape[0]):
    lambda_int, lambda_x = design[i, :]
    val_losses = []
    for rep in range(3): # Almost like bootstrap. Reshuffling
      
      cv_val_losses = []
      skf = StratifiedKFold(n_splits=10, shuffle=True)
      for train_index, test_index in skf.split(unit_x, test_df['unit']):
         re_model = create_model(unit_onehot.shape[1], lambda_int, lambda_x,
                                 .01, .0001, .92)

         X_train = [test_df["x"][train_index], unit_onehot[train_index],
                    unit_x[train_index]]
         X_test = [test_df["x"][test_index], unit_onehot[test_index],
                    unit_x[test_index]]

         y_train, y_test = test_df["y"][train_index], test_df["y"][test_index]
         h = re_model.fit(X_train, y_train,
                          epochs = 15000, batch_size = 450,
                          validation_data = (X_test, y_test),
                          callbacks = callbacks, verbose = 0)
         cv_val_losses.append(np.min(h.history['val_loss']))

      val_losses.append(np.mean(cv_val_losses))
    cv_results.append(np.mean(val_losses)) 
  return cv_results
def classify(X,y, clf,**para):
    # y = profile["Loss"].as_matrix()
    # X = profile[features].as_matrix()

    kf = KFold(n_splits=10)
    skf = StratifiedKFold(n_splits=6)

    # print(**para)
    classifier = clf(**para)
    name = str(classifier).split("(")[0]


    # dt = tree.DecisionTreeClassifier(min_samples_split=min_split, max_depth=max_dep)
    print("{0} has been established with {1}".format(name, para))
    # lr = LogisticRegression(penalty='l1')

    for train_index, test_index in skf.split(X, y):
        #     print("TRAIN:",train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        print("10-fold Score is: {0}".format(score))

    return classifier,y_test, y_pred
Beispiel #12
0
def test_grid_search_correct_score_results():
    # test that correct scores are used
    n_splits = 3
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits)
        results = grid_search.fit(X, y).cv_results_

        # Test scorer names
        result_keys = list(results.keys())
        expected_keys = (("mean_test_score", "rank_test_score") +
                         tuple("split%d_test_score" % cv_i
                               for cv_i in range(n_splits)))
        assert_true(all(in1d(expected_keys, result_keys)))

        cv = StratifiedKFold(n_splits=n_splits)
        n_splits = grid_search.n_splits_
        for candidate_i, C in enumerate(Cs):
            clf.set_params(C=C)
            cv_scores = np.array(
                list(grid_search.cv_results_['split%d_test_score'
                                             % s][candidate_i]
                     for s in range(n_splits)))
            for i, (train, test) in enumerate(cv.split(X, y)):
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, cv_scores[i])
Beispiel #13
0
    def split_data(self, X, y, stratified = True, bad_chess = False):
        if bad_chess:
            n_points = int(X.shape[0] / self.nodes)
            for node in range(self.nodes):
                start_slice = node * n_points
                final_slice = start_slice + n_points
                dx = X[start_slice:final_slice]
                dy = y[start_slice:final_slice]

                frame_dx = pd.DataFrame(dx)
                frame_dy = pd.DataFrame(dy)

                file_data  = datas_path.joinpath('data_' + str(node) + '.csv')
                file_class = datas_path.joinpath('class_' + str(node) + '.csv')
                frame_dx.to_csv(file_data, index = False)
                frame_dy.to_csv(file_class, index = False)
        else:
            node = 0
            if stratified:
                skf  = StratifiedKFold(n_splits = self.nodes)
            else:
                skf  = KFold(n_splits = self.nodes, shuffle = True, random_state = 17)
            for splited_index in skf.split(X, y):
                new_X = pd.DataFrame(X[splited_index[1]])
                new_y = pd.DataFrame(y[splited_index[1]])

                X_path = datas_path.joinpath("data_" + str(node) + ".csv")
                y_path = datas_path.joinpath("class_" + str(node) + ".csv")
                new_X.to_csv(X_path, index = False)
                new_y.to_csv(y_path, index = False)
                node += 1
def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False,
                   shuffle=True,metric='acc',clf_name='UnKnown'):
    folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed)
    folds.get_n_splits(X_train,y)
    #return stacking_proba for train set
    train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0]))
    score=0
    for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)):
        # print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds))
        X_train_fold=X_train[train_index,:]
        y_train_fold=y[train_index]
        X_validate_fold=X_train[validate_index,:]
        y_validate_fold=y[validate_index]
        clf.fit(X_train_fold,y_train_fold)
        fold_preds=clf.predict_proba(X_validate_fold)
        train_stacking_proba[validate_index,:]=fold_preds
        #validation
        fold_preds_a = np.argmax(fold_preds, axis=1)
        fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold)
        # print('validate '+metric+":"+str(fold_score))
        score+=fold_score
    score/=nfolds
    #return stacking_proba for test set
    clf.fit(X_train,y)
    test_stacking_proba=clf.predict_proba(X_test)

    if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
        train_stacking_proba=train_stacking_proba[:,1]
        test_stacking_proba=test_stacking_proba[:,1]
    if return_score:
        return train_stacking_proba,test_stacking_proba,score
    else:
        return train_stacking_proba,test_stacking_proba
Beispiel #15
0
def split(dependent, independent, n_folds):
  skf = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_STATE)
  for train_indices, test_indices in skf.split(dependent, independent):
    train_x = dependent[train_indices]
    train_y = independent[train_indices]
    test_x = dependent[test_indices]
    test_y = independent[test_indices]
    yield train_x, train_y, test_x, test_y
Beispiel #16
0
def test_ovr_multinomial_iris():
    # Test that OvR and multinomial are correct using the iris dataset.
    train, target = iris.data, iris.target
    n_samples, n_features = train.shape

    # The cv indices from stratified kfold (where stratification is done based
    # on the fine-grained iris classes, i.e, before the classes 0 and 1 are
    # conflated) is used for both clf and clf1
    n_cv = 2
    cv = StratifiedKFold(n_cv)
    precomputed_folds = list(cv.split(train, target))

    # Train clf on the original dataset where classes 0 and 1 are separated
    clf = LogisticRegressionCV(cv=precomputed_folds)
    clf.fit(train, target)

    # Conflate classes 0 and 1 and train clf1 on this modified dataset
    clf1 = LogisticRegressionCV(cv=precomputed_folds)
    target_copy = target.copy()
    target_copy[target_copy == 0] = 1
    clf1.fit(train, target_copy)

    # Ensure that what OvR learns for class2 is same regardless of whether
    # classes 0 and 1 are separated or not
    assert_array_almost_equal(clf.scores_[2], clf1.scores_[2])
    assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_)
    assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_)

    # Test the shape of various attributes.
    assert_equal(clf.coef_.shape, (3, n_features))
    assert_array_equal(clf.classes_, [0, 1, 2])
    coefs_paths = np.asarray(list(clf.coefs_paths_.values()))
    assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1))
    assert_equal(clf.Cs_.shape, (10,))
    scores = np.asarray(list(clf.scores_.values()))
    assert_equal(scores.shape, (3, n_cv, 10))

    # Test that for the iris data multinomial gives a better accuracy than OvR
    for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']:
        max_iter = 2000 if solver in ['sag', 'saga'] else 15
        clf_multi = LogisticRegressionCV(
            solver=solver, multi_class='multinomial', max_iter=max_iter,
            random_state=42, tol=1e-5 if solver in ['sag', 'saga'] else 1e-2,
            cv=2)
        clf_multi.fit(train, target)
        multi_score = clf_multi.score(train, target)
        ovr_score = clf.score(train, target)
        assert_greater(multi_score, ovr_score)

        # Test attributes of LogisticRegressionCV
        assert_equal(clf.coef_.shape, clf_multi.coef_.shape)
        assert_array_equal(clf_multi.classes_, [0, 1, 2])
        coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values()))
        assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10,
                                                      n_features + 1))
        assert_equal(clf_multi.Cs_.shape, (10,))
        scores = np.asarray(list(clf_multi.scores_.values()))
        assert_equal(scores.shape, (3, n_cv, 10))
 def create_validation_split(self, n_folds=5, stratified=False):
     self.folds = n_folds
     if Path("cv_splits/train_cv_fold_0").is_file() is False:
         if stratified:
             skf = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True)
             idx = 0
             for train_index, test_index in skf.split(self.df_train[[self.id_colname]], self.df_train[[self.target_colname]]):
                 self.df_train[[self.id_colname]].loc[train_index, :].to_csv('cv_splits/train_cv_fold_{}'.format(idx), index=False)
                 self.df_train[[self.id_colname]].loc[test_index, :].to_csv('cv_splits/test_cv_fold_{}'.format(idx), index=False)
                 idx += 1
         else:
             skf = KFold(n_splits=n_folds, random_state=42, shuffle=True)
             idx = 0
             for train_index, test_index in skf.split(self.df_train[[self.id_colname]]):
                 self.df_train[[self.id_colname]].loc[train_index, :].to_csv('cv_splits/train_cv_fold_{}'.format(idx), index=False)
                 self.df_train[[self.id_colname]].loc[test_index, :].to_csv('cv_splits/test_cv_fold_{}'.format(idx), index=False)
                 idx += 1
         gc.collect()
def gen_folds(X, y, n_folds=5, random_state=0):
    from sklearn.model_selection import StratifiedKFold

    kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)

    folds = kf.split(X, y)
    # iteratively train epochs
    kfsplit = [(itrain, itest) for itrain, itest in folds]
    return kfsplit
Beispiel #19
0
def categorical_average(variable, y, pred_0, feature_name):
    def calculate_average(sub1, sub2):
        s = pd.DataFrame(data = {
                                 variable: sub1.groupby(variable, as_index = False).count()[variable],                              
                                 'sumy': sub1.groupby(variable, as_index = False).sum()['y'],
                                 'avgY': sub1.groupby(variable, as_index = False).mean()['y'],
                                 'cnt': sub1.groupby(variable, as_index = False).count()['y']
                                 })
                                 
        tmp = sub2.merge(s.reset_index(), how='left', left_on=variable, right_on=variable) 
        del tmp['index']                       
        tmp.loc[pd.isnull(tmp['cnt']), 'cnt'] = 0.0
        tmp.loc[pd.isnull(tmp['cnt']), 'sumy'] = 0.0

        def compute_beta(row):
            cnt = row['cnt'] if row['cnt'] < 200 else float('inf')
            return 1.0 / (g + exp((cnt - k) / f))
            
        if lambda_val is not None:
            tmp['beta'] = lambda_val
        else:
            tmp['beta'] = tmp.apply(compute_beta, axis = 1)
            
        tmp['adj_avg'] = tmp.apply(lambda row: (1.0 - row['beta']) * row['avgY'] + row['beta'] * row['pred_0'],
                                   axis = 1)
                                   
        tmp.loc[pd.isnull(tmp['avgY']), 'avgY'] = tmp.loc[pd.isnull(tmp['avgY']), 'pred_0']
        tmp.loc[pd.isnull(tmp['adj_avg']), 'adj_avg'] = tmp.loc[pd.isnull(tmp['adj_avg']), 'pred_0']
        tmp['random'] = np.random.uniform(size = len(tmp))
        tmp['adj_avg'] = tmp.apply(lambda row: row['adj_avg'] *(1 + (row['random'] - 0.5) * r_k),
                                   axis = 1)
    
        return tmp['adj_avg'].ravel()
     
    #cv for training set 
    k_fold = StratifiedKFold(5)
    X_train[feature_name] = -999 
    for (train_index, cv_index) in k_fold.split(np.zeros(len(X_train)),
                                                X_train['interest_level'].ravel()):
        sub = pd.DataFrame(data = {variable: X_train[variable],
                                   'y': X_train[y],
                                   'pred_0': X_train[pred_0]})
            
        sub1 = sub.iloc[train_index]        
        sub2 = sub.iloc[cv_index]
        
        X_train.loc[cv_index, feature_name] = calculate_average(sub1, sub2)
    
    #for test set
    sub1 = pd.DataFrame(data = {variable: X_train[variable],
                                'y': X_train[y],
                                'pred_0': X_train[pred_0]})
    sub2 = pd.DataFrame(data = {variable: X_test[variable],
                                'y': X_test[y],
                                'pred_0': X_test[pred_0]})
    X_test.loc[:, feature_name] = calculate_average(sub1, sub2)                               
def stratifiedCV(X, y, n_splits = 6):

    skf = StratifiedKFold(n_splits=n_splits)

    for train_index, test_index in skf.split(X, y):
        #     print("TRAIN:",train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        yield X_train, y_train, X_test, y_test
Beispiel #21
0
    def cv_stats(self):
        """Perform cross-validation for model evaluation.
        
        Returns
        -------
        (list[int], list[int], list[float])
            Tuple containing three lists of the same size:
                true labels
                predicted labels
                prediction probabilities
        """
        if 'y_true' in self._cache:
            return self._cache['y_true'], self._cache['y_pred'], self._cache['y_prob'], self._cache['sigfeatures']
        
        X = self._fe.X
        y = self._fe.y
        
        kf = StratifiedKFold(n_splits=10, shuffle=True)
        y_true, y_pred, y_prob = [], [], []
        sigfeatures = []

        order_indices = []

        for train_index, test_index in kf.split(X, y):
            order_indices.extend(test_index)
			
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = self.get_new_classifier()
            clf.fit(X_train, y_train)

            pred = clf.predict(X_test)
            prob = clf.predict_proba(X_test)
            prob = np.choose(pred, prob.T)
            
            for predy in pred:
                sigfeatures.append(get_sig_features(predy, clf.coef_, 20))
            
            y_true.extend(y_test)
            y_pred.extend(pred)
            y_prob.extend(prob)
        # reorder the results so they match the order of original data
        y_true = [v for i, v in sorted(zip(order_indices, y_true))]
        y_pred = [v for i, v in sorted(zip(order_indices, y_pred))]
        y_prob = [v for i, v in sorted(zip(order_indices, y_prob))]
        assert list(y_true) == list(y)
        
        # cache the results
        self._cache['y_true'] = y_true
        self._cache['y_pred'] = y_pred
        self._cache['y_prob'] = y_prob
        self._cache['sigfeatures'] = sigfeatures
        
        return (y_true, y_pred, y_prob, sigfeatures) 
def get_cross_validated_confusion_matrix(data, label, estimator, index, nfolds=10):
    # nfolds = get_least_class(label)
    skf = StratifiedKFold(n_splits=nfolds)
    con_matrix = np.zeros((len(np.unique(label)), len(np.unique(label))))
    for train_index, test_index in skf.split(data, label):
        train_data, test_data = data[train_index], data[test_index]
        train_label, test_label = np.array(label)[train_index], np.array(label)[test_index]
        estimator.train_matrix(train_data, train_label)
        pred_label = estimator.predict(test_data)
        con_matrix = con_matrix + confusion_matrix(test_label, pred_label, labels = index)
    return con_matrix
def runCrossValidation(train, RFfile):

	train_tracks = []
	for feature in train:
		if feature[0] != 0.:
			train_tracks.append(feature)
	train_tracks = np.array(train_tracks)
	# Gets parameter values for training data
	trainArr = train_tracks[:,1:]
	# Gets class label of all training data
	trainRes = train_tracks[:,0]

	# Convert all NaNs to 0 for RF to work properly
	trainArr = np.nan_to_num(trainArr)
	trainRes = np.nan_to_num(trainRes)

	# Load the classifier
	rf = joblib.load(RFfile)

	# Stratified KFolds cross validation
	cv = StratifiedKFold(n_splits = 5)

	precision   = []
	accuracy    = []
	sensitivity = []
	matthews    = []
	r2          = []
	f1          = []
	auroc       = []
	cm          = [[0, 0], [0, 0]]

	for train_index, test_index in cv.split(trainArr, trainRes):
	    probas_     = rf.fit(trainArr[train_index], trainRes[train_index]).predict_proba(trainArr[test_index])
	    classes     = rf.fit(trainArr[train_index], trainRes[train_index]).predict(trainArr[test_index])
	   # r2          = np.append(r2, (r2_score(trainRes[test_index], probas_[:, 1])))
	    precision   = np.append(precision, (precision_score(trainRes[test_index], classes)))
	   # auroc       = np.append(auroc, (roc_auc_score(trainRes[test_index], classes)))
	    accuracy    = np.append(accuracy, (accuracy_score(trainRes[test_index], classes)))
	    sensitivity = np.append(sensitivity, (recall_score(trainRes[test_index], classes)))
	    f1          = np.append(f1, (f1_score(trainRes[test_index], classes)))
	   # matthews    = np.append(matthews, (matthews_corrcoef(trainRes[test_index], classes)))
	    #cma         = np.add(cma, (confusion_matrix(trainRes[test_index], classes)))

	# cma         = np.array(cma)
	# r2          = np.array(r2)
	precision   = np.array(precision)
	accuracy    = np.array(accuracy)
	sensitivity = np.array(sensitivity)
	f1          = np.array(f1)
	# auroc       = np.array(auroc)
	# matthews    = np.array(matthews)

	return accuracy, precision, sensitivity, f1
Beispiel #24
0
def generate_folds(dataset_path, output_folder, n_folds=10, random_state=None):
    """
    Given a dataset df, generate n_folds for it and store them in <output_folder>/<dataset_name>.

    :type dataset_path: str
    :param dataset_path: Path to dataset with .arff file extension (i.e my_dataset.arff)
    :type output_folder: str
    :param output_folder: Path to store both index file with folds and fold files.
    :type n_folds: int
    :param n_folds: Optional - Number of folds to split the dataset into. Defaults to 10.
    :type random_state: int
    :param random_state: Optional - Seed to use in the splitting process. Defaults to None (no seed).
    """

    import warnings
    warnings.filterwarnings('error')

    dataset_name = dataset_path.split('/')[-1].split('.')[0]

    af = load_arff(dataset_path)
    df = load_dataframe(af)

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    fold_iter = skf.split(df[df.columns[:-1]], df[df.columns[-1]])

    fold_index = dict()

    jvm.start()

    csv_loader = Loader(classname="weka.core.converters.CSVLoader")
    arff_saver = Saver(classname='weka.core.converters.ArffSaver')

    for i, (arg_rest, arg_test) in enumerate(fold_iter):
        fold_index[i] = list(arg_test)

        _temp_path = 'temp_%s_%d.csv' % (dataset_name, i)

        fold_data = df.loc[arg_test]  # type: pd.DataFrame
        fold_data.to_csv(_temp_path, sep=',', index=False)

        java_arff_dataset = csv_loader.load_file(_temp_path)
        java_arff_dataset.relationname = af['relation']
        java_arff_dataset.class_is_last()
        arff_saver.save_file(java_arff_dataset, os.path.join(output_folder, '%s_fold_%d.arff' % (dataset_name, i)))

        os.remove(_temp_path)

    json.dump(
        fold_index, open(os.path.join(output_folder, dataset_name + '.json'), 'w'), indent=2
    )

    jvm.stop()
    warnings.filterwarnings('default')
Beispiel #25
0
def rmseCvMean(model, X, y, cv=5, random_state=41):
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=cv, random_state=random_state)
    scr = 0
    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        scr += rmse(y_test, pred)
        print('\t', rmse(y_test, pred))
    return scr/cv
Beispiel #26
0
def run_cv_evaluation(data, n_folds, nlu_config):
    from sklearn import metrics
    from sklearn.model_selection import StratifiedKFold
    from collections import defaultdict
    # type: (List[rasa_nlu.training_data.Message], int, RasaNLUConfig) -> Dict[Text, List[float]]
    """Stratified cross validation on data

    :param data: list of rasa_nlu.training_data.Message objects
    :param n_folds: integer, number of cv folds
    :param nlu_config: nlu config file
    :return: dictionary with key, list structure, where each entry in list
              corresponds to the relevant result for one fold

    """
    trainer = Trainer(nlu_config)
    results = defaultdict(list)

    y_true = [e.get("intent") for e in data]

    skf = StratifiedKFold(n_splits=n_folds, random_state=11, shuffle=True)
    counter = 1
    logger.info("Evaluation started")
    for train_index, test_index in skf.split(data, y_true):

        train = [data[i] for i in train_index]
        test = [data[i] for i in test_index]

        logger.debug("Fold: {}".format(counter))
        logger.debug("Training ...")
        trainer.train(TrainingData(training_examples=train))
        model_directory = trainer.persist("projects/")  # Returns the directory the model is stored in

        logger.debug("Evaluation ...")
        interpreter = Interpreter.load(model_directory, nlu_config)
        test_y = [e.get("intent") for e in test]

        preds = []
        for e in test:
            res = interpreter.parse(e.text)
            if res.get('intent'):
                preds.append(res['intent'].get('name'))
            else:
                preds.append(None)

        # compute fold metrics
        results["Accuracy"].append(metrics.accuracy_score(test_y, preds))
        results["F1-score"].append(metrics.f1_score(test_y, preds, average='weighted'))
        results["Precision"] = metrics.precision_score(test_y, preds, average='weighted')

        # increase fold counter
        counter += 1

    return dict(results)
Beispiel #27
0
def cross_validation(sgd_clf, x_train, y_train):
    skfolds = StratifiedKFold(n_splits=5, random_state=42)
    for train_index, test_index in skfolds.split(x_train, y_train): #40000, 20000
        clone_clf = clone(sgd_clf)
        x_train_folds = x_train[train_index]
        y_train_folds = y_train[train_index]
        x_test_fold = x_train[test_index]
        y_test_fold = y_train[test_index]

        clone_clf.fit(x_train_folds, y_train_folds)
        y_pred = clone_clf.predict(x_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print(n_correct / len(y_pred))
Beispiel #28
0
 def run_cv_model(self, alpha=0.0001, batch_size=200, learning_rate_init=0.001, power_t=0.5, max_iter=200, momentum=0.9, beta_1=0.9, beta_2=0.999, hidden_layer_sizes=(100,), do_plot=True):
     
     # use k-fold cross validation
     
     # we need to standardize the data for the KNN learner
     pipe_clf = Pipeline([ ('scl', StandardScaler() ),
                           ('clf', MLPClassifier(alpha=alpha,
                                                 batch_size=batch_size,
                                                 learning_rate_init=learning_rate_init,
                                                 power_t=power_t,
                                                 max_iter=max_iter,
                                                 momentum=momentum,
                                                 beta_1=beta_1,
                                                 beta_2=beta_2,
                                                 hidden_layer_sizes=hidden_layer_sizes))])
 
     # resample the test data without replacement. This means that each data point is part of a test a
     # training set only once. (paraphrased from Raschka p.176). In Stratified KFold, the features are
     # evenly disributed such that each test and training set is an accurate representation of the whole
     # this is the 0.17 version
     #kfold = StratifiedKFold(y=self.y_train, n_folds=self.cv, random_state=0)
     
     # this is the 0.18dev version
     skf = StratifiedKFold(n_folds=self.cv, random_state=0)
     
     # do the cross validation
     train_scores = []
     test_scores = []
     #for k, (train, test) in enumerate(kfold):
     for k, (train, test) in enumerate(skf.split(X=self.x_train, y=self.y_train)):
         
         # run the learning algorithm
         pipe_clf.fit(self.x_train[train], self.y_train[train])
         train_score = pipe_clf.score(self.x_train[test], self.y_train[test])
         train_scores.append(train_score)
         test_score = pipe_clf.score(self.x_test, self.y_test)
         test_scores.append(test_score)
         print('Fold:', k+1, ', Training score:', train_score, ', Test score:', test_score)
     
     train_score = np.mean(train_scores)
     print('Training score is', train_score)
     
     test_score = np.mean(test_scores)
     print('Test score is', test_score)
     
     if do_plot:
         self.__plot_learning_curve(pipe_clf)
         
     return train_score, test_score  
def evaluate_classifier(clf, features, labels):
    """ 
        Evaluates the classifier using StratifiedKFold cross validation. The 
            precision and recall scores are used to evaluate the algorithm's 
            performance.
        
        clf = classifier
        features = features list as returned by the targetFeatureSplit script
        labels = target list as returned by the targetFeatureSplit script
    """
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.model_selection import StratifiedKFold
    
    ### Use StratifiedKFold cross validation with 10 folds
    skf = StratifiedKFold(n_splits = 10, random_state = 42)
    
    precision = []
    recall = []
    count = 0

    ### Split the features and labels into training and testing sets.
    for train_index, test_index in skf.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []

        for i in train_index:
            features_train.append(features[i])
            labels_train.append(labels[i])
                
        for j in test_index:
            features_test.append(features[j])
            labels_test.append(labels[j])
    
        clf.fit(features_train, labels_train)
        pred = clf.predict(features_test)
        
        precision.append(precision_score(labels_test, pred))
        recall.append(recall_score(labels_test, pred))
        
        count += 1
    
    print clf
    print "Folds:", count
    print "Average Precision:", sum(precision) / count
    print "Average Recall:", sum(recall) / count
    print ""
Beispiel #30
0
def CrossVal(estimator, X, y,procsessor=None,cv=3,times=10,random_state=0,imb=False):
    """
    交叉验证
    
    estimator:
        模型
    
    X:
        数据集X部分
    
    y:
        数据集的label
    
    procsessor:
        预处理器,其实就是做特征选择
    
    cv:
        做cv折交叉验证
    
    times:
        重复times次交叉验证
        
    random_state:
        随机数种子
    
    imb:
        是否使用SMOTE使得正负样本数平衡
    
    """
    
    res=[]
    for t in range(times):
        skf=StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state+t)
        indices=list(skf.split(X=X,y=y))        
        for k in indices:
            x_train,y_train,x_test,y_test=X[k[0]],y[k[0]],X[k[1]],y[k[1]]              
            if(imb==True):
                n,p=__lableCount(y_train)
                rus=RandomUnderSampler(random_state=random_state+t)
                x_train,y_train=rus.fit_sample(x_train,y_train)         
            if(procsessor is not None):
                procsessor.fit(x_train,y_train)
                x_train,y_train=procsessor.transform(x_train,y_train)
                x_test,y_test=procsessor.transform(x_test,y_test)
            estimator.fit(x_train,y_train)
            res.append(Metrics.Score(estimator,x_test,y_test))                
    res=np.array(res)
    return res
Beispiel #31
0
import csv
f = open('data.csv', 'w')
writer = csv.writer(f, lineterminator="\n")
writer.writerow(data)
f.close()
f = open('label.csv', 'w')
writer = csv.writer(f, lineterminator="\n")
writer.writerow(data_label)
f.close()
"""generate a SVM classifier"""
classifier = svm.SVC()
"""train cross validation"""
valid_score = []
kfold = StratifiedKFold(n_splits=5, shuffle=False, random_state=1)
count = 0
for train_index, valid_index in kfold.split(np.array([0] * 3000),
                                            np.array([0] * 3000)):
    print('<<<<<COUNT>>>>> ' + str(count))
    classifier.fit(train[train_index], train_label[train_index])

    predicted = classifier.predict(train[valid_index])
    confus = metrics.confusion_matrix(train_label[valid_index], predicted)
    acc = (confus[0][0] + confus[1][1]) / sum(sum(confus))
    valid_score.extend([acc])
    count = count + 1
print("valid: %.2f%% (+/- %.2f%%)" %
      (np.mean(valid_score), np.std(valid_score)))
# train model, classifier.fit(資料:data numberxdata size, 分類目標:data numberxlabel size)
"""test model"""
expected = test_label
predicted = classifier.predict(test)
confus = metrics.confusion_matrix(expected, predicted)
Beispiel #32
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--dataset',
                        type=str,
                        help='Provide the dataset name')
    parser.add_argument('--crossvalidation',
                        default=False,
                        action='store_true',
                        help='Enable a 10-fold crossvalidation')
    parser.add_argument('--gridsearch',
                        default=False,
                        action='store_true',
                        help='Enable grid search')
    parser.add_argument('--sinkhorn',
                        default=False,
                        action='store_true',
                        help='Use sinkhorn approximation')
    parser.add_argument('--h',
                        type=int,
                        required=False,
                        default=2,
                        help="(Max) number of WL iterations")
    parser.add_argument('--type', type=str, default='continuous')

    args = parser.parse_args()
    dataset = args.dataset
    h = args.h
    sinkhorn = args.sinkhorn
    typ = args.type
    if typ != 'discrete' and typ != 'continuous' and typ != 'both':
        print('Type error!')
        exit(-1)
    print(f'Generating results for {dataset}...')
    #---------------------------------
    # Setup
    #---------------------------------
    # Start by making directories for intermediate and final files
    data_path = 'data'
    output_path = os.path.join('output', dataset)
    results_path = os.path.join('results', dataset)

    for path in [output_path, results_path]:
        if not os.path.exists(path):
            os.makedirs(path)

    #---------------------------------
    # Embeddings
    #---------------------------------
    # Load the data and generate the embeddings
    # embedding_type = 'continuous' # if dataset == 'ENZYMES' else 'discrete'
    # print(f'Generating {embedding_type} embeddings for {dataset}.')
    node_labels, node_features, adj_mat, n_nodes, edge_features, y = load_continuous_graphs(
        dataset)
    if typ != 'discrete':
        label_sequences_continuous = compute_wl_embeddings_continuous(
            node_features, adj_mat, edge_features, n_nodes, h)
    if typ != 'continuous':
        label_sequences_discrete = compute_wl_embeddings_discrete(
            adj_mat, node_labels, h)

    # Save embeddings to output folder
    # out_name = f'{dataset}_wl_{embedding_type}_embeddings_h{h}.npy'
    # np.save(os.path.join(output_path, out_name), label_sequences)
    # print(f'Embeddings for {dataset} computed, saved to {os.path.join(output_path, out_name)}.')
    print()

    #---------------------------------
    # Wasserstein & Kernel computations
    #---------------------------------
    # Run Wasserstein distance computation
    print('Computing the Wasserstein distances...')
    if typ != 'discrete':
        wasserstein_distances_continuous = compute_wasserstein_distance(
            label_sequences_continuous, h, sinkhorn=sinkhorn, discrete=False)
    if typ != 'continuous':
        wasserstein_distances_discrete = compute_wasserstein_distance(
            label_sequences_discrete, h, sinkhorn=sinkhorn, discrete=True)

    if typ == 'discrete':
        wasserstein_distances = wasserstein_distances_discrete
    elif typ == 'continuous':
        wasserstein_distances = wasserstein_distances_continuous
    elif typ == 'both':
        wasserstein_distances = []
        for h in range(len(wasserstein_distances_discrete)):
            M = wasserstein_distances_continuous[
                h] * wasserstein_distances_discrete[h]
            wasserstein_distances.append(M)
    else:
        print('Type error!')
        exit(-1)
    print('Wasserstein distances computation done')
    print()

    # Transform to Kernel
    # Here the flags come into play
    if args.gridsearch:
        # Gammas in eps(-gamma*M):
        gammas = np.logspace(-4, 1, num=6)
        # iterate over the iterations too
        hs = range(h)
        param_grid = [{'C': np.logspace(-3, 3, num=7)}]
    else:
        gammas = [0.001]
        hs = [h]

    kernel_matrices = []
    kernel_params = []
    for i, current_h in enumerate(hs):
        # Generate the full list of kernel matrices from which to select
        M = wasserstein_distances[current_h]
        for g in gammas:
            K = np.exp(-g * M)
            kernel_matrices.append(K)
            kernel_params.append((current_h, g))

    # Check for no hyperparam:
    if not args.gridsearch:
        assert len(kernel_matrices) == 1
    print('Kernel matrices computed.')
    print()

    #---------------------------------
    # Classification
    #---------------------------------
    # Run hyperparameter search if needed
    print(
        f'Running SVMs, crossvalidation: {args.crossvalidation}, gridsearch: {args.gridsearch}.'
    )

    cv_scores = []
    for cv_time in range(10):
        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []
        # np.random.seed(42)

        cv = StratifiedKFold(n_splits=10, shuffle=True)
        # Hyperparam logging
        best_C = []
        best_h = []
        best_gamma = []

        for train_index, test_index in cv.split(kernel_matrices[0], y):
            K_train = [K[train_index][:, train_index] for K in kernel_matrices]
            K_test = [K[test_index][:, train_index] for K in kernel_matrices]
            y_train, y_test = y[train_index], y[test_index]

            # Gridsearch
            if args.gridsearch:
                gs, best_params = custom_grid_search_cv(
                    SVC(kernel='precomputed'),
                    param_grid,
                    K_train,
                    y_train,
                    cv=5)
                # Store best params
                C_ = best_params['params']['C']
                h_, gamma_ = kernel_params[best_params['K_idx']]
                y_pred = gs.predict(K_test[best_params['K_idx']])
            else:
                gs = SVC(C=100, kernel='precomputed').fit(K_train[0], y_train)
                y_pred = gs.predict(K_test[0])
                h_, gamma_, C_ = h, gammas[0], 100
            best_C.append(C_)
            best_h.append(h_)
            best_gamma.append(gamma_)

            accuracy_scores.append(accuracy_score(y_test, y_pred))
            if not args.crossvalidation:
                break

        #---------------------------------
        # Printing and logging
        #---------------------------------
        if args.crossvalidation:
            print('Mean 10-fold accuracy {}: {:2.2f} +- {:2.2f} %'.format(
                cv_time,
                np.mean(accuracy_scores) * 100,
                np.std(accuracy_scores) * 100))
        else:
            print('Final accuracy: {:2.3f} %'.format(np.mean(accuracy_scores)))
        cv_scores.append(np.mean(accuracy_scores))

        # Save to file
        # if args.crossvalidation or args.gridsearch:
        #     extension = ''
        #     if args.crossvalidation:
        #         extension += '_crossvalidation'
        #     if args.gridsearch:
        #         extension += '_gridsearch'
        #     results_filename = os.path.join(results_path, f'results_{dataset}'+extension+'.csv')
        #     n_splits = 10 if args.crossvalidation else 1
        #     pd.DataFrame(np.array([best_h, best_C, best_gamma, accuracy_scores]).T,
        #             columns=[['h', 'C', 'gamma', 'accuracy']],
        #             index=['fold_id{}'.format(i) for i in range(n_splits)]).to_csv(results_filename)
        #     print(f'Results saved in {results_filename}.')
        # else:
        #     print('No results saved to file as --crossvalidation or --gridsearch were not selected.')
    print('Mean 10-times 10-fold accuracy: {:2.2f} +- {:2.2f} %'.format(
        np.mean(cv_scores) * 100,
        np.std(cv_scores) * 100))
Beispiel #33
0
def load_model_data(dataset_name,
                    k_fold=5,
                    dataset_autobalance=False,
                    print_dataset_info=True):
    '''

	:param dataset_name: name of the dataset to use
	:param k_fold: the number of folds to split the dataset
	:param test_number: if specified, use this to split dataset instead of k_fold
	:param dataset_autobalance: whether to balances dataset by class distribtuion if it is too skewed.
	:param print_dataset_info: whether to print information on the dataset
	:return:
	'''
    print(
        'load_data.py load_model_data(): Unserialising pickled dataset into Graph objects'
    )

    # Perform unserialisation
    graph_list, graph_labels_mapping_dict, node_labels_mapping_dict, node_label_flag, node_feature_flag =\
     unserialize_pickle(dataset_name)

    # Count the number of labels, and form a graph label list for kfold split later
    label_count_list = [0 for _ in range(len(graph_labels_mapping_dict))]
    graph_labels = []
    for graph in graph_list:
        label_count_list[graph.label] += 1
        graph_labels.append(graph.label)

    # If the dataset is too imbalanced, perform balancing operation using under-sampling
    if dataset_autobalance and len(label_count_list) == 2:
        balance_ratio = min(label_count_list[0] / label_count_list[1],
                            label_count_list[1] / label_count_list[0])
        ideal_balance_ratio = 0.5

        if balance_ratio < ideal_balance_ratio:
            print(
                "load_data.py: Dataset is too imbalanced at %s, restoring to atleast %s now."
                % (str(round(balance_ratio, 3)), str(ideal_balance_ratio)))
            if label_count_list[0] > label_count_list[1]:
                endslice = round(
                    len(graph_split[1]) / ideal_balance_ratio -
                    len(graph_split[1]))
                graph_list = graph_split[0][:endslice] + graph_split[1]
                graph_labels = [1 for _ in range(endslice)
                                ] + [0 for _ in range(len(graph_split[1]))]
            else:
                endslice = round(
                    len(graph_split[0]) / ideal_balance_ratio -
                    len(graph_split[0]))
                graph_list = graph_split[1][:endslice] + graph_split[0]
                graph_labels = [1 for _ in range(endslice)
                                ] + [0 for _ in range(len(graph_split[0]))]

        # Recalculate label_count_list again:
        label_count_list = [0 for _ in len(label_count_list)]
        for label in graph_labels:
            label_count_list[label] += 1

    # Set useful dataset features into a dictionary to be passed to main later
    dataset_features = {}
    dataset_features['name'] = dataset_name
    dataset_features['num_class'] = len(graph_labels_mapping_dict)
    dataset_features['label_dict'] = graph_labels_mapping_dict
    dataset_features['have_node_labels'] = node_label_flag
    dataset_features['have_node_attributions'] = node_feature_flag
    dataset_features['node_dict'] = node_labels_mapping_dict
    dataset_features['feat_dim'] = len(node_labels_mapping_dict)
    dataset_features['edge_feat_dim'] = 0
    graph_sizes_list = [graph.number_of_nodes for graph in graph_list]
    dataset_features['max_num_nodes'] = max(graph_sizes_list)
    dataset_features['avg_num_nodes'] = round(
        sum(graph_sizes_list) / len(graph_sizes_list))
    dataset_features['graph_sizes_list'] = graph_sizes_list

    if node_feature_flag == True:
        dataset_features['attr_dim'] = graph_list[0].node_features.shape[1]
    else:
        dataset_features['attr_dim'] = 0

    # If verbose on dataset features
    if print_dataset_info:
        # Get class distribution of graphs
        class_distribution_dict = {}
        inverse_graph_label_dict = {
            v: k
            for k, v in graph_labels_mapping_dict.items()
        }
        inverse_node_label_dict = {
            v: k
            for k, v in node_labels_mapping_dict.items()
        }

        for i in range(len(label_count_list)):
            class_distribution_dict[
                inverse_graph_label_dict[i]] = label_count_list[i]

        # Get node statistics
        unique_node_labels_count_list = []
        unique_node_features_per_graph_count_list = []
        unique_node_features_per_node_count_list = []
        node_labels_count_dict = {}

        if graph.node_labels is not None:
            for graph in graph_list:
                unique_node_labels_count_list.append(
                    len(graph.unique_node_labels))
                for node_label in graph.node_labels:
                    original_node_label = inverse_node_label_dict[node_label]
                    if original_node_label not in node_labels_count_dict.keys(
                    ):
                        node_labels_count_dict[original_node_label] = 1
                    else:
                        node_labels_count_dict[original_node_label] += 1

        if graph.node_features is not None:
            for graph in graph_list:
                sum_node_features_in_graph = [
                    sum(x) for x in zip(*graph.node_features)
                ]
                unique_node_features_per_graph_count_list.append(
                    sum([_ > 0 for _ in sum_node_features_in_graph]))
                for node_feature in graph.node_features:
                    unique_node_features_per_node_count_list.append(
                        sum([_ > 0 for _ in node_feature]))

        # Get Edge statistics
        edge_count_list = []
        for graph in graph_list:
            edge_count_list.append(len(graph.edge_pairs) / 2)

        # Build verbose message
        dataset_features_string = "==== Dataset Information ====\n"
        dataset_features_string += "== General Information == \n"
        dataset_features_string += "Number of graphs: " + str(
            len(graph_list)) + "\n"
        dataset_features_string += "Number of classes: " + str(
            dataset_features['num_class']) + "\n"
        dataset_features_string += "Class distribution: \n"

        for key in sorted(class_distribution_dict.keys()):
            dataset_features_string += '{}:{} '.format(
                key, class_distribution_dict[key])

        dataset_features_string += "\n\n"
        dataset_features_string += "== Node information== \n"
        dataset_features_string += "Average number of nodes: " + str(
            dataset_features['avg_num_nodes']) + "\n"
        dataset_features_string += "Average number of edges (undirected): " + \
                 str(round(sum(edge_count_list)/len(graph_list))) + "\n"
        dataset_features_string += "Max number of nodes: " + str(
            dataset_features['max_num_nodes']) + "\n"

        if graph.node_labels is not None:
            dataset_features_string += "Number of distinct node labels: " + str(
                len(node_labels_count_dict)) + "\n"
            dataset_features_string += "Average number of distinct node labels: " + \
                     str(round(sum(unique_node_labels_count_list)/len(graph_list))) + "\n"
            dataset_features_string += "Node labels distribution: " + "\n"

            for node_label in sorted(node_labels_count_dict.keys()):
                dataset_features_string += '{}:{} '.format(
                    node_label, node_labels_count_dict[node_label])

        if graph.node_features is not None:
            dataset_features_string += "Average number of distinct node features per graph: " + \
                    str(round(sum(unique_node_features_per_graph_count_list)/len(graph_list))) + "\n"

            dataset_features_string += "Average number of distinct node features per node: " + \
                    str(round(sum(unique_node_features_per_node_count_list)/
                     len(unique_node_features_per_node_count_list))) + "\n"
        dataset_features_string += "\n"

        dataset_features["dataset_info"] = dataset_features_string
        print(dataset_features_string)

    # If no test number is specified, use stratified KFold sampling for train test split
    stratified_KFold = StratifiedKFold(n_splits=k_fold,
                                       shuffle=True,
                                       random_state=None)
    i = 0

    train_graphs = []
    test_graphs = []
    for train_index, test_index in stratified_KFold.split(
            graph_list, graph_labels):
        train_graphs.append([graph_list[i] for i in train_index])
        test_graphs.append([graph_list[i] for i in test_index])

    return train_graphs, test_graphs, dataset_features
Beispiel #34
0
def kfold_lightgbm(df, num_folds, stratified=False, debug=False):
    """
	LightGBM with KFold or Stratified KFold
	Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
	:param df:
	:param num_folds:
	:param stratified:
	:param debug:
	:return:
	"""
    # Divide in training / validation and testing data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)

    # Create array and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [
        f for f in train_df.columns
        if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_PREV', 'index']
    ]
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1,
        )
        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc',
                verbose=200,
                early_stopping_rounds=200)
        oof_preds[valid_idx] = clf.predict_proba(
            valid_x, num_iteration=clf.best_iteration_)[:, -1]
        sub_preds += clf.predict_proba(
            test_df[feats],
            num_iteration=clf.best_iteration_)[:, -1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = feats
        fold_importance_df['importance'] = clf.feature_importances_
        fold_importance_df['fold'] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df])
        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print("Full AUC score %.6f" % roc_auc_score(train_df['TARGET'], oof_preds))

    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR',
                 'TARGET']].to_csv(os.path.join('./submission/',
                                                submission_file_name),
                                   index=False)

    # Display feature importance
    display_importances(feature_importance_df)

    return feature_importance_df
def train(cfg):
    SEED = cfg.values.seed
    MODEL_NAME = cfg.values.model_name
    USE_KFOLD = cfg.values.val_args.use_kfold
    TSVFILE = cfg.values.tsvfile
    #
    # early_stopping = EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.001)
    # early_stopping_patience : 몇 번(epoch)을 참아줄 것인가?
    # early_stopping_threshold : metric이 어느 정도 개선 되어야 하는가?

    seed_everything(SEED)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config')
    # model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config = ElectraConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 42

    whole_df = load_data("/opt/ml/input/data/train/" + TSVFILE)
    whole_label = whole_df['label'].values
    # tokenizer_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Tokenizer')
    # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME)

    training_args = TrainingArguments(
        output_dir=cfg.values.train_args.output_dir,  # output directory
        save_total_limit=cfg.values.train_args.
        save_total_limit,  # number of total save model.
        save_steps=cfg.values.train_args.save_steps,  # model saving step.
        num_train_epochs=cfg.values.train_args.
        num_epochs,  # total number of training epochs
        learning_rate=cfg.values.train_args.lr,  # learning_rate
        fp16=True,
        per_device_train_batch_size=cfg.values.train_args.
        train_batch_size,  # batch size per device during training
        per_device_eval_batch_size=cfg.values.train_args.
        eval_batch_size,  # batch size for evaluation
        warmup_steps=cfg.values.train_args.
        warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=cfg.values.train_args.
        weight_decay,  # strength of weight decay
        logging_dir=cfg.values.train_args.
        logging_dir,  # directory for storing logs
        logging_steps=cfg.values.train_args.logging_steps,  # log saving step.
        evaluation_strategy=cfg.values.train_args.
        evaluation_strategy,  # evaluation strategy to adopt during training
        dataloader_num_workers=4,
        label_smoothing_factor=cfg.values.train_args.label_smoothing_factor,
        greater_is_better=True,
        metric_for_best_model=cfg.values.train_args.metric_for_best_model,
        # lr_scheduler_type='get_cosine_with_hard_restarts_schedule_with_warmup'
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=cfg.values.train_args.eval_steps,  # evaluation step.
        load_best_model_at_end=cfg.values.train_args.load_best_model_at_end)

    if USE_KFOLD:
        kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k)

        k = 1
        for train_idx, val_idx in kfold.split(whole_df, whole_label):
            print('\n')
            cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15)
            train_df = whole_df.iloc[train_idx]
            val_df = whole_df.iloc[val_idx]

            tokenized_train = tokenized_dataset(train_df, tokenizer)
            tokenized_val = tokenized_dataset(val_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          train_df['label'].values)
            RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values)

            # model_module = getattr(import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification')
            # model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config)
            model = ElectraForSequenceClassification.from_pretrained(
                MODEL_NAME, config=model_config)

            model.to(device)

            training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold'
            training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold'

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                eval_dataset=RE_val_dataset,  # evaluation fkdataset
                compute_metrics=compute_metrics  # define metrics function
            )
            k += 1
            # train model
            trainer.train()
            if cfg.values.val_args.fold_break:
                break

    else:
        cpprint('=' * 20 + f'START TRAINING' + '=' * 20)

        train_df, val_df = train_test_split(
            whole_df,
            test_size=cfg.values.val_args.test_size,
            random_state=SEED)

        tokenized_train = tokenized_dataset(train_df, tokenizer)
        tokenized_val = tokenized_dataset(val_df, tokenizer)

        RE_train_dataset = RE_Dataset(tokenized_train,
                                      train_df['label'].values)
        RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values)

        # model_module = getattr(import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification')
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME, config=model_config)
        model.parameters
        model.to(device)

        trainer = Trainer(
            model=model,  # the instantiated 🤗 Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=RE_train_dataset,  # training dataset
            eval_dataset=RE_val_dataset,  # evaluation dataset
            compute_metrics=compute_metrics,  # define metrics function
        )

        # train model
        trainer.train()
Beispiel #36
0
def save_validationlist(root='.'):
    # list up filenames of valid data
    # totalfiles = glob.glob(os.path.join(root,"test_20??_withUPID","*.dcm"))
    # filenames = glob.glob(os.path.join(root,"test_20??_withUPID","*_[0-3]_[0-3].dcm"))
    data_dir = ["final_dcm", "final_crop"][0]
    logger.info('[' * 10 + ' ' * 20 + 'START ANALYSIS' + ' ' * 20 + ']' * 10)
    filenames = glob.glob(
        os.path.join(root, data_dir,
                     "*" + (".dcm" if data_dir == 'final_dcm' else '.jpg')))
    logger.info(f'No. of total datasets : {len(filenames)} patients')  # 6516

    rmfn = glob.glob(
        os.path.join(root, data_dir, "*_x_x" +
                     (".dcm" if data_dir == 'final_dcm' else '.jpg')))
    if len(rmfn) > 1:
        logger.info('  x_x.dcm  :')
        logger.info(rmfn)
        filenames.remove(rmfn)
    logger.info(
        f'No. of valid datasets : {len(filenames)} patients (excluded x_x.dcm )'
    )  #2980 (20.10.7 ver)

    cvdf = prepare_metatable(filenames)

    n_folds = 10
    plen = len(filenames)
    logger.info(f'----- Split patients for {n_folds} Cross-validation')

    skf = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True)
    for ii, (train_pindex, test_pindex) in enumerate(
            skf.split(range(plen), cvdf['left_label'])):
        # record fold index
        cvdf.at[test_pindex, 'FOLD'] = ii
        cvdf[f'FOLD{ii}_testset'] = 0
        cvdf.at[test_pindex, f'FOLD{ii}_testset'] = 1

    # save metadata
    filelist_dir = os.path.join(root, 'inputlist')
    os.makedirs(filelist_dir, exist_ok=True)

    cvdf.to_csv(os.path.join(filelist_dir, "input_metadata_table.csv"),
                index=False)
    cvdf[['index',
          'filename']].to_csv(os.path.join(filelist_dir,
                                           "input_filenames_total.csv"),
                              index=False)
    for i in range(n_folds):
        cvdf.loc[cvdf[f'FOLD{i}_testset'] == 1,
                 'filename'].to_csv(os.path.join(
                     filelist_dir, f"input_filenames_fold{i}.csv"),
                                    index=False)

    # statistics
    logger.info(f'----- Data statistics by fold', cvdf['FOLD'].value_counts())

    logger.info(cvdf['FOLD'].value_counts())

    labelfreq_left = pd.crosstab(cvdf['FOLD'],
                                 cvdf['left_label'],
                                 margins=True)
    labelfreq_left_ratio = pd.crosstab(cvdf['FOLD'],
                                       cvdf['left_label'],
                                       margins=True,
                                       normalize='index')
    labelfreq_right = pd.crosstab(cvdf['FOLD'],
                                  cvdf['right_label'],
                                  margins=True)
    labelfreq_right_ratio = pd.crosstab(cvdf['FOLD'],
                                        cvdf['right_label'],
                                        margins=True,
                                        normalize='index')

    labelfreq = pd.concat([labelfreq_left, labelfreq_right],
                          axis=1,
                          keys=['left_sinus', 'right_sinus'],
                          names=[' ', 'label'])
    labelfreq_ratio = pd.concat([labelfreq_left_ratio, labelfreq_right_ratio],
                                axis=1,
                                keys=['left_sinus', 'right_sinus'],
                                names=[' ', 'label (ratio)'])

    labelfreq.to_csv(os.path.join(filelist_dir, f"label_freq_byfold.csv"))
    labelfreq_ratio.to_csv(os.path.join(filelist_dir,
                                        f"label_freq_ratio_byfold.csv"),
                           float_format='%.2f')
    logger.info(f'----- Label frequency by fold')
    logger.info(labelfreq)
    logger.info(f'----- Label frequency (ratio) by fold')
    logger.info(labelfreq_ratio)
df = pd.DataFrame(data=X_reduced, columns=features_name)
df["label"] = y
g = sns.PairGrid(df, hue='label')
g.map(sns.scatterplot)
plt.show()

x_data = df.iloc[:, 0:-1]
y_data = df["label"]

C = 1.0  #SVM regularization parameter
kf = StratifiedKFold(n_splits=20, shuffle=True)

clfs = []
scores = []

for i, (train_index, test_index) in enumerate(kf.split(x_data, y_data)):
    #row 1,4,7,8,10,11,15.... -> training
    #row 2,3,... ->testing

    X_train, X_test = x_data.iloc[train_index], x_data.iloc[test_index]
    Y_train, Y_test = y_data.iloc[train_index], y_data.iloc[test_index]

    clf = svm.SVC(kernel='linear', C=C, probability=True)
    clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)
    print(score)
    clfs.append(clf)
    scores.append(score)

best_accuracy = np.argsort(scores)[::-1][0]
clf = clfs[best_accuracy]
def train(train_path, tokenizer_path):
    print('import data...')
    maxlen = 1024
    X, label, Y = text2sequence(train_path, tokenizer_path, maxlen)
    num_class = len(set(label))
    print('data import finished!')
    tokenizer = pickle.load(open(tokenizer_path, 'rb'))
    num_words = len(tokenizer.word_index) + 1
    print('prepare training data and validation data using k_fold')
    seed = 0
    k = 10
    k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
    #10折交叉验证数据集划分

    cw_1 = {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1}  #不考虑数据不均衡
    cw_2 = {
        0: 0.348709,
        1: 3.457910,
        2: 1.451396,
        3: 2.116922,
        4: 17.358700,
        5: 0.404727,
        6: 3.370635,
        7: 1.167362
    }  #每类权重为(1/8/该类出现频率)
    class_weight = [cw_1, cw_2]  #使两种权重一样重要
    #在100个文档的数据集上测试发现不使用class_weight的效果比使用class_weight的好
    #使用class_weight的效果比只使用cw_2的效果好

    print('create lstm model...')
    model = Sequential()
    model.add(Embedding(num_words, 128, input_length=maxlen))
    model.add(Dropout(0.5))
    model.add(LSTM(64, recurrent_dropout=0.5))
    model.add(Dropout(0.5))
    model.add(Dense(num_class, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    print(model.summary())

    k_fold_cv_loss = []
    k_fold_cv_acc = []

    dt = datetime.now()
    d = dt.date()
    h = dt.time().hour
    m = dt.time().minute
    time_str = '{}_{}{}'.format(d, h, m)

    mckpt = ModelCheckpoint('model/best-lstm_weights_{}.h5'.format(time_str),
                            monitor='val_loss',
                            mode='auto',
                            verbose=1,
                            save_best_only=True,
                            save_weights_only=True,
                            period=1)
    rlstp = EarlyStopping(monitor='val_loss', patience=3)
    tb = TensorBoard(log_dir='./logs',
                     embeddings_freq=1,
                     write_images=1,
                     histogram_freq=1,
                     batch_size=32)
    turn = 1
    for train, valid in k_fold.split(X, label):
        print('the {} turn training...'.format(turn))
        turn += 1
        model.fit(X[train],
                  Y[train],
                  validation_data=(X[valid], Y[valid]),
                  class_weight=None,
                  callbacks=[mckpt],
                  verbose=2,
                  epochs=11,
                  batch_size=32)
        # Evaluate model
        loss, acc = model.evaluate(X[valid],
                                   Y[valid],
                                   verbose=0,
                                   batch_size=32)
        k_fold_cv_loss.append(loss)
        k_fold_cv_acc.append(acc)

    print("Model loss: {:0.6f}".format(np.mean(k_fold_cv_loss)))
    print("Model Accuracy: {:0.6f}%".format(np.mean(k_fold_cv_acc) * 100))

    # Save model
    model.save_weights('model/lstm_weights_{}.h5'.format(time_str))
    model.save('model/lstm_model_{}.h5'.format(time_str))
    with open('model/lstm_model_{}.json'.format(time_str), 'w') as outfile:
        outfile.write(model.to_json())
Beispiel #39
0
def main():
    parser = argparse.ArgumentParser()
    # IO-specific
    parser.add_argument(
        '-f',
        '--fcs',
        required=True,
        help='file specifying the FCS file names and corresponding labels')
    parser.add_argument(
        '-m',
        '--markers',
        required=True,
        help='file specifying the names of markers to be used for analysis')
    parser.add_argument('-i',
                        '--indir',
                        default='./',
                        help='directory where input FCS files are located')
    parser.add_argument('-o',
                        '--outdir',
                        default='output',
                        help='directory where output will be generated')
    parser.add_argument('-p',
                        '--plot',
                        action='store_true',
                        default=True,
                        help='whether to plot results ')
    parser.add_argument('--export_selected_cells',
                        action='store_true',
                        default=False,
                        help='whether to export selected cell populations')
    parser.add_argument('--export_csv',
                        action='store_true',
                        default=False,
                        help='whether to export network weights as csv files')
    parser.add_argument('-l',
                        '--load_results',
                        action='store_true',
                        default=False,
                        help='whether to load precomputed results')

    # data preprocessing
    parser.add_argument('--train_perc',
                        type=float,
                        default=0.75,
                        help='percentage of samples to be used for training')
    parser.add_argument('--arcsinh',
                        dest='arcsinh',
                        action='store_true',
                        help='preprocess the data with arcsinh')
    parser.add_argument('--no_arcsinh',
                        dest='arcsinh',
                        action='store_false',
                        help='do not preprocess the data with arcsinh')
    parser.set_defaults(arcsinh=True)
    parser.add_argument('--cofactor',
                        type=int,
                        default=5,
                        help='cofactor for the arcsinh transform')
    parser.add_argument(
        '--scale',
        dest='scale',
        action='store_true',
        help='z-transform features (mean=0, std=1) prior to training')
    parser.add_argument(
        '--no_scale',
        dest='scale',
        action='store_false',
        help='do not z-transform features (mean=0, std=1) prior to training')
    parser.set_defaults(scale=True)
    parser.add_argument(
        '--quant_normed',
        action='store_true',
        default=False,
        help=
        'only use this option if the input data already lies in the [0, 1] interval, e.g. after quantile normalization'
    )

    # multi-cell input specific
    parser.add_argument('--ncell',
                        type=int,
                        help='number of cells per multi-cell input',
                        default=200)
    parser.add_argument('--nsubset',
                        type=int,
                        help='number of multi-cell inputs',
                        default=1000)
    parser.add_argument(
        '--per_sample',
        action='store_true',
        default=False,
        help='whether nsubset refers to each class or each sample')
    parser.add_argument(
        '--subset_selection',
        choices=['random', 'outlier'],
        default='random',
        help='generate random or outlier-enriched multi-cell inputs')

    # neural network specific
    parser.add_argument(
        '--maxpool_percentages',
        nargs='+',
        type=float,
        help=
        'list of choices (percentage of multi-cell input) for top-k max pooling',
        default=[0.01, 1, 5, 20, 100])
    parser.add_argument('--nfilter_choice',
                        nargs='+',
                        type=int,
                        help='list of choices for number of filters',
                        default=range(3, 10))
    parser.add_argument(
        '--learning_rate',
        type=float,
        default=0.005,
        help='learning rate for the Adam optimization algorithm')
    parser.add_argument('--coeff_l1',
                        type=float,
                        default=0,
                        help='coefficient for L1 weight regularization')
    parser.add_argument('--coeff_l2',
                        type=float,
                        default=0.0001,
                        help='coefficient for L2 weight regularization')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=20,
                        help='maximum number of iterations through the data')
    parser.add_argument('--patience',
                        type=int,
                        default=5,
                        help='number of epochs before early stopping')

    # analysis specific
    parser.add_argument('--seed', type=int, default=1234, help='random seed')
    parser.add_argument(
        '--nrun',
        type=int,
        default=15,
        help='number of neural network configurations to try (should be >= 3)')
    parser.add_argument(
        '--regression',
        action='store_true',
        default=False,
        help='whether it is a regression problem (default is classification)')
    parser.add_argument(
        '--dendrogram_cutoff',
        type=float,
        default=.4,
        help='cutoff for hierarchical clustering of filter weights')
    parser.add_argument('--accur_thres', type=float, default=.9,
                        help='keep filters from models achieving at least this accuracy ' \
                             ' (or at least from the best 3 models)')
    parser.add_argument('-v',
                        '--verbose',
                        type=int,
                        choices=[0, 1],
                        default=1,
                        help='output verbosity')

    # plot specific
    parser.add_argument(
        '--filter_diff_thres',
        type=float,
        default=0.2,
        help='threshold that defines which filters are discriminative')
    parser.add_argument(
        '--filter_response_thres',
        type=float,
        default=0,
        help='threshold that defines the selected cell population per filter')
    parser.add_argument('--stat_test', choices=[None, 'ttest', 'mannwhitneyu'],
                        help='statistical test for comparing cell population frequencies of two ' \
                             'groups of samples')
    parser.add_argument('--group_a',
                        default='group A',
                        help='name of the first class')
    parser.add_argument('--group_b',
                        default='group B',
                        help='name of the second class')
    parser.add_argument('--group_names',
                        nargs='+',
                        default=None,
                        help='list of class names')
    parser.add_argument('--tsne_ncell',
                        type=int,
                        help='number of cells to include in t-SNE maps',
                        default=10000)
    args = parser.parse_args()

    # read in the data
    fcs_info = np.array(pd.read_csv(args.fcs, sep=','))
    marker_names = list(pd.read_csv(args.markers, sep=',').columns)
    # if the samples have already been pre-processed via quantile normalization
    # we should not perform arcsinh transformation
    if args.quant_normed:
        args.arcsinh = False
    samples, phenotypes = get_data(args.indir, fcs_info, marker_names,
                                   args.arcsinh, args.cofactor)

    # generate training/validation sets
    np.random.seed(args.seed)
    val_perc = 1 - args.train_perc
    n_splits = int(1. / val_perc)
    # stratified CV for classification problems
    if not args.regression:
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
    # simple CV for regression problems
    else:
        skf = KFold(n_splits=n_splits, shuffle=True)
    train, val = next(skf.split(np.zeros((len(phenotypes), 1)), phenotypes))
    train_samples = [samples[i] for i in train]
    valid_samples = [samples[i] for i in val]
    train_phenotypes = [phenotypes[i] for i in train]
    valid_phenotypes = [phenotypes[i] for i in val]

    print '\nSamples used for model training:'
    for i in train:
        print fcs_info[i]
    print '\nSamples used for validation:'
    for i in val:
        print fcs_info[i]
    print

    # always generate multi-cell inputs on a per-sample basis for regression
    if args.regression:
        args.per_sample = True

    if not args.load_results:
        # run CellCnn
        model = CellCnn(ncell=args.ncell,
                        nsubset=args.nsubset,
                        per_sample=args.per_sample,
                        subset_selection=args.subset_selection,
                        scale=args.scale,
                        quant_normed=args.quant_normed,
                        maxpool_percentages=args.maxpool_percentages,
                        nfilter_choice=args.nfilter_choice,
                        nrun=args.nrun,
                        regression=args.regression,
                        learning_rate=args.learning_rate,
                        coeff_l1=args.coeff_l1,
                        coeff_l2=args.coeff_l2,
                        max_epochs=args.max_epochs,
                        patience=args.patience,
                        dendrogram_cutoff=args.dendrogram_cutoff,
                        accur_thres=args.accur_thres,
                        verbose=args.verbose)
        model.fit(train_samples=train_samples,
                  train_phenotypes=train_phenotypes,
                  valid_samples=valid_samples,
                  valid_phenotypes=valid_phenotypes,
                  outdir=args.outdir)
        # save results for subsequent analysis
        results = model.results
        pickle.dump(results, open(os.path.join(args.outdir, 'results.pkl'),
                                  'w'))
    else:
        results = pickle.load(
            open(os.path.join(args.outdir, 'results.pkl'), 'r'))

    if args.export_csv:
        save_results(results, args.outdir, marker_names)

    # plot results
    if args.plot or args.export_selected_cells:
        plotdir = os.path.join(args.outdir, 'plots')
        plot_filters(results, marker_names,
                     os.path.join(plotdir, 'filter_plots'))
        _v = discriminative_filters(results,
                                    os.path.join(plotdir, 'filter_plots'),
                                    filter_diff_thres=args.filter_diff_thres,
                                    show_filters=True)
        filter_info = plot_results(
            results,
            train_samples,
            train_phenotypes,
            marker_names,
            os.path.join(plotdir, 'training_plots'),
            filter_diff_thres=args.filter_diff_thres,
            filter_response_thres=args.filter_response_thres,
            stat_test=args.stat_test,
            group_a=args.group_a,
            group_b=args.group_b,
            group_names=args.group_names,
            tsne_ncell=args.tsne_ncell,
            regression=args.regression,
            show_filters=False)
        _v = plot_results(results,
                          valid_samples,
                          valid_phenotypes,
                          marker_names,
                          os.path.join(plotdir, 'validation_plots'),
                          filter_diff_thres=args.filter_diff_thres,
                          filter_response_thres=args.filter_response_thres,
                          stat_test=args.stat_test,
                          group_a=args.group_a,
                          group_b=args.group_b,
                          group_names=args.group_names,
                          tsne_ncell=args.tsne_ncell,
                          regression=args.regression,
                          show_filters=False)
        if args.export_selected_cells:
            csv_dir = os.path.join(args.outdir, 'selected_cells')
            mkdir_p(csv_dir)
            nfilter = len(filter_info)
            sample_names = [
                name.split('.fcs')[0] for name in list(fcs_info[:, 0])
            ]
            # for each sample
            for x, x_name in zip(samples, sample_names):
                flags = np.zeros((x.shape[0], 2 * nfilter))
                columns = []
                # for each filter
                for i, (filter_idx, thres) in enumerate(filter_info):
                    flags[:, 2 * i:2 * (i + 1)] = get_selected_cells(
                        results['selected_filters'][filter_idx], x,
                        results['scaler'], thres, True)
                    columns += [
                        'filter_%d_continuous' % filter_idx,
                        'filter_%d_binary' % filter_idx
                    ]
                df = pd.DataFrame(flags, columns=columns)
                df.to_csv(os.path.join(csv_dir,
                                       x_name + '_selected_cells.csv'),
                          index=False)
    if (args.debug):
        print("len(X_train) : ", len(X_train))
        print("len(y_train) : ", len(y_train))
        print("len(y_pred_val) : ", len(y_pred_val))

    #===========================================
    # k-fold CV による処理
    #===========================================
    # k-hold cross validation で、学習用データセットを学習用と検証用に分割したもので評価
    kf = StratifiedKFold(n_splits=args.n_splits,
                         shuffle=True,
                         random_state=args.seed)

    y_preds = []
    for fold_id, (train_index,
                  valid_index) in enumerate(kf.split(X_train, y_train)):
        #--------------------
        # データセットの分割
        #--------------------
        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[
            valid_index]
        y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[
            valid_index]

        #--------------------
        # モデル定義
        #--------------------
        model = KerasResNetClassifier(n_channles=len(X_train.columns))

        #--------------------
        # モデルの学習処理
Beispiel #41
0
X = breast_cancer.iloc[:, :9]
Y = breast_cancer["Class"]
labels = pd.unique(Y)

Y = Y.replace("'recurrence-events'", 1)
Y = Y.replace("'no-recurrence-events'", 0)

X1 = preprocessData(X)

clf = DecisionTreeClassifier(min_samples_leaf=15)
rclf = RandomForestClassifier()

skf = StratifiedKFold(n_splits=10)

acc = []
for train_index, test_index in skf.split(X1, Y):
    #print("TRAIN:", train_index, "\nTEST:", test_index)
    X_train = X1.iloc[train_index, :]
    X_test = X1.iloc[test_index, :]
    y_train = Y.iloc[train_index]
    y_test = Y.iloc[test_index]

    clf = clf.fit(X_train, y_train)
    rclf = rclf.fit(X_train, y_train)
    acc1 = clf.score(X_test, y_test) * 100.0
    acc.append(acc1)
    #acc2 = rclf.score(X_test,y_test)

    print('The accuracy of CART was: {}'.format(acc1))
    #print('The accuracy of Random Forest was: {}'.format(acc2))
    #print(clf.decision_path)
Beispiel #42
0
def extract_feature_siamese_lstm_manDist_char():
    feature_name = 'dl_siamese_lstm_manDist_char'
    embedding_char_matrix_file_path = 'train_all_char_embedding_matrix.pickle'
    nb_filter = 300
    filter_width = [4, 3]

    y_train_oofp = np.zeros((len(y_train), 1), dtype='float64')

    y_test_oofp = np.zeros((len(X_test_s1), 1), dtype='float64')

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=44)
    for fold_num, (ix_train,
                   ix_val) in enumerate(kfold.split(X_train_s1, y_train)):

        # 选出需要添加的样本
        train_true_mask = y_train[ix_train] == 1
        X_train_true_s1 = X_train_s1[ix_train][train_true_mask]
        X_train_true_s2 = X_train_s2[ix_train][train_true_mask]
        y_train_true = y_train[ix_train][train_true_mask]

        # 进行添加
        X_add_train_fold_s1 = np.vstack(
            [X_train_s1[ix_train], X_train_true_s2])
        X_add_train_fold_s2 = np.vstack(
            [X_train_s2[ix_train], X_train_true_s1])
        y_add_train_fold = np.concatenate([y_train[ix_train], y_train_true])

        val_true_mask = y_train[ix_val] == 1
        X_val_true_s1 = X_train_s1[ix_val][val_true_mask]
        X_val_true_s2 = X_train_s2[ix_val][val_true_mask]
        y_val_true = y_train[ix_val][val_true_mask]

        # 进行添加
        X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2])
        X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1])
        y_add_val_fold = np.concatenate([y_train[ix_val], y_val_true])

        print('start train fold {} of {} ......'.format((fold_num + 1), 5))
        # 创建模型
        model = create_abcnn_model(embedding_matrix, nb_filter, filter_width)
        # 训练模型
        model_checkpoint_path = project.trained_model_dir + 'dl_abcnn_model{}.h5'.format(
            fold_num)
        model.fit(x=[X_add_train_fold_s1, X_add_train_fold_s2],
                  y=y_add_train_fold,
                  validation_data=([X_add_val_fold_s1,
                                    X_add_val_fold_s2], y_add_val_fold),
                  batch_size=512,
                  epochs=30,
                  verbose=1,
                  class_weight={
                      0: 1,
                      1: 2
                  },
                  callbacks=[
                      EarlyStopping(monitor='val_loss',
                                    min_delta=0.005,
                                    patience=5,
                                    verbose=1,
                                    mode='auto'),
                      ModelCheckpoint(model_checkpoint_path,
                                      monitor='val_loss',
                                      save_best_only=True,
                                      save_weights_only=False,
                                      verbose=1)
                  ])
        model.load_weights(model_checkpoint_path)
        y_train_oofp[ix_val] = predict(model, X_train_s1[ix_val],
                                       X_train_s2[ix_val])
        K.clear_session()
        del X_add_train_fold_s1
        del X_add_train_fold_s2
        del X_add_val_fold_s1
        del X_add_val_fold_s2
        del y_add_train_fold
        del y_add_val_fold
        gc.collect()
    model_path = project.trained_model_dir + 'dl_abcnn_model0.h5'
    model0 = load_model(model_path,
                        custom_objects={
                            'fbeta_score': fbeta_score,
                            'precision': precision,
                            'recall': recall
                        })
    y_test_oofp = predict(model0, X_test_s1, X_test_s2)

    col_names = ['{}_{}'.format(feature_name, index) for index in range(1)]
    after_extract_feature_save_data(y_train_oofp, y_test_oofp, col_names,
                                    feature_name)
Beispiel #43
0
class KNN(object):
    def __init__(self, clf, train, test, lr_features, targets, cv, model_name):
        """
        the construction method
        :param clf: classifier model_lr
        :param train: train data - dataframe
        :param test: test data - dataframe
        :param lr_features: features for LogitReg
        :param targets: y columns - list
        :param cv: number of cv - int
        :param model_name: name of the model - string
        """
        self.clf = clf
        self.train = train
        self.test = test
        self.lr_features = lr_features
        self.targets = targets
        self.cv = StratifiedKFold(n_splits=cv)
        self.model_name = model_name
        self.train_X = train.loc[:, lr_features].values
        self.train_y = train.loc[:, targets].values

    # split feature and target
    def split_X_y(self, df, X_cols, y_cols):
        return df.loc[:, X_cols].values, df.loc[:, y_cols].values

    # function for sampling and spliting features and target
    def get_data(self):
        # undersampling according to length of ice data
        sample_size = len(self.train.loc[lambda df: df[self.targets] == 1, :])
        # unbalanced sampling between ice data and normal data
        train_sample = self.train.loc[
            lambda df: df[self.targets] == 1, :].append(
                self.train.loc[lambda df: df[self.targets] == 0, :].sample(
                    sample_size - int(sample_size * 0.9)))
        # split features and target
        train_sample_X, train_sample_y = self.split_X_y(
            train_sample, self.lr_features, self.targets)
        test_X, test_y = self.split_X_y(self.test, self.lr_features,
                                        self.targets)
        return train_sample_X, train_sample_y, test_X, test_y

    def train_model(self):
        """
        train and estimate model
        draw ROC curves of train and test
        :return: train_y, train_pred_y, test_y, test_pred_y
        """
        train_sample_X, train_sample_y, test_X, test_y = self.get_data(
        )  # generate formed train and test data
        cv_data = self.cv.split(
            train_sample_X,
            train_sample_y)  # split train data to train and validation data

        tprs = []  # list for saving TP rates in each cv
        aucs = []  # list for saving aucs in each cv
        mean_fpr = np.linspace(0, 1, 100)  # mean FP rates
        fig, ax = plt.subplots()  # initialize plt
        for i, (train,
                valid) in enumerate(cv_data):  # 5 fold training of model_lr
            self.clf.fit(train_sample_X[train],
                         train_sample_y[train])  # fit model using train data
            # plot ROC
            viz = metrics.plot_roc_curve(self.clf,
                                         train_sample_X[valid],
                                         train_sample_y[valid],
                                         name='ROC fold {}'.format(i),
                                         alpha=0.3,
                                         lw=1,
                                         ax=ax)
            interp_tpr = interp(mean_fpr, viz.fpr,
                                viz.tpr)  # get TP rates and do interp
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)  # add new interp_tpr to trprs list
            aucs.append(viz.roc_auc)  # add viz.roc_auc to aucs list
        # plot ROC of test data
        metrics.plot_roc_curve(self.clf,
                               test_X,
                               test_y,
                               name='ROC test',
                               alpha=0.8,
                               lw=1,
                               color='green',
                               ax=ax)
        ax.plot([0, 1], [0, 1],
                linestyle='--',
                lw=2,
                color='r',
                label='Chance',
                alpha=.8)
        # draw mean auc of 5 cv train
        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = metrics.auc(mean_fpr, mean_tpr)
        std_auc = np.std(aucs)
        ax.plot(mean_fpr,
                mean_tpr,
                color='b',
                label=r'Mean ROC of Train (AUC = %0.2f $\pm$ %0.2f)' %
                (mean_auc, std_auc),
                lw=2,
                alpha=.8)
        # draw confident interval
        std_tpr = np.std(tprs, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)  # get upper bound
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)  # get lower bound
        ax.fill_between(mean_fpr,
                        tprs_lower,
                        tprs_upper,
                        color='grey',
                        alpha=.2,
                        label=r'$\pm$ 1 std. dev.')
        ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="ROC Curve")
        ax.legend(loc="lower right")
        plt.savefig('res_fig_knn/' + self.model_name + '.png')
        print(
            r'5 Cross Validation Mean AUC: %0.2f, Standard Deviation is %0.2f'
            % (mean_auc, std_auc))
        # train with all train data and compute the train and test accuracy respectively
        start = datetime.datetime.now()  # record start time
        self.clf.fit(train_sample_X, train_sample_y)  # fit all train data
        test_pred_y = self.clf.predict(test_X)  # predict test data
        end = datetime.datetime.now()  # record end time
        print('Fit Time:')  # calculate time cost
        print(end - start)
        dump(self.clf,
             'model_knn/' + self.model_name + '.joblib')  # save trained model
        train_acc = self.clf.score(self.train_X,
                                   self.train_y)  # calculate train accuracy
        test_acc = self.clf.score(test_X, test_y)  # calculate test accuracy
        print('Train Accuracy is %0.2f, Test Accuracy is %0.2f' %
              (train_acc, test_acc))
        train_pred_y = self.clf.predict(self.train_X)  # train data prediction
        return self.train_y, train_pred_y, test_y, test_pred_y
Beispiel #44
0
if (kfold==1):
    X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, train_size=0.7, random_state=42)
    # save
    dirname2 = '%s/result_%02d' % (dirname, 0)
    if not os.path.exists(dirname2):
        os.mkdir(dirname2)
    df1 = pd.concat([X_train, Y_train], axis = 1)
    df2 = pd.concat([X_test , Y_test ], axis = 1)
    trfile = '%s/%s_train_%02d.csv' % (dirname2, prefix, 0)
    tefile = '%s/%s_test_%02d.csv' % (dirname2, prefix, 0)
    df1.to_csv(trfile, header=None, index=False)
    df2.to_csv(tefile, header=None, index=False)

else:
    k_fold = StratifiedKFold(n_splits=kfold, random_state=42,shuffle=True)
    cv = k_fold.split(X_data,Y_data)
    
    t = 0
    for train_index, test_index in cv:
        print('KFold = ', t)
        X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
        Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index]
    
        # save
        dirname2 = '%s/result_%02d' % (dirname, t)
        if not os.path.exists(dirname2):
            os.mkdir(dirname2)
        df1 = pd.concat([X_train, Y_train], axis = 1)
        df2 = pd.concat([X_test , Y_test ], axis = 1)
        trfile = '%s/%s_train_%02d.csv' % (dirname2, prefix, t)
        tefile = '%s/%s_test_%02d.csv' % (dirname2, prefix, t)
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=47)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=8,
            #is_unbalance=True,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1,
            #scale_pos_weight=11
            )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
            eval_metric= 'auc', verbose= 1000, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        train_df['Prediction'] = oof_preds
        train_df.to_csv("kernel02_train.csv", index=False)
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    return feature_importance_df
y = [(category) for (rev, category) in documents]
random.shuffle(featuresets)

skf = StratifiedKFold(n_splits=10)

# blank lists to store predicted values and actual values
predicted_y = []
expected_y = []

# partition data
training_set = []
testing_set = []
file = open("output_kfold.txt", "w")
file.close

for train_index, test_index in skf.split(x, y):
    file = open("output_kfold.txt", "a")
    file.write(str(train_index) + str(test_index))
    # specific ".loc" syntax for working with dataframes
    # x_train, x_test = x[train_index], x[test_index]
    #y_train, y_test = y[train_index], y[test_index]
    for i in train_index:
        training_set.append(featuresets[i])
    for i in test_index:
        testing_set.append(featuresets[i])

    k_folds_f.KF(training_set, testing_set, file)
    #accuracy = metrics.accuracy_score(expected_y, predicted_y)
    #print("Accuracy: " + accuracy.__str__())
    #print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
    #classifier.show_most_informative_features(15)
#y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()'''

#CountVectorizer
#Take the sum of all accuracies of the 10 folds
sumsvc=0
sumdtc=0
sumrfc=0
sumlr=0
t=0

skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(x, y)
StratifiedKFold(n_splits=10, random_state=None, shuffle=True)
print("SKF on count vectorizer...")
for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index] 


    classifiers=[
        (SVC(kernel = 'rbf', random_state = 0),"SVC"),
        (DecisionTreeClassifier(random_state = 0),"DTC"),
        (LogisticRegression(),"LR"),
        (RandomForestClassifier(n_estimators=80, max_depth=100,random_state=0),"RFC"),
    ]
    
    #Accuracy scores of different models
    score_ , names = [] , []
    for model,name in classifiers:
        model.fit(x_train, y_train)
    y_train = np.asarray(labels)
    logger.info('Number of Training Examples: {}'.format(X_train.shape))
    logger.info('Number of Labels: {}'.format(y_train.shape))

    # Train model
    logger.info('Training model...')
    clf = MultinomialNB()
    model = clf.fit(X_train, y_train)
    logger.info('Training Accuracy: {}'.format(model.score(X_train, y_train)))

    # K-fold Cross Validation (stratified)
    logger.info('Cross validating...')
    skf = StratifiedKFold(n_splits=5)
    test_prec_scores = []
    test_rec_scores = []
    for train_index, test_index in skf.split(X_train, y_train):
        # train
        X_train_val, X_test_val = X_train[train_index], X_train[test_index]
        y_train_val, y_test_val = y_train[train_index], y_train[test_index]
        clf = MultinomialNB()
        model = clf.fit(X_train_val, y_train_val)
        # test
        test_prec_score = precision_score(y_test_val,
                                          model.predict(X_test_val))
        test_rec_score = recall_score(y_test_val, model.predict(X_test_val))
        # update scores
        test_prec_scores.append(test_prec_score)
        test_rec_scores.append(test_rec_score)

    # Mean precision and recall
    logger.info('Average Test Precision: {}'.format(np.mean(test_prec_scores)))
Beispiel #49
0
def KNN(distances,
        labels,
        k=1,
        metrics=['Recall', 'Precision', 'F1_Score', 'AUC']):
    lb = LabelEncoder()
    labels = lb.fit_transform(labels)

    skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
    neigh = KNeighborsClassifier(n_neighbors=k,
                                 metric='precomputed',
                                 weights='distance')

    pred_list = []
    pred_prob_list = []
    labels_list = []
    for train_idx, test_idx in skf.split(distances, labels):
        distances_train = distances[train_idx, :]
        distances_train = distances_train[:, train_idx]

        distances_test = distances[test_idx, :]
        distances_test = distances_test[:, train_idx]

        labels_train = labels[train_idx]
        labels_test = labels[test_idx]

        neigh.fit(distances_train, labels_train)
        pred = neigh.predict(distances_test)
        pred_prob = neigh.predict_proba(distances_test)

        labels_list.extend(labels_test)
        pred_list.extend(pred)
        pred_prob_list.extend(pred_prob)

    pred = np.asarray(pred_list)
    pred_prob = np.asarray(pred_prob_list)
    labels = np.asarray(labels_list)

    OH = OneHotEncoder(sparse=False)
    labels = OH.fit_transform(labels.reshape(-1, 1))
    pred = OH.transform(pred.reshape(-1, 1))

    metric = []
    value = []
    classes = []
    k_list = []
    for ii, c in enumerate(lb.classes_):
        if 'Recall' in metrics:
            value.append(recall_score(y_true=labels[:, ii], y_pred=pred[:,
                                                                        ii]))
            metric.append('Recall')
            classes.append(c)
            k_list.append(k)
        if 'Precision' in metrics:
            value.append(
                precision_score(y_true=labels[:, ii], y_pred=pred[:, ii]))
            metric.append('Precision')
            classes.append(c)
            k_list.append(k)
        if 'F1_Score' in metrics:
            value.append(f1_score(y_true=labels[:, ii], y_pred=pred[:, ii]))
            metric.append('F1_Score')
            classes.append(c)
            k_list.append(k)
        if 'AUC' in metrics:
            value.append(roc_auc_score(labels[:, ii], pred_prob[:, ii]))
            metric.append('AUC')
            classes.append(c)
            k_list.append(k)

    return classes, metric, value, k_list
Beispiel #50
0
    elif first_scan >= threshold:
        if second_scan == 0:
            threshhold_data.append(i)
            starting_labels.append(malware)
            ending_labels.append(benign)
        elif second_scan >= threshold:
            threshhold_data.append(i)
            starting_labels.append(malware)
            ending_labels.append(malware)

skf = StratifiedKFold(n_splits=splits)
skf.get_n_splits(threshhold_data, ending_labels)
print("stratified")

for train_indexs, test_indexs in skf.split(threshhold_data, ending_labels):
    test_set = []
    train_set = []

    for i in train_indexs:
        train_set.append(threshhold_data[i])
    for i in test_indexs:
        test_set.append(threshhold_data[i])

    xTrain, yTrain, yExpect, day_one = build_matrix(train_set, test_set)
    print("built matrix")
    if method == 'var':
        for u_a in alpha_values:
            for l_a in alpha_values:
                #checking upload
                variant(xTrain, yTrain, l_a, l_a, u_a, u_a, kern, kNN, g=gamma)
Beispiel #51
0
# **Validation Strategy: Stratified KFold**

# In[ ]:

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=59)

# In[ ]:

predicted = np.zeros((test.shape[0], 9))
measured = np.zeros((data.shape[0]))
score = 0

# In[ ]:

for times, (trn_idx, val_idx) in enumerate(
        folds.split(data.values, target['surface'].values)):
    model = RandomForestClassifier(n_estimators=500, n_jobs=-1)
    #model = RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=5, n_jobs=-1)
    model.fit(data.iloc[trn_idx], target['surface'][trn_idx])
    measured[val_idx] = model.predict(data.iloc[val_idx])
    predicted += model.predict_proba(test) / folds.n_splits
    score += model.score(data.iloc[val_idx], target['surface'][val_idx])
    print("Fold: {} score: {}".format(
        times, model.score(data.iloc[val_idx], target['surface'][val_idx])))

    importances = model.feature_importances_
    indices = np.argsort(importances)
    features = data.columns

    if model.score(data.iloc[val_idx], target['surface'][val_idx]) > 0.92000:
        hm = 30
Beispiel #52
0
        Dense(16,
              activation='relu',
              kernel_initializer='he_uniform',
              use_bias=True))
    model.add(Dense(1, input_shape=(16, ), activation='linear', use_bias=True))

    print(model.summary())

    if learn:
        model.compile(loss='mean_absolute_error',
                      optimizer='adam',
                      metrics=['mae'])

        maes = []

        for train, val in kfold.split(X_train, y_train):
            # Fit the model
            history = model.fit(X_train[train],
                                y_train[train],
                                epochs=150,
                                batch_size=8,
                                verbose=0)
            # evaluate the model
            scores = model.evaluate(X_train[val], y_train[val], verbose=1)
            print("%s: %.2f" % (model.metrics_names[1], scores[1]))
            maes.append(scores[1])

            # ========== PART 3 ============ #
            # Code to be used only to print nice plots

            if plot:
Beispiel #53
0
lr_end = 1e-5
result = []


# 学习率变化
schedule = lambda epoch: LR_schedule(epoch, 5, lr_start=lr_start, lr_end=lr_end, c=c)
lr_schedule_obj = LearningRateScheduler(schedule=schedule)
# 随机权重对象
swa_obj = SWA(swa_start, update)
# 记录对象
history = LossHistory()


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
i = 1
for train_index, test_index in skf.split(X_all, Y_all):
    # 随机数设置
    print("开始第{0}轮验证".format(i))
    model = LSTM_model()
    model.compile(optimizer=RMSprop(rho=0.9, epsilon=1e-06, clipnorm=0, clipvalue=1),
                  loss=weight_categorical_crossentropy, metrics=['accuracy'])

    X_train = X_all[train_index]
    X_test = X_all[test_index]
    m = Y_all[train_index]
    Y_train = np_utils.to_categorical(Y_all[train_index], 4)
    Y_test = np_utils.to_categorical(Y_all[test_index], 4)

    print("校验类比比例")
    print("类别1: ", len(m[m == 0]) / len(m))
    print("类别2: ", len(m[m == 1]) / len(m))
for train_index, test_index in kf.split(normalised_train_df):
    x_train, x_test = normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index]
    y_train, y_test = y_balanced[train_index], y_balanced[test_index]
    model = LogisticRegression().fit(x_train, y_train)
    #save result to list
    f1_scores.append(f1_score(y_true = y_test, y_pred = model.predict(x_test),
                            pos_label = '2A')*100)
f1_scores


#StratifiedKFold
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
f1_scores = []
#run for every split
for train_index, test_index in skf.split(normalised_train_df, y_balanced):
    x_train, x_test = np.array(normalised_train_df)[train_index], np.array(normalised_train_df)[test_index]
    y_train, y_test = y_balanced[train_index], y_balanced[test_index]
    model = LogisticRegression().fit(x_train, y_train)
    #save result to list
    f1_scores.append(f1_score(y_true = y_test, y_pred = model.predict(x_test),
                            pos_label = '2A')*100)
f1_scores


#LeaveOneOut
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balanced, cv=loo, scoring='f1_macro')
average_score = scores.mean() * 100
average_score
Beispiel #55
0
def train_cross_validation_model(model,
                                 X,
                                 y,
                                 output_folder,
                                 splits,
                                 resolution,
                                 batch_size=20,
                                 epochs=50):

    skf = StratifiedKFold(n_splits=splits, random_state=42, shuffle=True)
    params = {
        'batch_size': batch_size,
        'input_shape': (229, 229, 3),
        'size': (229, 229),
        'shuffle': True
    }
    split = 1

    print(y)

    for train_idx, test_idx in skf.split(X, y):

        print('Starting Split : %02d' % split)
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = to_categorical(y[train_idx]), to_categorical(
            y[test_idx])

        output_log = os.path.join(output_folder, 'split_%03d' % split)
        weights_best, logs_dir = logging_configuration(output_log)
        save_split(X_train, X_test, y_train, y_test, output_log)
        ds = Dataflow(X_train, y_train, size=(229, 229))
        dsm = MultiProcessRunner(ds, num_prefetch=batch_size, num_proc=5)
        ds1 = BatchData(dsm, batch_size)
        train_gen = gen(ds1)

        callbacks_list = get_callbacks(weights_best, logs_dir)
        if True:
            History = model.fit_generator(train_gen,
                                          callbacks=callbacks_list,
                                          epochs=epochs,
                                          steps_per_epoch=len(y_train))

        else:
            train_data = get_tf_dataset(filenames=X_train, labels=y_train)
            validation_data = get_tf_dataset(filenames=X_test, labels=y_test)
            History = model.fit(train_data,
                                callbacks=callbacks_list,
                                epochs=epochs,
                                steps_per_epoch=len(y) / 100)
        X_test_img = np.array(
            [cv2.resize(cv2.imread(im), (224, 224)) for im in X_test],
            dtype=np.float16)
        y_pred = model.predict_classes(X_test_img)
        test = np.argmax(y_test, axis=1)
        report = classification_report(test, y_pred, output_dict=True)
        df = pd.DataFrame(report).transpose()
        print(report)
        Historydf = pd.DataFrame(History.history)
        history_file = os.path.join(output_log, 'history.csv')
        Historydf.to_csv(history_file)
        report_file = os.path.join(output_log, 'report.csv')
        df.to_csv(report_file)
        split += 1
Beispiel #56
0
def extract_feature_siamese_lstm_manDist():
    # 前期参数设置
    embedding_matrix_file_path = 'train_all_w2v_embedding_matrix.pickle'
    feature_name = 'dl_siamese_lstm_manDist'
    RANOD_SEED = 42
    np.random.seed(RANOD_SEED)
    nepoch = 40
    num_folds = 5
    batch_size = 512

    # 加载Embeding矩阵
    embedding_matrix = project.load(project.aux_dir +
                                    embedding_matrix_file_path)

    #加载输入数据
    X_train_s1 = project.load(project.preprocessed_data_dir +
                              's1_train_ids_pad.pickle')
    X_train_s2 = project.load(project.preprocessed_data_dir +
                              's2_train_ids_pad.pickle')

    X_test_s1 = project.load(project.preprocessed_data_dir +
                             's1_test_ids_pad.pickle')
    X_test_s2 = project.load(project.preprocessed_data_dir +
                             's2_test_ids_pad.pickle')

    #y_0.6_train.pickle 存储的为list
    y_train = np.array(
        project.load(project.features_dir + 'y_0.6_train.pickle'))
    y_val = np.array(project.load(project.features_dir + 'y_0.4_test.pickle'))

    #定义model param
    model_param = {
        'lstm_units': 50,
        'lstm_dropout_rate': 0.,
        'lstm_re_dropout_rate': 0.,
        'desen_dropout_rate': 0.75,
        'num_dense': 128
    }
    # model_checkpoint_path = project.temp_dir + 'fold-checkpoint-'+feature_name + '.h5'
    kfold = StratifiedKFold(n_splits=num_folds,
                            shuffle=True,
                            random_state=RANOD_SEED)
    # 存放最后预测结果
    y_train_oofp = np.zeros((len(y_train), 2), dtype='float64')
    y_test_oofp = np.zeros((len(X_test_s1), 2), dtype='float64')

    train_y = to_categorical(y_train, 2)
    val_y = to_categorical(y_val, 2)

    for fold_num, (ix_train,
                   ix_val) in enumerate(kfold.split(X_train_s1, y_train)):

        # 选出需要添加的样本
        train_true_mask = y_train[ix_train] == 1
        X_train_true_s1 = X_train_s1[ix_train][train_true_mask]
        X_train_true_s2 = X_train_s2[ix_train][train_true_mask]
        y_train_true = train_y[ix_train][train_true_mask]

        # 进行添加
        X_add_train_fold_s1 = np.vstack(
            [X_train_s1[ix_train], X_train_true_s2])
        X_add_train_fold_s2 = np.vstack(
            [X_train_s2[ix_train], X_train_true_s1])
        y_add_train_fold = np.concatenate([train_y[ix_train], y_train_true])

        val_true_mask = y_train[ix_val] == 1
        X_val_true_s1 = X_train_s1[ix_val][val_true_mask]
        X_val_true_s2 = X_train_s2[ix_val][val_true_mask]
        y_val_true = train_y[ix_val][val_true_mask]

        # 进行添加
        X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2])
        X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1])
        y_add_val_fold = np.concatenate([train_y[ix_val], y_val_true])

        print('start train fold {} of {} ......'.format((fold_num + 1), 5))
        # 创建模型
        model = create_siamese_lstm_ManDistance_model(embedding_matrix,
                                                      model_param)
        # 训练模型
        model_checkpoint_path = project.trained_model_dir + 'dl_siamese_lstm_manDist_model{}.h5'.format(
            fold_num)
        model.fit(x=[X_add_train_fold_s1, X_add_train_fold_s2],
                  y=y_add_train_fold,
                  validation_data=([X_add_val_fold_s1,
                                    X_add_val_fold_s2], y_add_val_fold),
                  batch_size=batch_size,
                  epochs=nepoch,
                  verbose=1,
                  class_weight={
                      0: 1,
                      1: 2
                  },
                  callbacks=[
                      EarlyStopping(monitor='val_loss',
                                    min_delta=0.005,
                                    patience=5,
                                    verbose=1,
                                    mode='auto'),
                      ModelCheckpoint(model_checkpoint_path,
                                      monitor='val_loss',
                                      save_best_only=True,
                                      save_weights_only=False,
                                      verbose=1)
                  ])
        model.load_weights(model_checkpoint_path)
        y_train_oofp[ix_val] = predict(model, X_train_s1[ix_val],
                                       X_train_s2[ix_val])
        K.clear_session()
        del X_add_train_fold_s1
        del X_add_train_fold_s2
        del X_add_val_fold_s1
        del X_add_val_fold_s2
        del y_add_train_fold
        del y_add_val_fold
        gc.collect()

    # save feature

    model_path = project.trained_model_dir + 'dl_siamese_lstm_manDist_model0.h5'
    model0 = load_model(model_path,
                        custom_objects={
                            'ManDist': ManDist,
                            'fbeta_score': fbeta_score,
                            'precision': precision,
                            'recall': recall
                        })
    y_test_oofp = predict(model0, X_test_s1, X_test_s2)
    col_names = ['{}_{}'.format(feature_name, index) for index in range(2)]
    after_extract_feature_save_data(y_train_oofp, y_test_oofp, col_names,
                                    feature_name)
Beispiel #57
0
n_estimators = [100]
max_leaf_nodes = [2, 3, 4, 6, 8, 10]
learning_rate = [0.1]
min_samples_leaf = [1]
subsample = [0.1, 0.2, 0.25, 0.5, 0.75, 1.0]

gbtree = GradientBoostingRegressor()
pipeline = Pipeline([('standardize', StandardScaler()), ('gbr', gbtree)])

param_grid = dict(gbr__n_estimators=n_estimators,
                  gbr__max_leaf_nodes=max_leaf_nodes,
                  gbr__learning_rate=learning_rate,
                  gbr__subsample=subsample,
                  gbr__min_samples_leaf=min_samples_leaf)
metrics = ['neg_mean_squared_error']

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
grid_gbr = GridSearchCV(estimator=pipeline,
                        param_grid=param_grid,
                        scoring=metrics,
                        cv=kfold.split(X, df.iloc[:, 2].values),
                        return_train_score=True,
                        refit='neg_mean_squared_error')
results_gbr = grid_gbr.fit(X, y)

# Save model and results
df_results = pd.DataFrame(results_gbr.cv_results_)
df_results.to_csv(root_path / 'results' / 'results_gbr_metrics.csv',
                  index=False)

joblib.dump(grid_gbr.best_estimator_, root_path / 'models' / 'grid_gbr.pkl')
Beispiel #58
0
def main():

    # fix seed for train reproduction
    seed_everything(args.SEED)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("\n device", device)


    # TODO dataset loading
    train_df = pd.read_csv('/DATA/trainset-for_user.csv', header=None)
    train_df = train_df.dropna().reset_index(drop=True)
    test_df = pd.read_csv('/DATA/testset-for_user.csv', header=None)
    print('train_df shape : ', train_df.shape)

    train_df = create_str_feature(train_df)
    test_df = create_str_feature(test_df)
    
    train_df['patient_label'] = train_df['patient'] + '_' + train_df['label']
    train_df['count'] = train_df['patient_label'].map(train_df['patient_label'].value_counts())

    print(train_df.head())
    print(train_df.isnull().sum())
    from sklearn.model_selection import train_test_split

    train_df['image_path'] = [os.path.join('/DATA', train_df['patient'][i], train_df['image'][i]) for i in range(train_df.shape[0])]
    labels = train_df['label'].map({'Wake':0, 'N1':1, 'N2':2, 'N3':3, 'REM':4}).values
    str_train_df = train_df[['time', 'user_count', 'user_max', 'user_min']].values
    str_test_df = test_df[['time', 'user_count', 'user_max', 'user_min']].values

    print('meta max value: ', str_train_df.max(), str_test_df.max(), 'meta shape: ', str_train_df.shape, str_test_df.shape)

    skf_labels = train_df['patient'] + '_' + train_df['label']

    unique_idx = train_df[train_df['count']==1].index
    non_unique_idx = train_df[train_df['count']>1].index
    trn_idx, val_idx, trn_labels, val_labels = train_test_split(non_unique_idx, labels[non_unique_idx],
                                                                test_size=0.05,
                                                                random_state=0,
                                                                shuffle=True,
                                                                stratify=skf_labels[non_unique_idx])

    # valid set define
    trn_image_paths = train_df.loc[trn_idx, 'image_path'].values
    val_image_paths = train_df.loc[val_idx, 'image_path'].values

    # struture data define
    trn_str_data = str_train_df[trn_idx, :]
    val_str_data = str_train_df[val_idx, :]

    print('\n')
    print('8:2 train, valid split : ', len(trn_image_paths), len(trn_labels), len(val_image_paths), len(val_labels), trn_str_data.shape, val_str_data.shape)
    print('\n')
    print(trn_image_paths[:5], trn_labels[:5])
    print(val_image_paths[:5], val_labels[:5])

    valid_transforms = create_val_transforms(args, args.input_size)
    if args.DEBUG:
        valid_dataset = SleepDataset(args, val_image_paths[:100], val_str_data, val_labels[:100], valid_transforms, is_test=False)
    else:
        valid_dataset = SleepDataset(args, val_image_paths, val_str_data, val_labels, valid_transforms, is_test=False)
    valid_loader = DataLoader(dataset=valid_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False, pin_memory=True)

    trn_skf_labels = (train_df.loc[trn_idx, 'patient'] + train_df.loc[trn_idx, 'label']).values
    print('skf labels head : ', trn_skf_labels[:5])

    if args.DEBUG:
        print('\n#################################### DEBUG MODE')
    else:
        print('\n################################### MAIN MODE')
        print(trn_image_paths.shape, trn_labels.shape, trn_skf_labels.shape)

    # train set define
    train_dataset_dict = {}
    skf = StratifiedKFold(n_splits=args.n_folds, shuffle=True, random_state=args.SEED)
    nsplits = [val_idx for _, val_idx in skf.split(trn_image_paths, trn_skf_labels)]
    print(nsplits)
    #np.save('nsplits.npy', nsplits)

    #print('\nload nsplits')
    #nsplits = np.load('nsplits.npy', allow_pickle=True)
    #print(nsplits)

    for idx, val_idx in enumerate(nsplits):#trn_skf_labels

        sub_img_paths = np.array(trn_image_paths)[val_idx]
        sub_labels = np.array(trn_labels)[val_idx]
        sub_meta = np.array(trn_str_data)[val_idx]
        if args.DEBUG:
            sub_img_paths = sub_img_paths[:200]
            sub_labels = sub_labels[:200]
            sub_meta = sub_meta[:200]

        if idx==1 or idx==6:
            sub_img_paths = np.concatenate([sub_img_paths, train_df.loc[unique_idx, 'image_path'].values])
            sub_labels = np.concatenate([sub_labels, labels[unique_idx]])
            sub_meta = np.concatenate([sub_meta, str_train_df[unique_idx]])

        train_transforms = create_train_transforms(args, args.input_size)
        #train_dataset = SleepDataset(args, sub_img_paths, sub_labels, train_transforms, use_masking=True, is_test=False)
        train_dataset_dict[idx] = [args, sub_img_paths, sub_meta, sub_labels, train_transforms]
        print(f'train dataset complete {idx}/{args.n_folds}, ')

    print("numberr of train datasets: ", len(train_dataset_dict))

    # define model
    model = build_model(args, device)

    # optimizer definition
    optimizer = build_optimizer(args, model)
    #scheduler = build_scheduler(args, optimizer, len(train_loader))
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 9)
    scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=1, after_scheduler=scheduler_cosine)

    if args.label_smoothing:
        criterion = LabelSmoothingLoss(classes=args.num_classes, smoothing=args.label_smoothing_ratio)
    else:
        criterion = nn.CrossEntropyLoss()

    trn_cfg = {'train_datasets':train_dataset_dict,
                    'valid_loader':valid_loader,
                    'model':model,
                    'criterion':criterion,
                    'optimizer':optimizer,
                    'scheduler':scheduler,
                    'device':device,
                    'fold_num':0,
                    }

    train(args, trn_cfg)
# In[180]:


randomForestModel.fit(train_predictor,train_response)


# In[102]:


#K-Fold cross validation
cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)
results = pd.DataFrame(columns=['training_score', 'test_score'])
fprs, tprs, scores = [], [], []
    
for (train, test), i in zip(cv.split(train_dataset,train_response), range(10)):
    randomForestModel.fit(train_dataset.iloc[train], train_response.iloc[train])
    _, _, auc_score_train = compute_roc_auc(train)
    fpr, tpr, auc_score = compute_roc_auc(test)
    scores.append((auc_score_train, auc_score))
    fprs.append(fpr)
    tprs.append(tpr)

plot_roc_curve(fprs, tprs);
pd.DataFrame(scores, columns=['AUC Train', 'AUC Test'])


# In[ ]:


#fit chaid analysis
Beispiel #60
0
    model = Model([input_layer, input2], output_coverage)
    return model


preds = []
ids = []
preds_test = []
ids_test = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model_count = 0
data = np.array(train_df.images.map(upsample).tolist()).reshape(
    -1, img_size_target, img_size_target, 1)
labels = train_df.coverage

is_train = True
for train_idx, val_idx in skf.split(data, train_df.coverage_class):
    if is_train and model_count != int(sys.argv[1]):
        model_count += 1
        continue
    model = build_model(10)
    model.compile(loss='mse', optimizer="adam", metrics=["accuracy", "mse"])
    ids_train, ids_valid,x_train, x_valid, y_train, y_valid, depth_train, depth_test = \
        train_df.index.values[train_idx],train_df.index.values[val_idx], \
        data[train_idx], data[val_idx], \
        labels[train_idx], labels[val_idx], \
        train_df.z.values[train_idx],train_df.z.values[val_idx]
    depth_train = np.array(map(lambda x: math.log(x + 1, 10), depth_train))
    depth_test = np.array(map(lambda x: math.log(x + 1, 10), depth_test))
    sample_weight = [1.0] * len(x_train)
    x_train_org = x_train.copy()
    y_train_org = y_train.copy()