def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False,
                   shuffle=True,metric='acc',clf_name='UnKnown'):
    folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed)
    folds.get_n_splits(X_train,y)
    #return stacking_proba for train set
    train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0]))
    score=0
    for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)):
        # print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds))
        X_train_fold=X_train[train_index,:]
        y_train_fold=y[train_index]
        X_validate_fold=X_train[validate_index,:]
        y_validate_fold=y[validate_index]
        clf.fit(X_train_fold,y_train_fold)
        fold_preds=clf.predict_proba(X_validate_fold)
        train_stacking_proba[validate_index,:]=fold_preds
        #validation
        fold_preds_a = np.argmax(fold_preds, axis=1)
        fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold)
        # print('validate '+metric+":"+str(fold_score))
        score+=fold_score
    score/=nfolds
    #return stacking_proba for test set
    clf.fit(X_train,y)
    test_stacking_proba=clf.predict_proba(X_test)

    if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
        train_stacking_proba=train_stacking_proba[:,1]
        test_stacking_proba=test_stacking_proba[:,1]
    if return_score:
        return train_stacking_proba,test_stacking_proba,score
    else:
        return train_stacking_proba,test_stacking_proba
def test_kfold_valueerrors():
    X1 = np.array([[1, 2], [3, 4], [5, 6]])
    X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, next, KFold(4).split(X1))

    # Check that a warning is raised if the least populated class has too few
    # members.
    y = np.array([3, 3, -1, -1, 2])

    skf_3 = StratifiedKFold(3)
    assert_warns_message(Warning, "The least populated class",
                         next, skf_3.split(X2, y))

    # Check that despite the warning the folds are still computed even
    # though all the classes are not necessarily represented at on each
    # side of the split at each split
    with warnings.catch_warnings():
        check_cv_coverage(skf_3, X2, y, labels=None, expected_n_iter=3)

    # Error when number of folds is <= 1
    assert_raises(ValueError, KFold, 0)
    assert_raises(ValueError, KFold, 1)
    assert_raises(ValueError, StratifiedKFold, 0)
    assert_raises(ValueError, StratifiedKFold, 1)

    # When n_folds is not integer:
    assert_raises(ValueError, KFold, 1.5)
    assert_raises(ValueError, KFold, 2.0)
    assert_raises(ValueError, StratifiedKFold, 1.5)
    assert_raises(ValueError, StratifiedKFold, 2.0)

    # When shuffle is not  a bool:
    assert_raises(TypeError, KFold, n_folds=4, shuffle=None)
Ejemplo n.º 3
0
 def __init__(self, fm_decoder, n_iter=5, n_folds=3,
              random_state=None):
     self.fm_decoder = fm_decoder
     StratifiedKFold.__init__(
         self,
         n_folds=n_folds,
         random_state=random_state)
Ejemplo n.º 4
0
def test_datasets(dataset_names):
    from sklearn.svm import SVC
    data = Data(dataset_names=dataset_names)

    def separate_sets(x, y, test_fold_id, test_folds):
        x_test = x[test_folds == test_fold_id, :]
        y_test = y[test_folds == test_fold_id]

        x_train = x[test_folds != test_fold_id, :]
        y_train = y[test_folds != test_fold_id]
        return [x_train, y_train, x_test, y_test]

    n_folds = 2
    accuracies = {}
    for name, dataset in data.datasets.items():
        dataset.print_summary()
        skf = StratifiedKFold(dataset.target, n_folds=n_folds, shuffle=True)
        test_folds = skf.test_folds
        accuracies[name] = np.zeros(n_folds)
        test_fold = 0
        for train_idx, test_idx in skf.split(X=dataset.data, y=dataset.target):
            x_train, y_train = dataset.data[train_idx], dataset.target[train_idx]
            x_test, y_test = dataset.data[test_idx], dataset.target[test_idx]

            svc = SVC(C=1.0, kernel='rbf', degree=1, tol=0.01)
            svc.fit(x_train, y_train)
            prediction = svc.predict(x_test)
            accuracies[name][test_fold] = 100*np.mean((prediction == y_test))
            print("Acc = {0:.2f}%".format(accuracies[name][test_fold]))
            test_fold += 1
    return accuracies
Ejemplo n.º 5
0
    def stratified_cross_validate(self, k):
        attributes = np.append(self.training_attributes, self.testing_attributes, axis=0)
        labels = np.append(self.training_labels, self.testing_labels, axis=0)

        all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))])

        #print("all data : %s" % all_data)
        #print("")

        np.random.shuffle(all_data)

        X = all_data[:, :-1]
        y = all_data[:, -1]
        print(X.shape, y.shape)
        skf = StratifiedKFold(n_splits=2)
        print(skf.get_n_splits(X, y))
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            yield (X_train, y_train, X_test, y_test)

        #print("shuffled data : %s" % all_data)
        #print("")

        for i in range(k):
            split = len(all_data) / k
            #print("split : %s" % split)

            test_data = all_data[i * split:(i + 1) * split, :]
            train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0)

            train_input, train_output = train_data[:, :-1], train_data[:, -1]
            test_input, test_output = test_data[:, :-1], test_data[:, -1]

            yield (train_input, train_output, test_input, test_output)
Ejemplo n.º 6
0
def cv_score(X, y, n_epochs = 10, n_folds=10, random_state=1999):
    kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)
    scores = np.zeros((n_folds, n_epochs))
    val_scores = np.zeros((n_folds, n_epochs))
    best_epochs = np.zeros(n_folds)
    clfs = [KerasWrapper(num_features=X.shape[1], label='keras_{}'.format(i)) for i in range(n_folds)]
    folds = kf.split(X, y_train)
    #iteratively train epochs
    kfsplit = [(itrain, itest) for itrain, itest in folds]
    for i in range(n_epochs):
        print('=============Epoch {}================'.format(i))
        i_fold = 0
        for itrain, itest in kfsplit:
            print('Fold ', i_fold)
            train = X[itrain,:]
            test = X[itest,:]
            ytrain, ytest = y[itrain], y[itest]
            clf, score, num_epoch = clfs[i_fold].fit(train, ytrain, nb_epoch=1, 
                                               validation_split=None, batch_size=64,
                                               patience=1)

            print('score: {}'.format(score))
            scores[i_fold, i] = score
            best_epochs[i_fold] = num_epoch

            # predict on oof
            pred = clf.predict_proba(test)
            val_score = log_loss(ytest, pred)
            print('Validation score: ', val_score)
            val_scores[i_fold, i] = val_score
            i_fold += 1
    return scores, val_scores, best_epochs
Ejemplo n.º 7
0
def get_cv_results(design, data, cv_splits=10):
  test_df, unit_onehot, unit_x = data
  cv_results = []
  for i in range(design.shape[0]):
    lambda_int, lambda_x = design[i, :]
    val_losses = []
    for rep in range(3): # Almost like bootstrap. Reshuffling
      
      cv_val_losses = []
      skf = StratifiedKFold(n_splits=10, shuffle=True)
      for train_index, test_index in skf.split(unit_x, test_df['unit']):
         re_model = create_model(unit_onehot.shape[1], lambda_int, lambda_x,
                                 .01, .0001, .92)

         X_train = [test_df["x"][train_index], unit_onehot[train_index],
                    unit_x[train_index]]
         X_test = [test_df["x"][test_index], unit_onehot[test_index],
                    unit_x[test_index]]

         y_train, y_test = test_df["y"][train_index], test_df["y"][test_index]
         h = re_model.fit(X_train, y_train,
                          epochs = 15000, batch_size = 450,
                          validation_data = (X_test, y_test),
                          callbacks = callbacks, verbose = 0)
         cv_val_losses.append(np.min(h.history['val_loss']))

      val_losses.append(np.mean(cv_val_losses))
    cv_results.append(np.mean(val_losses)) 
  return cv_results
Ejemplo n.º 8
0
    def split_data(self, X, y, stratified = True, bad_chess = False):
        if bad_chess:
            n_points = int(X.shape[0] / self.nodes)
            for node in range(self.nodes):
                start_slice = node * n_points
                final_slice = start_slice + n_points
                dx = X[start_slice:final_slice]
                dy = y[start_slice:final_slice]

                frame_dx = pd.DataFrame(dx)
                frame_dy = pd.DataFrame(dy)

                file_data  = datas_path.joinpath('data_' + str(node) + '.csv')
                file_class = datas_path.joinpath('class_' + str(node) + '.csv')
                frame_dx.to_csv(file_data, index = False)
                frame_dy.to_csv(file_class, index = False)
        else:
            node = 0
            if stratified:
                skf  = StratifiedKFold(n_splits = self.nodes)
            else:
                skf  = KFold(n_splits = self.nodes, shuffle = True, random_state = 17)
            for splited_index in skf.split(X, y):
                new_X = pd.DataFrame(X[splited_index[1]])
                new_y = pd.DataFrame(y[splited_index[1]])

                X_path = datas_path.joinpath("data_" + str(node) + ".csv")
                y_path = datas_path.joinpath("class_" + str(node) + ".csv")
                new_X.to_csv(X_path, index = False)
                new_y.to_csv(y_path, index = False)
                node += 1
Ejemplo n.º 9
0
def cv(X_train, y_train):

    kfold = StratifiedKFold(n_splits=5, shuffle=True)

    scores_f = []
    scores_p = []
    scores_r = []


    for train, test in kfold.split(X_train, y_train):

        model = TargetEnsembler(features)
        X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns)
        y_train_cv = pd.DataFrame(y_train.values[train], columns=["PCL_Strict3"])
        X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns)
        y_test_cv = pd.DataFrame(y_train.values[test], columns=["PCL_Strict3"])
        model.fit(X_train_cv, y_train_cv)

        y_pred = model.predict(X_test_cv)

        s_f = f1_score(y_test_cv, y_pred)
        s_p = precision_score(y_test_cv, y_pred)
        s_r = recall_score(y_test_cv, y_pred)
        print("\tscores f1", (s_f))
        print("\tscores p", (s_p))
        print("\tscores r", (s_r))
        scores_f.append(s_f)
        scores_p.append(s_p)
        scores_r.append(s_r)

    print("mean scores f1", np.mean(scores_f))
    print("mean scores p", np.mean(scores_p))
    print("mean scores r", np.mean(scores_r))
Ejemplo n.º 10
0
def classify(X,y, clf,**para):
    # y = profile["Loss"].as_matrix()
    # X = profile[features].as_matrix()

    kf = KFold(n_splits=10)
    skf = StratifiedKFold(n_splits=6)

    # print(**para)
    classifier = clf(**para)
    name = str(classifier).split("(")[0]


    # dt = tree.DecisionTreeClassifier(min_samples_split=min_split, max_depth=max_dep)
    print("{0} has been established with {1}".format(name, para))
    # lr = LogisticRegression(penalty='l1')

    for train_index, test_index in skf.split(X, y):
        #     print("TRAIN:",train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        print("10-fold Score is: {0}".format(score))

    return classifier,y_test, y_pred
Ejemplo n.º 11
0
def test_grid_search_correct_score_results():
    # test that correct scores are used
    n_splits = 3
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits)
        results = grid_search.fit(X, y).cv_results_

        # Test scorer names
        result_keys = list(results.keys())
        expected_keys = (("mean_test_score", "rank_test_score") +
                         tuple("split%d_test_score" % cv_i
                               for cv_i in range(n_splits)))
        assert_true(all(in1d(expected_keys, result_keys)))

        cv = StratifiedKFold(n_splits=n_splits)
        n_splits = grid_search.n_splits_
        for candidate_i, C in enumerate(Cs):
            clf.set_params(C=C)
            cv_scores = np.array(
                list(grid_search.cv_results_['split%d_test_score'
                                             % s][candidate_i]
                     for s in range(n_splits)))
            for i, (train, test) in enumerate(cv.split(X, y)):
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, cv_scores[i])
Ejemplo n.º 12
0
def split(dependent, independent, n_folds):
  skf = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_STATE)
  for train_indices, test_indices in skf.split(dependent, independent):
    train_x = dependent[train_indices]
    train_y = independent[train_indices]
    test_x = dependent[test_indices]
    test_y = independent[test_indices]
    yield train_x, train_y, test_x, test_y
Ejemplo n.º 13
0
def test_ovr_multinomial_iris():
    # Test that OvR and multinomial are correct using the iris dataset.
    train, target = iris.data, iris.target
    n_samples, n_features = train.shape

    # The cv indices from stratified kfold (where stratification is done based
    # on the fine-grained iris classes, i.e, before the classes 0 and 1 are
    # conflated) is used for both clf and clf1
    n_cv = 2
    cv = StratifiedKFold(n_cv)
    precomputed_folds = list(cv.split(train, target))

    # Train clf on the original dataset where classes 0 and 1 are separated
    clf = LogisticRegressionCV(cv=precomputed_folds)
    clf.fit(train, target)

    # Conflate classes 0 and 1 and train clf1 on this modified dataset
    clf1 = LogisticRegressionCV(cv=precomputed_folds)
    target_copy = target.copy()
    target_copy[target_copy == 0] = 1
    clf1.fit(train, target_copy)

    # Ensure that what OvR learns for class2 is same regardless of whether
    # classes 0 and 1 are separated or not
    assert_array_almost_equal(clf.scores_[2], clf1.scores_[2])
    assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_)
    assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_)

    # Test the shape of various attributes.
    assert_equal(clf.coef_.shape, (3, n_features))
    assert_array_equal(clf.classes_, [0, 1, 2])
    coefs_paths = np.asarray(list(clf.coefs_paths_.values()))
    assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1))
    assert_equal(clf.Cs_.shape, (10,))
    scores = np.asarray(list(clf.scores_.values()))
    assert_equal(scores.shape, (3, n_cv, 10))

    # Test that for the iris data multinomial gives a better accuracy than OvR
    for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']:
        max_iter = 2000 if solver in ['sag', 'saga'] else 15
        clf_multi = LogisticRegressionCV(
            solver=solver, multi_class='multinomial', max_iter=max_iter,
            random_state=42, tol=1e-5 if solver in ['sag', 'saga'] else 1e-2,
            cv=2)
        clf_multi.fit(train, target)
        multi_score = clf_multi.score(train, target)
        ovr_score = clf.score(train, target)
        assert_greater(multi_score, ovr_score)

        # Test attributes of LogisticRegressionCV
        assert_equal(clf.coef_.shape, clf_multi.coef_.shape)
        assert_array_equal(clf_multi.classes_, [0, 1, 2])
        coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values()))
        assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10,
                                                      n_features + 1))
        assert_equal(clf_multi.Cs_.shape, (10,))
        scores = np.asarray(list(clf_multi.scores_.values()))
        assert_equal(scores.shape, (3, n_cv, 10))
Ejemplo n.º 14
0
def gen_folds(X, y, n_folds=5, random_state=0):
    from sklearn.model_selection import StratifiedKFold

    kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)

    folds = kf.split(X, y)
    # iteratively train epochs
    kfsplit = [(itrain, itest) for itrain, itest in folds]
    return kfsplit
Ejemplo n.º 15
0
def categorical_average(variable, y, pred_0, feature_name):
    def calculate_average(sub1, sub2):
        s = pd.DataFrame(data = {
                                 variable: sub1.groupby(variable, as_index = False).count()[variable],                              
                                 'sumy': sub1.groupby(variable, as_index = False).sum()['y'],
                                 'avgY': sub1.groupby(variable, as_index = False).mean()['y'],
                                 'cnt': sub1.groupby(variable, as_index = False).count()['y']
                                 })
                                 
        tmp = sub2.merge(s.reset_index(), how='left', left_on=variable, right_on=variable) 
        del tmp['index']                       
        tmp.loc[pd.isnull(tmp['cnt']), 'cnt'] = 0.0
        tmp.loc[pd.isnull(tmp['cnt']), 'sumy'] = 0.0

        def compute_beta(row):
            cnt = row['cnt'] if row['cnt'] < 200 else float('inf')
            return 1.0 / (g + exp((cnt - k) / f))
            
        if lambda_val is not None:
            tmp['beta'] = lambda_val
        else:
            tmp['beta'] = tmp.apply(compute_beta, axis = 1)
            
        tmp['adj_avg'] = tmp.apply(lambda row: (1.0 - row['beta']) * row['avgY'] + row['beta'] * row['pred_0'],
                                   axis = 1)
                                   
        tmp.loc[pd.isnull(tmp['avgY']), 'avgY'] = tmp.loc[pd.isnull(tmp['avgY']), 'pred_0']
        tmp.loc[pd.isnull(tmp['adj_avg']), 'adj_avg'] = tmp.loc[pd.isnull(tmp['adj_avg']), 'pred_0']
        tmp['random'] = np.random.uniform(size = len(tmp))
        tmp['adj_avg'] = tmp.apply(lambda row: row['adj_avg'] *(1 + (row['random'] - 0.5) * r_k),
                                   axis = 1)
    
        return tmp['adj_avg'].ravel()
     
    #cv for training set 
    k_fold = StratifiedKFold(5)
    X_train[feature_name] = -999 
    for (train_index, cv_index) in k_fold.split(np.zeros(len(X_train)),
                                                X_train['interest_level'].ravel()):
        sub = pd.DataFrame(data = {variable: X_train[variable],
                                   'y': X_train[y],
                                   'pred_0': X_train[pred_0]})
            
        sub1 = sub.iloc[train_index]        
        sub2 = sub.iloc[cv_index]
        
        X_train.loc[cv_index, feature_name] = calculate_average(sub1, sub2)
    
    #for test set
    sub1 = pd.DataFrame(data = {variable: X_train[variable],
                                'y': X_train[y],
                                'pred_0': X_train[pred_0]})
    sub2 = pd.DataFrame(data = {variable: X_test[variable],
                                'y': X_test[y],
                                'pred_0': X_test[pred_0]})
    X_test.loc[:, feature_name] = calculate_average(sub1, sub2)                               
Ejemplo n.º 16
0
def stratifiedCV(X, y, n_splits = 6):

    skf = StratifiedKFold(n_splits=n_splits)

    for train_index, test_index in skf.split(X, y):
        #     print("TRAIN:",train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        yield X_train, y_train, X_test, y_test
Ejemplo n.º 17
0
    def cv_stats(self):
        """Perform cross-validation for model evaluation.
        
        Returns
        -------
        (list[int], list[int], list[float])
            Tuple containing three lists of the same size:
                true labels
                predicted labels
                prediction probabilities
        """
        if 'y_true' in self._cache:
            return self._cache['y_true'], self._cache['y_pred'], self._cache['y_prob'], self._cache['sigfeatures']
        
        X = self._fe.X
        y = self._fe.y
        
        kf = StratifiedKFold(n_splits=10, shuffle=True)
        y_true, y_pred, y_prob = [], [], []
        sigfeatures = []

        order_indices = []

        for train_index, test_index in kf.split(X, y):
            order_indices.extend(test_index)
			
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = self.get_new_classifier()
            clf.fit(X_train, y_train)

            pred = clf.predict(X_test)
            prob = clf.predict_proba(X_test)
            prob = np.choose(pred, prob.T)
            
            for predy in pred:
                sigfeatures.append(get_sig_features(predy, clf.coef_, 20))
            
            y_true.extend(y_test)
            y_pred.extend(pred)
            y_prob.extend(prob)
        # reorder the results so they match the order of original data
        y_true = [v for i, v in sorted(zip(order_indices, y_true))]
        y_pred = [v for i, v in sorted(zip(order_indices, y_pred))]
        y_prob = [v for i, v in sorted(zip(order_indices, y_prob))]
        assert list(y_true) == list(y)
        
        # cache the results
        self._cache['y_true'] = y_true
        self._cache['y_pred'] = y_pred
        self._cache['y_prob'] = y_prob
        self._cache['sigfeatures'] = sigfeatures
        
        return (y_true, y_pred, y_prob, sigfeatures) 
Ejemplo n.º 18
0
def test_shuffle_stratifiedkfold():
    # Check that shuffling is happening when requested, and for proper
    # sample coverage
    X_40 = np.ones(40)
    y = [0] * 20 + [1] * 20
    kf0 = StratifiedKFold(5, shuffle=True, random_state=0)
    kf1 = StratifiedKFold(5, shuffle=True, random_state=1)
    for (_, test0), (_, test1) in zip(kf0.split(X_40, y),
                                      kf1.split(X_40, y)):
        assert_not_equal(set(test0), set(test1))
    check_cv_coverage(kf0, X_40, y, labels=None, expected_n_iter=5)
def get_cross_validated_confusion_matrix(data, label, estimator, index, nfolds=10):
    # nfolds = get_least_class(label)
    skf = StratifiedKFold(n_splits=nfolds)
    con_matrix = np.zeros((len(np.unique(label)), len(np.unique(label))))
    for train_index, test_index in skf.split(data, label):
        train_data, test_data = data[train_index], data[test_index]
        train_label, test_label = np.array(label)[train_index], np.array(label)[test_index]
        estimator.train_matrix(train_data, train_label)
        pred_label = estimator.predict(test_data)
        con_matrix = con_matrix + confusion_matrix(test_label, pred_label, labels = index)
    return con_matrix
Ejemplo n.º 20
0
def run_cv_evaluation(data, n_folds, nlu_config):
    from sklearn import metrics
    from sklearn.model_selection import StratifiedKFold
    from collections import defaultdict
    # type: (List[rasa_nlu.training_data.Message], int, RasaNLUConfig) -> Dict[Text, List[float]]
    """Stratified cross validation on data

    :param data: list of rasa_nlu.training_data.Message objects
    :param n_folds: integer, number of cv folds
    :param nlu_config: nlu config file
    :return: dictionary with key, list structure, where each entry in list
              corresponds to the relevant result for one fold

    """
    trainer = Trainer(nlu_config)
    results = defaultdict(list)

    y_true = [e.get("intent") for e in data]

    skf = StratifiedKFold(n_splits=n_folds, random_state=11, shuffle=True)
    counter = 1
    logger.info("Evaluation started")
    for train_index, test_index in skf.split(data, y_true):

        train = [data[i] for i in train_index]
        test = [data[i] for i in test_index]

        logger.debug("Fold: {}".format(counter))
        logger.debug("Training ...")
        trainer.train(TrainingData(training_examples=train))
        model_directory = trainer.persist("projects/")  # Returns the directory the model is stored in

        logger.debug("Evaluation ...")
        interpreter = Interpreter.load(model_directory, nlu_config)
        test_y = [e.get("intent") for e in test]

        preds = []
        for e in test:
            res = interpreter.parse(e.text)
            if res.get('intent'):
                preds.append(res['intent'].get('name'))
            else:
                preds.append(None)

        # compute fold metrics
        results["Accuracy"].append(metrics.accuracy_score(test_y, preds))
        results["F1-score"].append(metrics.f1_score(test_y, preds, average='weighted'))
        results["Precision"] = metrics.precision_score(test_y, preds, average='weighted')

        # increase fold counter
        counter += 1

    return dict(results)
Ejemplo n.º 21
0
def runCrossValidation(train, RFfile):

	train_tracks = []
	for feature in train:
		if feature[0] != 0.:
			train_tracks.append(feature)
	train_tracks = np.array(train_tracks)
	# Gets parameter values for training data
	trainArr = train_tracks[:,1:]
	# Gets class label of all training data
	trainRes = train_tracks[:,0]

	# Convert all NaNs to 0 for RF to work properly
	trainArr = np.nan_to_num(trainArr)
	trainRes = np.nan_to_num(trainRes)

	# Load the classifier
	rf = joblib.load(RFfile)

	# Stratified KFolds cross validation
	cv = StratifiedKFold(n_splits = 5)

	precision   = []
	accuracy    = []
	sensitivity = []
	matthews    = []
	r2          = []
	f1          = []
	auroc       = []
	cm          = [[0, 0], [0, 0]]

	for train_index, test_index in cv.split(trainArr, trainRes):
	    probas_     = rf.fit(trainArr[train_index], trainRes[train_index]).predict_proba(trainArr[test_index])
	    classes     = rf.fit(trainArr[train_index], trainRes[train_index]).predict(trainArr[test_index])
	   # r2          = np.append(r2, (r2_score(trainRes[test_index], probas_[:, 1])))
	    precision   = np.append(precision, (precision_score(trainRes[test_index], classes)))
	   # auroc       = np.append(auroc, (roc_auc_score(trainRes[test_index], classes)))
	    accuracy    = np.append(accuracy, (accuracy_score(trainRes[test_index], classes)))
	    sensitivity = np.append(sensitivity, (recall_score(trainRes[test_index], classes)))
	    f1          = np.append(f1, (f1_score(trainRes[test_index], classes)))
	   # matthews    = np.append(matthews, (matthews_corrcoef(trainRes[test_index], classes)))
	    #cma         = np.add(cma, (confusion_matrix(trainRes[test_index], classes)))

	# cma         = np.array(cma)
	# r2          = np.array(r2)
	precision   = np.array(precision)
	accuracy    = np.array(accuracy)
	sensitivity = np.array(sensitivity)
	f1          = np.array(f1)
	# auroc       = np.array(auroc)
	# matthews    = np.array(matthews)

	return accuracy, precision, sensitivity, f1
Ejemplo n.º 22
0
def generate_folds(dataset_path, output_folder, n_folds=10, random_state=None):
    """
    Given a dataset df, generate n_folds for it and store them in <output_folder>/<dataset_name>.

    :type dataset_path: str
    :param dataset_path: Path to dataset with .arff file extension (i.e my_dataset.arff)
    :type output_folder: str
    :param output_folder: Path to store both index file with folds and fold files.
    :type n_folds: int
    :param n_folds: Optional - Number of folds to split the dataset into. Defaults to 10.
    :type random_state: int
    :param random_state: Optional - Seed to use in the splitting process. Defaults to None (no seed).
    """

    import warnings
    warnings.filterwarnings('error')

    dataset_name = dataset_path.split('/')[-1].split('.')[0]

    af = load_arff(dataset_path)
    df = load_dataframe(af)

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    fold_iter = skf.split(df[df.columns[:-1]], df[df.columns[-1]])

    fold_index = dict()

    jvm.start()

    csv_loader = Loader(classname="weka.core.converters.CSVLoader")
    arff_saver = Saver(classname='weka.core.converters.ArffSaver')

    for i, (arg_rest, arg_test) in enumerate(fold_iter):
        fold_index[i] = list(arg_test)

        _temp_path = 'temp_%s_%d.csv' % (dataset_name, i)

        fold_data = df.loc[arg_test]  # type: pd.DataFrame
        fold_data.to_csv(_temp_path, sep=',', index=False)

        java_arff_dataset = csv_loader.load_file(_temp_path)
        java_arff_dataset.relationname = af['relation']
        java_arff_dataset.class_is_last()
        arff_saver.save_file(java_arff_dataset, os.path.join(output_folder, '%s_fold_%d.arff' % (dataset_name, i)))

        os.remove(_temp_path)

    json.dump(
        fold_index, open(os.path.join(output_folder, dataset_name + '.json'), 'w'), indent=2
    )

    jvm.stop()
    warnings.filterwarnings('default')
Ejemplo n.º 23
0
def Kfold(dataset, k, shuffle=False, stratify=False):
    """
    Envelop function for folding operation
    """
    # remove class labels
    data = dataset[0]
    if stratify:
        kf = StratifiedKFold(k, shuffle)
        return kf.split(dataset[0], dataset[1])

    kf = KFold(k, shuffle)
    return kf.split(data)
Ejemplo n.º 24
0
def rmseCvMean(model, X, y, cv=5, random_state=41):
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=cv, random_state=random_state)
    scr = 0
    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        scr += rmse(y_test, pred)
        print('\t', rmse(y_test, pred))
    return scr/cv
Ejemplo n.º 25
0
def cross_validation(sgd_clf, x_train, y_train):
    skfolds = StratifiedKFold(n_splits=5, random_state=42)
    for train_index, test_index in skfolds.split(x_train, y_train): #40000, 20000
        clone_clf = clone(sgd_clf)
        x_train_folds = x_train[train_index]
        y_train_folds = y_train[train_index]
        x_test_fold = x_train[test_index]
        y_test_fold = y_train[test_index]

        clone_clf.fit(x_train_folds, y_train_folds)
        y_pred = clone_clf.predict(x_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print(n_correct / len(y_pred))
Ejemplo n.º 26
0
def test_kfold_valueerrors():
    X1 = np.array([[1, 2], [3, 4], [5, 6]])
    X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, next, KFold(4).split(X1))

    # Check that a warning is raised if the least populated class has too few
    # members.
    y = np.array([3, 3, -1, -1, 3])

    skf_3 = StratifiedKFold(3)
    assert_warns_message(Warning, "The least populated class",
                         next, skf_3.split(X2, y))

    # Check that despite the warning the folds are still computed even
    # though all the classes are not necessarily represented at on each
    # side of the split at each split
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        check_cv_coverage(skf_3, X2, y, labels=None, expected_n_splits=3)

    # Check that errors are raised if all n_labels for individual
    # classes are less than n_splits.
    y = np.array([3, 3, -1, -1, 2])

    assert_raises(ValueError, next, skf_3.split(X2, y))

    # Check that errors are raised if all n_labels for individual
    # classes are less than n_folds.
    y = np.array([3, 3, -1, -1, 2])

    assert_raises(ValueError, next, skf_3.split(X2, y))

    # Error when number of folds is <= 1
    assert_raises(ValueError, KFold, 0)
    assert_raises(ValueError, KFold, 1)
    error_string = ("k-fold cross-validation requires at least one"
                    " train/test split")
    assert_raise_message(ValueError, error_string,
                         StratifiedKFold, 0)
    assert_raise_message(ValueError, error_string,
                         StratifiedKFold, 1)

    # When n_splits is not integer:
    assert_raises(ValueError, KFold, 1.5)
    assert_raises(ValueError, KFold, 2.0)
    assert_raises(ValueError, StratifiedKFold, 1.5)
    assert_raises(ValueError, StratifiedKFold, 2.0)

    # When shuffle is not  a bool:
    assert_raises(TypeError, KFold, n_splits=4, shuffle=None)
Ejemplo n.º 27
0
 def run_cv_model(self, alpha=0.0001, batch_size=200, learning_rate_init=0.001, power_t=0.5, max_iter=200, momentum=0.9, beta_1=0.9, beta_2=0.999, hidden_layer_sizes=(100,), do_plot=True):
     
     # use k-fold cross validation
     
     # we need to standardize the data for the KNN learner
     pipe_clf = Pipeline([ ('scl', StandardScaler() ),
                           ('clf', MLPClassifier(alpha=alpha,
                                                 batch_size=batch_size,
                                                 learning_rate_init=learning_rate_init,
                                                 power_t=power_t,
                                                 max_iter=max_iter,
                                                 momentum=momentum,
                                                 beta_1=beta_1,
                                                 beta_2=beta_2,
                                                 hidden_layer_sizes=hidden_layer_sizes))])
 
     # resample the test data without replacement. This means that each data point is part of a test a
     # training set only once. (paraphrased from Raschka p.176). In Stratified KFold, the features are
     # evenly disributed such that each test and training set is an accurate representation of the whole
     # this is the 0.17 version
     #kfold = StratifiedKFold(y=self.y_train, n_folds=self.cv, random_state=0)
     
     # this is the 0.18dev version
     skf = StratifiedKFold(n_folds=self.cv, random_state=0)
     
     # do the cross validation
     train_scores = []
     test_scores = []
     #for k, (train, test) in enumerate(kfold):
     for k, (train, test) in enumerate(skf.split(X=self.x_train, y=self.y_train)):
         
         # run the learning algorithm
         pipe_clf.fit(self.x_train[train], self.y_train[train])
         train_score = pipe_clf.score(self.x_train[test], self.y_train[test])
         train_scores.append(train_score)
         test_score = pipe_clf.score(self.x_test, self.y_test)
         test_scores.append(test_score)
         print('Fold:', k+1, ', Training score:', train_score, ', Test score:', test_score)
     
     train_score = np.mean(train_scores)
     print('Training score is', train_score)
     
     test_score = np.mean(test_scores)
     print('Test score is', test_score)
     
     if do_plot:
         self.__plot_learning_curve(pipe_clf)
         
     return train_score, test_score  
def evaluate_classifier(clf, features, labels):
    """ 
        Evaluates the classifier using StratifiedKFold cross validation. The 
            precision and recall scores are used to evaluate the algorithm's 
            performance.
        
        clf = classifier
        features = features list as returned by the targetFeatureSplit script
        labels = target list as returned by the targetFeatureSplit script
    """
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.model_selection import StratifiedKFold
    
    ### Use StratifiedKFold cross validation with 10 folds
    skf = StratifiedKFold(n_splits = 10, random_state = 42)
    
    precision = []
    recall = []
    count = 0

    ### Split the features and labels into training and testing sets.
    for train_index, test_index in skf.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []

        for i in train_index:
            features_train.append(features[i])
            labels_train.append(labels[i])
                
        for j in test_index:
            features_test.append(features[j])
            labels_test.append(labels[j])
    
        clf.fit(features_train, labels_train)
        pred = clf.predict(features_test)
        
        precision.append(precision_score(labels_test, pred))
        recall.append(recall_score(labels_test, pred))
        
        count += 1
    
    print clf
    print "Folds:", count
    print "Average Precision:", sum(precision) / count
    print "Average Recall:", sum(recall) / count
    print ""
Ejemplo n.º 29
0
def kfold_sklearn_model_train(train_X,
                              train_y,
                              parameters,
                              n_fold,
                              sklearn_model,
                              logger):

    best_auc = 0
    best_param = None

    for params in tqdm(list(ParameterGrid(parameters))):
        logger.info('params: {}'.format(params))

        auc_lst = []
        skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=0)
        for trn_i, val_i in skf.split(train_X, train_y):
            trn_x = train_X[trn_i]
            val_x = train_X[val_i]
            trn_y = train_y[trn_i]
            val_y = train_y[val_i]

            model = sklearn_model(**params)
            model.fit(trn_x, trn_y)

            pred = model.predict_proba(val_x)
            pred = np.array([p[1] for p in pred])

            fpr, tpr, thresholds = metrics.roc_curve(val_y, pred)
            auc = metrics.auc(fpr, tpr)
            logger.info('AUC: {}'.format(auc))

            auc_lst.append(auc)

        auc_avg = sum(auc_lst) / len(auc_lst)
        logger.info('AUC AVG: {}'.format(auc_avg))

        if best_auc < auc_avg:
            best_auc = auc_avg
            best_param = params

    logger.info('best params: {}'.format(best_param))
    logger.info('AUC: {}'.format(best_auc))

    logger.info('train by best parameters')
    model = sklearn_model(**best_param)
    model.fit(train_X, train_y)

    return model
Ejemplo n.º 30
0
def CrossVal(estimator, X, y,procsessor=None,cv=3,times=10,random_state=0,imb=False):
    """
    交叉验证
    
    estimator:
        模型
    
    X:
        数据集X部分
    
    y:
        数据集的label
    
    procsessor:
        预处理器,其实就是做特征选择
    
    cv:
        做cv折交叉验证
    
    times:
        重复times次交叉验证
        
    random_state:
        随机数种子
    
    imb:
        是否使用SMOTE使得正负样本数平衡
    
    """
    
    res=[]
    for t in range(times):
        skf=StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state+t)
        indices=list(skf.split(X=X,y=y))        
        for k in indices:
            x_train,y_train,x_test,y_test=X[k[0]],y[k[0]],X[k[1]],y[k[1]]              
            if(imb==True):
                n,p=__lableCount(y_train)
                rus=RandomUnderSampler(random_state=random_state+t)
                x_train,y_train=rus.fit_sample(x_train,y_train)         
            if(procsessor is not None):
                procsessor.fit(x_train,y_train)
                x_train,y_train=procsessor.transform(x_train,y_train)
                x_test,y_test=procsessor.transform(x_test,y_test)
            estimator.fit(x_train,y_train)
            res.append(Metrics.Score(estimator,x_test,y_test))                
    res=np.array(res)
    return res
Ejemplo n.º 31
0
plt.subplot(2,2,2)
plt.imshow(tf.squeeze(resizing[311]), cmap = 'jet')
plt.title(f'{tf.squeeze(resizing[311]).shape}')

plt.tight_layout()
plt.show()
# ---------------------------------------------------------------------
'''

# 이미지 증폭 정의 / idg2는 증폭없이 형태만 맞춰줌 
idg = ImageDataGenerator(height_shift_range=(-1, 1), width_shift_range=(-1, 1))
idg2 = ImageDataGenerator()

# kfold 정의
skf = StratifiedKFold(n_splits=40, random_state=42, shuffle=True)

# callback 정의
redu_lr = ReduceLROnPlateau(patience= 80, verbose=1, factor=0.5)
stop = EarlyStopping(monitor='val_loss', patience=160, verbose=1, mode='min')
mc = ModelCheckpoint(filepath= '../data/modelcheckpoint/dacon3/1st_01.h5', save_best_only=True, verbose=1)

result = 0 
nth = 0


# for문으로 모델 + 컴파일 + 훈련 + 평가

for train_index, valid_index in skf.split(train2, train['digit']) :
    x_train = train2[train_index]
    x_val = train2[valid_index]
Ejemplo n.º 32
0
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=SEED)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=SEED)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric':'auc',
        'n_estimators': 10000,
        'learning_rate': 0.05,  
        'num_leaves': 34,  # we should let it be smaller than 2^(max_depth)
        'max_depth': 8, 
        'subsample': 0.8715623,  # Subsample ratio of the training instance. 
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.9497036,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 60,  
        'min_split_gain': 0.0222415,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0.041545473,  # L1 regularization term on weights
        'reg_lambda': 0.0735294,  # L2 regularization term on weights 
        'nthread': 8,
        'seed':42, 
        'verbose': -1,
    } 
    
    # lgb_params = {
    #     'boosting_type': 'gbdt',
    #     'objective': 'binary',
    #     'metric':'auc',
    #     'n_estimators': 10000,
    #     'learning_rate': 0.05, 
    #     'num_leaves': 15,  # we should let it be smaller than 2^(max_depth)
    #     #'max_depth': 8, 
    #     'subsample': 0.7225,  # Subsample ratio of the training instance. 
    #     #'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    #     'colsample_bytree': 0.8443,  # Subsample ratio of columns when constructing each tree.
    #     'min_child_weight': np.power(10, -1.7449),  
    #     'min_split_gain': np.power(10, 0.1397),  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
    #     'reg_alpha': np.power(10, -3.1527),  # L1 regularization term on weights
    #     'reg_lambda': np.power(10, 1.4779),  # L2 regularization term on weights 
    #     'nthread': 8,
    #     'random_state':42,
    #     'verbose': -1,
    # } 
    
    CV_score = pd.DataFrame()
    FOLDS = []
    SCORE = []
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
        
        
        xgtrain = lgb.Dataset(train_x, label=train_y,
                          feature_name=feats
                          )
        xgvalid = lgb.Dataset(valid_x, label=valid_y,
                          feature_name=feats
                          )
        evals_results = {}
    
        clf = lgb.train(lgb_params, 
                         xgtrain, 
                         valid_sets=[xgvalid],  
                         valid_names=['valid'], 
                         evals_result=evals_results, 
                         num_boost_round=10000,
                         early_stopping_rounds=200,
                         verbose_eval=200)

        oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        
        FOLDS.append(str(n_fold + 1))
        SCORE.append(roc_auc_score(valid_y, oof_preds[valid_idx]))
        
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
        
    FOLDS.append('Full AUC score')
    SCORE.append(roc_auc_score(train_df['TARGET'], oof_preds))
     
    CV_score['folds'] = FOLDS
    CV_score['score'] = SCORE
    
    CV_score.to_csv('CV_SCORE.csv', index=False)

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds / num_folds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
        
        pd.DataFrame(data=oof_preds, columns=['TARGET']).to_csv('lgb_baseline_val_oof.csv', index=False)
        
    display_importances(feature_importance_df)
    return feature_importance_df
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
#Load Breast Cancer Dataset
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.model_selection import RepeatedStratifiedKFold
breast_cancer = load_breast_cancer()

X = breast_cancer.data
y = breast_cancer.target
sum_l=0
n=5
kf = StratifiedKFold(n_splits=n, random_state=None) 
auc_score=0
auc_score_2=0
for train_index, test_index in kf.split(X,y):
     
       # print("Train:", train_index, "Validation:",test_index)
      X_train, X_test = X[train_index], X[test_index] 
      y_train, y_test = y[train_index], y[test_index]
      gnb = LogisticRegression() 
      classifier =  gnb.fit(X_train, y_train)
      y_pred = gnb.predict_proba( X_test)[:,1]
      y_class = gnb.predict( X_test)
      sum_l += metrics.accuracy_score( y_class,  y_test)
      fpr, tpr, _ = metrics.roc_curve( y_test, y_pred)
      
      auc_score += metrics.auc(fpr, tpr)
Ejemplo n.º 34
0
y = y.astype(np.uint8)

xtrain, xtest, ytrain, ytest = X[:60000], X[60000:], y[:60000], y[60000:]

ytrain_5 = (ytrain == 5)
ytest_5 = (ytest == 5)

sgd = SGDClassifier(random_state=42)
sgd.fit(xtrain, ytrain_5)

##Medir la presicion usando cross-validation
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfold.split(xtrain, ytrain_5):
    clone_sgd = clone(sgd)
    xtrain_folds = xtrain[train_index]
    ytrain_folds = ytrain_5[train_index]

    xtest_folds = xtrain[test_index]
    ytest_folds = ytrain_5[test_index]

    clone_sgd.fit(xtrain_folds, ytrain_folds)
    y_pred = clone_sgd.predict(xtest_folds)
    n_correct = sum(y_pred == ytest_folds)
    print(n_correct / len(y_pred))
print(cross_val_score(sgd, xtrain, ytrain_5, cv=3, scoring="accuracy"))
Ejemplo n.º 35
0
    pipelines = {}
    pipelines['fgMDM-Coh'] = make_pipeline(
        FeatConn("Coh", s, "test"),
        FgMDM2(metric='logeuclid', tsupdate=True, n_jobs=n_jobs))
    pipelines['fgMDM-PLV'] = make_pipeline(
        FeatConn("PLV", s, "test"),
        FgMDM2(metric='logeuclid', tsupdate=True, n_jobs=n_jobs))
    pipelines['fgMDM-Cov'] = make_pipeline(
        FeatConn("Cov", s, "test"),
        FgMDM2(metric='logeuclid', tsupdate=True, n_jobs=n_jobs))
    estimators = [('cov', pipelines['fgMDM-Cov']),
                  ('coh', pipelines['fgMDM-Coh']),
                  ('plv', pipelines['fgMDM-PLV'])
                 ]
    final_estimator = RidgeClassifier(class_weight="balanced")
    cvkf = StratifiedKFold(n_splits=5, shuffle=True)
    scl = StackingClassifier(estimators=estimators,
            cv=cvkf, n_jobs=n_jobs, final_estimator=final_estimator,
            stack_method='predict_proba')
    pipelines['Ensemble'] = scl

    pipelines['Ensemble'].fit(X_train, y_train)
    y_pred = pipelines['Ensemble'].predict(X_test)
    y_pred = le.inverse_transform(y_pred)
    for i, yp in enumerate(y_pred):
        res = {"subject name": "P{:02d}".format(s+1),
               "trial index": i+1,
               "prediction": yp}
        all_pred.append(res)
df_pred = pd.DataFrame(all_pred)
Ejemplo n.º 36
0
def train(infile):
    with gzip.open(infile, 'r') as file:
        data = np.genfromtxt(file, delimiter='\t', dtype=str)

    ## Split the data up into features and answers
    answers = []
    features = []
    for row in data[1:, ]:
        answers.append(row[1])
        features.append(row[2:])

    features = np.array(features)
    answers = np.array(answers)

    y_test_final = np.array([])
    predictions_final = np.array([])
    y_prob_final = np.ndarray(shape=(0, 2), dtype=int)

    ##first way to cross validate, not very easy to scale
    #    scores = cross_val_score(mlp, features, answers, cv=10)
    #    print(scores)

    ## Second way that is applicable, problem is that it just takes the first certain number. We don't know if there is a correlation
    #    kf = KFold(n_splits=10)
    #    for train, test in kf.split(features):

    ## Third way is the shuffle split.
    #    ss = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
    #    for train, test in ss.split(features):

    ## Fouth way is the stradified fold.
    skf = StratifiedKFold(n_splits=10)
    for train, test in skf.split(features, answers):
        print("Training: %s \n Test: %s" % (train, test))
        scaler = StandardScaler()
        X_train, X_test, y_train, y_test = features[train], features[
            test], answers[train], answers[test]

        ## This sets the size of the scaler object
        scaler.fit(X_train)

        ## The MLP is super senesitive to feature scaling, so it is highly recommended to scale your data.
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        ####### This is where we want to implement the Ensemble method ######
        predictions, y_prob = mlp(X_train, X_test, y_train)
        #        predictions, y_prob = rf(X_train, X_test, y_train)
        #        predictions, y_prob = naiveBayes(X_train, X_test, y_train)
        #        predictions, y_prob = kNearestNeighbor(X_train, X_test, y_train)
        #        predictions, y_prob = supportVM(X_train, X_test, y_train) ##Attention, this returns a y_prob of 0 because it doesn't work with the SVM
        #        predictions, y_prob = logisticRegression(X_train, X_test, y_train)

        ## This will show the confusion in a matrix that will tell how often we were correct
        y_test_final = np.concatenate([y_test_final, y_test])
        predictions_final = np.concatenate([predictions_final, predictions])
        y_prob_final = np.concatenate([y_prob_final, y_prob])

    print(confusion_matrix(y_test_final, predictions_final))
    print(classification_report(y_test_final, predictions_final))
    for i in range(len(y_prob_final)):
        print("Predicted value for item " + str(i + 1) + " : " +
              str(predictions_final[i]) + ", actual: " + str(y_test_final[i]))
        print("Probability : " + str(y_prob_final[i]))
Ejemplo n.º 37
0
def train_kfold(log_dir,
                hparams,
                model_name,
                k,
                state,
                X,
                Y,
                X_test=None,
                Y_test=None,
                X_train_add=None,
                Y_train_add=None,
                batch_size=20,
                epochs=200):
    '''
    X: The training set
    X_test: The testing set
    X_train_add: The additional set for model training, used for the 2nd experiment in paper.
    '''
    ''' Training '''
    # Log
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
    model_dir = os.path.join(log_dir, timestamp, 'models')
    os.makedirs(model_dir)
    hist_dir = os.path.join(log_dir, timestamp, 'history')
    os.makedirs(hist_dir)
    eval_dir = os.path.join(log_dir, timestamp, 'evaluate')
    os.makedirs(eval_dir)
    params_savename = os.path.join(log_dir, timestamp, 'params.json')
    summary_savename = os.path.join(log_dir, timestamp, 'summary.json')
    test_hist_savename = os.path.join(log_dir, timestamp, 'test.json')

    # Start training
    kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=state)
    fold = 1
    accs = []
    for train_index, val_index in kfold.split(X, Y):
        model, params = compile_model(model_name, hparams)
        if fold == 1:
            print(model.summary())
            print(params)
        print('\n' + '=' * 60 + ' Fold: ' + str(fold) + ' ' + '=' * 60 + '\n')
        # Callback functions
        model_savename = os.path.join(model_dir,
                                      'model{0}.h5'.format(str(fold)))
        hist_savename = os.path.join(hist_dir,
                                     'history{0}.json'.format(str(fold)))
        val_savename = os.path.join(eval_dir,
                                    'evaluate{0}.json'.format(str(fold)))
        cb_list = [
            callbacks.ModelCheckpoint(filepath=model_savename,
                                      monitor='val_acc',
                                      save_best_only=True),
            callbacks.EarlyStopping(
                monitor='acc',
                patience=6,
            )
        ]
        # Add new training sets
        try:
            if X_train_add.any() and Y_train_add.any():
                x_train = np.concatenate([X[train_index], X_train_add], axis=0)
                y_train = np.concatenate([Y[train_index], Y_train_add], axis=0)
                index = list(range(len(y_train)))
                random.seed(state + 1)
                random.shuffle(index)
                x_train = x_train[index]
                y_train = y_train[index]
        except AttributeError:
            x_train = X[train_index]
            y_train = Y[train_index]

        history = model.fit(x_train,
                            y_train,
                            validation_data=(X[val_index], Y[val_index]),
                            batch_size=batch_size,
                            epochs=epochs,
                            callbacks=cb_list,
                            verbose=2)
        # Log
        hist_dict = history.history
        m = models.load_model(model_savename, custom_objects={'tf': tf})
        val_dict = evaluate_model(m, X[val_index], Y[val_index])

        accs.append(val_dict['accuracy'])
        log_to_json(hist_dict, hist_savename)
        log_to_json(val_dict, val_savename)

        fold += 1
        K.clear_session()
        print('Session cleared.')
    # Summary
    try:
        if X_test.any() and Y_test.any():
            model_path = os.path.join(
                model_dir, 'model{0}.h5'.format(accs.index(max(accs)) + 1))
            m = models.load_model(model_path, custom_objects={'tf': tf})
            test_dict = evaluate_model(m, X_test, Y_test)
            log_to_json(test_dict, test_hist_savename)
    except AttributeError:
        pass

    log_to_json(hparams, params_savename)
    summary = summary_kfold(eval_dir)
    print(summary)
    log_to_json(summary, summary_savename)
C_vals = [
    0.0001, 0.001, 0.01, 0.1, 0.13, 0.2, .15, .25, .275, .33, 0.5, .66, 0.75,
    1.0, 2.5, 4.0, 4.5, 5.0, 5.1, 5.5, 6.0, 10.0, 100.0, 1000.0
]
penalties = ['l1', 'l2']

param = {
    'penalty': penalties,
    'C': C_vals,
}
grid = GridSearchCV(logreg,
                    param,
                    verbose=False,
                    cv=StratifiedKFold(n_splits=5,
                                       random_state=10,
                                       shuffle=True),
                    n_jobs=1,
                    scoring='accuracy')

# In[ ]:

grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

# In[ ]:

#grid.best_estimator_.fit(X_train,y_train)
#predict=grid.best_estimator_.predict(X_test)
Ejemplo n.º 39
0
imputer = ColumnTransformer([('imputer_media', imputer_media, num_cols),
                             ('imputer_moda', imputer_moda, cat_cols)])

# Creamos un ColumnTransformer para el StandardScaler
scaler = ColumnTransformer([('scaler_media', scaler_media, num_cols),
                            ('scaler_moda', scaler_moda, cat_cols)])

# Creamos el Pipeline incorporando ColumnTransformer
pipeline = Pipeline([('imputer', imputer), ('trans', trans),
                     ('scaler', scaler), ('trans2', trans)])

# TRAMPA. Problemas con el pipeline. RFE y RFECV tienen un 'check_X_y()' antes de llamar al pipeline (que contiene el imputer)
X = pipeline.fit_transform(X)

# 5 folds estratificadas para el RFECV
skf = StratifiedKFold(n_splits=5)

# Diccionario que mapea la RFE Accuracy con un índice
dict_1 = {}
# Diccionario que mapea un índice con el objeto RFECV
dict_2 = {}

time_prebucle = time.time()

# Itero sobre los posibles valores de C
for i, c in enumerate(C):
    time_temp1 = time.time()
    clf_temp = SVC(C=c,
                   kernel=kernel,
                   class_weight=class_weight,
                   random_state=random_state)
Ejemplo n.º 40
0
# def build_fn():
#     model = MobileNet(
#         include_top=True,
#         input_shape=(64, 690, 1),
#         classes=2,
#         classifier_activation='softmax',
#         pooling=None,
#         weights=None,
#     )
#     return model

model = build_fn()    
model.summary()

split = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 10)

pred = []
pred_ = []

for train_idx, val_idx in split.split(train_x, train_y):
    x_train, y_train = train_x[train_idx], train_y[train_idx]
    x_val, y_val = train_x[val_idx], train_y[val_idx]

    model = build_fn()
    model.compile(optimizer = keras.optimizers.Adam(0.002),
                 loss = keras.losses.SparseCategoricalCrossentropy(),
                 metrics = ['acc'])

    history = model.fit(x = x_train, y = y_train, validation_data = (x_val, y_val), epochs = 8)
    print("*******************************************************************")
Ejemplo n.º 41
0
#print nX.shape
#nX = SelectKBest(f_classif, k=10000).fit_transform(nX, ny)
#print nX.shape

#PCA
#print nX.shape
#nX = PCA(n_components=0.99, svd_solver="full").fit_transform(nX)
#print nX.shape

#Filter out NaN values
indices = np.array([not np.any(np.isnan(vec)) for vec in nX])
nX = nX[indices]
ny = ny[indices]

#Begin k-fold cross validation
kf = StratifiedKFold(n_splits=10)

#Optional - parameter evaluation using grid search
"""
param_grid = [
  {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'kernel': ['linear']},
  {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
{'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['sigmoid']},
{'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'degree': [2,3], 'kernel': ['poly']}
 ]

grid = GridSearchCV(svm.SVC(class_weight="balanced"), param_grid=param_grid, cv=kf, scoring="f1_macro", verbose=10)
grid.fit(nX,ny)
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))
"""
numpy.random.seed(seed)
tf.random.set_seed(3)

df=pd.read_csv('./data/dataset/sonar.csv', header=None)

dataset=df.values
X=dataset[:,0:60].astype(float)
Y_obj=dataset[:,60]

e=LabelEncoder()
e.fit(Y_obj)
Y=e.transform(Y_obj)

n_fold=10

skf=StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

accuracy=[]

for train, test in skf.split(X, Y):
    model= Sequential()
    model.add(Dense(24, input_dim=60, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    model.fit(X[train], Y[train], epochs=100, batch_size=5)
    k_accuracy="%.4f" % (model.evaluate(X[test], Y[test])[1])
    accuracy.append(k_accuracy)


print("\n %.f fold_accuracy : " % n_fold, accuracy)
Ejemplo n.º 43
0
def smape_objective(preds, train_data):
    labels = train_data.get_label()
    grad = fgrad(preds, labels)
    hess = fhess(preds, labels)
    return grad, hess


def smape_error(preds, train_data):
    labels = train_data.get_label()
    return 'error', 100 * np.mean(
        np.fabs(preds - labels) / (preds + labels) * 2), False


from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
y_pred_lgb = np.zeros(len(X_test))

params = {
    'objective': 'regression',
    'num_leaves': 5,
    'learning_rate': 0.05,
    'n_estimators': 720,
    'max_bin': 55,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'feature_fraction': 0.2319,
    'feature_fraction_seed': 9,
    'bagging_seed': 9,
    'min_data_in_leaf': 6,
    'min_sum_hessian_in_leaf': 11
Ejemplo n.º 44
0
from comet_ml import Experiment

from pytorch_lightning.callbacks import ModelCheckpoint
from src.transforms import ImageTransform
from src.utils import summarize_submit

import warnings
warnings.filterwarnings('ignore')

# Config  ###########################
# Input Data
data_dir = './input'
# TTA
test_num = 20
# CV
cv = StratifiedKFold(n_splits=4, shuffle=True)


@hydra.main('config.yml')
def main(cfg: DictConfig):
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)
    # Random Seed
    seed_everything(cfg.train.seed)

    # Model  ####################################################################
    net = ENet(model_name=cfg.train.model_name)
    transform = ImageTransform(img_size=cfg.data.img_size)

    # Comet.ml
    experiment = Experiment(api_key=cfg.comet_ml.api_key,
Ejemplo n.º 45
0
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = StratifiedKFold(2)

    score, scores, pvalue = permutation_test_score(svm,
                                                   X,
                                                   y,
                                                   n_permutations=30,
                                                   cv=cv,
                                                   scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_group, _, pvalue_group = permutation_test_score(svm,
                                                          X,
                                                          y,
                                                          n_permutations=30,
                                                          cv=cv,
                                                          scoring="accuracy",
                                                          groups=np.ones(
                                                              y.size),
                                                          random_state=0)
    assert_true(score_group == score)
    assert_true(pvalue_group == pvalue)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = StratifiedKFold(2)
    score_group, _, pvalue_group = permutation_test_score(svm_sparse,
                                                          X_sparse,
                                                          y,
                                                          n_permutations=30,
                                                          cv=cv_sparse,
                                                          scoring="accuracy",
                                                          groups=np.ones(
                                                              y.size),
                                                          random_state=0)

    assert_true(score_group == score)
    assert_true(pvalue_group == pvalue)

    # test with custom scoring object
    def custom_score(y_true, y_pred):
        return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) /
                y_true.shape[0])

    scorer = make_scorer(custom_score)
    score, _, pvalue = permutation_test_score(svm,
                                              X,
                                              y,
                                              n_permutations=100,
                                              scoring=scorer,
                                              cv=cv,
                                              random_state=0)
    assert_almost_equal(score, .93, 2)
    assert_almost_equal(pvalue, 0.01, 3)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = permutation_test_score(svm,
                                                   X,
                                                   y,
                                                   n_permutations=30,
                                                   cv=cv,
                                                   scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)
Ejemplo n.º 46
0
def RandomizedSearchCV_load_or_make(model, data, labels, random_grid, cv="5", scoring="accuracy", n_iter=20, random_state=47):
    import xgboost as xgb
    from xgboost import XGBClassifier
    from sklearn.model_selection import RandomizedSearchCV
    import pickle
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

    # we use the RandomizedSearchCV to find the best parameters for our XGB model

    load_or_make = input("Load or make RandomizedSearchCV?")
    if load_or_make == "load":
        
        # pick RandomizedSearchCV to load
        print('Izaberi RandomizedSearchCV:')
        print('1. RandomizedSearchCV10_basic_all_features_neg_log_loss ')
        
        option = input()
        if option == '1':
            filename = '.../dataset/basic_modeli/RandomizedSearchCV10_basic_all_features_neg_log_loss.sav'
        else:
            print('Ne postoji zatražena opcija!')
            raise ValueError('Ne postoji zatražena opcija - pri učitavanju RandomizedSearchCV!')
            return
        
        rand_XGB = pickle.load(open(filename, 'rb'))
        rand_XGB.get_params()
        
        # show results
        rand_XGB_results_df = pd.DataFrame(rand_XGB.cv_results_)[['mean_test_score', 'std_test_score', 'params','rank_test_score']]
        rand_XGB_results_df

        # plot of randomized search results
        rand_XGB_mean_scores = rand_XGB.cv_results_['mean_test_score']
        plt.plot(list(range(1, 21)), rand_XGB_mean_scores)
        plt.xlabel('k-ti Model Randomized Search CV treniranja (XGB)')
        plt.ylabel('Točnost unakrsne validacije')

        return [rand_XGB, rand_XGB_results_df]
        
    elif load_or_make == "make":
        # getting ready for saving later
        name = input('Unesi ime novo-pokrenutoga RandomizedSearchCV? ')
        filename = 'data/RandomizedSearchCV_' + name + '.sav'

        model_XGB = model
        # RandomizedSearchCV
        rand_XGB = RandomizedSearchCV(model_XGB, 
                                      param_distributions = random_grid, 
                                      cv=StratifiedKFold(n_splits=cv), 
                                      scoring=scoring, 
                                      n_iter=20, 
                                      random_state=random_state, 
                                      return_train_score=False, 
                                      verbose=True,
                                      n_jobs=-1)
        # fit
        rand_XGB.fit(data, labels)

        # save 
        pickle.dump(rand_XGB, open(filename, 'wb'))
        print('RandomizedSearchCV je spremljen u: ' + filename )

        # show results
        rand_XGB_results_df = pd.DataFrame(rand_XGB.cv_results_)[['mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
        rand_XGB_results_df

        # plot of randomized search results
        rand_XGB_mean_scores = rand_XGB.cv_results_['mean_test_score']
        plt.plot(list(range(1, 21)), rand_XGB_mean_scores)
        plt.xlabel('k-ti Model Randomized Search CV treniranja (XGB)')
        plt.ylabel('Točnost unakrsne validacije')

        return [rand_XGB, rand_XGB_results_df]

    else:
        print("Krivi unos! Upiši 'load' ili 'make'!")
        raise ValueError('Nije upisano load ili make - pri učitavanju/izradi RandomizedSearchCV!')
        return
Ejemplo n.º 47
0
def fit_predict(X, y, X_pred):
    predictors = [i for i in X.columns]
    stacking_num = 5
    bagging_num = 3
    bagging_test_size = 0.33
    num_boost_round = 500
    early_stopping_rounds = 100

    stacking_model = []
    bagging_model = []

    l2_error = []
    X = X.values
    y = y.values
    layer_train = np.zeros((X.shape[0], 2))
    SK = StratifiedKFold(n_splits=stacking_num, shuffle=True, random_state=1)
    for k, (train_index, test_index) in enumerate(SK.split(X, y)):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test)

        gbm = lgb.train(param,
                        lgb_train,
                        num_boost_round=num_boost_round,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=early_stopping_rounds)
        stacking_model.append(gbm)

    X = np.hstack((X, layer_train[:, 1].reshape((-1, 1))))

    predictors.append('lgb_result')

    for bn in range(bagging_num):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=bagging_test_size, random_state=bn)

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test)

        gbm = lgb.train(param,
                        lgb_train,
                        num_boost_round=10000,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=200)

        bagging_model.append(gbm)

        l2_error.append(
            mean_squared_error(
                gbm.predict(X_test, num_iteration=gbm.best_iteration), y_test))

        feat_imp = pd.Series(gbm.feature_importance(),
                             predictors).sort_values(ascending=False)

    test_pred = np.zeros((X_pred.shape[0], stacking_num))
    for sn, gbm in enumerate(stacking_model):
        pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration)
        test_pred[:, sn] = pred

        X_pred = np.hstack((X_pred, test_pred.mean(axis=1).reshape((-1, 1))))

    for bn, gbm in enumerate(bagging_model):
        pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration)
        if bn == 0:
            pred_out = pred
        else:
            pred_out += pred
    return pred_out / bagging_num, feat_imp
                  verbose=False)
        score = model.best_score_["valid_0"]["multi_logloss"]

        return {'loss': score, 'status': STATUS_OK, 'model': model}


    trials = Trials()
    best = fmin(fn=objective, space=space, trials=trials,
                algo=tpe.suggest, max_evals=10, verbose=1)
    hyperparams = space_eval(space, best)
    n_best = trials.best_trial['result']['model'].best_iteration_
    params.update(hyperparams)
    print(params)
    # 하이퍼 파라미터 튜닝 끝

    cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

    p_val = np.zeros((trn.shape[0], n_class))
    p_tst = np.zeros((tst.shape[0], n_class))
    for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
        print(f'training model for CV #{i}')
        clf = lgb.LGBMClassifier(**params)
        clf.fit(trn[i_trn], y[i_trn],
                eval_set=[(trn[i_val], y[i_val])],
                eval_metric='multiclass',
                verbose=3,
                early_stopping_rounds=20)

        p_val[i_val, :] = clf.predict_proba(trn[i_val])
        p_tst += clf.predict_proba(tst) / n_fold
def discriminate(X, y, nDmax):
    CVFOLDS = 10
    MINCOUNT = 10
    MINCOUNTTRAINING = 5

    # Initialize Variables and clean up data
    classes, classesCount = np.unique(
        y, return_counts=True
    )  # Classes to be discriminated should be same as ldaMod.classes_
    goodIndClasses = np.array([n >= MINCOUNT for n in classesCount])
    goodInd = np.array([b in classes[goodIndClasses] for b in y])

    yGood = y[goodInd]
    XGood = X[goodInd]

    classes, classesCount = np.unique(yGood, return_counts=True)
    nClasses = classes.size  # Number of classes or groups

    cvFolds = min(min(classesCount), CVFOLDS)
    if (cvFolds < CVFOLDS):
        print(
            'Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)'
            % (cvFolds, CVFOLDS))

    # Data size and color values
    nD = XGood.shape[1]  # number of features in X
    nX = XGood.shape[0]  # number of data points in X

    # Use a uniform prior
    myPrior = np.ones(nClasses) * (1.0 / nClasses)

    # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted.
    # nDmax = int(np.fix(np.sqrt(nX//5)))
    if nDmax < nD:
        print('Warning: Insufficient data for', nD,
              'parameters. PCA projection to', nDmax, 'dimensions.')
    nDmax = min(nD, nDmax)
    pca = PCA(n_components=nDmax)
    Xr = pca.fit_transform(XGood)
    print('Variance explained is %.2f%%' %
          (sum(pca.explained_variance_ratio_) * 100.0))

    # Initialise Classifiers
    ldaMod = LDA(n_components=min(nDmax, nClasses - 1),
                 priors=myPrior,
                 shrinkage=None,
                 solver='svd')
    qdaMod = QDA(priors=myPrior)
    rfMod = RF()  # by default assumes equal weights

    # Perform CVFOLDS fold cross-validation to get performance of classifiers.
    ldaYes = 0
    qdaYes = 0
    rfYes = 0
    cvCount = 0

    skf = StratifiedKFold(n_splits=cvFolds)
    skfList = skf.split(Xr, yGood)

    for train, test in skfList:

        # Enforce the MINCOUNT in each class for Training
        trainClasses, trainCount = np.unique(yGood[train], return_counts=True)
        goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount])
        goodIndTrain = np.array(
            [b in trainClasses[goodIndClasses] for b in yGood[train]])

        # Specity the training data set, the number of groups and priors
        yTrain = yGood[train[goodIndTrain]]
        XrTrain = Xr[train[goodIndTrain]]

        trainClasses, trainCount = np.unique(yTrain, return_counts=True)
        ntrainClasses = trainClasses.size

        # Skip this cross-validation fold because of insufficient data
        if ntrainClasses < 2:
            continue
        goodInd = np.array([b in trainClasses for b in yGood[test]])
        if (goodInd.size == 0):
            continue

        # Fit the data
        trainPriors = np.ones(ntrainClasses) * (1.0 / ntrainClasses)
        ldaMod.priors = trainPriors
        qdaMod.priors = trainPriors
        ldaModself = ldaMod.fit(XrTrain, yTrain)
        qdaMod.fit(XrTrain, yTrain)
        rfMod.fit(XrTrain, yTrain)

        ldaYes += np.around(
            (ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]])) *
            goodInd.size)
        qdaYes += np.around(
            (qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]])) *
            goodInd.size)
        rfYes += np.around(
            (rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]])) *
            goodInd.size)
        cvCount += goodInd.size

    ldaYes = int(ldaYes)
    qdaYes = int(qdaYes)
    rfYes = int(rfYes)

    p = 1.0 / nClasses
    ldaP = 0
    qdaP = 0
    rfP = 0
    for k in range(ldaYes, cvCount + 1):
        ldaP += binom.pmf(k, cvCount, p)

    for k in range(qdaYes, cvCount + 1):
        qdaP += binom.pmf(k, cvCount, p)

    for k in range(rfYes, cvCount + 1):
        rfP += binom.pmf(k, cvCount, p)

    print("Number of classes %d. Chance level %.2f %%" %
          (nClasses, 100.0 / nClasses))
    print("LDA: %.2f %% (%d/%d p=%.4f)" %
          (100.0 * ldaYes / cvCount, ldaYes, cvCount, ldaP))
    print("QDA: %.2f %% (%d/%d p=%.4f)" %
          (100.0 * qdaYes / cvCount, qdaYes, cvCount, qdaP))
    print("RF: %.2f %% (%d/%d p=%.4f)" %
          (100.0 * rfYes / cvCount, rfYes, cvCount, rfP))
    # return ldaYes, qdaYes, rfYes, cvCount, ldaP, qdaP, rfP, nClasses, weights
    return 100.0 * ldaYes / cvCount, 100.0 * qdaYes / cvCount, 100.0 * rfYes / cvCount
def kfold_lightgbm(df, num_folds, stratified=False):
    # Divide in training/validation and test data
    train_df = df[df['FLAG'] != -1]
    test_df = df[df['FLAG'] == -1]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=False,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['FLAG', 'USRID']]

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['FLAG'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'FLAG'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'FLAG'].iloc[valid_idx]

        clf = lgb.LGBMClassifier(nthread=4,
                                 n_estimators=3000,
                                 learning_rate=0.02,
                                 num_leaves=31,
                                 colsample_bytree=0.997212866002197,
                                 bagging_fraction=0.7733927534732657,
                                 min_data_in_leaf=37,
                                 min_child_weight=13.05659547343758,
                                 min_split_gain=0.027258234021548238,
                                 reg_lambda=0.12367585365238067,
                                 verbose=0)

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc',
                verbose=100,
                early_stopping_rounds=150)

        oof_preds[valid_idx] = clf.predict_proba(
            valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(
            test_df[feats],
            num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['FLAG'], oof_preds))
    # Write submission file and plot feature importance
    train_df['RST'] = oof_preds
    test_df['RST'] = sub_preds
    test_df[['USRID', 'RST']].to_csv('submission6.csv', index=False, sep='\t')
    display_importances(feature_importance_df)
    return train_df[['USRID', 'RST']], test_df[['USRID',
                                                'RST']], feature_importance_df
    MAX_FEATURES= 195000
    MAX_LEN = 150
    MODEL_IDENTIFIER = "fastext_minimum_preproc_reg"

    train = pd.read_csv(TRAIN_DATA_FILE)
    test = pd.read_csv(TEST_DATA_FILE)

    print(train.shape, test.shape)

    list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    y = train[list_classes].values

    #Get validation folds
    train['target_str'] = reduce(lambda x,y: x+y, [train[col].astype(str) for col in list_classes])
    train['target_str'] = train['target_str'].replace('110101', '000000').replace('110110','000000')
    cvlist1 = list(StratifiedKFold(n_splits=10, random_state=786).split(train, train['target_str'].astype('category')))
    cvlist2 = list(StratifiedShuffleSplit(n_splits=5, test_size=0.05, random_state=786).split(train, train['target_str'].astype('category')))
    
    #NOrmalize text
    for df in train, test:
        df["comment_text"] = normalizeString(df["comment_text"])
    #stemmer = PorterStemmer()
    #def custom_tokenize(text):
    #    tokens = wordpunct_tokenize(text)
    #    tokens = [stemmer.stem(token) for token in tokens]
    #    return tokens
        
    #Tokenize comments    S
    tok = Tokenizer(max_features=MAX_FEATURES, max_len=MAX_LEN, tokenizer=wordpunct_tokenize)
    X = tok.fit_transform(pd.concat([train["comment_text"].astype(str).fillna("na"), test["comment_text"].astype(str).fillna("na")]))
    X_train = X[:len(train), :]
Ejemplo n.º 52
0
    path, dataset),
                      dtype=np.dtype(np.int))

nb_rows = len(clust)
Data = np.zeros((nb_rows, nb_columns), dtype=np.float32)

for i in range(nb_rows):
    row = prepare_activity_score_feature_vector(features, labels, clust[i],
                                                clusters)
    Data[i, :] = row
X = np.transpose(Data)
#Activity score features are sorted as label 0 then label 1, so we need to rearrange the labels (0s first then 1s)
labels.sort()
y = np.asarray(labels, dtype=np.int)
# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=5, shuffle=True)
classifier = LogisticRegression(solver='lbfgs', max_iter=500)

max_iter = 100
if (False):
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    i = 0
    for train, test in cv.split(X, y):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1],
                                  180 + angle, color=color)
        ell.set_clip_box(ax.bbox)
        ell.set_alpha(0.5)
        ax.add_artist(ell)






iris = datasets.load_iris()

# Break up the dataset into non-overlapping training (75%) and testing
# (25%) sets.
skf = StratifiedKFold(n_splits=4)
# Only take the first fold.
train_index, test_index = next(iter(skf.split(iris.data, iris.target)))
X_train = iris.data[train_index]
y_train = iris.target[train_index]
X_test = iris.data[test_index]
y_test = iris.target[test_index]

print( X_train.shape )
print( X_test.shape )
#exit(1)


n_classes = len(np.unique(y_train))

# Try GMMs using different types of covariances.
Ejemplo n.º 54
0
    def fit(self, data):
        # Split training data for phase 1 and phase 2
        if self.task_type in CLS_TASKS:
            kf = StratifiedKFold(n_splits=self.kfold)
        else:
            kf = KFold(n_splits=self.kfold)

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats["include_algorithms"]:
            model_to_eval = self.stats[algo_id]['model_to_eval']
            for idx, (node, config) in enumerate(model_to_eval):
                X, y = node.data
                if self.base_model_mask[model_cnt] == 1:
                    for j, (train, test) in enumerate(kf.split(X, y)):
                        x_p1, x_p2, y_p1, _ = X[train], X[test], y[train], y[
                            test]
                        estimator = fetch_predict_estimator(
                            self.task_type,
                            config,
                            x_p1,
                            y_p1,
                            weight_balance=data.enable_balance,
                            data_balance=data.data_balance)
                        with open(
                                os.path.join(
                                    self.output_dir, '%s-model%d_part%d' %
                                    (self.timestamp, model_cnt, j)),
                                'wb') as f:
                            pkl.dump(estimator, f)
                        if self.task_type in CLS_TASKS:
                            pred = estimator.predict_proba(x_p2)
                            n_dim = np.array(pred).shape[1]
                            if n_dim == 2:
                                # Binary classificaion
                                n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(train) + len(test)
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            if n_dim == 1:
                                feature_p2[test,
                                           suc_cnt * n_dim:(suc_cnt + 1) *
                                           n_dim] = pred[:, 1:2]
                            else:
                                feature_p2[test, suc_cnt *
                                           n_dim:(suc_cnt + 1) * n_dim] = pred
                        else:
                            pred = estimator.predict(x_p2).reshape(-1, 1)
                            n_dim = 1
                            # Initialize training matrix for phase 2
                            if feature_p2 is None:
                                num_samples = len(train) + len(test)
                                feature_p2 = np.zeros(
                                    (num_samples, self.ensemble_size * n_dim))
                            feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) *
                                       n_dim] = pred
                    suc_cnt += 1
                model_cnt += 1
        # Train model for stacking using the other part of training data
        self.meta_learner.fit(feature_p2, y)
        return self
Ejemplo n.º 55
0
def plot_learning_curve( model, X_train, y_train, X_test, y_test, cv, seed ):

    import warnings
    warnings.filterwarnings("ignore")

    # load libraries
    import numpy as np
    from numpy import loadtxt
    from xgboost import XGBClassifier
    from sklearn.model_selection import train_test_split as tts
    from sklearn.metrics import accuracy_score, make_scorer, log_loss

    import matplotlib.pyplot as plt
    from sklearn.model_selection import learning_curve, StratifiedKFold
    

    #plt.style.use('ggplot')
    malware_dict = { 1 : 'Ramnit', 2 : 'Lollipop', 3 : 'Kelihos_ver3', 4 : 'Vundo', 5 : 'Simba', 
                     6 : 'Tracur', 7 : 'Kelihos_ver1', 8 : 'Obfuscator.ACY', 9 : 'Gatak'}

    # Create CV training and test scores for various training set sizes
    train_sizes, train_scores, test_scores = learning_curve(model,
                                               X_train, y_train, cv=StratifiedKFold(n_splits=cv), 
                                               scoring="accuracy",
                                               #scoring=make_scorer(log_loss, needs_proba=True, labels=list(malware_dict.keys())), 
                                               n_jobs=-1,
                                               random_state=seed)

    # Create means and standard deviations of training set scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)

    # Create means and standard deviations of test set scores
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Draw lines
    plt.subplots(1, figsize=(12,12))
    plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Uspješnost treniranja")
    plt.plot(train_sizes, test_mean, color="#111111", label="Uspješnost unakrsne validacije")

    # Draw bands
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

    # Create plot
    plt.title("Krivulja učenja")
    plt.xlabel("Veličina skupa za treniranje"), plt.ylabel("Točnost"), plt.legend(loc="best")
    plt.tight_layout(); plt.show()    

    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

    # retrieve performance metrics
    results = model.evals_result()
    epochs = len(results['validation_0']['merror'])
    x_axis = range(0, epochs)

    # plot log loss
    fig, ax = plt.subplots(figsize=(12,12))
    ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
    ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
    ax.legend()
    plt.ylabel('Log Loss')
    plt.title('XGBoost Log Loss')
    plt.show()

    # plot classification error
    fig, ax = plt.subplots(figsize=(12,12))
    ax.plot(x_axis, results['validation_0']['merror'], label='Train')
    ax.plot(x_axis, results['validation_1']['merror'], label='Test')
    ax.legend()
    plt.ylabel('Pogreška klasifikacije')
    plt.title('XGBoost pogreška klasifikacije')
    plt.show()
Ejemplo n.º 56
0
# learning and prediction -------------------------------------------------------------
print("BayesSearch")
bayes_cv_tuner = BayesSearchCV(estimator=ExtraTreesClassifier(
    n_estimators=300, random_state=0, class_weight="balanced"),
                               search_spaces={
                                   'criterion': ["gini", "entropy"],
                                   'splitter': ["random", "best"],
                                   'min_samples_split': (2, 100),
                                   'min_samples_leaf': (1, 100),
                                   'min_weight_fraction': (0, 1),
                                   'max_depth': (1, 50),
                               },
                               scoring="roc_auc",
                               cv=StratifiedKFold(n_splits=3,
                                                  shuffle=True,
                                                  random_state=42),
                               n_jobs=-3,
                               n_iter=10,
                               verbose=0,
                               refit=True,
                               random_state=42)

result = bayes_cv_tuner.fit(train_mod[selected_features].values,
                            target_mod.values,
                            callback=status_print)

#Model
#Best ROC-AUC:
#Best params:
Ejemplo n.º 57
0
def mean_encode(train_data,
                test_data,
                columns,
                target_col,
                reg_method=None,
                alpha=0,
                add_random=False,
                rmean=0,
                rstd=0.1,
                folds=1):
    length_train = len(train_data)
    '''Returns a DataFrame with encoded columns'''
    encoded_cols = []
    target_mean_global = train_data[target_col].mean()
    for col in columns:
        # Getting means for test data
        nrows_cat = train_data.groupby(col)[target_col].count()
        target_means_cats = train_data.groupby(col)[target_col].mean()
        target_means_cats_adj = (target_means_cats * nrows_cat +
                                 target_mean_global * alpha) / (nrows_cat +
                                                                alpha)
        # Mapping means to test data
        encoded_col_test = test_data[col].map(target_means_cats_adj)
        # Getting a train encodings
        if reg_method == 'expanding_mean':
            train_data_shuffled = train_data.sample(frac=1, random_state=1)
            cumsum = train_data_shuffled.groupby(
                col)[target_col].cumsum() - train_data_shuffled[target_col]
            cumcnt = train_data_shuffled.groupby(col).cumcount()
            encoded_col_train = cumsum / (cumcnt)
            encoded_col_train.fillna(target_mean_global, inplace=True)
            if add_random:
                encoded_col_train = encoded_col_train + normal(
                    loc=rmean, scale=rstd, size=(encoded_col_train.shape[0]))
        elif (reg_method == 'k_fold') and (folds > 1):
            kfold = StratifiedKFold(n_splits=folds,
                                    shuffle=True,
                                    random_state=1).split(
                                        train_data[target_col].values,
                                        train_data[target_col])
            parts = []
            for tr_in, val_ind in kfold:
                # divide data

                df_for_estimation, df_estimated = train_data.iloc[
                    tr_in], train_data.iloc[val_ind]
                # getting means on data for estimation (all folds except estimated)
                nrows_cat = df_for_estimation.groupby(col)[target_col].count()
                target_means_cats = df_for_estimation.groupby(
                    col)[target_col].mean()
                target_means_cats_adj = (target_means_cats * nrows_cat +
                                         target_mean_global * alpha) / (
                                             nrows_cat + alpha)
                # Mapping means to estimated fold
                encoded_col_train_part = df_estimated[col].map(
                    target_means_cats_adj)
                if add_random:
                    encoded_col_train_part = encoded_col_train_part + normal(
                        loc=rmean,
                        scale=rstd,
                        size=(encoded_col_train_part.shape[0]))
                # Saving estimated encodings for a fold
                parts.append(encoded_col_train_part)
            encoded_col_train = pd.concat(parts, axis=0)
            encoded_col_train.fillna(target_mean_global, inplace=True)
        else:
            encoded_col_train = train_data[col].map(target_means_cats_adj)
            if add_random:
                encoded_col_train = encoded_col_train + normal(
                    loc=rmean, scale=rstd, size=(encoded_col_train.shape[0]))

        # Saving the column with means
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = target_mean_global
        encoded_cols.append(
            pd.DataFrame({'mean_' + target_col + '_' + col: encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    #Modified to reindex
    all_encoded = all_encoded.reset_index()
    return (all_encoded.iloc[:length_train].reset_index(drop=True),
            all_encoded.iloc[length_train:].reset_index(drop=True))
def kfold_xgb(df, num_folds, stratified=False):
    # Divide in training/validation and test data
    train_df = df[df['FLAG'] != -1]
    test_df = df[df['FLAG'] == -1]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=False,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feats = [f for f in train_df.columns if f not in ['FLAG', 'USRID']]
    feature_importance_df = pd.DataFrame()

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['FLAG'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'FLAG'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'FLAG'].iloc[valid_idx]

        train_x = xgb.DMatrix(train_x, label=train_y)
        valid_x = xgb.DMatrix(valid_x, label=valid_y)

        params = {
            'booster': 'gbtree',
            'objective': 'rank:pairwise',
            'eval_metric': 'auc',
            'max_depth': 4,
            'subsample': 0.85,
            'colsample_bytree': 0.8,
            'colsample_bylevel': 0.8,
            'tree_method': 'exact',
            'seed': 0,
            'nthread': 4,
            'gamma': 0.5,
            'min_child_weight': 50,
        }

        watchlist = [(train_x, 'train'), (valid_x, 'val')]
        clf = xgb.train(params,
                        train_x,
                        num_boost_round=3000,
                        evals=watchlist,
                        early_stopping_rounds=90)

        test = xgb.DMatrix(test_df[feats])
        oof_preds[valid_idx] = clf.predict(valid_x,
                                           ntree_limit=clf.best_ntree_limit)
        sub_preds += clf.predict(
            test, ntree_limit=clf.best_ntree_limit) / folds.n_splits

        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        xgb.plot_importance(clf)
        fscore = clf.get_fscore()
        a = list(fscore.keys())
        v = list(fscore.values())
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = a
        fold_importance_df['importance'] = v
        fold_importance_df['fold'] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)

        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['FLAG'], oof_preds))
    # Write submission file and plot feature importance
    train_df['RST'] = oof_preds
    test_df['RST'] = sub_preds
    # test_df[['USRID', 'RST']].to_csv('submission6.csv', index= False,sep='\t')
    return train_df[['USRID', 'RST']], test_df[['USRID',
                                                'RST']], feature_importance_df
Ejemplo n.º 59
0
    model = RandomForestClassifier(n_estimators=300,
                                   bootstrap=True,
                                   max_features='sqrt',
                                   n_jobs=2,
                                   random_state=1)
elif args.m == 'dt':
    model = DecisionTreeClassifier(max_depth=10, random_state=1)
elif args.m == 'svm':
    model = SVC(kernel='linear', C=1.0, random_state=1)
elif args.m == 'nb':
    model = GaussianNB()
elif args.m == 'knn':
    model = KNeighborsClassifier(n_neighbors=1)
elif args.m == 'all':
    for mm in ['rf', 'dt', 'svm', 'nb', 'knn']:
        cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
        acc, recall, prec, f1, TN, TP, FP, FN = 0, 0, 0, 0, 0, 0, 0, 0
        for (train, test), i in zip(cv.split(x, y), range(5)):
            if mm == 'rf':
                model = RandomForestClassifier(n_estimators=300,
                                               bootstrap=True,
                                               max_features='sqrt',
                                               n_jobs=2,
                                               random_state=1)
            elif mm == 'dt':
                model = DecisionTreeClassifier(max_depth=10, random_state=1)
            elif mm == 'svm':
                model = SVC(kernel='linear', C=1.0, random_state=1)
            elif mm == 'nb':
                model = GaussianNB()
            elif mm == 'knn':
Ejemplo n.º 60
0
             "Ubicación"]] = cases_covid[[
                 "Ciudad.de.residencia", "Sexo", "Tipo.de.caso", "Ubicación"
             ]].astype(str)
cases_covid.replace(dict, inplace=True)
# print(cases_covid.columns)
print(cases_covid.info())
# print(cases_covid)
x = cases_covid.loc[:, cases_covid.columns != 'Estado']
y = cases_covid.loc[:, 'Estado']

n_samples, n_features = x.shape

random_state = np.random.RandomState(0)
x = np.c_[x, random_state.randn(n_samples, 200 * n_features)]

cv = StratifiedKFold(n_splits=10)
classifier = svm.SVC(kernel='linear',
                     probability=True,
                     random_state=random_state)

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(x, y)):
    classifier.fit(x[train], y[train])
    viz = plot_roc_curve(classifier,
                         x[test],
                         y[test],
                         name='ROC fold {}'.format(i),