コード例 #1
0
ファイル: __init__.py プロジェクト: ribner/tooshort
    def oversample(self):
        """Function tranforming train data using undersampling and oversampling. Uses undersampling as well as oversampling if the
        ratio between classes is highly imbalanced. Otherwise only oversampling will be used

        Keyword args:
        None

        Returns:
        os_X_train, os_y_train. As matrices (as returned by SMOTE). os_X_train and os_y_train are saved to class and will be automatically 
        applied during the search method, if oversample method is run first.
        """
        unique, counts = np.unique(self.y_train, return_counts=True)
        count_dict = dict(zip(unique, counts))
        y_keys, y_counts = count_dict.keys(), count_dict.values()
        sorted_counts = sorted(y_counts, reverse=True)
        highest = sorted_counts[0]
        second_highest = sorted_counts[1]
        ratio = highest / second_highest
        # under sample then oversample
        if (ratio > 4):
            under = RandomUnderSampler(sampling_strategy=0.5)
            over = SMOTE()
            pipeline_steps = [('under_sample', under), ('over_sample', over)]
            pipeline = Pipeline(steps=pipeline_steps)
            os_X, os_y = pipeline.fit_resample(self.X_train, self.y_train)
        else:
            over = SMOTE()
            pipeline_steps = [('over_sample', over)]
            pipeline = Pipeline(steps=pipeline_steps)
            os_X, os_y = pipeline.fit_resample(self.X_train, self.y_train)
        self.imb_pipeline_steps = pipeline_steps
        # osx is returned if you choose to use it yourself, however pipeline will be used in grid search automatically
        # https://stackoverflow.com/questions/50245684/using-smote-with-gridsearchcv-in-scikit-learn
        return os_X, os_y
コード例 #2
0
ファイル: rf_smote.py プロジェクト: rohit49plus2/AIED21
    }
    model = RandomForestClassifier()
    conf_matrix_list_of_arrays = []
    scores=[]
    for i in range(10):
        for fold_ind, (train_index, test_index) in enumerate(stratified_group_k_fold(X, y, ids, k=8)):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            train_groups, test_groups = ids[train_index], ids[test_index]

            ipca = IncrementalPCA(n_components=X_train.shape[1]//5, batch_size=120)
            ipca.fit(X_train)
            X_train=ipca.transform(X_train)
            X_test=ipca.transform(X_test)

            X_train, y_train = pipeline.fit_resample(X_train, y_train)#Smote

            clf = GridSearchCV(model, parameters,cv=5, n_jobs=4)
            clf.fit(X_train, y_train)
            pred = clf.predict(X_test)
            conf_matrix = confusion_matrix(y_test, pred)
            conf_matrix_list_of_arrays.append(conf_matrix)
            score=accuracy_score(y_test, pred)
            scores.append(score)

    mean_of_conf_matrix_arrays = np.mean(conf_matrix_list_of_arrays, axis=0)
    print(mean_of_conf_matrix_arrays,file=f)
    print('Accuracy: %.7f (%.7f)' % (np.mean(scores), np.std(scores)),file=f)

    f.close()
コード例 #3
0
conf_matrix_list_of_arrays = []
scores=[]
for i in range(10):
    for fold_ind, (train_index, test_index) in enumerate(stratified_group_k_fold(Xlog, y, ids, k=8)):
        print("Fold ", fold_ind)
        Xlog_train, Xlog_test = Xlog[train_index], Xlog[test_index]
        Xeye_train, Xeye_test = Xeye[train_index], Xeye[test_index]
        y_train, y_test = y[train_index], y[test_index]
        train_groups, test_groups = ids[train_index], ids[test_index]

        # print(data,X_train.shape)
        pca = PCA(n_components=0.9999)
        pca.fit(Xlog_train)
        Xlog_train=pca.transform(Xlog_train)
        Xlog_test=pca.transform(Xlog_test)
        Xlog_train, ylog_train = pipeline.fit_resample(Xlog_train, y_train)#Smote
        # print(data,X_train.shape)

        pca = PCA(n_components=0.9999)
        pca.fit(Xeye_train)
        Xeye_train=pca.transform(Xeye_train)
        Xeye_test=pca.transform(Xeye_test)

        Xeye_train, yeye_train = pipeline.fit_resample(Xeye_train, y_train)#Smote

        clf1 = GridSearchCV(rf, parameters_rf,cv=5, n_jobs=4)
        clf1.fit(Xlog_train, ylog_train)
        y_pred = clf1.best_estimator_.predict(Xlog_train)
        m1=confusion_matrix(ylog_train, y_pred)
        pred1 = clf1.predict_proba(Xlog_test)