コード例 #1
0
    def test_cross_validate(self):
        X = np.vstack([data_min, data_maj])
        y = np.hstack(
            [np.repeat(1, len(data_min)),
             np.repeat(0, len(data_maj))])

        # setting cache path
        cache_path = os.path.join(os.path.expanduser('~'), 'smote_test')
        if not os.path.exists(cache_path):
            os.mkdir(cache_path)

        # prepare dataset
        dataset = {'data': X, 'target': y, 'name': 'ballpark_data'}

        # instantiating classifiers
        knn_classifier = KNeighborsClassifier()

        # instantiate the validation object
        results = sv.cross_validate(dataset=dataset,
                                    sampler=sv.SMOTE(),
                                    classifier=knn_classifier)

        self.assertTrue(len(results) > 0)

        dataset = datasets.load_wine()

        results = sv.cross_validate(dataset=dataset,
                                    sampler=sv.SMOTE(),
                                    classifier=knn_classifier)

        self.assertTrue(len(results) > 0)
コード例 #2
0
    def balanceData(self):
        shape = self.trainInputDict["data"].shape
        print("trainInputDict[data].shape : ", shape)
        copy = self.trainInputDict["data"]
        copy = copy.reshape(shape[0], -1)
        print("copy.shape : ", copy.shape)
        npDict = copy.numpy()
        copyLabel = self.trainInputDict["label"]
        print("copyLabel.shape : ", copyLabel.shape)
        # copyLabel = copyLabel.view(-1)
        npLabel = copyLabel.numpy()
        # [print('Class {} had {} instances originally'.format(label, count)) for label, count in zip(*np.unique(npLabel, return_counts=True))]
        # X_resampled, y_resampled = kmeans_smote.fit_sample(npDict, npLabel)

        # print(sv.get_all_oversamplers_multiclass())

        oversampler = sv.MulticlassOversampling(sv.SMOTE(n_jobs=6))

        # oversampler = sv.SMOTE(n_jobs=8)
        X_resampled, y_resampled = oversampler.sample(npDict, npLabel)
        [
            print('Class {} has {} instances after oversampling'.format(
                label, count)) for label, count in zip(
                    *np.unique(y_resampled, return_counts=True))
        ]

        newData = torch.from_numpy(
            X_resampled.reshape(len(X_resampled), shape[1], shape[2],
                                shape[3]))
        newLabel = torch.from_numpy(y_resampled)
        newData = newData.float()
        return newData, newLabel
コード例 #3
0
class Config:
    model_names = [
        "RBF SVM", "Decision Tree", "Random Forest", "Neural Net", "LDA",
        "LogReg", "SVC", "KNN"
    ]
    no_of_splits = 5
    Groups = {
        'Group_1': {
            'ProWSyn': sv.ProWSyn(),
            'AND_SMOTE': sv.AND_SMOTE(),
            'SMOTE': sv.SMOTE()
        },
        'Group_2': {
            'G_SMOTE': sv.G_SMOTE(),
            'Random_SMOTE': sv.Random_SMOTE()
        },
        'Group_3': {
            'SMOTE_TomekLinks': sv.SMOTE_TomekLinks(proportion=1.0),
            'VIS_RST': sv.VIS_RST()
        },
        'Group_4': {
            'CBSO': sv.CBSO(),
            'SMOBD': sv.SMOBD(),
            'A_SUWO': sv.A_SUWO()
        }
    }

    classifiers = {
        "RBF SVM":
        SVC(gamma=2, C=1, max_iter=1000),
        "Decision Tree":
        DecisionTreeClassifier(max_depth=5),
        "Random Forest":
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        "Neural Net":
        MLPClassifier(alpha=1, max_iter=1000),
        "LDA":
        LinearDiscriminantAnalysis(),
        "LogReg":
        LogisticRegression(),
        "SVC":
        SVC(kernel="linear", C=0.025),
        "KNN":
        KNeighborsClassifier(n_neighbors=3)
    }

    ## setting up directories
    raw_data_dir = r'C:\Users\shubh\Desktop\Methods\raw_data'
    X_filename = r'X_4_feature_3_sec_Acc_m_+Gyr_m1_Scale.npy'
    y_filename = r'y_4_feature_3_sec_Acc_m_+Gyr_m1_Scale.npy'
    saving_dir = r'C:\Users\shubh\Desktop\Methods\k_fold_data'
    oversampled_data_dir = r'C:\Users\shubh\Desktop\Methods\oversampled_data'
    data_shape = 16
    similarity_score_dir = r'C:\Users\shubh\Desktop\Methods\similarity_score'
    oversampled_data_based_on_similarity_dir = r'C:\Users\shubh\Desktop\Methods\oversampled_data_based_on_similarity'
    model_results = r'C:\Users\shubh\Desktop\Methods\model_results'
    without_smote_results_dir = r'C:\Users\shubh\Desktop\Methods\without smote results'
    final_result_dir = r'C:\Users\shubh\Desktop\Methods\final_result'
    # user inputs
    groups_to_analyse = ['Group_1']
コード例 #4
0
        
#print(X)
#print(Y)
scaler = StandardScaler()
# fit and transform the data
X = scaler.fit_transform(X) # standardizing the data

X = np.array(X)
Y = np.array(Y)
# without numpy array the oversampler shows error.

# classification_and_report_generation(X, Y)

#after SMOTE
print("Now SMOTE will be applied")
oversampler= sv.SMOTE()
X_samp, y_samp= oversampler.sample(X, Y)

classification_and_report_generation(X_samp, y_samp)

#after kmeans_SMOTE
print("Now kmeans_SMOTE will be applied")
oversampler= sv.kmeans_SMOTE()
X_samp, y_samp= oversampler.sample(X, Y)

classification_and_report_generation(X_samp, y_samp)


# now ecoli dataset, here the features are floating point values
print("ecoli dataset")
コード例 #5
0
def test_cross_validate():
    data_min = np.array([[5.7996138, -0.25574582], [3.0637093, 2.11750874],
                         [4.91444087, -0.72380123], [1.06414164, 0.08694243],
                         [2.59071708, 0.75283568], [3.44834937, 1.46118085],
                         [2.8036378, 0.69553702], [3.57901791, 0.71870743],
                         [3.81529064, 0.62580927], [3.05005506, 0.33290343],
                         [1.83674689, 1.06998465], [2.08574889, -0.32686821],
                         [3.49417022, -0.92155623], [2.33920982, -1.59057568],
                         [1.95332431, -0.84533309], [3.35453368, -1.10178101],
                         [4.20791149, -1.41874985], [2.25371221, -1.45181929],
                         [2.87401694, -0.74746037], [1.84435381, 0.15715329]])

    data_maj = np.array([[-1.40972752, 0.07111486], [-1.1873495, -0.20838002],
                         [0.51978825, 2.1631319], [-0.61995016, -0.45111475],
                         [2.6093289, -0.40993063], [-0.06624482, -0.45882838],
                         [-0.28836659, -0.59493865], [0.345051, 0.05188811],
                         [1.75694985, 0.16685025], [0.52901288, -0.62341735],
                         [0.09694047, -0.15811278], [-0.37490451, -0.46290818],
                         [-0.32855088,
                          -0.20893795], [-0.98508364, -0.32003935],
                         [0.07579831, 1.36455355], [-1.44496689, -0.44792395],
                         [1.17083343, -0.15804265], [1.73361443, -0.06018163],
                         [-0.05139342, 0.44876765], [0.33731075, -0.06547923],
                         [-0.02803696, 0.5802353], [0.20885408, 0.39232885],
                         [0.22819482, 2.47835768], [1.48216063, 0.81341279],
                         [-0.6240829, -0.90154291], [0.54349668, 1.4313319],
                         [-0.65925018, 0.78058634], [-1.65006105, -0.88327625],
                         [-1.49996313, -0.99378106], [0.31628974, -0.41951526],
                         [0.64402186, 1.10456105], [-0.17725369, -0.67939216],
                         [0.12000555, -1.18672234], [2.09793313, 1.82636262],
                         [-0.11711376, 0.49655609], [1.40513236, 0.74970305],
                         [2.40025472, -0.5971392], [-1.04860983, 2.05691699],
                         [0.74057019, -1.48622202], [1.32230881, -2.36226588],
                         [-1.00093975,
                          -0.44426212], [-2.25927766, -0.55860504],
                         [-1.12592836, -0.13399132], [0.14500925, -0.89070934],
                         [0.90572513, 1.23923502], [-1.25416346, -1.49100593],
                         [0.51229813, 1.54563048], [-1.36854287, 0.0151081],
                         [0.08169257, -0.69722099], [-0.73737846, 0.42595479],
                         [0.02465411, -0.36742946], [-1.14532211, -1.23217124],
                         [0.98038343, 0.59259824], [-0.20721222, 0.68062552],
                         [-2.21596433, -1.96045872], [-1.20519292, -1.8900018],
                         [0.47189299, -0.4737293], [1.18196143, 0.85320018],
                         [0.03255894, -0.77687178], [0.32485141, -0.34609381]])

    X = np.vstack([data_min, data_maj])
    y = np.hstack([np.repeat(1, len(data_min)), np.repeat(0, len(data_maj))])

    # setting cache path
    cache_path = os.path.join(os.path.expanduser('~'), 'smote_test')
    if not os.path.exists(cache_path):
        os.mkdir(cache_path)

    # prepare dataset
    dataset = {'data': X, 'target': y, 'name': 'ballpark_data'}

    # instantiating classifiers
    knn_classifier = KNeighborsClassifier()

    # instantiate the validation object
    results = sv.cross_validate(dataset=dataset,
                                sampler=sv.SMOTE(),
                                classifier=knn_classifier)

    assert len(results) > 0

    dataset = datasets.load_wine()

    results = sv.cross_validate(dataset=dataset,
                                sampler=sv.SMOTE(),
                                classifier=knn_classifier)

    assert len(results) > 0
コード例 #6
0
def evaluate_trial(resampler_name, fold):
    RESULTS_PATH = Path(__file__).parents[0] / 'results_final'
    RANDOM_STATE = 42

    resamplers = {
        'SMOTE': sv.SMOTE(random_state=RANDOM_STATE),
        'polynom-fit-SMOTE': sv.polynom_fit_SMOTE(random_state=RANDOM_STATE),
        'Lee': sv.Lee(random_state=RANDOM_STATE),
        'SMOBD': sv.SMOBD(random_state=RANDOM_STATE),
        'G-SMOTE': sv.G_SMOTE(random_state=RANDOM_STATE),
        'LVQ-SMOTE': sv.LVQ_SMOTE(random_state=RANDOM_STATE),
        'Assembled-SMOTE': sv.Assembled_SMOTE(random_state=RANDOM_STATE),
        'SMOTE-TomekLinks': sv.SMOTE_TomekLinks(random_state=RANDOM_STATE),
        'RBO': RBO(random_state=RANDOM_STATE),
        'PA': PA(random_state=RANDOM_STATE)
    }

    for dataset_name in datasets.names():
        classifiers = {
            'CART': DecisionTreeClassifier(random_state=RANDOM_STATE),
            'KNN': KNeighborsClassifier(n_neighbors=3),
            'SVM': SVC(kernel='rbf', random_state=RANDOM_STATE),
            'MLP': MLPClassifier(random_state=RANDOM_STATE)
        }

        trial_name = f'{dataset_name}_{fold}_{resampler_name}'
        trial_path = RESULTS_PATH / f'{trial_name}.csv'

        if trial_path.exists():
            continue

        logging.info(f'Evaluating {trial_name}...')

        dataset = datasets.load(dataset_name)

        (X_train, y_train), (X_test,
                             y_test) = dataset[fold][0], dataset[fold][1]

        resampler = resamplers[resampler_name]

        assert len(np.unique(y_train)) == len(np.unique(y_test)) == 2

        X_train, y_train = resampler.sample(X_train, y_train)

        rows = []

        for classifier_name in classifiers.keys():
            classifier = classifiers[classifier_name]

            clf = classifier.fit(X_train, y_train)
            predictions = clf.predict(X_test)

            scoring_functions = {
                'Precision': metrics.precision,
                'Recall': metrics.recall,
                'AUC': metrics.auc,
                'G-mean': metrics.g_mean
            }

            for scoring_function_name in scoring_functions.keys():
                score = scoring_functions[scoring_function_name](y_test,
                                                                 predictions)
                row = [
                    dataset_name, fold, classifier_name, resampler_name,
                    scoring_function_name, score
                ]
                rows.append(row)

        columns = [
            'Dataset', 'Fold', 'Classifier', 'Resampler', 'Metric', 'Score'
        ]

        RESULTS_PATH.mkdir(exist_ok=True, parents=True)

        pd.DataFrame(rows, columns=columns).to_csv(trial_path, index=False)
コード例 #7
0
                results[s.__class__.__name__][0].append(res_sv)
                results[s.__class__.__name__][1].append(res_imb)
            except:
                pass

    # preparing the final dataframe
    for k in results:
        results[k] = [np.mean(results[k][0]), np.mean(results[k][1])]

    results = pd.DataFrame(results).T
    results.columns = ['smote_variants', 'imblearn']

    return results


# In[3]:

# Executing the evaluation for the techniques implemented by both smote_variants and imblearn, using the
# same parameters, involving 104 datasets

sv_techniques = [sv.SMOTE(), sv.Borderline_SMOTE2(k_neighbors=10), sv.ADASYN()]
imb_techniques = [SMOTE(), BorderlineSMOTE(), ADASYN()]

results = measure(sv_techniques, imb_techniques, imbd.get_data_loaders())

# In[4]:

# Printing the results, the unit is 'seconds'

print(results)