Python Preprocessing Exemples, pre_processing.Preprocessing Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : class_imbalance.py Projet : BabafemiOyinlola/Clustering-and-Classification

    def KNN_smote_PCA(self):

        train, test = self.process_and_split_data()

        x_train = np.delete(train, obj=8, axis=1)
        y_train = train[:, 8]
        x_test = np.delete(test, obj=8, axis=1)
        y_test = test[:, 8]

        new_col = pd.get_dummies(x_train[:, 0])
        new_col2 = pd.get_dummies(x_test[:, 0])

        #create new columns for sex class
        new_col = np.array(new_col)
        new_col2 = np.array(new_col2)
        #add the new columns to features
        features_train = np.column_stack([x_train, new_col])
        features_test = np.column_stack([x_test, new_col2])

        #delete sex column
        features_train = np.delete(features_train, obj=0, axis=1)
        features_test = np.delete(features_test, obj=0, axis=1)

        #Handle imbalance
        features_train, y_train = self.smote(features_train, y_train)

        #standardize data
        preprocess = Preprocessing()
        features_train = preprocess.standardize_data(features_train)
        features_test = preprocess.standardize_data(features_test)

        knn = KNeighborsClassifier(n_neighbors=7)
        knn.fit(features_train, y_train)
        pred = knn.predict(features_test)

        #PCA
        features_train = self.PCA(features_train, 5)
        features_test = self.PCA(features_test, 5)

        print()
        print("KNN - Accuracy smote with PCA")
        metrics = self.metrics(pred, y_test)
        print()

        features = np.vstack((features_train, features_test))
        labels = np.vstack((y_train[:, None], y_test[:, None]))

        cross_val_acc = self.cross_validation(knn, features, labels)

        return cross_val_acc, y_test, pred, metrics

Exemple #2

0

Afficher le fichier

Fichier : class_imbalance.py Projet : BabafemiOyinlola/Clustering-and-Classification

    def logistic_regression_oversampled_PCA(self):
        train, test = self.process_and_split_data()

        train_oversampled = self.pre_process_oversample(
            1219, "positive", train)

        x_train = np.delete(train_oversampled, obj=8, axis=1)
        y_train = train_oversampled[:, 8]
        x_test = np.delete(test, obj=8, axis=1)
        y_test = test[:, 8]

        new_col = pd.get_dummies(x_train[:, 0])
        new_col2 = pd.get_dummies(x_test[:, 0])

        #create new columns for sex class
        new_col = np.array(new_col)
        new_col2 = np.array(new_col2)
        #add the new columns to features
        features_train = np.column_stack([x_train, new_col])
        features_test = np.column_stack([x_test, new_col2])

        #delete sex column
        features_train = np.delete(features_train, obj=0, axis=1)
        features_test = np.delete(features_test, obj=0, axis=1)

        #standardize data
        preprocess = Preprocessing()
        features_train = preprocess.standardize_data(features_train)
        features_test = preprocess.standardize_data(features_test)

        #PCA
        features_train = self.PCA(features_train, 5)
        features_test = self.PCA(features_test, 5)

        reg = LogisticRegression()
        reg.fit(features_train, y_train)
        pred = reg.predict(features_test)

        print()
        print("Logisitic Regression - Accuracy over sampled data after PCA")
        metrics = self.metrics(pred, y_test)

        features = np.vstack((features_train, features_test))
        labels = np.vstack((y_train[:, None], y_test[:, None]))

        cross_val_acc = self.cross_validation(reg, features, labels)

        return cross_val_acc, y_test, pred, metrics

Exemple #3

0

Afficher le fichier

Fichier : class_imbalance.py Projet : BabafemiOyinlola/Clustering-and-Classification

    def decision_tree_undersampled_PCA(self):
        train, test = self.process_and_split_data()

        train_undersampled = self.pre_process_undersample(
            1219, "negative", train)

        x_train = np.delete(train_undersampled, obj=8, axis=1)
        y_train = train_undersampled[:, 8]
        x_test = np.delete(test, obj=8, axis=1)
        y_test = test[:, 8]

        new_col = pd.get_dummies(x_train[:, 0])
        new_col2 = pd.get_dummies(x_test[:, 0])

        #create new encoded columns for sex class
        new_col = np.array(new_col)
        new_col2 = np.array(new_col2)
        #add the new columns to features
        features_train = np.column_stack([x_train, new_col])
        features_test = np.column_stack([x_test, new_col2])

        #delete sex column
        features_train = np.delete(features_train, obj=0, axis=1)
        features_test = np.delete(features_test, obj=0, axis=1)

        #standardize data
        preprocess = Preprocessing()
        features_train = preprocess.standardize_data(features_train)
        features_test = preprocess.standardize_data(features_test)

        #PCA
        features_train = self.PCA(features_train, 5)
        features_test = self.PCA(features_test, 5)

        tree = DecisionTreeClassifier()
        tree.fit(features_train, y_train)
        pred = tree.predict(features_test)

        print()
        print("Decision tree - Accuracy under sampled data with PCA")
        metrics = self.metrics(pred, y_test)
        features = np.vstack((features_train, features_test))
        labels = np.vstack((y_train[:, None], y_test[:, None]))

        cross_val_acc = self.cross_validation(tree, features, labels)

        return cross_val_acc, y_test, pred, metrics

Exemple #4

0

Afficher le fichier

Fichier : class_imbalance.py Projet : BabafemiOyinlola/Clustering-and-Classification

    def decision_tree_smote(self):
        train, test = self.process_and_split_data()

        x_train = np.delete(train, obj=8, axis=1)
        y_train = train[:, 8]
        x_test = np.delete(test, obj=8, axis=1)
        y_test = test[:, 8]

        new_col = pd.get_dummies(x_train[:, 0])
        new_col2 = pd.get_dummies(x_test[:, 0])

        #create new columns for sex class
        new_col = np.array(new_col)
        new_col2 = np.array(new_col2)
        #add the new columns to features
        features_train = np.column_stack([x_train, new_col])
        features_test = np.column_stack([x_test, new_col2])

        #delete sex column
        features_train = np.delete(features_train, obj=0, axis=1)
        features_test = np.delete(features_test, obj=0, axis=1)

        #Handle imbalance
        features_train, y_train = self.smote(features_train, y_train)

        #standardize data
        preprocess = Preprocessing()
        features_train = preprocess.standardize_data(features_train)
        features_test

        tree = DecisionTreeClassifier()
        tree.fit(features_train, y_train)
        pred = tree.predict(features_test)

        knn = KNeighborsClassifier(n_neighbors=7)
        knn.fit(features_train, y_train)
        pred = knn.predict(features_test)

        accuracy = metrics.accuracy_score(y_test, pred)
        print("Decision tree - Accuracy smote data without PCA: ", accuracy)
        print()
        features = np.vstack((features_train, features_test))
        labels = np.vstack((y_train[:, None], y_test[:, None]))

        cross_val_acc = self.cross_validation(tree, features, labels)

        return cross_val_acc, y_test, pred, metrics

Exemple #5

0

Afficher le fichier

Fichier : class_imbalance.py Projet : BabafemiOyinlola/Clustering-and-Classification

    def KNN_oversampled(self):
        train, test = self.process_and_split_data()

        train_oversampled = self.pre_process_oversample(
            1219, "positive", train)

        x_train = np.delete(train_oversampled, obj=8, axis=1)
        y_train = train_oversampled[:, 8]
        x_test = np.delete(test, obj=8, axis=1)
        y_test = test[:, 8]

        new_col = pd.get_dummies(x_train[:, 0])
        new_col2 = pd.get_dummies(x_test[:, 0])

        #create new columns for sex class
        new_col = np.array(new_col)
        new_col2 = np.array(new_col2)
        #add the new columns to features
        features_train = np.column_stack([x_train, new_col])
        features_test = np.column_stack([x_test, new_col2])

        #delete sex column
        features_train = np.delete(features_train, obj=0, axis=1)
        features_test = np.delete(features_test, obj=0, axis=1)

        #standardize data
        preprocess = Preprocessing()
        features_train = preprocess.standardize_data(features_train)
        features_test

        knn = KNeighborsClassifier(n_neighbors=7)
        knn.fit(features_train, y_train)
        pred = knn.predict(features_test)

        print()
        print("KNN - Accuracy over sampled data without PCA")
        metrics = self.metrics(pred, y_test)

        features = np.vstack((features_train, features_test))
        labels = np.vstack((y_train[:, None], y_test[:, None]))

        cross_val_acc = self.cross_validation(knn, features, labels)

        return cross_val_acc, y_test, pred, metrics

Exemple #6

0

Afficher le fichier

# setting device and default data type
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_default_tensor_type('torch.cuda.FloatTensor')

# data_loader = fast_loader('yahoo','A3Benchmark')
data_loader = fast_loader('nab', 'realKnownCause')
# data_loader = fast_loader('kpi')

# data-preprocessing
for xs, ys, title in data_loader:
    preprocessor = Preprocessing(xs,
                                 ys,
                                 q_size,
                                 batch_size,
                                 device,
                                 standardization=standardized,
                                 remove_low_freq=low_frq_remove,
                                 window_standardization=window_stand,
                                 scaling=normalized)
    train_x, train_y, test_x, test_y = preprocessor.get_data()
    print('Data are ready')

    train_idx_anomaly, train_idx_normal, test_idx_anomaly, test_idx_normal = preprocessor.get_index(
    )

    anomaly_is_there(test_idx_anomaly)

    # plotting
    window = plt.figure()
    te_l = window.add_subplot(311)