Beispiel #1
0
def predict_SAME(timestamps_train, Y_train, timestamp_test, bandwidth, tau,
                 n_iterations, n_neighbors):
    '''
    Makes predictions for data restored by SAME
    
    Parameters
    ----------
        timestamps_train : array_like
            A 1D array of timestamps for the train dataset
        Y_train : array_like
            Original data
        timestamps_test : array like
            A 1D array of timestamps for the test dataset
        bandwidth : int
            The size of the bandwidth
        tau : float
            A parameter for SAME algorithm, take a look at SAME function for more details
        n_iterations : int
            A number of iterations for SAME algorithm
        n_neighbors : int, tuple
            A number of nearest neighbors for each component, if one number is given, 
            it is treated as a equal number for all components
    
    Returns
    -------
        predictions : array_like
            Predictions for future values of a time series
    '''
    # Construct the generalized features
    generalized_X_train, generalized_X_test = get_dataset(Y_train,
                                                          bandwidth=bandwidth)
    # Construct a manifold
    generalized_X = np.append(generalized_X_train,
                              generalized_X_test.reshape(1, -1),
                              axis=0).astype(float)
    # Normalize the numerical features
    generalized_X = normalize(generalized_X)
    Z = generalized_X
    # Find a manifold
    neighbors_list = np.array([50 * 0.93**i
                               for i in range(n_iterations)]).astype(int)
    Z = SAME(generalized_X, neighbors_list, tau)
    # Define modified train and test features
    Z_train = Z[:-1, :]
    Z_test = Z[-1, :]
    # Make predictions with shifted weighted kNN
    predictions = shifted_weighted_kNN(timestamps_train[bandwidth:], timestamp_test,\
                                       Z_train, Y_train[bandwidth:], Z_test, n_neighbors=n_neighbors)
    return predictions
Beispiel #2
0
 def train(self, datasets, epochs=None):
     concat_output = concat_map[self.config.get("model_type")]
     if isinstance(datasets[0], tuple):
         prep_data = self._format_mem_data(datasets)
     else:
         prep_data = datasets
     n_outputs = 1
     try:
         n_outputs = len(self.keras_model.layers[-1].input)
     except TypeError:
         pass
     out_shape = self.out_shape
     datasets = [
         get_dataset(dt,
                     batch_size=self.config['batch_size'],
                     concat_outputs=concat_output,
                     n_outputs=n_outputs,
                     out_shape=out_shape) for dt in prep_data
     ]
     self._train_from_tfdatasets(datasets, epochs)
Beispiel #3
0
def predict_knn(timestamps_train, Y_train, timestamp_test, bandwidth,
                n_neighbors):
    '''
    Makes predictions without preparatory manifold restoration
    
    Parameters
    ----------
        timestamps_train : array_like
            A 1D array of timestamps for the train dataset
        Y_train : array_like
            Original data
        timestamps_test : array like
            A 1D array of timestamps for the test dataset
        bandwidth : int
            The size of the bandwidth
        n_neighbors : int, tuple
            A number of nearest neighbors for each component, if one number is given, 
            it is treated as a equal number for all components
    
    Returns
    -------
        predictions : array_like
            Predictions for future values of a time series
    '''
    # Construct the generalized features
    generalized_X_train, generalized_X_test = get_dataset(Y_train,
                                                          bandwidth=bandwidth)
    # Construct a manifold
    generalized_X = np.append(generalized_X_train,
                              generalized_X_test.reshape(1, -1),
                              axis=0).astype(float)
    # Normalize the numerical features
    Z = normalize(generalized_X)
    # Define modified train and test features
    Z_train = Z[:-1, :]
    Z_test = Z[-1, :]
    #print(Z_train.shape, timestamps_train[bandwidth:].shape, Y_train[bandwidth:].shape)
    predictions = shifted_weighted_kNN(timestamps_train[bandwidth:], timestamp_test,\
                                       Z_train, Y_train[bandwidth:], Z_test, n_neighbors=n_neighbors)

    return predictions
def runComp():
    """
    Run the KNN algorithm on the dataset with the provided flags
    """
    k = 3  # k = 1 gives good results but is unreliable
    X_train, Y_train, X_test, Y_test = get_dataset(sample=50000,
                                                   pollution=0.7,
                                                   train_size=0.5)
    cols = [
        "Time", "V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10",
        "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20",
        "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "Amount"
    ]
    X_train = pd.DataFrame(X_train, columns=cols)
    X_test = pd.DataFrame(X_test, columns=cols)

    Y_pred = runKNN(X_train, Y_train, X_test, k)
    # Print out the confusion matrix since its more relevant than the overall accuracy
    cf_knn = confusion_matrix(Y_test, Y_pred)
    print("Confusion matrix for KNN:\n{}".format(cf_knn))
    print("Classification report for standard KNN:\n {}".format(
        classification_report(Y_test, Y_pred)))
    if len(sys.argv) >= 2:
        if "-grid" in sys.argv:
            # Run a grid search to find the overall best configuration for the KNN classifier.
            knnGridSearch(X_train, Y_train, X_test, Y_test)
        elif "-corr" in sys.argv:
            # Print out a correlation matrix for the entire dataset, allowing some limited insight into the correlation of the attributes to the class
            correlationMatrix()
        elif "-nca" in sys.argv:
            # Run the KNN classifier, but with NCA dimensionalty reduction, which from our testing gives slightly better results than PCA
            Y_pred_nca = knn_NCA(X_train, Y_train, X_test, k)
            # Print out the confusion matrix since its more relevant than the overall accuracy
            cf_knn_nca = confusion_matrix(Y_test, Y_pred_nca)
            print("Confusion matrix for KNN with NCA:\n{}".format(cf_knn_nca))
            print("Classification report for KNN with NCA:\n {}".format(
                classification_report(Y_test, Y_pred_nca)))
        elif "-dim" in sys.argv:
            # Function comparing the results of PCA and NCA
            dim_reduc(X_train, Y_train, X_test, Y_test, k)
def main(architecture, train_path, test_path, test_labels_path, img_size,
         result_folder, grayscale):
    print("Load and create dataset from file...")
    xtrain, ytrain, xtest, ytest = get_dataset(train_path, test_path,
                                               test_labels_path,
                                               (img_size, img_size), grayscale)
    num_classes = np.unique(ytrain).size

    # one-hot result vector encoding
    from tensorflow.keras.utils import to_categorical

    ytrain = to_categorical(ytrain, num_classes=num_classes)
    ytest = to_categorical(ytest, num_classes=num_classes)

    model_input_shape = xtrain[0].shape
    model = None
    # train model
    if architecture == 'vgg19':
        model = build_vgg19(num_classes, model_input_shape)
    elif architecture == 'lenet-5':
        model = build_lenet5(num_classes, model_input_shape)
    elif architecture == 'alex':
        model = build_alexnet(num_classes, model_input_shape)
    elif architecture == 'resnet50':
        model = build_resnet50(num_classes, model_input_shape)

    if model != None:
        train_model(model,
                    xtrain,
                    ytrain,
                    xtest,
                    ytest,
                    lr=0.001,
                    batch_size=32,
                    epochs=10,
                    result_folder=result_folder)
    else:
        "model architecture not implemented yet"
Beispiel #6
0
                                     1,
                                     1,
                                     border_mode='same',
                                     activation='relu',
                                     name=prefix + 'layer_conv_1x1_d',
                                     W_regularizer=l2(0.0002))(layer_max_3x3_d)
    layer_conv_1x1_d = BatchNormalization()(layer_conv_1x1_d)

    output = merge([
        layer_conv_1x1_a, layer_conv_3x3_b, layer_conv_5x5_c, layer_conv_1x1_d
    ],
                   mode='concat')
    return output


dataset, _ = get_dataset(train_directory)
np.random.shuffle(dataset)
print(dataset.shape)

dataset_features = []
dataset_labels = []
for arr in dataset:
    dataset_features.append(arr[0])
    dataset_labels.append(arr[1])

dataset_features = np.array(dataset_features)
dataset_labels = np.array(dataset_labels)

# normalizing
dataset_features = dataset_features / 255.0
    def predict_ll_from_scores(self, scores):
        return self.LL.predict_from_scores(scores)


def l2_score(y_true, y_pred):
    # Loss squared. The loss function of the autoencoder.
    return np.square(np.subtract(y_true, y_pred)).sum(axis=1)  #sum the row.


# MAIN PROGRAM

# Obtain the dataset

train_X, train_Y, test_X, test_Y = get_dataset(
    sample=undersampling,
    # Not training the encoder on any outliers gives the best results
    pollution=pollution,  # How much of the outliers to put in the training set
    train_size=train_size  # How much of the inliers to put in the training set
)

# Isolate inliers and outliers for graphing
inliers, outliers = split_inliers_outliers(test_X, test_Y)

# Set up the autoencoder
AE = AutoEncoderOutlierPredictor(hidden_layers=hidden_layers,
                                 activation=activation,
                                 threshold=threshold,
                                 l2_threshold=l2_threshold,
                                 ll_threshold=ll_threshold)

# Fit to training data, this will take a while
AE.fit(train_X)
Beispiel #8
0
def predict_LDMM(timestamps_train,
                 Y_train,
                 timestamp_test,
                 bandwidth,
                 lambd,
                 mu,
                 h_sqr,
                 n_iteration,
                 n_neighbors,
                 b=0):
    '''
    Makes predictions for data restored by LDMM
    
    Parameters
    ----------
        timestamps_train : array_like
            A 1D array of timestamps for the train dataset
        Y_train : array_like
            Original data
        timestamps_test : array like
            A 1D array of timestamps for the test dataset
        bandwidth : int
            The size of the bandwidth
        lambd : float
            A parameter for LDMM algorithm, take a look at LDMM function for more details
        mu : float
            A parameter for LDMM algorithm, take a look at LDMM function for more details
        h : float
            A parameter for LDMM algorithm, take a look at LDMM function for more details
        n_iterations : int
            A number of iterations for LDMM algorithm
        n_neighbors : int, tuple
            A number of nearest neighbors for each component, if one number is given, 
            it is treated as a equal number for all components
        b : float
            A parameter for LDMM algorithm, take a look at LDMM function for more details
    
    Returns
    -------
        predictions : array_like
            Predictions for future values of a time series
    '''
    # Construct the generalized features
    generalized_Y_train, generalized_Y_test = get_dataset(Y_train,
                                                          bandwidth=bandwidth)
    # Construct a manifold
    generalized_Y = np.append(generalized_Y_train,
                              generalized_Y_test.reshape(1, -1),
                              axis=0).astype(float)
    # Normalize the numerical features
    generalized_Y = normalize(generalized_Y)
    # Find a manifold
    Z = LDMM(generalized_Y,
             lambd=lambd,
             mu=mu,
             h_sqr=h_sqr,
             n_iterations=n_iteration,
             b=b)
    # Define modified train and test features
    Z_train = Z[:-1, :]
    Z_test = Z[-1, :]
    # Make predictions with shifted weighted kNN
    predictions = shifted_weighted_kNN(timestamps_train[bandwidth:], timestamp_test,\
                                       Z_train, Y_train[bandwidth:], Z_test, n_neighbors=n_neighbors)

    return predictions
# -*- coding: utf-8 -*-
from preprocessing import get_dataset
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

x_train, y_train, x_test, y_test = get_dataset(400, 400)

x_train = pd.DataFrame(StandardScaler().fit_transform(x_train))
x_test = pd.DataFrame(StandardScaler().fit_transform(x_test))

pca = PCA(2).fit(x_train)
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

x_train_pca = pd.DataFrame(x_train_pca)
x_test_pca = pd.DataFrame(x_test_pca)

plt.scatter(x_train_pca[0], x_train_pca[1], c=y_train, alpha=0.8)
plt.title('Scatter plot')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

clf = KNeighborsClassifier(n_neighbors=7, weights="distance")
print(clf)
clf.fit(x_train_pca, y_train)
y_pred = clf.predict(x_test_pca)
Beispiel #10
0
        return self.transform(X=X)

    def _reset(self):
        """
        Resets the variables
        """
        self.scaler_vars = np.empty((30, 2))


if __name__ == "__main__":
    """
    This is just testing code to make sure the scaler gives the same results as the SKlearn one 
    If you are not running this file directly you can ignore it.
    """
    X_train, Y_train, X_test, Y_test = get_dataset(sample=1000,
                                                   pollution=0.7,
                                                   train_size=0.9)
    # Test our awsome scaler
    scaler = StandardScaler()
    X_train_0 = X_train
    X_test_0 = X_test
    X_train_1 = X_train
    X_test_1 = X_test
    scaler.fit(X_train_0)
    X_train_0 = scaler.transform(X_train_0)
    X_test_0 = scaler.transform(X_test_0)
    # Test the boring Sklearn scaler
    skaler = Skaler()
    X_train_1 = skaler.fit_transform(X_train_1)
    X_test_1 = skaler.transform(X_test_1)
        # and classify x with the larger of the sums.
        for d_s, c_s in zip(dist, classes):
            bins = [0, 0]
            for d, c in zip(d_s, c_s):
                bins[c] += self.weights[c] / (
                    d + 1e-5)  # add small value to avoid division by zero.

            predictions.append(np.argmax(bins))
        return predictions


# MAIN PROGRAM

train_X, train_Y, test_X, test_Y = get_dataset(
    sample=undersampling,
    pollution=pollution,  # How much of the outliers to put in the training set
    train_size=train_size  # How much of the inliers to put in the training set
)

# Preprocess the data
pipeline = make_pipeline(StandardScaler(),
                         PCA(n_components=n_components)).fit(train_X)

train_X = pd.DataFrame(pipeline.transform(train_X))

# Drop duplicates after having transformed the training data.
# we have to add train_Y in order to not mess up the indices.
cleaned = pd.concat([train_X, pd.DataFrame(train_Y)], axis=1).drop_duplicates()
train_X = cleaned.iloc[:, :-1]
train_Y = cleaned.iloc[:, -1]