def predict_SAME(timestamps_train, Y_train, timestamp_test, bandwidth, tau, n_iterations, n_neighbors): ''' Makes predictions for data restored by SAME Parameters ---------- timestamps_train : array_like A 1D array of timestamps for the train dataset Y_train : array_like Original data timestamps_test : array like A 1D array of timestamps for the test dataset bandwidth : int The size of the bandwidth tau : float A parameter for SAME algorithm, take a look at SAME function for more details n_iterations : int A number of iterations for SAME algorithm n_neighbors : int, tuple A number of nearest neighbors for each component, if one number is given, it is treated as a equal number for all components Returns ------- predictions : array_like Predictions for future values of a time series ''' # Construct the generalized features generalized_X_train, generalized_X_test = get_dataset(Y_train, bandwidth=bandwidth) # Construct a manifold generalized_X = np.append(generalized_X_train, generalized_X_test.reshape(1, -1), axis=0).astype(float) # Normalize the numerical features generalized_X = normalize(generalized_X) Z = generalized_X # Find a manifold neighbors_list = np.array([50 * 0.93**i for i in range(n_iterations)]).astype(int) Z = SAME(generalized_X, neighbors_list, tau) # Define modified train and test features Z_train = Z[:-1, :] Z_test = Z[-1, :] # Make predictions with shifted weighted kNN predictions = shifted_weighted_kNN(timestamps_train[bandwidth:], timestamp_test,\ Z_train, Y_train[bandwidth:], Z_test, n_neighbors=n_neighbors) return predictions
def train(self, datasets, epochs=None): concat_output = concat_map[self.config.get("model_type")] if isinstance(datasets[0], tuple): prep_data = self._format_mem_data(datasets) else: prep_data = datasets n_outputs = 1 try: n_outputs = len(self.keras_model.layers[-1].input) except TypeError: pass out_shape = self.out_shape datasets = [ get_dataset(dt, batch_size=self.config['batch_size'], concat_outputs=concat_output, n_outputs=n_outputs, out_shape=out_shape) for dt in prep_data ] self._train_from_tfdatasets(datasets, epochs)
def predict_knn(timestamps_train, Y_train, timestamp_test, bandwidth, n_neighbors): ''' Makes predictions without preparatory manifold restoration Parameters ---------- timestamps_train : array_like A 1D array of timestamps for the train dataset Y_train : array_like Original data timestamps_test : array like A 1D array of timestamps for the test dataset bandwidth : int The size of the bandwidth n_neighbors : int, tuple A number of nearest neighbors for each component, if one number is given, it is treated as a equal number for all components Returns ------- predictions : array_like Predictions for future values of a time series ''' # Construct the generalized features generalized_X_train, generalized_X_test = get_dataset(Y_train, bandwidth=bandwidth) # Construct a manifold generalized_X = np.append(generalized_X_train, generalized_X_test.reshape(1, -1), axis=0).astype(float) # Normalize the numerical features Z = normalize(generalized_X) # Define modified train and test features Z_train = Z[:-1, :] Z_test = Z[-1, :] #print(Z_train.shape, timestamps_train[bandwidth:].shape, Y_train[bandwidth:].shape) predictions = shifted_weighted_kNN(timestamps_train[bandwidth:], timestamp_test,\ Z_train, Y_train[bandwidth:], Z_test, n_neighbors=n_neighbors) return predictions
def runComp(): """ Run the KNN algorithm on the dataset with the provided flags """ k = 3 # k = 1 gives good results but is unreliable X_train, Y_train, X_test, Y_test = get_dataset(sample=50000, pollution=0.7, train_size=0.5) cols = [ "Time", "V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "Amount" ] X_train = pd.DataFrame(X_train, columns=cols) X_test = pd.DataFrame(X_test, columns=cols) Y_pred = runKNN(X_train, Y_train, X_test, k) # Print out the confusion matrix since its more relevant than the overall accuracy cf_knn = confusion_matrix(Y_test, Y_pred) print("Confusion matrix for KNN:\n{}".format(cf_knn)) print("Classification report for standard KNN:\n {}".format( classification_report(Y_test, Y_pred))) if len(sys.argv) >= 2: if "-grid" in sys.argv: # Run a grid search to find the overall best configuration for the KNN classifier. knnGridSearch(X_train, Y_train, X_test, Y_test) elif "-corr" in sys.argv: # Print out a correlation matrix for the entire dataset, allowing some limited insight into the correlation of the attributes to the class correlationMatrix() elif "-nca" in sys.argv: # Run the KNN classifier, but with NCA dimensionalty reduction, which from our testing gives slightly better results than PCA Y_pred_nca = knn_NCA(X_train, Y_train, X_test, k) # Print out the confusion matrix since its more relevant than the overall accuracy cf_knn_nca = confusion_matrix(Y_test, Y_pred_nca) print("Confusion matrix for KNN with NCA:\n{}".format(cf_knn_nca)) print("Classification report for KNN with NCA:\n {}".format( classification_report(Y_test, Y_pred_nca))) elif "-dim" in sys.argv: # Function comparing the results of PCA and NCA dim_reduc(X_train, Y_train, X_test, Y_test, k)
def main(architecture, train_path, test_path, test_labels_path, img_size, result_folder, grayscale): print("Load and create dataset from file...") xtrain, ytrain, xtest, ytest = get_dataset(train_path, test_path, test_labels_path, (img_size, img_size), grayscale) num_classes = np.unique(ytrain).size # one-hot result vector encoding from tensorflow.keras.utils import to_categorical ytrain = to_categorical(ytrain, num_classes=num_classes) ytest = to_categorical(ytest, num_classes=num_classes) model_input_shape = xtrain[0].shape model = None # train model if architecture == 'vgg19': model = build_vgg19(num_classes, model_input_shape) elif architecture == 'lenet-5': model = build_lenet5(num_classes, model_input_shape) elif architecture == 'alex': model = build_alexnet(num_classes, model_input_shape) elif architecture == 'resnet50': model = build_resnet50(num_classes, model_input_shape) if model != None: train_model(model, xtrain, ytrain, xtest, ytest, lr=0.001, batch_size=32, epochs=10, result_folder=result_folder) else: "model architecture not implemented yet"
1, 1, border_mode='same', activation='relu', name=prefix + 'layer_conv_1x1_d', W_regularizer=l2(0.0002))(layer_max_3x3_d) layer_conv_1x1_d = BatchNormalization()(layer_conv_1x1_d) output = merge([ layer_conv_1x1_a, layer_conv_3x3_b, layer_conv_5x5_c, layer_conv_1x1_d ], mode='concat') return output dataset, _ = get_dataset(train_directory) np.random.shuffle(dataset) print(dataset.shape) dataset_features = [] dataset_labels = [] for arr in dataset: dataset_features.append(arr[0]) dataset_labels.append(arr[1]) dataset_features = np.array(dataset_features) dataset_labels = np.array(dataset_labels) # normalizing dataset_features = dataset_features / 255.0
def predict_ll_from_scores(self, scores): return self.LL.predict_from_scores(scores) def l2_score(y_true, y_pred): # Loss squared. The loss function of the autoencoder. return np.square(np.subtract(y_true, y_pred)).sum(axis=1) #sum the row. # MAIN PROGRAM # Obtain the dataset train_X, train_Y, test_X, test_Y = get_dataset( sample=undersampling, # Not training the encoder on any outliers gives the best results pollution=pollution, # How much of the outliers to put in the training set train_size=train_size # How much of the inliers to put in the training set ) # Isolate inliers and outliers for graphing inliers, outliers = split_inliers_outliers(test_X, test_Y) # Set up the autoencoder AE = AutoEncoderOutlierPredictor(hidden_layers=hidden_layers, activation=activation, threshold=threshold, l2_threshold=l2_threshold, ll_threshold=ll_threshold) # Fit to training data, this will take a while AE.fit(train_X)
def predict_LDMM(timestamps_train, Y_train, timestamp_test, bandwidth, lambd, mu, h_sqr, n_iteration, n_neighbors, b=0): ''' Makes predictions for data restored by LDMM Parameters ---------- timestamps_train : array_like A 1D array of timestamps for the train dataset Y_train : array_like Original data timestamps_test : array like A 1D array of timestamps for the test dataset bandwidth : int The size of the bandwidth lambd : float A parameter for LDMM algorithm, take a look at LDMM function for more details mu : float A parameter for LDMM algorithm, take a look at LDMM function for more details h : float A parameter for LDMM algorithm, take a look at LDMM function for more details n_iterations : int A number of iterations for LDMM algorithm n_neighbors : int, tuple A number of nearest neighbors for each component, if one number is given, it is treated as a equal number for all components b : float A parameter for LDMM algorithm, take a look at LDMM function for more details Returns ------- predictions : array_like Predictions for future values of a time series ''' # Construct the generalized features generalized_Y_train, generalized_Y_test = get_dataset(Y_train, bandwidth=bandwidth) # Construct a manifold generalized_Y = np.append(generalized_Y_train, generalized_Y_test.reshape(1, -1), axis=0).astype(float) # Normalize the numerical features generalized_Y = normalize(generalized_Y) # Find a manifold Z = LDMM(generalized_Y, lambd=lambd, mu=mu, h_sqr=h_sqr, n_iterations=n_iteration, b=b) # Define modified train and test features Z_train = Z[:-1, :] Z_test = Z[-1, :] # Make predictions with shifted weighted kNN predictions = shifted_weighted_kNN(timestamps_train[bandwidth:], timestamp_test,\ Z_train, Y_train[bandwidth:], Z_test, n_neighbors=n_neighbors) return predictions
# -*- coding: utf-8 -*- from preprocessing import get_dataset from sklearn.decomposition import PCA import pandas as pd from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import classification_report, confusion_matrix x_train, y_train, x_test, y_test = get_dataset(400, 400) x_train = pd.DataFrame(StandardScaler().fit_transform(x_train)) x_test = pd.DataFrame(StandardScaler().fit_transform(x_test)) pca = PCA(2).fit(x_train) x_train_pca = pca.transform(x_train) x_test_pca = pca.transform(x_test) x_train_pca = pd.DataFrame(x_train_pca) x_test_pca = pd.DataFrame(x_test_pca) plt.scatter(x_train_pca[0], x_train_pca[1], c=y_train, alpha=0.8) plt.title('Scatter plot') plt.xlabel('x') plt.ylabel('y') plt.show() clf = KNeighborsClassifier(n_neighbors=7, weights="distance") print(clf) clf.fit(x_train_pca, y_train) y_pred = clf.predict(x_test_pca)
return self.transform(X=X) def _reset(self): """ Resets the variables """ self.scaler_vars = np.empty((30, 2)) if __name__ == "__main__": """ This is just testing code to make sure the scaler gives the same results as the SKlearn one If you are not running this file directly you can ignore it. """ X_train, Y_train, X_test, Y_test = get_dataset(sample=1000, pollution=0.7, train_size=0.9) # Test our awsome scaler scaler = StandardScaler() X_train_0 = X_train X_test_0 = X_test X_train_1 = X_train X_test_1 = X_test scaler.fit(X_train_0) X_train_0 = scaler.transform(X_train_0) X_test_0 = scaler.transform(X_test_0) # Test the boring Sklearn scaler skaler = Skaler() X_train_1 = skaler.fit_transform(X_train_1) X_test_1 = skaler.transform(X_test_1)
# and classify x with the larger of the sums. for d_s, c_s in zip(dist, classes): bins = [0, 0] for d, c in zip(d_s, c_s): bins[c] += self.weights[c] / ( d + 1e-5) # add small value to avoid division by zero. predictions.append(np.argmax(bins)) return predictions # MAIN PROGRAM train_X, train_Y, test_X, test_Y = get_dataset( sample=undersampling, pollution=pollution, # How much of the outliers to put in the training set train_size=train_size # How much of the inliers to put in the training set ) # Preprocess the data pipeline = make_pipeline(StandardScaler(), PCA(n_components=n_components)).fit(train_X) train_X = pd.DataFrame(pipeline.transform(train_X)) # Drop duplicates after having transformed the training data. # we have to add train_Y in order to not mess up the indices. cleaned = pd.concat([train_X, pd.DataFrame(train_Y)], axis=1).drop_duplicates() train_X = cleaned.iloc[:, :-1] train_Y = cleaned.iloc[:, -1]