def init_data(): df = read_data() X = df.loc[:, df.columns != 'defaultPaymentNextMonth'].values y = df.loc[:, df.columns == 'defaultPaymentNextMonth'].values onehotencoder = OneHotEncoder(categories="auto", sparse=False) X = ColumnTransformer( [("", onehotencoder, [2, 3]),], remainder="passthrough" ).fit_transform(X) scaler = StandardScaler(with_mean=False) X = scaler.fit_transform(X) return X, y
def main(): np.random.seed(1) plt.style.use("bmh") dh.generate_data_for_first_func() dh.generate_data_for_second_func() data = dh.read_data() first_label = 'First FFN' print(first_label) bool_func(data[0], num_on_hidden=4, num_epochs=80, label=first_label) second_label = 'Second FFN' print(second_label) bool_func(data[1], num_on_hidden=8, num_epochs=50, label=second_label) plt.show()
def main(): #Read data and plit X and y y, X = read_data(s.data_file_path) #Implement a linear regressor based on Maximum Likelihood Estimation lm_res = mlel.fitLinearRegression(y, X) #show linear regressor summary print(lm_res.summary()) #Estimating predicted labels y_hat = mlel.yhat(X, lm_res) #Plot y versus y predicted p.plot(y, y_hat) #compute L1 l1 = mlel.compute_L1(y, y_hat) print('L1 error: ', l1[0]) #Compute error between y and y_hat error = mlel.error_list(y, y_hat) #Plot y versus y predicted and error p.plot_error(error) #Bootstraping and we obtain params bs_params = bootstrapping.bstrap(s.number_replication, y, X) #get Means, lower and upper bounds means, lower_bounds, upper_bounds = bootstrapping.compute_CI(bs_params) print('Lower bounds: ', lower_bounds) print('Upper bounds:', upper_bounds) #Plot Confidence interval p.plotCI(np.asarray(bs_params), lower_bounds, upper_bounds) #Cluster method gmm_pred = clustering.gmm_cluster(X, s.n_components) #Report print(classification_report(y, gmm_pred, target_names=s.target_names))
def load_data(name="corpus", force_refresh=0) -> object: data_path = "data" output_path = "/content/drive/My Drive/Colab Notebooks/INF8460/Project/output" result = () if name == "corpus": result = read_data( os.path.join( data_path, "/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/corpus.csv" )) elif name == "train": result = read_questions( os.path.join( data_path, "/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/train_ids.csv" )) elif name == "validation": result = read_questions( os.path.join( data_path, "/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/val_ids.csv" )) elif name == "test": result = read_questions( os.path.join( data_path, "/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/test.csv" )) else: print("error name") return result
def generate_heatmap(accuracy, x_range, y_range): sbr.heatmap(pd.DataFrame(accuracy), annot=True, cmap="viridis", fmt='g') plt.title('Grid-search for logistic regression') plt.ylabel('Learning rate: $\\eta$') plt.xlabel('Regularization Term: $\\lambda$') plt.xticks(ticks=np.arange(len(x_range)) + 0.5, labels=x_range) plt.yticks(ticks=np.arange(len(y_range)) + 0.5, labels=y_range) plt.show() if __name__=='__main__': df = read_data(filtered=True) prediction_target='defaultPaymentNextMonth' features = df.loc[:, df.columns != prediction_target].values targets = df.loc[:, df.columns == prediction_target].values design_matrix = create_design_matrix(features) data_train, data_test, targets_train, targets_test = train_test_split(design_matrix, targets, test_size=0.2, shuffle=True) search_start, search_end, n_points = -6, 1, 8 learning_rates = np.logspace(search_start, search_end, n_points) lambda_values = np.logspace(search_start, search_end, n_points) iterations = 10000 accuracy = np.zeros((len(learning_rates), len(lambda_values)))
) exit() train_file = args[0] test_file = args[1] filename = train_file seq_len = 1014 # Fixed length of a sequence of chars, given num_classes = 14 # Num of categories/concepts, given init_step_size = 0.01 # Given max_epochs = 33 # Num of epochs training happens for - arbitarily set to 33 to observe step size decay mini_batch_size = 1 # Given value is 128, but I've set to 1 to run quickly on toy data momentum = 0.9 # Given alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}" #alphabet set, given alph_size = len(alphabet) step_size = init_step_size data = data_handling.read_data(filename, alphabet, seq_len, num_classes) x = data[0] # Training input character sequences y = data[1] # Training input labels #Function to implement step size decay (halves every 3 epochs, 10 times) def step_size_decay(epoch): if epoch > 1 and epoch <= 30 and epoch % 3 == 1: global step_size step_size = step_size / 2 return step_size #Function to print epoch count, loss and step size (to observe decay) after every epoch class FlushCallback(Callback): def on_epoch_end(self, epoch, logs={}):
mini_batch_size = 256 learning_rate = 0.001 standard_deviation = 0.01 initial_bias = 0.01 loss = "SCE" # There are two types of loss: L2 or SCE i.e. Sigmoid Cross Entropy verbosity_level = False network_structure_list = [ dh.MNIST_WIDTH * dh.MNIST_HEIGHT, # Input layer size 32, dh.MNIST_WIDTH * dh.MNIST_HEIGHT # Output layer size ] # Read input data file_name = sys.argv[1] data = dh.read_data(file_name) random.shuffle(data) N = len(data) nvd = int(0.1 * N) print("Number of images: " + str(N)) print("Number of validation images: " + str(nvd)) print("Number of training data: " + str(N - nvd)) training_data = data[nvd:] validation_data = data[0:nvd] # Setup the network ae_one = ae.AutoEncoder(dh.MNIST_WIDTH, dh.MNIST_HEIGHT, network_structure_list, standard_deviation, initial_bias, loss, verbosity_level)
plt.savefig("KAlleF1_training_E_{}_B_{}.png".format(epochs, batches)) plt.show() fig, ax = plt.subplots(figsize=(10, 10)) sbr.heatmap(test_accuracy, annot=True, ax=ax, cmap="viridis") ax.set_title("Test Accuracy") ax.set_xlabel("Neurons per layer") ax.set_ylabel("Hidden layers") plt.yticks(ticks=np.arange(len(hidden_layers)), labels=hidden_layers) plt.xticks(ticks=np.arange(len(neurons_pr_layer)), labels=neurons_pr_layer) plt.savefig("KAlleF1_test_E_{}_B_{}.png".format(epochs, batches)) plt.show() if __name__ == '__main__': df = read_data() features = df.loc[:, df.columns != 'defaultPaymentNextMonth'].values targets = df.loc[:, df.columns == 'defaultPaymentNextMonth'].values #cnn = CondensedNearestNeighbour(random_state=1337) #print(f"{features.shape} skalle {np.reshape(targets, (len(targets), )).shape}") #features, targets = cnn.fit_resample(features, np.reshape(targets, (len(targets),))) #print(f"{features.shape} skalle {targets.shape}") sm = SMOTE(random_state=42) features, targets = sm.fit_resample(features, targets) data_train, data_test, targets_train, targets_test = train_test_split( features, targets, test_size=0.2, shuffle=True) #sm = SMOTE(random_state=42)
import numpy as np import data_handling as dh import deep_models as dm #random.seed(1) if __name__ == "__main__": print("Starting main.") # ------------------------------------------------------------------------ # Load data # ------------------------------------------------------------------------ train_images, truth_images = dh.read_data(data_dir="training/images/") iw, ih, ic = train_images[0].shape if len(train_images) != len(truth_images): sys.exit("ERROR: Dimension mismatch.") n_images = len(train_images) print("Data loaded.") print("Number of train images:" + str(len(train_images))) print("Number of truth images:" + str(len(truth_images))) print("Train image size: " + str(train_images[0].shape)) print("Truth image size: " + str(truth_images[0].shape)) # ------------------------------------------------------------------------ # Augment data # ------------------------------------------------------------------------