def main(): red_x, red_y = dh.load_data('hw2_winequality-red_train.npy') red_y = dh.convertWineDataToClasses(red_y) red_test_x, red_test_y = dh.load_data('hw2_winequality-red_test.npy') red_test_y = dh.convertWineDataToClasses(red_test_y) balanced_x, balanced_y = (red_x, red_y,) class_dictionary = {'poor': 0, 'median': 1, 'excellent': 2} #balanced_x, balanced_y = dh.balanceData(red_x, red_y, class_dictionary.keys()) #dtree = tree.trainTree(balanced_x, balanced_y, 15) #y_pred = tree.testTree(red_test_x, dtree) #ensemble = tree.trainBaggingEnsemble(balanced_x, balanced_y, 15, 5) #y_pred = tree.testBaggingEnsemble(red_test_x, ensemble) #confusion_matrix = cf.calculate_confusion_matrix(y_pred, red_test_y, class_dictionary) #cf.print_confusion_matrix(confusion_matrix) #print(cf.calculateAccuracy(y_pred, balanced_y)) ensembles = bf.trainAdaBoost(balanced_x, balanced_y, 1000) return 0
def run(data_folder, data_file, metric, save_folder, batch_size=10): if not osp.exists(save_folder): os.makedirs(save_folder) y, X = load_data(data_folder, data_file, metric=metric, standardize_subjects=True) print(save_folder) print('Batch size: {}'.format(batch_size)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, stratify=y) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=50, stratify=y_train) print('Train: Class 1: {}; Class 0: {}'.format(np.sum(y_train == 1), np.sum(y_train == 0))) print('Valid: Class 1: {}; Class 0: {}'.format(np.sum(y_valid == 1), np.sum(y_valid == 0))) print('Test: Class 1: {}; Class 0: {}'.format(np.sum(y_test == 1), np.sum(y_test == 0))) input_shape = X_train.shape[1:] network = get_network(n_classes=np.unique(y).size, input_shape=input_shape) #mean_train = X_train.mean(axis=0, keepdims=True) #std_train = X_train.std(axis=0, keepdims=True) #X_train = (X_train - mean_train)/(std_train + 0.0001) #X_valid = (X_valid - mean_train)/(std_train + 0.0001) #X_test = (X_test - mean_train)/(std_train + 0.0001) y_train = to_categorical(y_train, num_classes=2) y_valid = to_categorical(y_valid, num_classes=2) y_test = to_categorical(y_test, num_classes=2) early_stopping = EarlyStopping(monitor='val_balanced_accuracy', min_delta=0.001, patience=50, verbose=1, mode='max') csv_logger = CSVLogger(osp.join(save_folder, 'training.log')) tensorboard = TensorBoard(log_dir=osp.join(save_folder, 'tensorboard'), histogram_freq=0, write_graph=True, write_images=True) network.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=1000, verbose=1, validation_data=(X_valid, y_valid), shuffle=True, callbacks=[early_stopping, csv_logger, tensorboard]) loss, acc, bal_acc = network.evaluate(x=X_test, y=y_test, batch_size=y_test.size, verbose=1) print('Test: ') print(loss, acc, bal_acc) metrics_test = np.array([loss, acc, bal_acc]) #network = run_final_model(X, y, batch_size) #network.save(osp.join(save_folder, 'final_weights_{}.h5'.format(metric))) np.save(osp.join(save_folder, 'metrics_test_{}.npy'.format(metric)), metrics_test)
def main(): # Get and load data get_data() housing = load_data() # display_data(housing) # Perform and split by strata strat_train_set, strat_test_set = do_stratified_sampling(housing) # Using the training set, play with the data # play_with_data(strat_train_set.copy()) # Split data into predictors and labels housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() # Use an imputer to fill in missing values # We will fill in these values with the median imputer = SimpleImputer(strategy="median") # Get dataframe of only numerical vals housing_num = housing.drop("ocean_proximity", axis=1) # Let the imputer estimate based on the numerical housing vals imputer.fit(housing_num) # NOTE: The median of each attribute is stored in imputer.statistics_ # Use trained imputer to fill in gaps by transforming the data X = imputer.transform(housing_num) # Insert np array into pandas DataFrame housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index) # Convert categorical attribute to numerical attribute housing_cat = housing[["ocean_proximity"]] # Use one-hot encoding instead of ordinal encoding # as the categories are not ordered. cat_encoder = OneHotEncoder() # NOTE: This gives a scipy array which stores the location # of the "hot" encoding (instead of potentially storing # many many "cold" encodings (0's)) # NOTE: Categories are stored in ordinal_encoder.categories_ housing_cat_1hot = cat_encoder.fit_transform(housing_cat) # Adding combinational attributes attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) # Pipeline for transformations on numerical values num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) # It is also possible to perform all of the above transformations # in one go num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) # This is the final set of training data housing_prepared = full_pipeline.fit_transform(housing) # Fit the linear regression model on prepared data lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) # Do some testing some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) print("Predictions:", lin_reg.predict(some_data_prepared)) print("Labels:", list(some_labels)) # Get metrics housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) print(lin_rmse) # Due to the above results being unsatisfactory # Try a decision tree regressor tree_reg = DecisionTreeRegressor() tree_reg.fit(housing_prepared, housing_labels) # Now do some testing on the tree regression model housing_predictions = tree_reg.predict(housing_prepared) tree_mse = mean_squared_error(housing_labels, housing_predictions) tree_rmse = np.sqrt(tree_mse) print(tree_rmse) # The above testing gives no error # Cross validation is performed on 10 folds (training and validating # 10 times, choosing a different fold for validation each time # and training on the remaining fold) scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) # As cross validation expect to use a utility function instead of a # cost function (whereas we want to use a cost function), we must # flip the sign of the scores. tree_rmse_scores = np.sqrt(-scores) # Double check against cross validation on the linear reg. model lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) lin_rmse_scores = np.sqrt(-lin_scores) print("TREE RSME SCORES") display_scores(tree_rmse_scores) print("LINEAR REG RMSE SCORES") display_scores(lin_rmse_scores) # This shows that the Decision Tree is overfitting # Therefore we try the Random Forest Regressor forest_reg = RandomForestRegressor() forest_reg.fit(housing_prepared, housing_labels) forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) forest_rmse_scores = np.sqrt(-forest_scores) print("RANDOM FOREST REG RMSE SCORES") display_scores(forest_rmse_scores) # Fine-tuning by automatically searching for hyperparams # Grid indicates to try firstly all permutations of the first dict # followed by the permutations of options in the second dict. param_grid = [ { "n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8] }, { "bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4] }, ] forest_reg = RandomForestRegressor() # We use five-fold cross validation grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True) grid_search.fit(housing_prepared, housing_labels) # The best parameters are found using: print(f"Best hyperparams: {grid_search.best_params_}") # The best estimator: print(f"Best Estimator: {grid_search.best_estimator_}") # The evaluation scores: cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params) # Examine the relative importance of each attribute for accurate predictions feature_importances = grid_search.best_estimator_.feature_importances_ # Displaying the importance scores next to their attribute names extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"] cat_encoder = full_pipeline.named_transformers_["cat"] cat_one_hot_attribs = list(cat_encoder.categories_[0]) attributes = num_attribs + extra_attribs + cat_one_hot_attribs print(sorted(zip(feature_importances, attributes), reverse=True)) # NOTE: The above may indicate which features may be dropped # Evaluation on test set # Select the best estimator found by the grid search as the final model final_model = grid_search.best_estimator_ # Separate test set into predictors and labels X_test = strat_test_set.drop("median_house_value", axis=1) y_test = strat_test_set["median_house_value"].copy() # NOTE: Only transform test data, DO NOT FIT the model on test data X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) # Compute 95% confidence interval confidence = 0.95 squared_errors = (final_predictions - y_test)**2 np.sqrt( stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors))) # The following is inserted into our SelectImportantFeatures' # fit method, however we add it here for testing later. top_k_feature_indices = top_importances(feature_importances, 5) # New pipeline, now reducing the data's features to be # restricted to the top 5 most important features prep_and_feature_pipeline = Pipeline([ ("prep", full_pipeline), ("feature", SelectImportantFeatures(feature_importances, 5)) ]) trimmed_housing = prep_and_feature_pipeline.fit_transform(housing) # NOTE: If we were to do trimmed_housing[0:3] and # housing_prepared[0:3, top_k_feature_indices], # the output would be the same. print(trimmed_housing[0:3]) print(housing_prepared[0:3, top_k_feature_indices])
def pre_train(args): Metadata = json.load(open(f"./data/Metadata.json", "r")) n_mfcc = Metadata["n_mfcc"] #audio_length = Metadata["max audio length"] audio_length = 216 n_classes = len(Metadata["mapping"]) model_name = args.model save_as = None if args.save_as != None: save_as = args.save_as[:-3] if args.save_as[ -3:] == '.pt' else args.save_as model = None hyperparameters = {} """ Specification for each model """ if model_name.lower() == "mlp" or model_name.lower() == "baseline": model_name = "mlp" model = MLP(input_size=n_mfcc * audio_length, output_size=n_classes) hyperparameters = { "optimizer": torch.optim.Adam, "loss_fnc": nn.CrossEntropyLoss(), "epochs": args.epochs, "batch_size": args.batch_size, "lr": 0.001 if args.lr == -1 else args.lr, "eval_every": 10 if args.eval_every == -1 else args.eval_every } print("Created MLP baseline model") elif model_name.lower() == "average": model = Average(input_size=audio_length, output_size=n_classes) hyperparameters = { "optimizer": torch.optim.Adam, "loss_fnc": nn.CrossEntropyLoss(), "epochs": args.epochs, "batch_size": args.batch_size, "lr": 0.001 if args.lr == -1 else args.lr, "eval_every": 10 if args.eval_every == -1 else args.eval_every } print("Created Averaging CNN baseline model") elif model_name.lower() == "cnn": model = CNN(n_mfcc=n_mfcc, n_classes=n_classes) hyperparameters = { "optimizer": torch.optim.Adam, "loss_fnc": nn.CrossEntropyLoss(), "epochs": args.epochs, "batch_size": args.batch_size, "lr": 0.001 if args.lr == -1 else args.lr, "eval_every": 50 if args.eval_every == -1 else args.eval_every } print("Created CNN model") elif model_name.lower() == "rnn": model = RNN(n_mfcc=n_mfcc, n_classes=n_classes, hidden_size=100) hyperparameters = { "optimizer": torch.optim.Adam, "loss_fnc": nn.CrossEntropyLoss(), "epochs": args.epochs, "batch_size": args.batch_size, "lr": 0.01 if args.lr == -1 else args.lr, "eval_every": 50 if args.eval_every == -1 else args.eval_every } print("Created RNN model") else: raise ValueError(f"Model '{model_name}' does not exist") print("Loading Data...") train_iter, valid_iter, test_iter = load_data(args.batch_size, n_mfcc, overfit=args.overfit) final_train_loss, final_train_acc, \ final_valid_loss, final_valid_acc, \ final_test_loss, final_test_acc = training_loop( model, train_iter, valid_iter, test_iter, save_as=save_as, **hyperparameters ) # Final evaluation of Validation and Test Data print() print() print( f"Training Loss: {final_train_loss:.4f}\tTraining Accuracy: {final_train_acc*100:.2f}" ) print( f"Validation Loss: {final_valid_loss:.4f}\tValidation Accuracy: {final_valid_acc*100:.2f}" ) print( f"Testing Loss: {final_test_loss:.4f}\tTesting Accuracy: {final_test_acc*100:.2f}" ) print() print() # summary statistics print("Model Summary:") summary(model, input_size=(n_mfcc, audio_length)) print() print() train_predictions, train_labels = get_predictions_and_labels( model, train_iter) valid_predictions, valid_labels = get_predictions_and_labels( model, valid_iter) test_predictions, test_labels = get_predictions_and_labels( model, test_iter) # plotting confusion matrices CM = confusion_matrix(train_labels, train_predictions) plot_confusion_matrix(CM, list(Metadata["mapping"].values()), title="Training Data") CM = confusion_matrix(valid_labels, valid_predictions) plot_confusion_matrix(CM, list(Metadata["mapping"].values()), title="Validation Data") CM = confusion_matrix(test_labels, test_predictions) plot_confusion_matrix(CM, list(Metadata["mapping"].values()), title="Testing Data") # Accuracy for top 2 guesses acc = evaluate_top_k(model, valid_iter, 2) print(f"Accuracy for top 2 guesses: {100*acc:2f}")
def main(): # Get and load data get_data() housing = load_data() # display_data(housing) # Perform and split by strata strat_train_set, strat_test_set = do_stratified_sampling(housing) # Using the training set, play with the data # play_with_data(strat_train_set.copy()) # Split data into predictors and labels housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() # Use an imputer to fill in missing values # We will fill in these values with the median imputer = SimpleImputer(strategy="median") # Get dataframe of only numerical vals housing_num = housing.drop("ocean_proximity", axis=1) # Let the imputer estimate based on the numerical housing vals imputer.fit(housing_num) # NOTE: The median of each attribute is stored in imputer.statistics_ # Use trained imputer to fill in gaps by transforming the data X = imputer.transform(housing_num) # Insert np array into pandas DataFrame housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index) # Convert categorical attribute to numerical attribute housing_cat = housing[["ocean_proximity"]] # Use one-hot encoding instead of ordinal encoding # as the categories are not ordered. cat_encoder = OneHotEncoder() # NOTE: This gives a scipy array which stores the location # of the "hot" encoding (instead of potentially storing # many many "cold" encodings (0's)) # NOTE: Categories are stored in ordinal_encoder.categories_ housing_cat_1hot = cat_encoder.fit_transform(housing_cat) # Adding combinational attributes attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) # Pipeline for transformations on numerical values num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) # It is also possible to perform all of the above transformations # in one go num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) # This is the final set of training data housing_prepared = full_pipeline.fit_transform(housing) print("Finished preparing data") svr_reg = SVR() # # Try a support vector machine regressor # param_grid = [ # {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]}, # {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0], # 'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}, # ] # # grid_search = GridSearchCV(svr_reg, param_grid, cv=5, # scoring="neg_mean_squared_error", # return_train_score=True) # grid_search.fit(housing_prepared, housing_labels) # # # Best svr score # best_svr_score = np.sqrt(-grid_search.best_score_) # print(f"Best SVR Estimator Score: {best_svr_score}") # Using a randomized search instead of a grid search param_distribs = { 'kernel': ['linear', 'rbf'], 'C': reciprocal(20, 200000), 'gamma': expon(scale=1.0), } rnd_search = RandomizedSearchCV(svr_reg, param_distribs, n_iter=50, cv=5, scoring="neg_mean_squared_error", verbose=2, random_state=42) rnd_search.fit(housing_prepared, housing_labels) best_svr_score = np.sqrt(-rnd_search.best_score_) print(f"Best SVR Estimator Score: {best_svr_score}")
from keras.models import Sequential from keras.layers import Flatten, Dense, Conv2D, Dropout from keras.layers import MaxPooling2D, Lambda, Cropping2D from data_handling import load_data, flip_augmentation #%% X, y = load_data(log_file='data2/driving_log.csv') X, y = flip_augmentation(X, y) # This is the network architecture model = Sequential() model.add(Cropping2D(cropping=((55, 25), (0, 0)), input_shape=(160, 320, 3))) model.add(Lambda(lambda x: (x / 255.0))) model.add(Conv2D(32, (3, 3), padding='same', activation='relu')) model.add(MaxPooling2D()) model.add(Conv2D(32, (3, 3), padding='same', activation='relu')) model.add(MaxPooling2D()) model.add(Conv2D(32, (3, 3), padding='same', activation='relu')) model.add(MaxPooling2D()) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(64, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(1)) # The model is compiled trained and saved to file model.compile(loss='mse', optimizer='adam') model.fit(X, y, validation_split=0.2, shuffle=True, epochs=5) model.save('model.h5')
def run(data_folder, save_folder, batch_size=10): if not osp.exists(save_folder): os.makedirs(save_folder) y = load_labels(data_folder) X = load_data(data_folder) cv = StratifiedKFold(n_splits=10, shuffle=True) metrics_test = np.zeros((cv.get_n_splits(X, y), 3)) for (i_cv, (train_id, test_id)) in enumerate(cv.split(X, y)): print '{}/{}'.format(i_cv + 1, cv.get_n_splits(X, y)) X_train, X_test = X[train_id], X[test_id] y_train, y_test = y[train_id], y[test_id] X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=30, stratify=y_train) print 'Train: Class 1: {}; Class 0: {}'.format(np.sum(y_train == 1), np.sum(y_train == 0)) print 'Valid: Class 1: {}; Class 0: {}'.format(np.sum(y_valid == 1), np.sum(y_valid == 0)) print 'Test: Class 1: {}; Class 0: {}'.format(np.sum(y_test == 1), np.sum(y_test == 0)) network = get_network(n_classes=np.unique(y).size) mean_train = X_train.mean(axis=0, keepdims=True) std_train = X_train.std(axis=0, keepdims=True) X_train = (X_train - mean_train) / (std_train + 0.0001) X_valid = (X_valid - mean_train) / (std_train + 0.0001) X_test = (X_test - mean_train) / (std_train + 0.0001) y_train = to_categorical(y_train, num_classes=2) y_valid = to_categorical(y_valid, num_classes=2) y_test = to_categorical(y_test, num_classes=2) early_stopping = EarlyStopping(monitor='val_balanced_accuracy', min_delta=0.001, patience=50, verbose=1, mode='max') csv_logger = CSVLogger( osp.join(save_folder, 'training{}.log'.format(i_cv + 1))) tensorboard = TensorBoard(log_dir=osp.join( save_folder, 'tensorboard{}'.format(i_cv + 1)), histogram_freq=0, write_graph=True, write_images=True) network.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=1000, verbose=1, validation_data=(X_valid, y_valid), shuffle=True, callbacks=[early_stopping, csv_logger, tensorboard]) loss, acc, bal_acc = network.evaluate(x=X_test, y=y_test, batch_size=y_test.size, verbose=1) print 'Test: ' print loss, acc, bal_acc metrics_test[i_cv] = [loss, acc, bal_acc] print 'Avg. test metrics:' print metrics_test.mean(axis=0) np.save(osp.join(save_folder, 'metrics_test_cv.npy'), metrics_test)