def collect_data(database, tables, train=False): """ Based on the provided database and tables, retrieve data from the database for different settings: CBD, Suburban/Rural and a combination of these two. """ connection = db_funcs.setup_connection(database) cursor = connection.cursor() # No specific tables were specified, so all tables in the # database are used for training. if not tables: tables = db_funcs.unique_tables(cursor) data_suburb, data_cbd, data_full = np.array([]), np.array([]), np.array([]) # Extract all training data and store it into a pandas DataFrame. for i, table in enumerate(tables): if i == 0: data_suburb, data_cbd, data_full = db_funcs.read_data(connection, table, training=True) else: suburb, cbd, full = db_funcs.read_data(connection, table, training=train) data_suburb = data_suburb.append(suburb) data_cbd = data_cbd.append(cbd) data_full = data_full.append(full) db_funcs.close_connection(connection, cursor) return data_full, data_suburb, data_cbd
def get_data(database, tables): """ Retrieve the data for the suburbs, CBDs and the combined dataset. """ connection = db_funcs.setup_connection(database) cursor = connection.cursor() data_suburb, data_cbd, data_full = np.array([]), np.array([]), np.array([]) for i, table in enumerate(tables): if i == 0: data_suburb, data_cbd, data_full = db_funcs.read_data(connection, table, training=True) else: suburb, cbd, full = db_funcs.read_data(connection, table, training=True) data_suburb = data_suburb.append(suburb) data_cbd = data_cbd.append(cbd) data_full = data_full.append(full) db_funcs.close_connection(connection, cursor) return data_full, data_suburb, data_cbd
def main(): """ Perform all function calls. """ table = "denver_cutout" store_results = False method = "RFR" connection = db_funcs.setup_connection("denver") connection.autocommit = True cursor = connection.cursor() data_suburb, data_cbd, data_full = db_funcs.read_data(connection, table, extra_features=True, training=True) correlation_matrix(data_suburb, 'suburbs') correlation_matrix(data_cbd, 'CBD') correlation_matrix(data_full, 'combined') violin_plot(data_suburb, 'suburbs') violin_plot(data_cbd, 'CBD') violin_plot(data_full, 'combined') print("\n>>> Running with only geometric features <<<") #test_geom_features_split(data_suburb, data_cbd, cursor, table, store_results, method) test_geom_features_single(data_full, cursor, table, store_results, method) print(80 * '-') print("\n>>> Running with geometric and non-geometric features <<<") #test_all_features_split(data_suburb, data_cbd, cursor, table, store_results, method) test_all_features_single(data_full, cursor, table, store_results, method) db_funcs.close_connection(connection, cursor)
def split_network(train_suburbs, train_cbds, test_db, tables_test, model_suburbs, scaler_suburbs, model_cbds, scaler_cbds, method, save_predictions, labels, save_model, test_subsets, feature_subset): """ Perform the machine learning based on a split training network that separates the CBDs and Suburban and Rural areas. """ reading_time = [] predict_time = [] if len(train_suburbs) != 0: train_feat_suburb, train_label_suburb = ml_funcs.get_features_and_labels(train_suburbs, "split", test_subsets, feature_subset, labels=True) if len(train_cbds) != 0: train_feat_cbd, train_label_cbd = ml_funcs.get_features_and_labels(train_cbds, "split", test_subsets, feature_subset, labels=True) # A database to perform the tests/ predictions on is specified. if test_db: connection = db_funcs.setup_connection(test_db) connection.autocommit = True cursor = connection.cursor() # If no specific tables are selected, perform predictions # for all tables in the specified testing database. if not tables_test: tables_test = db_funcs.unique_tables(cursor) for table in tables_test: if table == 'cbds': continue print(80*'-') print(80*'-') starttime = time() test_suburbs, test_cbds, _ = db_funcs.read_data(connection, table, training=labels) endtime = time() duration = endtime - starttime reading_time.append(duration) if labels: test_feat_suburbs, test_labels_suburbs = \ ml_funcs.get_features_and_labels(test_suburbs, "split", test_subsets, feature_subset, labels=labels) test_feat_cbds, test_labels_cbds = \ ml_funcs.get_features_and_labels(test_cbds, "split", test_subsets, feature_subset, labels=labels) else: test_feat_suburbs = \ ml_funcs.get_features_and_labels(test_suburbs, "split", test_subsets, feature_subset, labels=labels) test_feat_cbds = ml_funcs.get_features_and_labels(test_cbds, "split", test_subsets, feature_subset, labels=labels) pred_cbds, pred_suburbs = np.array([]), np.array([]) starttime = time() # There is no training data specified, use model. if len(train_suburbs) == 0 and len(train_cbds) == 0: if method == "RFR": # There must be test features for the CBD present. if len(test_feat_cbds) != 0: pred_cbds, imp_cbds = predict_from_model(method, test_feat_cbds, model_cbds, scaler_cbds, 'CBD') else: print("Warning: no CBD data present in test set {0}".format(table)) # There must be test features for the suburbs/rural areas present. if len(test_feat_suburbs) != 0: pred_suburbs, imp_suburbs = predict_from_model(method, test_feat_suburbs, model_suburbs, scaler_suburbs, 'suburbs') else: print("Warning: no rural/suburban data present in test set {0}"\ .format(table)) else: # There must be test features for the CBD present. if len(test_feat_cbds) != 0: pred_cbds = predict_from_model(method, test_feat_cbds, model_cbds, scaler_cbds, 'CBD') else: print("Warning: no CBD data present in test set {0}".format(table)) # There must be test features for the suburbs/rural areas present. if len(test_feat_suburbs) != 0: pred_suburbs = predict_from_model(method, test_feat_suburbs, model_suburbs, scaler_suburbs, 'suburbs') else: print("Warning: no rural/suburban data present in test set {0}"\ .format(table)) # There is training data specified, check which area morphologies are present. else: if method == "RFR": if len(train_suburbs) != 0 and len(test_feat_suburbs) != 0: pred_suburbs, imp_suburbs = train_from_data(method, train_feat_suburb, train_label_suburb, test_feat_suburbs, save_model, 'suburbs') else: print("Warning: training and testing data do not both contain " +\ "suburban/rural data!") if len(train_cbds) != 0 and len(test_feat_cbds) != 0: pred_cbds, imp_cbds = train_from_data(method, train_feat_cbd, train_label_cbd, test_feat_cbds, save_model, 'CBD') else: print("Warning: training and testing data do not both contain CBD data!") else: if len(train_suburbs) != 0 and len(test_feat_suburbs) != 0: pred_suburbs = train_from_data(method, train_feat_suburb, train_label_suburb, test_feat_suburbs, save_model, 'suburbs') else: print("Warning: training and testing data do not both contain " +\ "suburban/rural data!") if len(train_cbds) != 0 and len(test_feat_cbds) != 0: pred_cbds = train_from_data(method, train_feat_cbd, train_label_cbd, test_feat_cbds, save_model, 'CBD') else: print("Warning: training and testing data do not both contain CBD data!") endtime = time() duration = endtime - starttime predict_time.append(duration) # Labels are present: print statistics for the height predictions. if labels: if method == "RFR": if len(pred_suburbs) != 0: ml_funcs.get_statistics(test_labels_suburbs, pred_suburbs, "split", feature_subset, imp_suburbs) generate_plots.plot_cumulative_errors(test_labels_suburbs, pred_suburbs, 'suburbs') if len(pred_cbds) != 0: ml_funcs.get_statistics(test_labels_cbds, pred_cbds, "split", feature_subset, imp_cbds) generate_plots.plot_cumulative_errors(test_labels_cbds, pred_cbds, 'CBDs') else: if len(pred_suburbs) != 0: ml_funcs.get_statistics(test_labels_suburbs, pred_suburbs, "split", feature_subset) generate_plots.plot_cumulative_errors(test_labels_suburbs, pred_suburbs, 'suburbs') if len(pred_cbds) != 0: ml_funcs.get_statistics(test_labels_cbds, pred_cbds, "split", feature_subset) generate_plots.plot_cumulative_errors(test_labels_cbds, pred_cbds, 'CBD') # Store predictions in database. if save_predictions: if len(pred_suburbs) != 0: height_values = list(zip(test_suburbs.id, pred_suburbs)) db_funcs.store_predictions(cursor, height_values, table, method, 'split') if len(pred_cbds) != 0: height_values = list(zip(test_cbds.id, pred_cbds)) db_funcs.store_predictions(cursor, height_values, table, method, 'split') db_funcs.close_connection(connection, cursor) print("\n>> Total duration (s) of reading data " + \ "into dataframes: {0} ({1})".format(sum(reading_time), timedelta(seconds=sum(reading_time)))) print("\n>> Total duration (s) of the building " + \ " height predictions: {0} ({1})".format(sum(predict_time), timedelta(seconds=sum(predict_time)))) # No test database is specified, only train the model based on the training data. # Useful when training and storing a model to a file. else: if len(train_suburbs) != 0: train_from_data(method, train_feat_suburb, train_label_suburb, np.array([]), save_model, 'suburbs') if len(train_cbds) != 0: train_from_data(method, train_feat_cbd, train_label_cbd, np.array([]), save_model, 'CBD')
def single_network(train_data, test_db, tables_test, model, scaler, method, save_predictions, labels, save_model, test_subsets, feature_subset): """ Perform the machine learning based on a single training network that combines the CBDs and Suburban and Rural areas. """ reading_time = [] predict_time = [] if len(train_data) != 0: train_features, train_labels = ml_funcs.get_features_and_labels(train_data, "single", test_subsets, feature_subset, labels=True) # A database to perform the tests/ predictions on is specified. if test_db: connection = db_funcs.setup_connection(test_db) connection.autocommit = True cursor = connection.cursor() # If no specific tables are selected, perform predictions # for all tables in the specified testing database. if not tables_test: tables_test = db_funcs.unique_tables(cursor) for table in tables_test: if table == 'cbds': continue print(80*'-') print(80*'-') starttime = time() _, _, test_data = db_funcs.read_data(connection, table, training=labels) endtime = time() duration = endtime - starttime reading_time.append(duration) if labels: test_features, test_labels = ml_funcs.get_features_and_labels(test_data, "single", test_subsets, feature_subset, labels=labels) else: test_features = ml_funcs.get_features_and_labels(test_data, "single", test_subsets, feature_subset, labels=labels) starttime = time() if len(train_data) == 0: if method == "RFR": predictions, importances = predict_from_model(method, test_features, model, scaler, 'combined') else: predictions = predict_from_model(method, test_features, model, scaler, 'combined') else: if method == "RFR": predictions, importances = train_from_data(method, train_features, train_labels, test_features, save_model, 'combined') else: predictions = train_from_data(method, train_features, train_labels, test_features, save_model, 'combined') endtime = time() duration = endtime - starttime predict_time.append(duration) # Labels are present: print statistics for the height predictions. if labels: if method == "RFR": ml_funcs.get_statistics(test_labels, predictions, "single", feature_subset, importances) else: ml_funcs.get_statistics(test_labels, predictions, "single", feature_subset) generate_plots.plot_cumulative_errors(test_labels, predictions, 'combined',) # Store predictions in database. if save_predictions: height_values = list(zip(test_data.id, predictions)) db_funcs.store_predictions(cursor, height_values, table, method, 'combined') db_funcs.close_connection(connection, cursor) print("\n>> Total duration (s) of reading data " + \ "into dataframes: {0} ({1})".format(sum(reading_time), timedelta(seconds=sum(reading_time)))) print("\n>> Total duration (s) of the building " + \ " height predictions: {0} ({1})".format(sum(predict_time), timedelta(seconds=sum(predict_time)))) # No test database is specified, only train the model based on the training data. # Useful when training and storing a model to a file. else: if len(train_features) != 0: train_from_data(method, train_features, train_labels, np.array([]), save_model, 'combined')