def violin_plot(data, name): """ Plot violin plots for the building types versus the building height. """ sns.set_style("ticks") fig = plt.figure(figsize=(8, 6)) violin = sns.violinplot(x=data['bldg_type'], y=data['rel_height'], scale='width', width=0.75, color='steelblue') violin.set_xticklabels(violin.get_xticklabels(), rotation=45, horizontalalignment='right') fig.tight_layout() sns.despine() violin.set_xlabel('Building Type') violin.set_ylabel('Building Height [m]') if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/Violin_BldTypes_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def correlation_matrix(data, name): """ Compute the correlation matrix for the non-geometric features and the building height. """ sns.set_style("ticks") corr_matrix = data[[ 'rel_height', 'avg_hh_income', 'avg_hh_size', 'pop_density', 'h_mean', 'num_amenities' ]].corr() features = [ 'Building Height', 'Avg. HH. Income', 'Avg. HH. Size', 'Population Density', 'Raster Height', '#Amenities' ] fig = plt.figure(figsize=(5, 5)) # Create mask to only show one halve of the matrix mask = np.triu(np.ones_like(corr_matrix, dtype=np.bool)) heatmap = sns.heatmap(corr_matrix, xticklabels=features, yticklabels=features, cmap='RdBu', annot=True, linewidth=0.5, square=True, mask=mask, linewidths=.5, cbar_kws={ "shrink": 0.6, "label": "Correlation" }, vmin=-1, vmax=1) heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=45, horizontalalignment='right') heatmap.tick_params(left=False, bottom=False) fig.tight_layout() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/Correlation_NewFeatures_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!") plt.clf()
def rf_min_samples_leaf(train_features, train_labels, test_features, test_labels, name): """ Plot the minimum samples required in a leaf against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] samples_start = np.linspace(2, 24, 12, dtype=int) samples_end = np.linspace(25, 750, num=30, dtype=int) min_samples_leaf = np.hstack((samples_start, samples_end)) train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'RF', name, save_scaler=False) test_scaled = scaler.transform(test_features) for samples in min_samples_leaf: print("Samples leaf:", samples) randomforest = RandomForestRegressor(min_samples_leaf=samples, n_jobs=-1, random_state=0) randomforest.fit(train_scaled, train_labels) predict_train = randomforest.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) train_results.append(accuracy_train) predict_test = randomforest.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) test_results.append(accuracy_test) fig = plt.figure(figsize=(10, 6)) sns.lineplot(x=min_samples_leaf, y=train_results, label='Train') sns.lineplot(x=min_samples_leaf, y=test_results, label='Test') plt.legend(frameon=False, loc='upper right') plt.xlabel('Minimum samples in leaf') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/Min_Samples_Leaf_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def rf_max_depth(train_features, train_labels, test_features, test_labels, name): """ Plot the maximum tree depth against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] # Maximum depth of the tree. max_depth = np.linspace(1, 35, 35, dtype=int) train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'RF', name, save_scaler=False) test_scaled = scaler.transform(test_features) for depth in max_depth: print("Depth:", depth) randomforest = RandomForestRegressor(max_depth=depth, n_jobs=-1, random_state=0) randomforest.fit(train_scaled, train_labels) predict_train = randomforest.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) train_results.append(accuracy_train) predict_test = randomforest.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) test_results.append(accuracy_test) fig = plt.figure(figsize=(10, 6)) sns.lineplot(x=max_depth, y=train_results, label='Train') sns.lineplot(x=max_depth, y=test_results, label='Test') plt.legend(frameon=False, loc='lower right') plt.xlabel('Maximum tree depth') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/Max_Depth_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def rf_n_estimators(train_features, train_labels, test_features, test_labels, name): """ Plot the number of estimators against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] # The number of trees in the random forest. n_estimators = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'RF', name, save_scaler=False) test_scaled = scaler.transform(test_features) for estimator in n_estimators: print("Num estimators:", estimator) randomforest = RandomForestRegressor(n_estimators=estimator, n_jobs=-1, random_state=0) randomforest.fit(train_scaled, train_labels) predict_train = randomforest.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) train_results.append(accuracy_train) predict_test = randomforest.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) test_results.append(accuracy_test) fig = plt.figure(figsize=(10, 6)) sns.lineplot(x=n_estimators, y=train_results, label='Train') sns.lineplot(x=n_estimators, y=test_results, label='Test') plt.legend(frameon=False, loc='lower right') plt.xlabel('Number of estimators') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/N_Estimators_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def svr_C(train_features, train_labels, test_features, test_labels, name): """ Plot C against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] c_values = np.linspace(1e-4, 1, 10) train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'SVR', name, save_scaler=False) test_scaled = scaler.transform(test_features) for c_val in c_values: print("C:", c_val) svr = LinearSVR(C=c_val, max_iter=2000, random_state=0) svr.fit(train_scaled, train_labels) predict_train = svr.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) train_results.append(accuracy_train) predict_test = svr.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) test_results.append(accuracy_test) fig = plt.figure(figsize=(10, 6)) sns.lineplot(x=c_values, y=train_results, label='Train') sns.lineplot(x=c_values, y=test_results, label='Test') plt.legend(frameon=False, loc='lower right') plt.xlabel('C') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/C_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def emperical_cdf(ground_truth, rfr, svr, mlr, city, env): """ Plot a cumulative error graph showing how the errors are distributed over the number of buildings. """ sns.set() sns.set_style("white") sns.set_style("ticks") abs_errors_rf = np.sort(abs(ground_truth - rfr)) prop_vals_rf = np.linspace(0, 1, len(abs_errors_rf)) abs_errors_svr = np.sort(abs(ground_truth - svr)) prop_vals_svr = np.linspace(0, 1, len(abs_errors_svr)) abs_errors_mlr = np.sort(abs(ground_truth - mlr)) prop_vals_mlr = np.linspace(0, 1, len(abs_errors_mlr)) fig, ax = plt.subplots() ax.plot(abs_errors_rf, prop_vals_rf, label='RFR') ax.plot(abs_errors_svr, prop_vals_svr, label='SVR') ax.plot(abs_errors_mlr, prop_vals_mlr, label='MLR') ax.set_xlabel("Error [m]") ax.set_ylabel("Cumulative Frequency") if city == 'Seattle': ax.set_xlim([0, 100]) else: ax.set_xlim([0, 8]) ax.set_ylim([0, 1]) ax.legend(frameon=False, loc='lower right') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/Cumulative_Errors_" + city + "_" + env + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def svr_maxiter_tolerance(train_features, train_labels, test_features, test_labels, name): """ Plot a combination of the maximum number of iterations and the tolerance against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] tolerances = [1e-3, 1e-4, 1e-5] tol_labels = ['1e-3', '1e-4', '1e-5'] max_iter = np.linspace(100, 5000, 50, dtype=int) train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'SVR', name, save_scaler=False) test_scaled = scaler.transform(test_features) for tolerance in tolerances: temp_train = [] temp_test = [] print("Tolerance:", tolerance) for iteration in max_iter: print("Max. iterations:", iteration) svr = LinearSVR(tol=tolerance, max_iter=iteration, random_state=0) svr.fit(train_scaled, train_labels) predict_train = svr.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) temp_train.append(accuracy_train) predict_test = svr.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) temp_test.append(accuracy_test) train_results.append(temp_train) test_results.append(temp_test) fig = plt.figure(figsize=(10, 6)) for i in range(len(train_results)): label_train = 'Train (tol' + tol_labels[i] +')' sns.lineplot(x=max_iter, y=train_results[i], label=label_train) label_test = 'Test (tol' + tol_labels[i] +')' sns.lineplot(x=max_iter, y=test_results[i], label=label_test) plt.legend(frameon=False, loc='lower left', bbox_to_anchor=(1.0, 0.0)) plt.xlabel('Maximum number of iterations') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/MaxIter_Tolerance_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")