def save_train_results(grid, data_name, clf_name): """Saves grid search cross-validation results and pickles the entire pipeline and best estimator. Args: grid (GridSearchCV object): Trained grid search object. data_name (str): Name of data set algorithm was trained on. clf_name (str): Type of algorithm. path (str): Target path for results/pickled model. """ # get cross-validation results and best estimator results = pd.DataFrame(grid.cv_results_) best_clf = grid.best_estimator_ # save cross-validation results as CSV parentdir = 'models' target = '{}/{}'.format(parentdir, clf_name) resfile = get_abspath('{}_cv_results.csv'.format(data_name), target) results.to_csv(resfile, index=False) # save grid search object and best estimator as pickled model files gridpath = get_abspath('{}_grid.pkl'.format(data_name), target) bestpath = get_abspath('{}_best_estimator.pkl'.format(data_name), target) save_pickled_model(grid, gridpath) save_pickled_model(best_clf, bestpath)
def generate_kurtosis_plot(name): """Plots mean kurtosis as a function of number of components. Args: name (str): Dataset name. """ resdir = 'results/ICA' df = pd.read_csv(get_abspath('{}_kurtosis.csv'.format(name), resdir)) # get figure and axes fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 3)) # plot explained variance and cumulative explain variance ratios x = df['n'] kurt = df['kurtosis'] ax.plot(x, kurt, marker='.', color='g') ax.set_title('ICA Mean Kurtosis ({})'.format(name)) ax.set_ylabel('Mean Kurtosis') ax.set_xlabel('# Components') ax.grid(color='grey', linestyle='dotted') # change layout size, font size and width fig.tight_layout() for ax in fig.axes: ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label] for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels(): item.set_fontsize(8) # save figure plotdir = 'plots/ICA' plotpath = get_abspath('{}_kurtosis.png'.format(name), plotdir) plt.savefig(plotpath) plt.clf()
def preprocess_winequality(): """Cleans and generates wine quality dataset for experiments as a CSV file. """ # get file paths sdir = 'data/winequality' tdir = 'data/experiments' wr_file = get_abspath('winequality-red.csv', sdir) ww_file = get_abspath('winequality-white.csv', sdir) # load as data frame wine_red = pd.read_csv(wr_file, sep=';') wine_white = pd.read_csv(ww_file, sep=';') # encode artifical label to determine if wine is red or not wine_red['red'] = 1 wine_white['red'] = 0 # combine datasets and format column names df = wine_red.append(wine_white) df.columns = ['_'.join(col.split(' ')) for col in df.columns] df.rename(columns={'quality': 'class'}, inplace=True) # save to CSV save_dataset(df, 'winequality.csv', sep=',', subdir=tdir)
def nn_cluster_datasets(X, name, km_k, gmm_k): """Generates datasets for ANN classification by appending cluster label to original dataset. Args: X (Numpy.Array): Original attributes. name (str): Dataset name. km_k (int): Number of clusters for K-Means. gmm_k (int): Number of components for GMM. """ km = KMeans(random_state=0).set_params(n_clusters=km_k) gmm = GMM(random_state=0).set_params(n_components=gmm_k) km.fit(X) gmm.fit(X) # add cluster labels to original attributes km_x = np.concatenate((X, km.labels_[:, None]), axis=1) gmm_x = np.concatenate((X, gmm.predict(X)[:, None]), axis=1) # save results resdir = 'results/NN' kmfile = get_abspath('{}_km_labels.csv'.format(name), resdir) gmmfile = get_abspath('{}_gmm_labels.csv'.format(name), resdir) save_array(array=km_x, filename=kmfile, subdir=resdir) save_array(array=gmm_x, filename=gmmfile, subdir=resdir)
def pca_experiment(X, name, dims, evp): """Run PCA on specified dataset and saves dataset with components that explain at least 85% of total variance. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (int): Number of components. evp (float): Explained variance percentage threshold. """ pca = PCA(random_state=0, svd_solver='full', n_components=dims) comps = pca.fit_transform(X) # get principal components # cumulative explained variance greater than threshold r = range(1, dims + 1) ev = pd.Series(pca.explained_variance_, index=r, name='ev') evr = pd.Series(pca.explained_variance_ratio_, index=r, name='evr') evrc = evr.rename('evr_cum').cumsum() res = comps[:, :evrc.where(evrc > evp).idxmin()] evars = pd.concat((ev, evr, evrc), axis=1) # save results as CSV resdir = 'results/PCA' evfile = get_abspath('{}_variances.csv'.format(name), resdir) resfile = get_abspath('{}_projected.csv'.format(name), resdir) save_array(array=res, filename=resfile, subdir=resdir) evars.to_csv(evfile, index_label='n')
def rf_experiment(X, y, name, theta): """Run RF on specified dataset and saves feature importance metrics and best results CSV. Args: X (Numpy.Array): Attributes. y (Numpy.Array): Labels. name (str): Dataset name. theta (float): Min cumulative information gain threshold. """ rfc = RandomForestClassifier( n_estimators=100, class_weight='balanced', random_state=0) fi = rfc.fit(X, y).feature_importances_ # get feature importance and sort by value in descending order i = [i + 1 for i in range(len(fi))] fi = pd.DataFrame({'importance': fi, 'feature': i}) fi.sort_values('importance', ascending=False, inplace=True) fi['i'] = i cumfi = fi['importance'].cumsum() fi['cumulative'] = cumfi # generate dataset that meets cumulative feature importance threshold idxs = fi.loc[:cumfi.where(cumfi > theta).idxmin(), :] idxs = list(idxs.index) reduced = X[:, idxs] # save results as CSV resdir = 'results/RF' fifile = get_abspath('{}_fi.csv'.format(name), resdir) resfile = get_abspath('{}_projected.csv'.format(name), resdir) save_array(array=reduced, filename=resfile, subdir=resdir) fi.to_csv(fifile, index_label=None)
def preprocess_winequality(): """Cleans and generates wine quality dataset for experiments as a CSV file. """ # get file paths sdir = 'data/winequality' tdir = 'data/experiments' wr_file = get_abspath('winequality-red.csv', sdir) ww_file = get_abspath('winequality-white.csv', sdir) # load as data frame wine_red = pd.read_csv(wr_file, sep=';') wine_white = pd.read_csv(ww_file, sep=';') # encode artifical label to determine if wine is red or not wine_red['red'] = 1 wine_white['red'] = 0 # combine datasets and format column names df = wine_red.append(wine_white) df.columns = ['_'.join(col.split(' ')) for col in df.columns] df.rename(columns={'quality': 'class'}, inplace=True) # split out X data and scale (Gaussian zero mean and unit variance) X = df.drop(columns='class').as_matrix() y = df['class'].as_matrix() X_scaled = StandardScaler().fit_transform(X) data = np.concatenate((X_scaled, y[:, np.newaxis]), axis=1) # save to CSV save_array(array=data, filename='winequality.csv', subdir=tdir)
def run_experiment(problem, prefix, gamma, shape=None): """Run a policy iteration experiment. Args: problem (str): Gym problem name. prefix (str): Prefix for CSV and plot outputs. gamma (float): Gamma value. shape (tuple(int)): Shape of state space array. """ problem = gym.make(problem) policy, rewards, iters, value_fn = policy_iteration(problem, gamma=gamma) idxs = [i for i in range(0, iters)] print('{}: {} iterations to converge'.format(prefix, iters)) # save results as CSV resdir = 'results/PI' q = get_abspath('{}_policy.csv'.format(prefix), resdir) r = get_abspath('{}_rewards.csv'.format(prefix), resdir) v = get_abspath('{}_value_fn.csv'.format(prefix), resdir) pdf = pd.DataFrame(policy) rdf = pd.DataFrame(np.column_stack([idxs, rewards]), columns=['k', 'r']) vdf = pd.DataFrame(value_fn) pdf.to_csv(q, index=False) rdf.to_csv(r, index=False) vdf.to_csv(v, index=False) # plot results tdir = 'plots/PI' polgrid = pdf.as_matrix().reshape(shape) heatmap = vdf.as_matrix().reshape(shape) plot_grid(heatmap, prefix, tdir, policy_for_annot=polgrid) return iters
def generate_component_plots(name, rdir, pdir): """Generates plots of result files for given dataset. Args: name (str): Dataset name. rdir (str): Input file directory. pdir (str): Output directory. """ metrics = pd.read_csv(get_abspath('{}_metrics.csv'.format(name), rdir)) # get figure and axes fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=1, ncols=4, figsize=(15, 3)) # plot SSE for K-Means k = metrics['k'] metric = metrics['sse'] ax1.plot(k, metric, marker='o', markersize=5, color='g') ax1.set_title('K-Means SSE ({})'.format(name)) ax1.set_ylabel('Sum of squared error') ax1.set_xlabel('Number of clusters (k)') ax1.grid(color='grey', linestyle='dotted') # plot Silhoutte Score for K-Means metric = metrics['silhouette_score'] ax2.plot(k, metric, marker='o', markersize=5, color='b') ax2.set_title('K-Means Avg Silhouette Score ({})'.format(name)) ax2.set_ylabel('Mean silhouette score') ax2.set_xlabel('Number of clusters (k)') ax2.grid(color='grey', linestyle='dotted') # plot log-likelihood for EM metric = metrics['log-likelihood'] ax3.plot(k, metric, marker='o', markersize=5, color='r') ax3.set_title('GMM Log-likelihood ({})'.format(name)) ax3.set_ylabel('Log-likelihood') ax3.set_xlabel('Number of clusters (k)') ax3.grid(color='grey', linestyle='dotted') # plot BIC for EM metric = metrics['bic'] ax4.plot(k, metric, marker='o', markersize=5, color='k') ax4.set_title('GMM BIC ({})'.format(name)) ax4.set_ylabel('BIC') ax4.set_xlabel('Number of clusters (k)') ax4.grid(color='grey', linestyle='dotted') # change layout size, font size and width between subplots fig.tight_layout() for ax in fig.axes: ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label] for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels(): item.set_fontsize(8) plt.subplots_adjust(wspace=0.3) # save figure plotpath = get_abspath('{}_components.png'.format(name), pdir) plt.savefig(plotpath) plt.clf()
def main(): """Run code to generate results. """ combined = get_abspath('combined_results.csv', 'results/NN') try: os.remove(combined) except: pass with open(combined, 'a') as f: f.write('dataset,algorithm,accuracy,elapsed_time\n') names = ['digits', 'abalone'] dimred_algos = ['PCA', 'ICA', 'RP', 'RF'] cluster_algos = ['km', 'gmm'] # generate results for name in names: # get labels filepath = get_abspath('{}.csv'.format(name), 'data/experiments') data = np.loadtxt(filepath, delimiter=',') X = data[:, :-1] y = data[:, -1] # save base dataset results ann = create_ann(name=name) acc, elapsed = ann_experiment(X, y, name, ann) with open(combined, 'a') as f: f.write('{},{},{},{}\n'.format(name, 'base', acc, elapsed)) for d in dimred_algos: # get attributes resdir = 'results/{}'.format(d) filepath = get_abspath('{}_projected.csv'.format(name), resdir) X = np.loadtxt(filepath, delimiter=',') # train ANN and get test score, elapsed time ann = create_ann(name=name) plot_learning_curve(ann, d, name, X, y) acc, elapsed = ann_experiment(X, y, name, ann) with open(combined, 'a') as f: f.write('{},{},{},{}\n'.format(name, d, acc, elapsed)) for c in cluster_algos: # get attributes resdir = 'results/NN' filepath = get_abspath('{}_{}_labels.csv'.format(name, c), resdir) X = np.loadtxt(filepath, delimiter=',') # train ANN and get test score, elapsed time ann = create_ann(name=name) plot_learning_curve(ann, c, name, X, y) acc, elapsed = ann_experiment(X, y, name, ann) with open(combined, 'a') as f: f.write('{},{},{},{}\n'.format(name, c, acc, elapsed))
def create_timing_curve(estimator, dataset, data_name, clf_name): """Generates a timing curve for the specified estimator, saves tabular results to CSV and saves a plot of the timing curve. Args: estimator (object): Target classifier. dataset(pandas.DataFrame): Source data set. data_name (str): Name of data set being tested. clf_name (str): Type of algorithm. """ # set training sizes and intervals train_sizes = np.arange(0.1, 1.0, 0.05) # initialise variables train_time = [] predict_time = [] df_final = [] # iterate through training sizes and capture training and predict times for i, train_data in enumerate(train_sizes): X_train, X_test, y_train, y_test = split_data(dataset, test_size=1 - train_data) start_train = timeit.default_timer() estimator.fit(X_train, y_train) end_train = timeit.default_timer() estimator.predict(X_test) end_predict = timeit.default_timer() train_time.append(end_train - start_train) predict_time.append(end_predict - end_train) df_final.append([train_data, train_time[i], predict_time[i]]) # save timing results to CSV timedata = pd.DataFrame( data=df_final, columns=['Training Data Percentage', 'Train Time', 'Test Time'], ) resdir = 'results' res_tgt = '{}/{}'.format(resdir, clf_name) timefile = get_abspath('{}_timing_curve.csv'.format(data_name), res_tgt) timedata.to_csv(timefile, index=False) # generate timing curve plot plt.figure(2) plt.plot(train_sizes, train_time, marker='.', color='b', label='Train') plt.plot(train_sizes, predict_time, marker='.', color='g', label='Predict') plt.legend(loc='best') plt.grid(linestyle='dotted') plt.xlabel('Samples used for training as a percentage of total') plt.ylabel('Elapsed user time in seconds') # save timing curve plot as PNG plotdir = 'plots' plt.title("Timing Curve with {} on {}".format(clf_name, data_name)) plot_tgt = '{}/{}'.format(plotdir, clf_name) plotpath = get_abspath('{}_TC.png'.format(data_name), plot_tgt) plt.savefig(plotpath) plt.close()
def ga_mating_curve(): """Plots the mating rate validation curve for genetic algorithms and saves it as a PNG file. """ # load datasets resdir = 'results/NN/GA' df_10 = pd.read_csv(get_abspath('results_50_10_10.csv', resdir)) df_20 = pd.read_csv(get_abspath('results_50_20_10.csv', resdir)) df_30 = pd.read_csv(get_abspath('results_50_30_10.csv', resdir)) # get columns iters = df_10['iteration'] train_10 = df_10['MSE_train'] test_10 = df_10['MSE_test'] train_20 = df_20['MSE_train'] test_20 = df_20['MSE_test'] train_30 = df_30['MSE_train'] test_30 = df_30['MSE_test'] # create validation curve for training data plt.figure(0) plt.plot(iters, train_10, color='b', label='# of mates: 10') plt.plot(iters, train_20, color='g', label='# of mates: 20') plt.plot(iters, train_30, color='r', label='# of mates: 30') plt.xlim(xmin=-30) plt.legend(loc='best') plt.grid(color='grey', linestyle='dotted') plt.title('GA Validation Curve - Mating Rate (train)') plt.xlabel('Iterations') plt.ylabel('Mean squared error') # save complexity curve plot as PNG plotdir = 'plots/NN/GA' plotpath = get_abspath('GA_MA_train.png', plotdir) plt.savefig(plotpath, bbox_inches='tight') plt.clf() # create complexity curve for test data plt.figure(0) plt.plot(iters, test_10, color='b', label='# of mates: 10') plt.plot(iters, test_20, color='g', label='# of mates: 20') plt.plot(iters, test_30, color='r', label='# of mates: 30') plt.xlim(xmin=-30) plt.legend(loc='best') plt.grid(color='grey', linestyle='dotted') plt.title('GA Validation Curve - Mating Rate (test)') plt.xlabel('Iterations') plt.ylabel('Mean squared error') # save learning curve plot as PNG plotdir = 'plots/NN/GA' plotpath = get_abspath('GA_MA_test.png', plotdir) plt.savefig(plotpath, bbox_inches='tight') plt.clf()
def main(): """Run code to generate clustering results. """ print 'Running base clustering experiments' start_time = timeit.default_timer() winepath = get_abspath('winequality.csv', 'data/experiments') seismicpath = get_abspath('seismic-bumps.csv', 'data/experiments') wine = np.loadtxt(winepath, delimiter=',') seismic = np.loadtxt(seismicpath, delimiter=',') rdir = 'results/clustering' pdir = 'plots/clustering' # split data into X and yreduced wX = wine[:, :-1] wY = wine[:, -1] sX = seismic[:, :-1] sY = seismic[:, -1] # run clustering experiments clusters = [2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 18, 20, 25, 30, 45, 80, 120] clustering_experiment(wX, wY, 'winequality', clusters, rdir=rdir) clustering_experiment(sX, sY, 'seismic-bumps', clusters, rdir=rdir) # generate 2D data for cluster visualization get_cluster_data(wX, wY, 'winequality', km_k=15, gmm_k=15, rdir=rdir) get_cluster_data(sX, sY, 'seismic-bumps', km_k=20, gmm_k=15, rdir=rdir) # generate component plots (metrics to choose size of k) generate_component_plots(name='winequality', rdir=rdir, pdir=pdir) generate_component_plots(name='seismic-bumps', rdir=rdir, pdir=pdir) # # generate validation plots (relative performance of clustering) generate_validation_plots(name='winequality', rdir=rdir, pdir=pdir) generate_validation_plots(name='seismic-bumps', rdir=rdir, pdir=pdir) # generate validation plots (relative performance of clustering) df_wine = pd.read_csv(get_abspath('winequality_2D.csv', rdir)) df_seismic = pd.read_csv(get_abspath('seismic-bumps_2D.csv', rdir)) generate_cluster_plots(df_wine, name='winequality', pdir=pdir) generate_cluster_plots(df_seismic, name='seismic-bumps', pdir=pdir) # generate neural network datasets with cluster labels nn_cluster_datasets(wX, name='winequality', km_k=15, gmm_k=15) nn_cluster_datasets(sX, name='seismic-bumps', km_k=20, gmm_k=15) # calculate and print running time end_time = timeit.default_timer() elapsed = end_time - start_time print "Completed clustering experiments in {} seconds".format(elapsed)
def run_experiment(problem, prefix, alpha, gamma, d, shape=None): """Run Q-Learning experiment for specified Gym problem and write results to CSV files. Args: problem (str): Gym problem name. alpha (float): Learning rate. gamma (float): Discount factor. d : Epsilon decay rate. shape (tuple(int)): Shape of state space matrix. prefix (str): Prefix for CSV and plot outputs. """ episodes = 5000 size = episodes // 100 # instantiate environment and run Q-learner start = time.time() env = gym.make(problem) Q, rewards, visits = q_learning(env, alpha, d, gamma) env.close() end = time.time() elapsed = end - start # average rewards k = [i for i in range(0, episodes, size)] chunks = list(chunk_list(rewards, size)) rewards = [sum(chunk) / len(chunk) for chunk in chunks] # save results as CSV resdir = 'results/QL' qf = get_abspath('{}_policy.csv'.format(prefix), resdir) rf = get_abspath('{}_rewards.csv'.format(prefix), resdir) vf = get_abspath('{}_visits.csv'.format(prefix), resdir) qdf = pd.DataFrame(Q) vdf = pd.DataFrame(visits) rdf = pd.DataFrame(np.column_stack([k, rewards]), columns=['k', 'r']) qdf.to_csv(qf, index=False) vdf.to_csv(vf, index=False) rdf.to_csv(rf, index=False) # write timing results and average reward in last iteration combined = get_abspath('summary.csv', 'results/QL') with open(combined, 'a') as f: f.write('{},{},{}\n'.format(prefix, elapsed, rdf.iloc[-1, 1])) # plot results tdir = 'plots/QL' polgrid = qdf.as_matrix().argmax(axis=1).reshape(shape) heatmap = vdf.as_matrix().reshape(shape) plot_grid(heatmap, prefix, tdir, policy_for_annot=polgrid)
def create_timing_curve(estimator, dataset, data_name, clf_name): # set training sizes and intervals train_sizes = np.arange(0.01, 1.0, 0.03) # initialise variables train_time = [] predict_time = [] df_final = [] # iterate through training sizes and capture training and predict times for i, train_data in enumerate(train_sizes): X_train, X_test, y_train, y_test = split_data(dataset, test_size=1 - train_data) start_train = timeit.default_timer() estimator.fit(X_train, y_train) end_train = timeit.default_timer() estimator.predict(X_test) end_predict = timeit.default_timer() train_time.append(end_train - start_train) predict_time.append(end_predict - end_train) df_final.append([train_data, train_time[i], predict_time[i]]) # save timing results to CSV timedata = pd.DataFrame( data=df_final, columns=['Training Data Percentage', 'Train Time', 'Test Time']) resdir = 'results' res_tgt = '{}/{}'.format(resdir, clf_name) timefile = get_abspath('{}_timing_curve.csv'.format(data_name), res_tgt) timedata.to_csv(timefile, index=False) # generate timing curve plot plt.figure() plt.title("Timing Curve ({})".format(data_name)) plt.grid() plt.plot(train_sizes, train_time, marker='.', color='y', label='Train') plt.plot(train_sizes, predict_time, marker='.', color='dodgerblue', label='Predict') plt.legend(loc='best') plt.xlabel('Training Set Size (%)') plt.ylabel('Elapsed user time in seconds') # save timing curve plot as PNG plotdir = 'plots' plot_tgt = '{}/{}'.format(plotdir, clf_name) plotpath = get_abspath('{}_TC.png'.format(data_name), plot_tgt) plt.savefig(plotpath) plt.close()
def main(): """Run code to generate clustering results. """ print 'Running base clustering experiments' start_time = timeit.default_timer() digitspath = get_abspath('digits.csv', 'data/experiments') abalonepath = get_abspath('abalone.csv', 'data/experiments') digits = np.loadtxt(digitspath, delimiter=',') abalone = np.loadtxt(abalonepath, delimiter=',') rdir = 'results/clustering' pdir = 'plots/clustering' # split data into X and yreduced dX = digits[:, :-1] dY = digits[:, -1] aX = abalone[:, :-1] aY = abalone[:, -1] # run clustering experiments clusters = [2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 50] clustering_experiment(dX, dY, 'digits', clusters, rdir=rdir) clustering_experiment(aX, aY, 'abalone', clusters, rdir=rdir) # generate 2D data for cluster visualization get_cluster_data(dX, dY, 'digits', km_k=10, gmm_k=10, rdir=rdir) get_cluster_data(aX, aY, 'abalone', km_k=5, gmm_k=10, rdir=rdir) # generate component plots (metrics to choose size of k) generate_component_plots(name='digits', rdir=rdir, pdir=pdir) generate_component_plots(name='abalone', rdir=rdir, pdir=pdir) # # generate validation plots (relative performance of clustering) generate_validation_plots(name='digits', rdir=rdir, pdir=pdir) generate_validation_plots(name='abalone', rdir=rdir, pdir=pdir) # generate validation plots (relative performance of clustering) df_digits = pd.read_csv(get_abspath('digits_2D.csv', rdir)) df_abalone = pd.read_csv(get_abspath('abalone_2D.csv', rdir)) generate_cluster_plots(df_digits, name='digits', pdir=pdir) generate_cluster_plots(df_abalone, name='abalone', pdir=pdir) # generate neural network datasets with cluster labels nn_cluster_datasets(dX, name='digits', km_k=10, gmm_k=10) nn_cluster_datasets(aX, name='abalone', km_k=3, gmm_k=10) # calculate and print running time end_time = timeit.default_timer() elapsed = end_time - start_time print "Completed clustering experiments in {} seconds".format(elapsed)
def generate_variance_plot(name, evp): """Plots explained variance and cumulative explained variance ratios as a function of principal components. Args: name (str): Dataset name. evp (float): Explained variance percentage threshold. """ resdir = 'results/PCA' df = pd.read_csv(get_abspath('{}_variances.csv'.format(name), resdir)) # get figure and axes fig, (ax, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(5, 3)) # plot explained variance and cumulative explain variance ratios x = df['n'] evr = df['evr'] evr_cum = df['evr_cum'] ax.plot(x, evr, marker='.', color='b', label='EVR') ax.plot(x, evr_cum, marker='.', color='g', label='Cumulative EVR') vmark = evr_cum.where(evr_cum > evp).idxmin() + 1 fig.suptitle('PCA Explained Variance by PC ({})'.format(name)) ax.set_title( '{:.2%} Cumulative Variance \n Explained by {} Components'.format( evr_cum[vmark-1], vmark ) ) ax.set_ylabel('Explained Variance') ax.set_xlabel('Principal Component') ax.axvline(x=vmark, linestyle='--', color='r') ax.grid(color='grey', linestyle='dotted') loss = df['loss'] ax1.plot(x, loss, marker='.', color='r') ax1.set_title('PCA Mean Loss ({})'.format(name)) ax1.set_ylabel('Mean loss') ax1.set_xlabel('# Components') # change layout size, font size and width fig.tight_layout() for ax in fig.axes: ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label] for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels(): item.set_fontsize(8) # save figure plotdir = 'plots/PCA' plotpath = get_abspath('{}_explvar.png'.format(name), plotdir) plt.savefig(plotpath) plt.clf()
def generate_validation_plots(name, rdir, pdir): """Generates plots of validation metrics (accuracy, adjusted mutual info) for both datasets. Args: name (str): Dataset name. rdir (str): Input file directory. pdir (str): Output directory. """ metrics = pd.read_csv(get_abspath('{}_metrics.csv'.format(name), rdir)) # get figure and axes fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9, 4)) # plot accuracy k = metrics['k'] km = metrics['km_acc'] gmm = metrics['gmm_acc'] ax1.plot(k, km, marker='o', markersize=5, color='b', label='K-Means') ax1.plot(k, gmm, marker='o', markersize=5, color='g', label='GMM') ax1.set_title('Accuracy Score ({})'.format(name)) ax1.set_ylabel('Accuracy') ax1.set_xlabel('Number of clusters (k)') ax1.grid(color='grey', linestyle='dotted') ax1.legend(loc='best') # plot adjusted mutual info km = metrics['km_adjmi'] gmm = metrics['gmm_adjmi'] ax2.plot(k, km, marker='o', markersize=5, color='r', label='K-Means') ax2.plot(k, gmm, marker='o', markersize=5, color='k', label='GMM') ax2.set_title('Adjusted Mutual Info ({})'.format(name)) ax2.set_ylabel('Adjusted mutual information score') ax2.set_xlabel('Number of clusters (k)') ax2.grid(color='grey', linestyle='dotted') ax2.legend(loc='best') # change layout size, font size and width between subplots fig.tight_layout() for ax in fig.axes: ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label] for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels(): item.set_fontsize(8) plt.subplots_adjust(wspace=0.3) # save figure plotpath = get_abspath('{}_validation.png'.format(name), pdir) plt.savefig(plotpath) plt.clf()
def generate_fi_plot(name, theta): """Plots feature importance and cumulative feature importance values sorted by feature index. Args: name (str): Dataset name. theta (float): Explained variance percentage threshold. """ resdir = 'results/RF' df = pd.read_csv(get_abspath('{}_fi.csv'.format(name), resdir)) # get figure and axes fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(7 if name == 'abalone' else 12, 3)) # plot explained variance and cumulative explain variance ratios ax2 = ax1.twinx() x = df['i'] fi = df['importance'] cumfi = df['cumulative'] ax1.bar(x, height=fi, color='b', tick_label=df['feature'], align='center') ax2.plot(x, cumfi, color='r', label='Cumulative Info Gain') fig.suptitle('Feature Importance ({})'.format(name)) ax1.set_title('{:.2%} Explained variance percentage by {} Features'.format( cumfi.loc[cumfi.where(cumfi > theta).idxmin()], cumfi.where(cumfi > theta).idxmin() + 1, )) ax1.set_ylabel('Gini Gain') ax2.set_ylabel('Cumulative Gini Gain') ax1.set_xlabel('Feature Index') ax2.axhline(y=theta, linestyle='--', color='r') ax1.grid(b=None) ax2.grid(b=None) # change layout size, font size and width fig.tight_layout() for ax in fig.axes: ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label] for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels(): item.set_fontsize(8) # save figure plotdir = 'plots/RF' plotpath = get_abspath('{}_fi.png'.format(name), plotdir) plt.savefig(plotpath) plt.clf()
def preprocess_seismic(): """Cleans and generates seismic bumps dataset for experiments as a CSV file. Uses one-hot encoding for categorical features. """ # get file path sdir = 'data/seismic-bumps' tdir = 'data/experiments' seismic_file = get_abspath('seismic-bumps.arff', sdir) # read arff file and convert to record array rawdata = arff.loadarff(seismic_file) df = pd.DataFrame(rawdata[0]) # apply one-hot encoding to categorical features using Pandas get_dummies cat_cols = ['seismic', 'seismoacoustic', 'shift', 'ghazard'] cats = df[cat_cols] onehot_cols = pd.get_dummies(cats, prefix=cat_cols) # drop original categorical columns and append one-hot encoded columns df.drop(columns=cat_cols, inplace=True) df = pd.concat((df, onehot_cols), axis=1) # drop columns that have only 1 unique value (features add no information) for col in df.columns: if len(np.unique(df[col])) == 1: df.drop(columns=col, inplace=True) # drop columns with low correlation with class and higher (over 0.8) # correlation with other attributes df.drop(columns=['gdenergy', 'maxenergy'], inplace=True) # save to CSV save_dataset(df, 'seismic-bumps.csv', sep=',', subdir=tdir)
def ica_experiment(X, name, dims): """Run ICA on specified dataset and saves mean kurtosis results as CSV file. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (list(int)): List of component number values. """ ica = FastICA(random_state=0, max_iter=5000) kurt = {} for dim in dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(X) df = pd.DataFrame(tmp) df = df.kurt(axis=0) kurt[dim] = df.abs().mean() res = pd.DataFrame.from_dict(kurt, orient='index') res.rename(columns={0: 'kurtosis'}, inplace=True) # save results as CSV resdir = 'results/ICA' resfile = get_abspath('{}_kurtosis.csv'.format(name), resdir) res.to_csv(resfile, index_label='n')
def ica_experiment(X, name, dims, max_iter=5000, tol=1e-04): """Run ICA on specified dataset and saves mean kurtosis results as CSV file. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (list(int)): List of component number values. """ ica = FastICA(random_state=0, max_iter=max_iter, tol=tol) kurt = [] loss = [] X = StandardScaler().fit_transform(X) for dim in dims: print(dim) ica.set_params(n_components=dim) tmp = ica.fit_transform(X) df = pd.DataFrame(tmp) df = df.kurt(axis=0) kurt.append(kurtosistest(tmp).statistic.mean()) proj = ica.inverse_transform(tmp) loss.append(((X - proj)**2).mean()) res = pd.DataFrame({"kurtosis": kurt, "loss": loss}) # save results as CSV resdir = 'results/ICA' resfile = get_abspath('{}_kurtosis.csv'.format(name), resdir) res.to_csv(resfile, index_label='n')
def plot_delta_curve(deltas, prefix, tdir): """Plots delta as a function of number of episodes. Args: rewards (pandas.dataframe): Rewards dataframe. prefix (str): Prefix for CSV and plot outputs. tdir (str): Target directory. """ # get figure and axes fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4)) # plot reward curve k = deltas['k'] d = deltas['d'] ax.plot(k, d, color='g') ax.set_title('Delta Convergence ({})'.format(prefix)) ax.set_ylabel('Delta Value') ax.set_xlabel('Episodes') ax.grid(linestyle='dotted') fig.tight_layout() # save figure plotpath = get_abspath('{}_deltas.png'.format(prefix), tdir) plt.savefig(plotpath) plt.close()
def plot_reward_curve(rewards, prefix, tdir, xlabel='Iterations'): """Plots rewards as a function of number of iterations or episodes. Args: rewards (pandas.dataframe): Rewards dataframe. prefix (str): Prefix for CSV and plot outputs. tdir (str): Target directory. """ # get figure and axes fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4)) # plot reward curve k = rewards['k'] r = rewards['r'] ax.plot(k, r, color='b') ax.set_title('Average Rewards ({})'.format(prefix)) ax.set_ylabel('Average Reward') ax.set_xlabel('Episodes') ax.grid(linestyle='dotted') fig.tight_layout() # save figure plotpath = get_abspath('{}_rewards.png'.format(prefix), tdir) plt.savefig(plotpath) plt.close()
def plot_grid(heat, prefix, tdir, big=False, policy_for_annot=None): """Plots grid using a scalar-based heatmap. Args: heat (numpy.array): Heat map values. prefix (str): Prefix for CSV and plot outputs. tdir (str): Target directory. policy_for_annot (numpy.array): Policy array to use for annotations. """ figsize = (5, 5) if heat.shape[0] > 10: figsize = (8, 8) if policy_for_annot is not None: policy_for_annot = get_annotations(policy_for_annot) # generate graph fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize) ax = sns.heatmap(heat, annot=policy_for_annot, fmt='') ax.set_title('Optimal Policy ({})'.format(prefix), fontsize=20) fig.tight_layout() # save figure plotpath = get_abspath('{}_policygrid.png'.format(prefix), tdir) plt.savefig(plotpath) plt.close()
def plot_delta_combined(df1, df2, df3, prefix, tdir): """Plot combined delta chart for all values of gamma. Args: df1 (pandas.dataframe): Gamma 0.1. df2 (pandas.dataframe): Gamma 0.9. df3 (pandas.dataframe): Gamma 0.99. prefix (str): Prefix for CSV and plot outputs. tdir (str): Target directory. """ # get figure and axes fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4)) # plot reward curve k = df1['k'] r1 = df1['d'] r2 = df2['d'] r3 = df3['d'] ax.plot(k, r1, color='b', label='Gamma 0.1') ax.plot(k, r2, color='r', label='Gamma 0.9') ax.plot(k, r3, color='g', label='Gamma 0.99') ax.set_title('Delta Convergence ({})'.format(prefix)) ax.set_ylabel('Delta') ax.set_xlabel('Episodes') ax.legend(loc='best') ax.grid(linestyle='dotted') fig.tight_layout() # save figure plotpath = get_abspath('{}_combined_deltas.png'.format(prefix), tdir) plt.savefig(plotpath) plt.close()
def basic_results(grid, X_test, y_test, data_name, clf_name): """Gets best fit against test data for best estimator from a particular grid object. Note: test score funtion is the same scoring function used for training. Args: grid (GridSearchCV object): Trained grid search object. X_test (numpy.Array): Test features. y_test (numpy.Array): Test labels. data_name (str): Name of data set being tested. clf_name (str): Type of algorithm. """ # get best score, test score, scoring function and best parameters clf = clf_name dn = data_name bs = grid.best_score_ ts = grid.score(X_test, y_test) sf = grid.scorer_ bp = grid.best_params_ # write results to a combined results file parentdir = 'results' resfile = get_abspath('combined_results.csv', parentdir) with open(resfile, 'a') as f: f.write('{}|{}|{}|{}|{}|{}\n'.format(clf, dn, bs, ts, sf, bp))
def histogram(labels, dataname, outfile, outpath='plots/datasets'): """Generates a histogram of class labels in a given dataset and saves it to an output folder in the project directory. Args: labels (numpy.Array): array containing class labels. dataname (str): name of datasets (e.g. winequality). outfile (str): name of output file name. outpath (str): project folder to save plot file. """ # get number of bins bins = len(np.unique(labels)) # set figure params sns.set(font_scale=1.3, rc={'figure.figsize': (8, 8)}) # create plot and set params fig, ax = plt.subplots() ax.hist(labels, bins=bins) fig.suptitle('Class frequency in ' + dataname) ax.set_xlabel('Class') ax.set_ylabel('Frequency') # save plot plt.savefig(get_abspath(outfile, outpath)) plt.close()
def generate_contingency_matrix(kmeans_contigency, gmm_continigency, name, pdir): fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 3)) ax1 = sns.heatmap(kmeans_contigency, linewidths=.5, cmap="YlGnBu", ax=ax1) ax1.set_title('K-Means Clusters ({})'.format(name)) ax1.set_xlabel('Cluster') ax1.set_ylabel('True label') ax2 = sns.heatmap(gmm_continigency, linewidths=.5, cmap="YlGnBu", ax=ax2) ax2.set_title('GMM Clusters ({})'.format(name)) ax2.set_xlabel('Cluster') ax2.set_ylabel('True label') fig.tight_layout() for ax in fig.axes: ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label] for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels(): item.set_fontsize(8) plt.subplots_adjust(wspace=0.3) fig.suptitle('Contigency Plot - {}'.format(name), fontsize=12) # save figure plotdir = pdir plotpath = get_abspath('{}_contingecy.png'.format(name), plotdir) plt.savefig(plotpath) plt.clf()
def rp_experiment(X, y, name, dims): """Run Randomized Projections on specified dataset and saves reconstruction error and pairwise distance correlation results as CSV file. Args: X (Numpy.Array): Attributes. name (str): Dataset name. dims (list(int)): List of component number values. """ re = defaultdict(dict) pdc = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(X) re[dim][i] = reconstruction_error(rp, X) pdc[dim][i] = pairwise_dist_corr(rp.transform(X), X) re = pd.DataFrame(pd.DataFrame(re).T.mean(axis=1)) re.rename(columns={0: 'recon_error'}, inplace=True) pdc = pd.DataFrame(pd.DataFrame(pdc).T.mean(axis=1)) pdc.rename(columns={0: 'pairwise_dc'}, inplace=True) metrics = pd.concat((re, pdc), axis=1) # save results as CSV resdir = 'results/RP' resfile = get_abspath('{}_metrics.csv'.format(name), resdir) metrics.to_csv(resfile, index_label='n')