Ejemplo n.º 1
0
def save_train_results(grid, data_name, clf_name):
    """Saves grid search cross-validation results and pickles the entire
    pipeline and best estimator.

    Args:
        grid (GridSearchCV object): Trained grid search object.
        data_name (str): Name of data set algorithm was trained on.
        clf_name (str): Type of algorithm.
        path (str): Target path for results/pickled model.

    """
    # get cross-validation results and best estimator
    results = pd.DataFrame(grid.cv_results_)
    best_clf = grid.best_estimator_

    # save cross-validation results as CSV
    parentdir = 'models'
    target = '{}/{}'.format(parentdir, clf_name)
    resfile = get_abspath('{}_cv_results.csv'.format(data_name), target)
    results.to_csv(resfile, index=False)

    # save grid search object and best estimator as pickled model files
    gridpath = get_abspath('{}_grid.pkl'.format(data_name), target)
    bestpath = get_abspath('{}_best_estimator.pkl'.format(data_name), target)
    save_pickled_model(grid, gridpath)
    save_pickled_model(best_clf, bestpath)
Ejemplo n.º 2
0
def generate_kurtosis_plot(name):
    """Plots mean kurtosis as a function of number of components.

    Args:
        name (str): Dataset name.

    """
    resdir = 'results/ICA'
    df = pd.read_csv(get_abspath('{}_kurtosis.csv'.format(name), resdir))

    # get figure and axes
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 3))

    # plot explained variance and cumulative explain variance ratios
    x = df['n']
    kurt = df['kurtosis']
    ax.plot(x, kurt, marker='.', color='g')
    ax.set_title('ICA Mean Kurtosis ({})'.format(name))
    ax.set_ylabel('Mean Kurtosis')
    ax.set_xlabel('# Components')
    ax.grid(color='grey', linestyle='dotted')

    # change layout size, font size and width
    fig.tight_layout()
    for ax in fig.axes:
        ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label]
        for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels():
            item.set_fontsize(8)

    # save figure
    plotdir = 'plots/ICA'
    plotpath = get_abspath('{}_kurtosis.png'.format(name), plotdir)
    plt.savefig(plotpath)
    plt.clf()
Ejemplo n.º 3
0
def preprocess_winequality():
    """Cleans and generates wine quality dataset for experiments as a
    CSV file.

    """
    # get file paths
    sdir = 'data/winequality'
    tdir = 'data/experiments'
    wr_file = get_abspath('winequality-red.csv', sdir)
    ww_file = get_abspath('winequality-white.csv', sdir)

    # load as data frame
    wine_red = pd.read_csv(wr_file, sep=';')
    wine_white = pd.read_csv(ww_file, sep=';')

    # encode artifical label to determine if wine is red or not
    wine_red['red'] = 1
    wine_white['red'] = 0

    # combine datasets and format column names
    df = wine_red.append(wine_white)
    df.columns = ['_'.join(col.split(' ')) for col in df.columns]
    df.rename(columns={'quality': 'class'}, inplace=True)

    # save to CSV
    save_dataset(df, 'winequality.csv', sep=',', subdir=tdir)
Ejemplo n.º 4
0
def nn_cluster_datasets(X, name, km_k, gmm_k):
    """Generates datasets for ANN classification by appending cluster label to
    original dataset.

    Args:
        X (Numpy.Array): Original attributes.
        name (str): Dataset name.
        km_k (int): Number of clusters for K-Means.
        gmm_k (int): Number of components for GMM.

    """
    km = KMeans(random_state=0).set_params(n_clusters=km_k)
    gmm = GMM(random_state=0).set_params(n_components=gmm_k)
    km.fit(X)
    gmm.fit(X)

    # add cluster labels to original attributes
    km_x = np.concatenate((X, km.labels_[:, None]), axis=1)
    gmm_x = np.concatenate((X, gmm.predict(X)[:, None]), axis=1)

    # save results
    resdir = 'results/NN'
    kmfile = get_abspath('{}_km_labels.csv'.format(name), resdir)
    gmmfile = get_abspath('{}_gmm_labels.csv'.format(name), resdir)
    save_array(array=km_x, filename=kmfile, subdir=resdir)
    save_array(array=gmm_x, filename=gmmfile, subdir=resdir)
Ejemplo n.º 5
0
def pca_experiment(X, name, dims, evp):
    """Run PCA on specified dataset and saves dataset with components that
    explain at least 85% of total variance.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (int): Number of components.
        evp (float): Explained variance percentage threshold.

    """
    pca = PCA(random_state=0, svd_solver='full', n_components=dims)
    comps = pca.fit_transform(X)  # get principal components

    # cumulative explained variance greater than threshold
    r = range(1, dims + 1)
    ev = pd.Series(pca.explained_variance_, index=r, name='ev')
    evr = pd.Series(pca.explained_variance_ratio_, index=r, name='evr')
    evrc = evr.rename('evr_cum').cumsum()
    res = comps[:, :evrc.where(evrc > evp).idxmin()]
    evars = pd.concat((ev, evr, evrc), axis=1)

    # save results as CSV
    resdir = 'results/PCA'
    evfile = get_abspath('{}_variances.csv'.format(name), resdir)
    resfile = get_abspath('{}_projected.csv'.format(name), resdir)
    save_array(array=res, filename=resfile, subdir=resdir)
    evars.to_csv(evfile, index_label='n')
Ejemplo n.º 6
0
def rf_experiment(X, y, name, theta):
    """Run RF on specified dataset and saves feature importance metrics and best
    results CSV.

    Args:
        X (Numpy.Array): Attributes.
        y (Numpy.Array): Labels.
        name (str): Dataset name.
        theta (float): Min cumulative information gain threshold.

    """
    rfc = RandomForestClassifier(
        n_estimators=100, class_weight='balanced', random_state=0)
    fi = rfc.fit(X, y).feature_importances_

    # get feature importance and sort by value in descending order
    i = [i + 1 for i in range(len(fi))]
    fi = pd.DataFrame({'importance': fi, 'feature': i})
    fi.sort_values('importance', ascending=False, inplace=True)
    fi['i'] = i
    cumfi = fi['importance'].cumsum()
    fi['cumulative'] = cumfi

    # generate dataset that meets cumulative feature importance threshold
    idxs = fi.loc[:cumfi.where(cumfi > theta).idxmin(), :]
    idxs = list(idxs.index)
    reduced = X[:, idxs]

    # save results as CSV
    resdir = 'results/RF'
    fifile = get_abspath('{}_fi.csv'.format(name), resdir)
    resfile = get_abspath('{}_projected.csv'.format(name), resdir)
    save_array(array=reduced, filename=resfile, subdir=resdir)
    fi.to_csv(fifile, index_label=None)
Ejemplo n.º 7
0
def preprocess_winequality():
    """Cleans and generates wine quality dataset for experiments as a
    CSV file.

    """
    # get file paths
    sdir = 'data/winequality'
    tdir = 'data/experiments'
    wr_file = get_abspath('winequality-red.csv', sdir)
    ww_file = get_abspath('winequality-white.csv', sdir)

    # load as data frame
    wine_red = pd.read_csv(wr_file, sep=';')
    wine_white = pd.read_csv(ww_file, sep=';')

    # encode artifical label to determine if wine is red or not
    wine_red['red'] = 1
    wine_white['red'] = 0

    # combine datasets and format column names
    df = wine_red.append(wine_white)
    df.columns = ['_'.join(col.split(' ')) for col in df.columns]
    df.rename(columns={'quality': 'class'}, inplace=True)

    # split out X data and scale (Gaussian zero mean and unit variance)
    X = df.drop(columns='class').as_matrix()
    y = df['class'].as_matrix()
    X_scaled = StandardScaler().fit_transform(X)
    data = np.concatenate((X_scaled, y[:, np.newaxis]), axis=1)

    # save to CSV
    save_array(array=data, filename='winequality.csv', subdir=tdir)
Ejemplo n.º 8
0
def run_experiment(problem, prefix, gamma, shape=None):
    """Run a policy iteration experiment.

    Args:
        problem (str): Gym problem name.
        prefix (str): Prefix for CSV and plot outputs.
        gamma (float): Gamma value.
        shape (tuple(int)): Shape of state space array.

    """
    problem = gym.make(problem)
    policy, rewards, iters, value_fn = policy_iteration(problem, gamma=gamma)
    idxs = [i for i in range(0, iters)]
    print('{}: {} iterations to converge'.format(prefix, iters))

    # save results as CSV
    resdir = 'results/PI'
    q = get_abspath('{}_policy.csv'.format(prefix), resdir)
    r = get_abspath('{}_rewards.csv'.format(prefix), resdir)
    v = get_abspath('{}_value_fn.csv'.format(prefix), resdir)
    pdf = pd.DataFrame(policy)
    rdf = pd.DataFrame(np.column_stack([idxs, rewards]), columns=['k', 'r'])
    vdf = pd.DataFrame(value_fn)
    pdf.to_csv(q, index=False)
    rdf.to_csv(r, index=False)
    vdf.to_csv(v, index=False)

    # plot results
    tdir = 'plots/PI'
    polgrid = pdf.as_matrix().reshape(shape)
    heatmap = vdf.as_matrix().reshape(shape)
    plot_grid(heatmap, prefix, tdir, policy_for_annot=polgrid)

    return iters
Ejemplo n.º 9
0
def generate_component_plots(name, rdir, pdir):
    """Generates plots of result files for given dataset.

    Args:
        name (str): Dataset name.
        rdir (str): Input file directory.
        pdir (str): Output directory.

    """
    metrics = pd.read_csv(get_abspath('{}_metrics.csv'.format(name), rdir))

    # get figure and axes
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=1, ncols=4, figsize=(15, 3))

    # plot SSE for K-Means
    k = metrics['k']
    metric = metrics['sse']
    ax1.plot(k, metric, marker='o', markersize=5, color='g')
    ax1.set_title('K-Means SSE ({})'.format(name))
    ax1.set_ylabel('Sum of squared error')
    ax1.set_xlabel('Number of clusters (k)')
    ax1.grid(color='grey', linestyle='dotted')

    # plot Silhoutte Score for K-Means
    metric = metrics['silhouette_score']
    ax2.plot(k, metric, marker='o', markersize=5, color='b')
    ax2.set_title('K-Means Avg Silhouette Score ({})'.format(name))
    ax2.set_ylabel('Mean silhouette score')
    ax2.set_xlabel('Number of clusters (k)')
    ax2.grid(color='grey', linestyle='dotted')

    # plot log-likelihood for EM
    metric = metrics['log-likelihood']
    ax3.plot(k, metric, marker='o', markersize=5, color='r')
    ax3.set_title('GMM Log-likelihood ({})'.format(name))
    ax3.set_ylabel('Log-likelihood')
    ax3.set_xlabel('Number of clusters (k)')
    ax3.grid(color='grey', linestyle='dotted')

    # plot BIC for EM
    metric = metrics['bic']
    ax4.plot(k, metric, marker='o', markersize=5, color='k')
    ax4.set_title('GMM BIC ({})'.format(name))
    ax4.set_ylabel('BIC')
    ax4.set_xlabel('Number of clusters (k)')
    ax4.grid(color='grey', linestyle='dotted')

    # change layout size, font size and width between subplots
    fig.tight_layout()
    for ax in fig.axes:
        ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label]
        for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels():
            item.set_fontsize(8)
    plt.subplots_adjust(wspace=0.3)

    # save figure
    plotpath = get_abspath('{}_components.png'.format(name), pdir)
    plt.savefig(plotpath)
    plt.clf()
Ejemplo n.º 10
0
def main():
    """Run code to generate results.

    """
    combined = get_abspath('combined_results.csv', 'results/NN')

    try:
        os.remove(combined)
    except:
        pass

    with open(combined, 'a') as f:
        f.write('dataset,algorithm,accuracy,elapsed_time\n')

    names = ['digits', 'abalone']
    dimred_algos = ['PCA', 'ICA', 'RP', 'RF']
    cluster_algos = ['km', 'gmm']

    # generate results
    for name in names:
        # get labels
        filepath = get_abspath('{}.csv'.format(name), 'data/experiments')
        data = np.loadtxt(filepath, delimiter=',')
        X = data[:, :-1]
        y = data[:, -1]

        # save base dataset results
        ann = create_ann(name=name)

        acc, elapsed = ann_experiment(X, y, name, ann)
        with open(combined, 'a') as f:
            f.write('{},{},{},{}\n'.format(name, 'base', acc, elapsed))

        for d in dimred_algos:
            # get attributes
            resdir = 'results/{}'.format(d)
            filepath = get_abspath('{}_projected.csv'.format(name), resdir)
            X = np.loadtxt(filepath, delimiter=',')

            # train ANN and get test score, elapsed time
            ann = create_ann(name=name)
            plot_learning_curve(ann, d, name, X, y)
            acc, elapsed = ann_experiment(X, y, name, ann)
            with open(combined, 'a') as f:
                f.write('{},{},{},{}\n'.format(name, d, acc, elapsed))

        for c in cluster_algos:
            # get attributes
            resdir = 'results/NN'
            filepath = get_abspath('{}_{}_labels.csv'.format(name, c), resdir)
            X = np.loadtxt(filepath, delimiter=',')

            # train ANN and get test score, elapsed time
            ann = create_ann(name=name)
            plot_learning_curve(ann, c, name, X, y)
            acc, elapsed = ann_experiment(X, y, name, ann)
            with open(combined, 'a') as f:
                f.write('{},{},{},{}\n'.format(name, c, acc, elapsed))
Ejemplo n.º 11
0
def create_timing_curve(estimator, dataset, data_name, clf_name):
    """Generates a timing curve for the specified estimator, saves tabular
    results to CSV and saves a plot of the timing curve.

    Args:
        estimator (object): Target classifier.
        dataset(pandas.DataFrame): Source data set.
        data_name (str): Name of data set being tested.
        clf_name (str): Type of algorithm.

    """
    # set training sizes and intervals
    train_sizes = np.arange(0.1, 1.0, 0.05)

    # initialise variables
    train_time = []
    predict_time = []
    df_final = []

    # iterate through training sizes and capture training and predict times
    for i, train_data in enumerate(train_sizes):
        X_train, X_test, y_train, y_test = split_data(dataset,
                                                      test_size=1 - train_data)
        start_train = timeit.default_timer()
        estimator.fit(X_train, y_train)
        end_train = timeit.default_timer()
        estimator.predict(X_test)
        end_predict = timeit.default_timer()
        train_time.append(end_train - start_train)
        predict_time.append(end_predict - end_train)
        df_final.append([train_data, train_time[i], predict_time[i]])

    # save timing results to CSV
    timedata = pd.DataFrame(
        data=df_final,
        columns=['Training Data Percentage', 'Train Time', 'Test Time'],
    )
    resdir = 'results'
    res_tgt = '{}/{}'.format(resdir, clf_name)
    timefile = get_abspath('{}_timing_curve.csv'.format(data_name), res_tgt)
    timedata.to_csv(timefile, index=False)

    # generate timing curve plot
    plt.figure(2)
    plt.plot(train_sizes, train_time, marker='.', color='b', label='Train')
    plt.plot(train_sizes, predict_time, marker='.', color='g', label='Predict')
    plt.legend(loc='best')
    plt.grid(linestyle='dotted')
    plt.xlabel('Samples used for training as a percentage of total')
    plt.ylabel('Elapsed user time in seconds')

    # save timing curve plot as PNG
    plotdir = 'plots'
    plt.title("Timing Curve with {} on {}".format(clf_name, data_name))
    plot_tgt = '{}/{}'.format(plotdir, clf_name)
    plotpath = get_abspath('{}_TC.png'.format(data_name), plot_tgt)
    plt.savefig(plotpath)
    plt.close()
Ejemplo n.º 12
0
def ga_mating_curve():
    """Plots the mating rate validation curve for genetic algorithms and
    saves it as a PNG file.

    """
    # load datasets
    resdir = 'results/NN/GA'
    df_10 = pd.read_csv(get_abspath('results_50_10_10.csv', resdir))
    df_20 = pd.read_csv(get_abspath('results_50_20_10.csv', resdir))
    df_30 = pd.read_csv(get_abspath('results_50_30_10.csv', resdir))

    # get columns
    iters = df_10['iteration']
    train_10 = df_10['MSE_train']
    test_10 = df_10['MSE_test']
    train_20 = df_20['MSE_train']
    test_20 = df_20['MSE_test']
    train_30 = df_30['MSE_train']
    test_30 = df_30['MSE_test']

    # create validation curve for training data
    plt.figure(0)
    plt.plot(iters, train_10, color='b', label='# of mates: 10')
    plt.plot(iters, train_20, color='g', label='# of mates: 20')
    plt.plot(iters, train_30, color='r', label='# of mates: 30')
    plt.xlim(xmin=-30)
    plt.legend(loc='best')
    plt.grid(color='grey', linestyle='dotted')
    plt.title('GA Validation Curve - Mating Rate (train)')
    plt.xlabel('Iterations')
    plt.ylabel('Mean squared error')

    # save complexity curve plot as PNG
    plotdir = 'plots/NN/GA'
    plotpath = get_abspath('GA_MA_train.png', plotdir)
    plt.savefig(plotpath, bbox_inches='tight')
    plt.clf()

    # create complexity curve for test data
    plt.figure(0)
    plt.plot(iters, test_10, color='b', label='# of mates: 10')
    plt.plot(iters, test_20, color='g', label='# of mates: 20')
    plt.plot(iters, test_30, color='r', label='# of mates: 30')
    plt.xlim(xmin=-30)
    plt.legend(loc='best')
    plt.grid(color='grey', linestyle='dotted')
    plt.title('GA Validation Curve - Mating Rate (test)')
    plt.xlabel('Iterations')
    plt.ylabel('Mean squared error')

    # save learning curve plot as PNG
    plotdir = 'plots/NN/GA'
    plotpath = get_abspath('GA_MA_test.png', plotdir)
    plt.savefig(plotpath, bbox_inches='tight')
    plt.clf()
Ejemplo n.º 13
0
def main():
    """Run code to generate clustering results.

    """
    print 'Running base clustering experiments'
    start_time = timeit.default_timer()

    winepath = get_abspath('winequality.csv', 'data/experiments')
    seismicpath = get_abspath('seismic-bumps.csv', 'data/experiments')
    wine = np.loadtxt(winepath, delimiter=',')
    seismic = np.loadtxt(seismicpath, delimiter=',')
    rdir = 'results/clustering'
    pdir = 'plots/clustering'

    # split data into X and yreduced
    wX = wine[:, :-1]
    wY = wine[:, -1]
    sX = seismic[:, :-1]
    sY = seismic[:, -1]

    # run clustering experiments
    clusters = [2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 18, 20, 25, 30, 45, 80, 120]
    clustering_experiment(wX, wY, 'winequality', clusters, rdir=rdir)
    clustering_experiment(sX, sY, 'seismic-bumps', clusters, rdir=rdir)

    # generate 2D data for cluster visualization
    get_cluster_data(wX, wY, 'winequality', km_k=15, gmm_k=15, rdir=rdir)
    get_cluster_data(sX, sY, 'seismic-bumps', km_k=20, gmm_k=15, rdir=rdir)

    # generate component plots (metrics to choose size of k)
    generate_component_plots(name='winequality', rdir=rdir, pdir=pdir)
    generate_component_plots(name='seismic-bumps', rdir=rdir, pdir=pdir)

    # # generate validation plots (relative performance of clustering)
    generate_validation_plots(name='winequality', rdir=rdir, pdir=pdir)
    generate_validation_plots(name='seismic-bumps', rdir=rdir, pdir=pdir)

    # generate validation plots (relative performance of clustering)
    df_wine = pd.read_csv(get_abspath('winequality_2D.csv', rdir))
    df_seismic = pd.read_csv(get_abspath('seismic-bumps_2D.csv', rdir))
    generate_cluster_plots(df_wine, name='winequality', pdir=pdir)
    generate_cluster_plots(df_seismic, name='seismic-bumps', pdir=pdir)

    # generate neural network datasets with cluster labels
    nn_cluster_datasets(wX, name='winequality', km_k=15, gmm_k=15)
    nn_cluster_datasets(sX, name='seismic-bumps', km_k=20, gmm_k=15)

    # calculate and print running time
    end_time = timeit.default_timer()
    elapsed = end_time - start_time
    print "Completed clustering experiments in {} seconds".format(elapsed)
Ejemplo n.º 14
0
def run_experiment(problem, prefix, alpha, gamma, d, shape=None):
    """Run Q-Learning experiment for specified Gym problem and write results
    to CSV files.

    Args:
        problem (str): Gym problem name.
        alpha (float): Learning rate.
        gamma (float): Discount factor.
        d : Epsilon decay rate.
        shape (tuple(int)): Shape of state space matrix.
        prefix (str): Prefix for CSV and plot outputs.

    """
    episodes = 5000
    size = episodes // 100

    # instantiate environment and run Q-learner
    start = time.time()
    env = gym.make(problem)
    Q, rewards, visits = q_learning(env, alpha, d, gamma)
    env.close()
    end = time.time()
    elapsed = end - start

    # average rewards
    k = [i for i in range(0, episodes, size)]
    chunks = list(chunk_list(rewards, size))
    rewards = [sum(chunk) / len(chunk) for chunk in chunks]

    # save results as CSV
    resdir = 'results/QL'
    qf = get_abspath('{}_policy.csv'.format(prefix), resdir)
    rf = get_abspath('{}_rewards.csv'.format(prefix), resdir)
    vf = get_abspath('{}_visits.csv'.format(prefix), resdir)
    qdf = pd.DataFrame(Q)
    vdf = pd.DataFrame(visits)
    rdf = pd.DataFrame(np.column_stack([k, rewards]), columns=['k', 'r'])
    qdf.to_csv(qf, index=False)
    vdf.to_csv(vf, index=False)
    rdf.to_csv(rf, index=False)

    # write timing results and average reward in last iteration
    combined = get_abspath('summary.csv', 'results/QL')
    with open(combined, 'a') as f:
        f.write('{},{},{}\n'.format(prefix, elapsed, rdf.iloc[-1, 1]))

    # plot results
    tdir = 'plots/QL'
    polgrid = qdf.as_matrix().argmax(axis=1).reshape(shape)
    heatmap = vdf.as_matrix().reshape(shape)
    plot_grid(heatmap, prefix, tdir, policy_for_annot=polgrid)
Ejemplo n.º 15
0
def create_timing_curve(estimator, dataset, data_name, clf_name):
    # set training sizes and intervals
    train_sizes = np.arange(0.01, 1.0, 0.03)

    # initialise variables
    train_time = []
    predict_time = []
    df_final = []

    # iterate through training sizes and capture training and predict times
    for i, train_data in enumerate(train_sizes):
        X_train, X_test, y_train, y_test = split_data(dataset,
                                                      test_size=1 - train_data)
        start_train = timeit.default_timer()
        estimator.fit(X_train, y_train)
        end_train = timeit.default_timer()
        estimator.predict(X_test)
        end_predict = timeit.default_timer()
        train_time.append(end_train - start_train)
        predict_time.append(end_predict - end_train)
        df_final.append([train_data, train_time[i], predict_time[i]])

    # save timing results to CSV
    timedata = pd.DataFrame(
        data=df_final,
        columns=['Training Data Percentage', 'Train Time', 'Test Time'])
    resdir = 'results'
    res_tgt = '{}/{}'.format(resdir, clf_name)
    timefile = get_abspath('{}_timing_curve.csv'.format(data_name), res_tgt)
    timedata.to_csv(timefile, index=False)

    # generate timing curve plot
    plt.figure()
    plt.title("Timing Curve ({})".format(data_name))
    plt.grid()
    plt.plot(train_sizes, train_time, marker='.', color='y', label='Train')
    plt.plot(train_sizes,
             predict_time,
             marker='.',
             color='dodgerblue',
             label='Predict')
    plt.legend(loc='best')
    plt.xlabel('Training Set Size (%)')
    plt.ylabel('Elapsed user time in seconds')

    # save timing curve plot as PNG
    plotdir = 'plots'
    plot_tgt = '{}/{}'.format(plotdir, clf_name)
    plotpath = get_abspath('{}_TC.png'.format(data_name), plot_tgt)
    plt.savefig(plotpath)
    plt.close()
Ejemplo n.º 16
0
def main():
    """Run code to generate clustering results.

    """
    print 'Running base clustering experiments'
    start_time = timeit.default_timer()

    digitspath = get_abspath('digits.csv', 'data/experiments')
    abalonepath = get_abspath('abalone.csv', 'data/experiments')
    digits = np.loadtxt(digitspath, delimiter=',')
    abalone = np.loadtxt(abalonepath, delimiter=',')
    rdir = 'results/clustering'
    pdir = 'plots/clustering'

    # split data into X and yreduced
    dX = digits[:, :-1]
    dY = digits[:, -1]
    aX = abalone[:, :-1]
    aY = abalone[:, -1]

    # run clustering experiments
    clusters = [2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 50]
    clustering_experiment(dX, dY, 'digits', clusters, rdir=rdir)
    clustering_experiment(aX, aY, 'abalone', clusters, rdir=rdir)

    # generate 2D data for cluster visualization
    get_cluster_data(dX, dY, 'digits', km_k=10, gmm_k=10, rdir=rdir)
    get_cluster_data(aX, aY, 'abalone', km_k=5, gmm_k=10, rdir=rdir)

    # generate component plots (metrics to choose size of k)
    generate_component_plots(name='digits', rdir=rdir, pdir=pdir)
    generate_component_plots(name='abalone', rdir=rdir, pdir=pdir)

    # # generate validation plots (relative performance of clustering)
    generate_validation_plots(name='digits', rdir=rdir, pdir=pdir)
    generate_validation_plots(name='abalone', rdir=rdir, pdir=pdir)

    # generate validation plots (relative performance of clustering)
    df_digits = pd.read_csv(get_abspath('digits_2D.csv', rdir))
    df_abalone = pd.read_csv(get_abspath('abalone_2D.csv', rdir))
    generate_cluster_plots(df_digits, name='digits', pdir=pdir)
    generate_cluster_plots(df_abalone, name='abalone', pdir=pdir)

    # generate neural network datasets with cluster labels
    nn_cluster_datasets(dX, name='digits', km_k=10, gmm_k=10)
    nn_cluster_datasets(aX, name='abalone', km_k=3, gmm_k=10)

    # calculate and print running time
    end_time = timeit.default_timer()
    elapsed = end_time - start_time
    print "Completed clustering experiments in {} seconds".format(elapsed)
Ejemplo n.º 17
0
def generate_variance_plot(name, evp):
    """Plots explained variance and cumulative explained variance ratios as a
    function of principal components.

    Args:
        name (str): Dataset name.
        evp (float): Explained variance percentage threshold.

    """
    resdir = 'results/PCA'
    df = pd.read_csv(get_abspath('{}_variances.csv'.format(name), resdir))

    # get figure and axes
    fig, (ax, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(5, 3))

    # plot explained variance and cumulative explain variance ratios
    x = df['n']
    evr = df['evr']
    evr_cum = df['evr_cum']
    ax.plot(x, evr, marker='.', color='b', label='EVR')
    ax.plot(x, evr_cum, marker='.', color='g', label='Cumulative EVR')
    vmark = evr_cum.where(evr_cum > evp).idxmin() + 1
    fig.suptitle('PCA Explained Variance by PC ({})'.format(name))
    ax.set_title(
        '{:.2%} Cumulative Variance \n Explained by {} Components'.format(
             evr_cum[vmark-1], vmark
        )
    )
    ax.set_ylabel('Explained Variance')
    ax.set_xlabel('Principal Component')
    ax.axvline(x=vmark, linestyle='--', color='r')
    ax.grid(color='grey', linestyle='dotted')
    loss = df['loss']
    ax1.plot(x, loss, marker='.', color='r')
    ax1.set_title('PCA Mean Loss ({})'.format(name))
    ax1.set_ylabel('Mean loss')
    ax1.set_xlabel('# Components')

    # change layout size, font size and width
    fig.tight_layout()
    for ax in fig.axes:
        ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label]
        for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels():
            item.set_fontsize(8)

    # save figure
    plotdir = 'plots/PCA'
    plotpath = get_abspath('{}_explvar.png'.format(name), plotdir)
    plt.savefig(plotpath)
    plt.clf()
Ejemplo n.º 18
0
def generate_validation_plots(name, rdir, pdir):
    """Generates plots of validation metrics (accuracy, adjusted mutual info)
    for both datasets.

    Args:
        name (str): Dataset name.
        rdir (str): Input file directory.
        pdir (str): Output directory.

    """
    metrics = pd.read_csv(get_abspath('{}_metrics.csv'.format(name), rdir))

    # get figure and axes
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9, 4))

    # plot accuracy
    k = metrics['k']
    km = metrics['km_acc']
    gmm = metrics['gmm_acc']
    ax1.plot(k, km, marker='o', markersize=5, color='b', label='K-Means')
    ax1.plot(k, gmm, marker='o', markersize=5, color='g', label='GMM')
    ax1.set_title('Accuracy Score ({})'.format(name))
    ax1.set_ylabel('Accuracy')
    ax1.set_xlabel('Number of clusters (k)')
    ax1.grid(color='grey', linestyle='dotted')
    ax1.legend(loc='best')

    # plot adjusted mutual info
    km = metrics['km_adjmi']
    gmm = metrics['gmm_adjmi']
    ax2.plot(k, km, marker='o', markersize=5, color='r', label='K-Means')
    ax2.plot(k, gmm, marker='o', markersize=5, color='k', label='GMM')
    ax2.set_title('Adjusted Mutual Info ({})'.format(name))
    ax2.set_ylabel('Adjusted mutual information score')
    ax2.set_xlabel('Number of clusters (k)')
    ax2.grid(color='grey', linestyle='dotted')
    ax2.legend(loc='best')

    # change layout size, font size and width between subplots
    fig.tight_layout()
    for ax in fig.axes:
        ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label]
        for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels():
            item.set_fontsize(8)
    plt.subplots_adjust(wspace=0.3)

    # save figure
    plotpath = get_abspath('{}_validation.png'.format(name), pdir)
    plt.savefig(plotpath)
    plt.clf()
Ejemplo n.º 19
0
def generate_fi_plot(name, theta):
    """Plots feature importance and cumulative feature importance values sorted
    by feature index.

    Args:
        name (str): Dataset name.
        theta (float): Explained variance percentage threshold.

    """
    resdir = 'results/RF'
    df = pd.read_csv(get_abspath('{}_fi.csv'.format(name), resdir))

    # get figure and axes
    fig, ax1 = plt.subplots(nrows=1,
                            ncols=1,
                            figsize=(7 if name == 'abalone' else 12, 3))

    # plot explained variance and cumulative explain variance ratios
    ax2 = ax1.twinx()
    x = df['i']
    fi = df['importance']
    cumfi = df['cumulative']

    ax1.bar(x, height=fi, color='b', tick_label=df['feature'], align='center')
    ax2.plot(x, cumfi, color='r', label='Cumulative Info Gain')
    fig.suptitle('Feature Importance ({})'.format(name))
    ax1.set_title('{:.2%} Explained variance percentage by {} Features'.format(
        cumfi.loc[cumfi.where(cumfi > theta).idxmin()],
        cumfi.where(cumfi > theta).idxmin() + 1,
    ))
    ax1.set_ylabel('Gini Gain')
    ax2.set_ylabel('Cumulative Gini Gain')
    ax1.set_xlabel('Feature Index')
    ax2.axhline(y=theta, linestyle='--', color='r')
    ax1.grid(b=None)
    ax2.grid(b=None)

    # change layout size, font size and width
    fig.tight_layout()
    for ax in fig.axes:
        ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label]
        for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels():
            item.set_fontsize(8)

    # save figure
    plotdir = 'plots/RF'
    plotpath = get_abspath('{}_fi.png'.format(name), plotdir)
    plt.savefig(plotpath)
    plt.clf()
Ejemplo n.º 20
0
def preprocess_seismic():
    """Cleans and generates seismic bumps dataset for experiments as a
    CSV file. Uses one-hot encoding for categorical features.

    """
    # get file path
    sdir = 'data/seismic-bumps'
    tdir = 'data/experiments'
    seismic_file = get_abspath('seismic-bumps.arff', sdir)

    # read arff file and convert to record array
    rawdata = arff.loadarff(seismic_file)
    df = pd.DataFrame(rawdata[0])

    # apply one-hot encoding to categorical features using Pandas get_dummies
    cat_cols = ['seismic', 'seismoacoustic', 'shift', 'ghazard']
    cats = df[cat_cols]
    onehot_cols = pd.get_dummies(cats, prefix=cat_cols)

    # drop original categorical columns and append one-hot encoded columns
    df.drop(columns=cat_cols, inplace=True)
    df = pd.concat((df, onehot_cols), axis=1)

    # drop columns that have only 1 unique value (features add no information)
    for col in df.columns:
        if len(np.unique(df[col])) == 1:
            df.drop(columns=col, inplace=True)

    # drop columns with low correlation with class and higher (over 0.8)
    # correlation with other attributes
    df.drop(columns=['gdenergy', 'maxenergy'], inplace=True)

    # save to CSV
    save_dataset(df, 'seismic-bumps.csv', sep=',', subdir=tdir)
Ejemplo n.º 21
0
def ica_experiment(X, name, dims):
    """Run ICA on specified dataset and saves mean kurtosis results as CSV
    file.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (list(int)): List of component number values.

    """
    ica = FastICA(random_state=0, max_iter=5000)
    kurt = {}

    for dim in dims:
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(X)
        df = pd.DataFrame(tmp)
        df = df.kurt(axis=0)
        kurt[dim] = df.abs().mean()

    res = pd.DataFrame.from_dict(kurt, orient='index')
    res.rename(columns={0: 'kurtosis'}, inplace=True)

    # save results as CSV
    resdir = 'results/ICA'
    resfile = get_abspath('{}_kurtosis.csv'.format(name), resdir)
    res.to_csv(resfile, index_label='n')
Ejemplo n.º 22
0
def ica_experiment(X, name, dims, max_iter=5000, tol=1e-04):
    """Run ICA on specified dataset and saves mean kurtosis results as CSV
    file.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (list(int)): List of component number values.

    """
    ica = FastICA(random_state=0, max_iter=max_iter, tol=tol)
    kurt = []
    loss = []

    X = StandardScaler().fit_transform(X)
    for dim in dims:
        print(dim)
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(X)
        df = pd.DataFrame(tmp)
        df = df.kurt(axis=0)
        kurt.append(kurtosistest(tmp).statistic.mean())
        proj = ica.inverse_transform(tmp)
        loss.append(((X - proj)**2).mean())

    res = pd.DataFrame({"kurtosis": kurt, "loss": loss})

    # save results as CSV
    resdir = 'results/ICA'
    resfile = get_abspath('{}_kurtosis.csv'.format(name), resdir)
    res.to_csv(resfile, index_label='n')
Ejemplo n.º 23
0
def plot_delta_curve(deltas, prefix, tdir):
    """Plots delta as a function of number of episodes.

    Args:
        rewards (pandas.dataframe): Rewards dataframe.
        prefix (str): Prefix for CSV and plot outputs.
        tdir (str): Target directory.

    """
    # get figure and axes
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))

    # plot reward curve
    k = deltas['k']
    d = deltas['d']
    ax.plot(k, d, color='g')
    ax.set_title('Delta Convergence ({})'.format(prefix))
    ax.set_ylabel('Delta Value')
    ax.set_xlabel('Episodes')
    ax.grid(linestyle='dotted')
    fig.tight_layout()

    # save figure
    plotpath = get_abspath('{}_deltas.png'.format(prefix), tdir)
    plt.savefig(plotpath)
    plt.close()
Ejemplo n.º 24
0
def plot_reward_curve(rewards, prefix, tdir, xlabel='Iterations'):
    """Plots rewards as a function of number of iterations or episodes.

    Args:
        rewards (pandas.dataframe): Rewards dataframe.
        prefix (str): Prefix for CSV and plot outputs.
        tdir (str): Target directory.

    """
    # get figure and axes
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))

    # plot reward curve
    k = rewards['k']
    r = rewards['r']
    ax.plot(k, r, color='b')
    ax.set_title('Average Rewards ({})'.format(prefix))
    ax.set_ylabel('Average Reward')
    ax.set_xlabel('Episodes')
    ax.grid(linestyle='dotted')
    fig.tight_layout()

    # save figure
    plotpath = get_abspath('{}_rewards.png'.format(prefix), tdir)
    plt.savefig(plotpath)
    plt.close()
Ejemplo n.º 25
0
def plot_grid(heat, prefix, tdir, big=False, policy_for_annot=None):
    """Plots grid using a scalar-based heatmap.

    Args:
        heat (numpy.array): Heat map values.
        prefix (str): Prefix for CSV and plot outputs.
        tdir (str): Target directory.
        policy_for_annot (numpy.array): Policy array to use for annotations.

    """
    figsize = (5, 5)
    if heat.shape[0] > 10:
        figsize = (8, 8)
    if policy_for_annot is not None:
        policy_for_annot = get_annotations(policy_for_annot)

    # generate graph
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize)
    ax = sns.heatmap(heat, annot=policy_for_annot, fmt='')
    ax.set_title('Optimal Policy ({})'.format(prefix), fontsize=20)
    fig.tight_layout()

    # save figure
    plotpath = get_abspath('{}_policygrid.png'.format(prefix), tdir)
    plt.savefig(plotpath)
    plt.close()
Ejemplo n.º 26
0
def plot_delta_combined(df1, df2, df3, prefix, tdir):
    """Plot combined delta chart for all values of gamma.

    Args:
        df1 (pandas.dataframe): Gamma 0.1.
        df2 (pandas.dataframe): Gamma 0.9.
        df3 (pandas.dataframe): Gamma 0.99.
        prefix (str): Prefix for CSV and plot outputs.
        tdir (str): Target directory.

    """
    # get figure and axes
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))

    # plot reward curve
    k = df1['k']
    r1 = df1['d']
    r2 = df2['d']
    r3 = df3['d']
    ax.plot(k, r1, color='b', label='Gamma 0.1')
    ax.plot(k, r2, color='r', label='Gamma 0.9')
    ax.plot(k, r3, color='g', label='Gamma 0.99')
    ax.set_title('Delta Convergence ({})'.format(prefix))
    ax.set_ylabel('Delta')
    ax.set_xlabel('Episodes')
    ax.legend(loc='best')
    ax.grid(linestyle='dotted')
    fig.tight_layout()

    # save figure
    plotpath = get_abspath('{}_combined_deltas.png'.format(prefix), tdir)
    plt.savefig(plotpath)
    plt.close()
Ejemplo n.º 27
0
def basic_results(grid, X_test, y_test, data_name, clf_name):
    """Gets best fit against test data for best estimator from a particular
    grid object. Note: test score funtion is the same scoring function used
    for training.

    Args:
        grid (GridSearchCV object): Trained grid search object.
        X_test (numpy.Array): Test features.
        y_test (numpy.Array): Test labels.
        data_name (str): Name of data set being tested.
        clf_name (str): Type of algorithm.

    """
    # get best score, test score, scoring function and best parameters
    clf = clf_name
    dn = data_name
    bs = grid.best_score_
    ts = grid.score(X_test, y_test)
    sf = grid.scorer_
    bp = grid.best_params_

    # write results to a combined results file
    parentdir = 'results'
    resfile = get_abspath('combined_results.csv', parentdir)
    with open(resfile, 'a') as f:
        f.write('{}|{}|{}|{}|{}|{}\n'.format(clf, dn, bs, ts, sf, bp))
Ejemplo n.º 28
0
def histogram(labels, dataname, outfile, outpath='plots/datasets'):
    """Generates a histogram of class labels in a given dataset and saves it
    to an output folder in the project directory.

    Args:
        labels (numpy.Array): array containing class labels.
        dataname (str): name of datasets (e.g. winequality).
        outfile (str): name of output file name.
        outpath (str): project folder to save plot file.
    """
    # get number of bins
    bins = len(np.unique(labels))

    # set figure params
    sns.set(font_scale=1.3, rc={'figure.figsize': (8, 8)})

    # create plot and set params
    fig, ax = plt.subplots()
    ax.hist(labels, bins=bins)
    fig.suptitle('Class frequency in ' + dataname)
    ax.set_xlabel('Class')
    ax.set_ylabel('Frequency')

    # save plot
    plt.savefig(get_abspath(outfile, outpath))
    plt.close()
Ejemplo n.º 29
0
def generate_contingency_matrix(kmeans_contigency, gmm_continigency, name,
                                pdir):
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 3))
    ax1 = sns.heatmap(kmeans_contigency, linewidths=.5, cmap="YlGnBu", ax=ax1)
    ax1.set_title('K-Means Clusters ({})'.format(name))
    ax1.set_xlabel('Cluster')
    ax1.set_ylabel('True label')

    ax2 = sns.heatmap(gmm_continigency, linewidths=.5, cmap="YlGnBu", ax=ax2)
    ax2.set_title('GMM Clusters ({})'.format(name))
    ax2.set_xlabel('Cluster')
    ax2.set_ylabel('True label')

    fig.tight_layout()
    for ax in fig.axes:
        ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label]
        for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels():
            item.set_fontsize(8)
    plt.subplots_adjust(wspace=0.3)

    fig.suptitle('Contigency Plot - {}'.format(name), fontsize=12)
    # save figure
    plotdir = pdir
    plotpath = get_abspath('{}_contingecy.png'.format(name), plotdir)
    plt.savefig(plotpath)
    plt.clf()
Ejemplo n.º 30
0
def rp_experiment(X, y, name, dims):
    """Run Randomized Projections on specified dataset and saves reconstruction
    error and pairwise distance correlation results as CSV file.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (list(int)): List of component number values.

    """
    re = defaultdict(dict)
    pdc = defaultdict(dict)

    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(X)
        re[dim][i] = reconstruction_error(rp, X)
        pdc[dim][i] = pairwise_dist_corr(rp.transform(X), X)

    re = pd.DataFrame(pd.DataFrame(re).T.mean(axis=1))
    re.rename(columns={0: 'recon_error'}, inplace=True)
    pdc = pd.DataFrame(pd.DataFrame(pdc).T.mean(axis=1))
    pdc.rename(columns={0: 'pairwise_dc'}, inplace=True)
    metrics = pd.concat((re, pdc), axis=1)

    # save results as CSV
    resdir = 'results/RP'
    resfile = get_abspath('{}_metrics.csv'.format(name), resdir)
    metrics.to_csv(resfile, index_label='n')