def agglo_logit_calc(Xps1, Yps1, nonmusic_subreddits):
    """ Handles fitting and scoring of the agglomeration->logistic regression
        machine learning scheme.
    """

    Xps1 = Xps1.toarray()

    logit = LogisticRegression()
    (n_samples_1, _) = Xps1.shape

    n_folds = 4
    rand = 0
    kf = KFold(n_samples_1, n_folds=n_folds, shuffle=True, random_state=rand)

    logit_1 = 0.0
    logit_20 = 0.0

    n_lo = 1
    n_hi = 155
    step = 1
    n_groups_gen = range(n_lo, n_hi + 1, step)

    agglo_1s = [0.0 for _ in n_groups_gen]
    agglo_20s = [0.0 for _ in n_groups_gen]

    params = np.empty([len(n_groups_gen), n_folds], dtype=object)
    logit_params = []

    for i_fold, (train, test) in enumerate(kf):
        print i_fold

        logit.fit(Xps1[train], Yps1[train])
        logit_params.append(logit.coef_)

        logit_1 += 100.0 * logit.score(Xps1[test], Yps1[test])

        (Xps20_test, Yps20_test) = prune_sparse_samples(Xps1[test], Yps1[test], threshold=20)
        (Xps20_test, Yps20_test) = balance_data(Xps20_test, Yps20_test)

        logit_20 += 100.0 * logit.score(Xps20_test, Yps20_test)

        for j, n_groups in enumerate(n_groups_gen):

            agglo = phi_agglomerate(N=n_groups).fit(Xps1[train], Yps1[train])
            Xagglo_train_1, _ = agglo.transform(Xps1[train])
            Xagglo_test_1, _ = agglo.transform(Xps1[test])
            Xagglo_test_20, _ = agglo.transform(Xps20_test)

            logit.fit(Xagglo_train_1, Yps1[train])

            params[j][i_fold] = logit.coef_

            agglo_1s[j] += 100.0 * logit.score(Xagglo_test_1, Yps1[test]) / n_folds
            agglo_20s[j] += 100.0 * logit.score(Xagglo_test_20, Yps20_test) / n_folds

    logit_1 /= n_folds
    logit_20 /= n_folds

    return (n_lo, n_hi, logit_1, logit_20, n_groups_gen, agglo_1s, agglo_20s, params, logit_params)
Example #2
0
    X = X.tocsr()
    nonmusic_subreddits = array(nonmusic_subreddits, dtype=object)

    (X, Y, genres) = kill_outcome(X, Y, genres, 'classical')
    (X, Y, genres) = kill_outcome(X, Y, genres, 'electronic')

    # Delete those predictors I failed to exclude when I created the pickle.
    # Delete any predictors which are empty after killing outcomes
    (X, nonmusic_subreddits) = sanitise_predictors(X, nonmusic_subreddits,
                                                   music_subreddits)

    (Xps1, Yps1) = prune_sparse_samples(X, Y, threshold=1)
    # (Xps20, Yps20) = prune_sparse_samples(X, Y, threshold=20)

    (Xps1, Yps1) = balance_data(Xps1, Yps1)
    # (Xps20, Yps20) = balance_data(Xps20, Yps20)

    ###########
    #  Plots  #
    ###########

    plot_LDA_histogram(Xps1, Xps20, Yps1, Yps20)

    plot_sparsity(Xps1, Yps1)

    plot_agglo_logit(Xps1, Yps1, nonmusic_subreddits)

    plot_RBM(Xps1, Yps1)

    graph_music_taste(Xps1, Yps1, nonmusic_subreddits)
def plot_RBM(Xps1, Yps1):
    """ Produce a plot of RBM classification accuracy and model variation
    """

    ######################
    #  Stat/create data  #
    ######################

    n_lo = 10
    n_hi = 140

    N_range = range(n_lo, n_hi + 1, 10)
    rand = 0
    n_folds = 4
    BRBMs = get_BRBMs(Xps1, Yps1, N_range, rand, n_folds)

    (n_samples_1, n_features) = Xps1.shape
    kf = KFold(n_samples_1, n_folds=n_folds, shuffle=True, random_state=rand)

    #################
    #  Test models  #
    #################

    logit = LogisticRegression()
    logit_score = [0.0 for i in N_range]
    logit_score_20 = [0.0 for i in N_range]
    logit_params = []
    params = np.empty([len(N_range), n_folds], dtype=object)
    logit_params = []

    logit_1 = 0.0
    logit_20 = 0.0

    for j_fold, (train, test) in enumerate(kf):

        (Xps20_test, Yps20_test) = prune_sparse_samples(Xps1[test], Yps1[test], threshold=20)
        (Xps20_test, Yps20_test) = balance_data(Xps20_test, Yps20_test)

        for i, N in enumerate(N_range):

            rbm = BRBMs[i][j_fold]

            Xps1_train_trans = rbm.transform(Xps1[train])

            logit.fit(Xps1_train_trans, Yps1[train])
            params[i][j_fold] = logit.coef_

            Xps1_test_trans = rbm.transform(Xps1[test])
            logit_score[i] += 100.0 * logit.score(Xps1_test_trans, Yps1[test]) / n_folds

            Xps20_test_trans = rbm.transform(Xps20_test)
            logit_score_20[i] += 100.0 * logit.score(Xps20_test_trans, Yps20_test) / n_folds

        logit.fit(Xps1[train], Yps1[train])
        logit_params.append(logit.coef_)
        logit_1 += (100.0 * logit.score(Xps1[test], Yps1[test])) / n_folds
        logit_20 += (100.0 * logit.score(Xps20_test, Yps20_test)) / n_folds

    ############################################
    #  Plot - subplot 1 - prediction accuracy  #
    ############################################

    plot_n_lo = 0
    plot_n_hi = n_hi

    snscol = sns.color_palette("Set1", n_colors=8, desat=0.5)

    labelfontsize = 16
    linewidth = 2

    fig = plt.figure(figsize=(10, 4.0))
    fig.add_subplot(121)
    plt.tight_layout(pad=2, w_pad=5)
    plt.title("Model accuracy", size=22)
    plt.xlabel("Number of hidden units", size=labelfontsize)
    plt.ylabel("Correct predictions (%)", size=labelfontsize)

    plt.plot(N_range, logit_score, label="RBM features", linewidth=linewidth, color=snscol[0])
    plt.plot(N_range, logit_score_20, label=u"RBM features (≥20 subreddits)", linewidth=linewidth, color=snscol[1])

    plt.plot(
        [plot_n_lo, plot_n_hi],
        [logit_1, logit_1],
        label="No RBM",
        linestyle=("dashed"),
        linewidth=linewidth,
        color=snscol[0],
    )
    plt.plot(
        [plot_n_lo, plot_n_hi],
        [logit_20, logit_20],
        label=u"No RBM (≥20 subreddits)",
        linestyle=("dashed"),
        linewidth=linewidth,
        color=snscol[1],
    )

    axes = plt.gca()
    axes.set_xlim(plot_n_lo, plot_n_hi)
    axes.set_ylim(60, 72)

    plt.legend(fontsize=12.5, loc=4)

    #######################################
    #  Plot - subplot 2 - Parameter RMSD  #
    #######################################

    fig.add_subplot(122)
    plt.title("Model instability", size=22)
    plt.xlabel("Number of hidden units", size=labelfontsize)
    plt.ylabel("Mean parameter fluctuations", size=labelfontsize)

    mrmsds = []

    # var params is structured as:
    # params[k][j[i] = the ith model parameter of the jth model (jth fold in the
    #                 cross-validation) in the kth number of predictor groups
    for k, param_sets in enumerate(params):
        mrmsd = get_mrmsd(param_sets)
        mrmsds.append(mrmsd)

    mrmsd_logit = get_mrmsd(logit_params)

    plt.plot(N_range, mrmsds, linewidth=linewidth, color=snscol[2], label="RBM features")
    plt.plot(
        [plot_n_lo, plot_n_hi],
        [mrmsd_logit, mrmsd_logit],
        label="No RBM",
        linestyle=("dashed"),
        linewidth=linewidth,
        color=snscol[2],
    )

    plt.legend(fontsize=12.5, loc=1)
    axes = plt.gca()
    axes.set_xlim(plot_n_lo, plot_n_hi)
    axes.set_ylim(0.00, 0.45)

    plt.savefig("README_figs/RBMs_logit.svg")