def agglo_logit_calc(Xps1, Yps1, nonmusic_subreddits): """ Handles fitting and scoring of the agglomeration->logistic regression machine learning scheme. """ Xps1 = Xps1.toarray() logit = LogisticRegression() (n_samples_1, _) = Xps1.shape n_folds = 4 rand = 0 kf = KFold(n_samples_1, n_folds=n_folds, shuffle=True, random_state=rand) logit_1 = 0.0 logit_20 = 0.0 n_lo = 1 n_hi = 155 step = 1 n_groups_gen = range(n_lo, n_hi + 1, step) agglo_1s = [0.0 for _ in n_groups_gen] agglo_20s = [0.0 for _ in n_groups_gen] params = np.empty([len(n_groups_gen), n_folds], dtype=object) logit_params = [] for i_fold, (train, test) in enumerate(kf): print i_fold logit.fit(Xps1[train], Yps1[train]) logit_params.append(logit.coef_) logit_1 += 100.0 * logit.score(Xps1[test], Yps1[test]) (Xps20_test, Yps20_test) = prune_sparse_samples(Xps1[test], Yps1[test], threshold=20) (Xps20_test, Yps20_test) = balance_data(Xps20_test, Yps20_test) logit_20 += 100.0 * logit.score(Xps20_test, Yps20_test) for j, n_groups in enumerate(n_groups_gen): agglo = phi_agglomerate(N=n_groups).fit(Xps1[train], Yps1[train]) Xagglo_train_1, _ = agglo.transform(Xps1[train]) Xagglo_test_1, _ = agglo.transform(Xps1[test]) Xagglo_test_20, _ = agglo.transform(Xps20_test) logit.fit(Xagglo_train_1, Yps1[train]) params[j][i_fold] = logit.coef_ agglo_1s[j] += 100.0 * logit.score(Xagglo_test_1, Yps1[test]) / n_folds agglo_20s[j] += 100.0 * logit.score(Xagglo_test_20, Yps20_test) / n_folds logit_1 /= n_folds logit_20 /= n_folds return (n_lo, n_hi, logit_1, logit_20, n_groups_gen, agglo_1s, agglo_20s, params, logit_params)
X = X.tocsr() nonmusic_subreddits = array(nonmusic_subreddits, dtype=object) (X, Y, genres) = kill_outcome(X, Y, genres, 'classical') (X, Y, genres) = kill_outcome(X, Y, genres, 'electronic') # Delete those predictors I failed to exclude when I created the pickle. # Delete any predictors which are empty after killing outcomes (X, nonmusic_subreddits) = sanitise_predictors(X, nonmusic_subreddits, music_subreddits) (Xps1, Yps1) = prune_sparse_samples(X, Y, threshold=1) # (Xps20, Yps20) = prune_sparse_samples(X, Y, threshold=20) (Xps1, Yps1) = balance_data(Xps1, Yps1) # (Xps20, Yps20) = balance_data(Xps20, Yps20) ########### # Plots # ########### plot_LDA_histogram(Xps1, Xps20, Yps1, Yps20) plot_sparsity(Xps1, Yps1) plot_agglo_logit(Xps1, Yps1, nonmusic_subreddits) plot_RBM(Xps1, Yps1) graph_music_taste(Xps1, Yps1, nonmusic_subreddits)
def plot_RBM(Xps1, Yps1): """ Produce a plot of RBM classification accuracy and model variation """ ###################### # Stat/create data # ###################### n_lo = 10 n_hi = 140 N_range = range(n_lo, n_hi + 1, 10) rand = 0 n_folds = 4 BRBMs = get_BRBMs(Xps1, Yps1, N_range, rand, n_folds) (n_samples_1, n_features) = Xps1.shape kf = KFold(n_samples_1, n_folds=n_folds, shuffle=True, random_state=rand) ################# # Test models # ################# logit = LogisticRegression() logit_score = [0.0 for i in N_range] logit_score_20 = [0.0 for i in N_range] logit_params = [] params = np.empty([len(N_range), n_folds], dtype=object) logit_params = [] logit_1 = 0.0 logit_20 = 0.0 for j_fold, (train, test) in enumerate(kf): (Xps20_test, Yps20_test) = prune_sparse_samples(Xps1[test], Yps1[test], threshold=20) (Xps20_test, Yps20_test) = balance_data(Xps20_test, Yps20_test) for i, N in enumerate(N_range): rbm = BRBMs[i][j_fold] Xps1_train_trans = rbm.transform(Xps1[train]) logit.fit(Xps1_train_trans, Yps1[train]) params[i][j_fold] = logit.coef_ Xps1_test_trans = rbm.transform(Xps1[test]) logit_score[i] += 100.0 * logit.score(Xps1_test_trans, Yps1[test]) / n_folds Xps20_test_trans = rbm.transform(Xps20_test) logit_score_20[i] += 100.0 * logit.score(Xps20_test_trans, Yps20_test) / n_folds logit.fit(Xps1[train], Yps1[train]) logit_params.append(logit.coef_) logit_1 += (100.0 * logit.score(Xps1[test], Yps1[test])) / n_folds logit_20 += (100.0 * logit.score(Xps20_test, Yps20_test)) / n_folds ############################################ # Plot - subplot 1 - prediction accuracy # ############################################ plot_n_lo = 0 plot_n_hi = n_hi snscol = sns.color_palette("Set1", n_colors=8, desat=0.5) labelfontsize = 16 linewidth = 2 fig = plt.figure(figsize=(10, 4.0)) fig.add_subplot(121) plt.tight_layout(pad=2, w_pad=5) plt.title("Model accuracy", size=22) plt.xlabel("Number of hidden units", size=labelfontsize) plt.ylabel("Correct predictions (%)", size=labelfontsize) plt.plot(N_range, logit_score, label="RBM features", linewidth=linewidth, color=snscol[0]) plt.plot(N_range, logit_score_20, label=u"RBM features (≥20 subreddits)", linewidth=linewidth, color=snscol[1]) plt.plot( [plot_n_lo, plot_n_hi], [logit_1, logit_1], label="No RBM", linestyle=("dashed"), linewidth=linewidth, color=snscol[0], ) plt.plot( [plot_n_lo, plot_n_hi], [logit_20, logit_20], label=u"No RBM (≥20 subreddits)", linestyle=("dashed"), linewidth=linewidth, color=snscol[1], ) axes = plt.gca() axes.set_xlim(plot_n_lo, plot_n_hi) axes.set_ylim(60, 72) plt.legend(fontsize=12.5, loc=4) ####################################### # Plot - subplot 2 - Parameter RMSD # ####################################### fig.add_subplot(122) plt.title("Model instability", size=22) plt.xlabel("Number of hidden units", size=labelfontsize) plt.ylabel("Mean parameter fluctuations", size=labelfontsize) mrmsds = [] # var params is structured as: # params[k][j[i] = the ith model parameter of the jth model (jth fold in the # cross-validation) in the kth number of predictor groups for k, param_sets in enumerate(params): mrmsd = get_mrmsd(param_sets) mrmsds.append(mrmsd) mrmsd_logit = get_mrmsd(logit_params) plt.plot(N_range, mrmsds, linewidth=linewidth, color=snscol[2], label="RBM features") plt.plot( [plot_n_lo, plot_n_hi], [mrmsd_logit, mrmsd_logit], label="No RBM", linestyle=("dashed"), linewidth=linewidth, color=snscol[2], ) plt.legend(fontsize=12.5, loc=1) axes = plt.gca() axes.set_xlim(plot_n_lo, plot_n_hi) axes.set_ylim(0.00, 0.45) plt.savefig("README_figs/RBMs_logit.svg")