Ejemplo n.º 1
0
def get_best_dpgmm(X, num_c, cv_type, alpha, iters, n_init, rand_state=None):
    best_bic = np.inf
    bic_dpgmm = None
    lbl_vec_dpgmm = np.zeros(X.shape[0])
    prob_vec_dpgmm = np.zeros(X.shape[0])
    log_prob_dpgmm = None
    for i in xrange(n_init):
        dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \
                        alpha=alpha, random_state=rand_state)
        dpgmm.fit(X)
        b = dpgmm.bic(X)
        if b < best_bic:
            bic_dpgmm = b
            lbl_vec = dpgmm.predict(X)
            prob_vec = dpgmm.predict_proba(X)
            log_prob_dpgmm = np.sum(dpgmm.score(X))
    return [lbl_vec, prob_vec, bic_dpgmm, log_prob_dpgmm]
               aspect='auto',
               origin='low',
               interpolation='nearest',
               cmap=plt.cm.plasma)
fig.tight_layout()

# Clustering with DP-GMM
n_components = 32
dpgmm = DPGMM(n_components=n_components,
              tol=1e-3,
              n_iter=32,
              alpha=1000,
              covariance_type='diag',
              verbose=True)
dpgmm.fit(feats_log.T)
preds_proba = dpgmm.predict_proba(feats_log.T)
preds = np.argmax(preds_proba, axis=1)
np.unique(preds)
# resynthesis by sampling from clusters
resynthesis = dpgmm.means_[preds.astype(int), :]

fig, axes = plt.subplots(4, 1, figsize=(18, 8))
axes[0].set_title(feature)
axes[1].set_title('Prediction Probability')
axes[2].set_title('Resynthesis')
axes[3].set_title('Max(Prediction Probability)')

axes[0].imshow(feats_log,
               aspect='auto',
               origin='low',
               interpolation='nearest',
def plotClustering(fullpath, order=1, sr=4, cutoff=.1, n_singv=3,
                   feature='chroma', dim_red='SVD', round_to=0, normalize=1,
                   scale=1, length=4, clustering='KMEANS'):
    feat = {}
    print ('Analyzing {} with feature {}, order {}, sr {}, cutoff {}, '
           'n_singv {}, scale {} normalize {}, round_to {}'.format(
               fullpath, feature, order, sr, cutoff, n_singv, scale, normalize,
               round_to))
    # extract filename, filepath and beat aligned feature
    filename, file_ext = os.path.splitext(fullpath)

    # extract filter and apply pre-processing
    feat[feature], beat_times = extractFeature(
        filename, file_ext, feature, scale, round_to, normalize,
        beat_sync=True, save=True)

    feat['LPF'] = lpf(feat[feature], cutoff, sr, order)
    feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv)
    feat['{}(LPF)'.format(dim_red)] = dim_red_fn(
        dim_red, feat['LPF'], n_singv)
    feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order)
    feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF']
    feat['LPF({}-LPF)'.format(feature)] = lpf(
        feat['{}-LPF'.format(feature)], cutoff, sr, order)
    feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn(dim_red,
        feat['LPF({}-LPF)'.format(feature)], n_singv)

    # create vars for plotting
    ts = np.arange(0, len(feat[feature]))
    step_size = max(1, int(len(ts) * .01))
    fig = plt.figure(figsize=(98, 64))
    fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format(
        feature, order, cutoff, sr))

    gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1])
    i = 0
    print "\tPlot data and pre-processing"
    for name in (feature,
                 '{}-LPF'.format(feature),
                 '{}(LPF)'.format(dim_red),
                 'LPF({})'.format(dim_red),
                 'LPF({}-LPF)'.format(feature),
                 '{}(LPF({}-LPF))'.format(dim_red, feature)):
        data = feat[name]

        data_wide = np.array([feat[name][m:m+length, :]
                              for m in xrange(len(feat[name])-length)])
        data_wide = data_wide.reshape(
            data_wide.shape[0], data_wide.shape[1]*data_wide.shape[2])

        # build codebook using kmeans or DP-GMM
        if clustering == 'KMEANS':
            K_MIN, K_MAX = 2, 16
            KM = [KMeans(n_clusters=l, init='k-means++').fit(data_wide)
                  for l in xrange(K_MIN, K_MAX+1)]

            # compute scores to assess fit
            scores_bic = [computeBic(KM[x], data_wide) for x in xrange(len(KM))]
            scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))]
            scores_silhouette = [silhouette_score(data_wide, KM[x].labels_,
                                                  metric='euclidean')
                                 for x in xrange(len(KM))]

            # get best clusters
            idx_best_bic = findElbow(np.dstack(
                (xrange(K_MIN, K_MAX+1), scores_bic))[0])
            idx_best_inertia = findElbow(np.dstack(
                (xrange(K_MIN, K_MAX+1), scores_inertia))[0])
            idx_best_silhouette = findElbow(np.dstack(
                (xrange(K_MIN, K_MAX+1), scores_silhouette))[0])
            idx_best = int(np.median(
                (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1

            # get clusters and cluster allocations given best K
            k_best = idx_best + K_MIN
            centroids = KM[idx_best].cluster_centers_
            centroid_idx = KM[idx_best].labels_
        elif clustering == 'DPGMM':
            n_components = 12
            dpgmm = DPGMM(
                n_components=n_components, tol=1e-3, n_iter=32, alpha=1000,
                covariance_type='diag', verbose=True)
            dpgmm.fit(data_wide)

            # compute scores to assess fit
            scores_bic = dpgmm.bic(data_wide)
            scores_silhouette = [silhouette_score(data_wide, centroids,
                                 metric='euclidean')]
            scores_silhouette = [0.0]

            # get clusters and cluster allocations given best K
            k_best = dpgmm.means_.shape[0]
            centroids = dpgmm.means_
            centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1)
        # plot data
        if data.shape[1] == 3:
            data = data.reshape(1, data.shape[0], data.shape[1])
        else:
            data = data.T

        ax = fig.add_subplot(gs[i, :])
        ax.set_title(name)
        ax.imshow(data,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        xlabels = ["{}:{}".format(int(x / 60), int(x % 60))
                   for x in beat_times[::step_size]]
        ax.set_xticks(ts[::step_size])
        ax.set_xticklabels(xlabels, rotation=60)
        ax.grid(False)

        # plot clustering on raw feature
        changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:]))
        for c in xrange(changes.shape[0]-1):
            if changes[c] and changes[c+1]:
                changes[c] = False
        ax_twin = ax.twiny()
        ax_twin.set_xlim(ax.get_xlim())
        ax_twin.set_xticks(np.argwhere(changes)[:, 0])
        ax_twin.set_xticklabels(centroid_idx[changes])
        ax_twin.grid(False)

        # plot codebook (centroids)
        ax = fig.add_subplot(gs[i+1, 0])
        ax.set_title(name)

        if centroids.shape[1] == 3:
            centroids = centroids.reshape(
                1, centroids.shape[0], centroids.shape[1])
        elif centroids.shape[1] == n_singv * length:
            centroids = centroids.reshape(
                1, centroids.shape[0]*length, centroids.shape[1]/length)
        else:
            centroids = centroids.reshape(
                centroids.shape[0] * length,
                centroids.shape[1] / length).T
        ax.imshow(centroids,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        ax.set_xticks(xrange(0, centroids.shape[1], 4))
        ax.set_xticklabels(xrange(k_best))
        ax.grid(False)

        # plot elbow curve
        c = 1
        for k, v, idx in (('BIC', scores_bic, idx_best_bic),
                          ('INERTIA', scores_inertia, idx_best_inertia),
                          ('SILHOUETTE', scores_silhouette, idx_best_silhouette)
                          ):
            ax = fig.add_subplot(gs[i+1, c])
            ax.set_title('{}, {} best K {}'.format(name, k, idx+K_MIN))
            ax.plot(xrange(K_MIN, K_MAX+1), v,  'b*-')
            ax.set_xlim((K_MIN, K_MAX+1))
            ax.set_xlabel('Number of clusters')
            ax.set_ylabel('Score')
            ax.grid(True)
            ax.axvline(idx+K_MIN, color='r')
            c += 1
        i += 2

        """
        if 'SVD' in name:
            # scikit-image clustering
            segments_slic = slic(
                data, n_segments=10, compactness=10, sigma=1)
            segments_quickshift = quickshift(
                data, kernel_size=3, max_dist=6, ratio=0.5)

            ax = fig.add_subplot(gs[k, 0])
            ax.set_title('{} with quickshift'.format(name))
            ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'),
                      interpolation='nearest',
                      origin='low',
                      aspect='auto',
                      cmap=plt.cm.Oranges)
            ax.set_xticks(ts[::step_size])
            ax.set_xticklabels(beat_times[::step_size], rotation=60)
            ax.grid(False)

            ax = fig.add_subplot(gs[k, 1])

            ax.set_title('{} with slic'.format(name))
            ax.imshow(mark_boundaries(data, segments_slic, mode='outer'),
                      interpolation='nearest',
                      origin='low',
                      aspect='auto',
                      cmap=plt.cm.Oranges)
            ax.set_xticks(ts[::step_size])
            ax.set_xticklabels(beat_times[::step_size], rotation=60)
            ax.grid(False)
            k += 1
        """

    plt.tight_layout()
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length, dim_red))

    # save with large size
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length, dim_red))
    # save with smaller size
    fig.set_figwidth(36)
    fig.set_figheight(24)
    plt.tight_layout()
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length, dim_red))

    plt.close(fig)
train_dataset = train.values
X = train_dataset[:, 2:]
y = train_dataset[:, 1]
y = y.astype('int')
test_dataset = test.values
X_test = test_dataset[:, 2:]
print(type(X_test))
print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape)

# In[5]:
df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']})

print('dirichlet process gaussian mixture begins****************')
dpgmm = DPGMM(n_components=3)
print('fitting****************')
dpgmm_train = dpgmm.fit(X, y)
print('predicting on train****************')
dpgmm_X_prediction = dpgmm.predict_proba(X)[:, 1]
print('predicting on test****************')
dpgmm_X_test_prediction = dpgmm.predict_proba(X_test)[:, 1]
tr_te_concatenated = np.concatenate(
    [dpgmm_X_prediction, dpgmm_X_test_prediction])
df['dirichlet_process_gaussian_mixture'] = tr_te_concatenated

print('final tr_te shape', df.shape)
print(df.head())

df.to_csv('dirichlet_process_gaussian_mixture_tr_te.csv', index=False)

print(df.head())
Ejemplo n.º 5
0
def plotClustering(fullpath,
                   order=1,
                   sr=4,
                   cutoff=.1,
                   n_singv=3,
                   feature='chroma',
                   dim_red='SVD',
                   round_to=0,
                   normalize=1,
                   scale=1,
                   length=4,
                   clustering='KMEANS'):
    feat = {}
    print(
        'Analyzing {} with feature {}, order {}, sr {}, cutoff {}, '
        'n_singv {}, scale {} normalize {}, round_to {}'.format(
            fullpath, feature, order, sr, cutoff, n_singv, scale, normalize,
            round_to))
    # extract filename, filepath and beat aligned feature
    filename, file_ext = os.path.splitext(fullpath)

    # extract filter and apply pre-processing
    feat[feature], beat_times = extractFeature(filename,
                                               file_ext,
                                               feature,
                                               scale,
                                               round_to,
                                               normalize,
                                               beat_sync=True,
                                               save=True)

    feat['LPF'] = lpf(feat[feature], cutoff, sr, order)
    feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv)
    feat['{}(LPF)'.format(dim_red)] = dim_red_fn(dim_red, feat['LPF'], n_singv)
    feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order)
    feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF']
    feat['LPF({}-LPF)'.format(feature)] = lpf(feat['{}-LPF'.format(feature)],
                                              cutoff, sr, order)
    feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn(
        dim_red, feat['LPF({}-LPF)'.format(feature)], n_singv)

    # create vars for plotting
    ts = np.arange(0, len(feat[feature]))
    step_size = max(1, int(len(ts) * .01))
    fig = plt.figure(figsize=(98, 64))
    fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format(
        feature, order, cutoff, sr))

    gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1])
    i = 0
    print "\tPlot data and pre-processing"
    for name in (feature, '{}-LPF'.format(feature), '{}(LPF)'.format(dim_red),
                 'LPF({})'.format(dim_red), 'LPF({}-LPF)'.format(feature),
                 '{}(LPF({}-LPF))'.format(dim_red, feature)):
        data = feat[name]

        data_wide = np.array([
            feat[name][m:m + length, :]
            for m in xrange(len(feat[name]) - length)
        ])
        data_wide = data_wide.reshape(data_wide.shape[0],
                                      data_wide.shape[1] * data_wide.shape[2])

        # build codebook using kmeans or DP-GMM
        if clustering == 'KMEANS':
            K_MIN, K_MAX = 2, 16
            KM = [
                KMeans(n_clusters=l, init='k-means++').fit(data_wide)
                for l in xrange(K_MIN, K_MAX + 1)
            ]

            # compute scores to assess fit
            scores_bic = [
                computeBic(KM[x], data_wide) for x in xrange(len(KM))
            ]
            scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))]
            scores_silhouette = [
                silhouette_score(data_wide, KM[x].labels_, metric='euclidean')
                for x in xrange(len(KM))
            ]

            # get best clusters
            idx_best_bic = findElbow(
                np.dstack((xrange(K_MIN, K_MAX + 1), scores_bic))[0])
            idx_best_inertia = findElbow(
                np.dstack((xrange(K_MIN, K_MAX + 1), scores_inertia))[0])
            idx_best_silhouette = findElbow(
                np.dstack((xrange(K_MIN, K_MAX + 1), scores_silhouette))[0])
            idx_best = int(
                np.median(
                    (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1

            # get clusters and cluster allocations given best K
            k_best = idx_best + K_MIN
            centroids = KM[idx_best].cluster_centers_
            centroid_idx = KM[idx_best].labels_
        elif clustering == 'DPGMM':
            n_components = 12
            dpgmm = DPGMM(n_components=n_components,
                          tol=1e-3,
                          n_iter=32,
                          alpha=1000,
                          covariance_type='diag',
                          verbose=True)
            dpgmm.fit(data_wide)

            # compute scores to assess fit
            scores_bic = dpgmm.bic(data_wide)
            scores_silhouette = [
                silhouette_score(data_wide, centroids, metric='euclidean')
            ]
            scores_silhouette = [0.0]

            # get clusters and cluster allocations given best K
            k_best = dpgmm.means_.shape[0]
            centroids = dpgmm.means_
            centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1)
        # plot data
        if data.shape[1] == 3:
            data = data.reshape(1, data.shape[0], data.shape[1])
        else:
            data = data.T

        ax = fig.add_subplot(gs[i, :])
        ax.set_title(name)
        ax.imshow(data,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        xlabels = [
            "{}:{}".format(int(x / 60), int(x % 60))
            for x in beat_times[::step_size]
        ]
        ax.set_xticks(ts[::step_size])
        ax.set_xticklabels(xlabels, rotation=60)
        ax.grid(False)

        # plot clustering on raw feature
        changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:]))
        for c in xrange(changes.shape[0] - 1):
            if changes[c] and changes[c + 1]:
                changes[c] = False
        ax_twin = ax.twiny()
        ax_twin.set_xlim(ax.get_xlim())
        ax_twin.set_xticks(np.argwhere(changes)[:, 0])
        ax_twin.set_xticklabels(centroid_idx[changes])
        ax_twin.grid(False)

        # plot codebook (centroids)
        ax = fig.add_subplot(gs[i + 1, 0])
        ax.set_title(name)

        if centroids.shape[1] == 3:
            centroids = centroids.reshape(1, centroids.shape[0],
                                          centroids.shape[1])
        elif centroids.shape[1] == n_singv * length:
            centroids = centroids.reshape(1, centroids.shape[0] * length,
                                          centroids.shape[1] / length)
        else:
            centroids = centroids.reshape(centroids.shape[0] * length,
                                          centroids.shape[1] / length).T
        ax.imshow(centroids,
                  interpolation='nearest',
                  origin='low',
                  aspect='auto',
                  cmap=plt.cm.Oranges)
        ax.set_xticks(xrange(0, centroids.shape[1], 4))
        ax.set_xticklabels(xrange(k_best))
        ax.grid(False)

        # plot elbow curve
        c = 1
        for k, v, idx in (('BIC', scores_bic, idx_best_bic),
                          ('INERTIA', scores_inertia,
                           idx_best_inertia), ('SILHOUETTE', scores_silhouette,
                                               idx_best_silhouette)):
            ax = fig.add_subplot(gs[i + 1, c])
            ax.set_title('{}, {} best K {}'.format(name, k, idx + K_MIN))
            ax.plot(xrange(K_MIN, K_MAX + 1), v, 'b*-')
            ax.set_xlim((K_MIN, K_MAX + 1))
            ax.set_xlabel('Number of clusters')
            ax.set_ylabel('Score')
            ax.grid(True)
            ax.axvline(idx + K_MIN, color='r')
            c += 1
        i += 2
        """
        if 'SVD' in name:
            # scikit-image clustering
            segments_slic = slic(
                data, n_segments=10, compactness=10, sigma=1)
            segments_quickshift = quickshift(
                data, kernel_size=3, max_dist=6, ratio=0.5)

            ax = fig.add_subplot(gs[k, 0])
            ax.set_title('{} with quickshift'.format(name))
            ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'),
                      interpolation='nearest',
                      origin='low',
                      aspect='auto',
                      cmap=plt.cm.Oranges)
            ax.set_xticks(ts[::step_size])
            ax.set_xticklabels(beat_times[::step_size], rotation=60)
            ax.grid(False)

            ax = fig.add_subplot(gs[k, 1])

            ax.set_title('{} with slic'.format(name))
            ax.imshow(mark_boundaries(data, segments_slic, mode='outer'),
                      interpolation='nearest',
                      origin='low',
                      aspect='auto',
                      cmap=plt.cm.Oranges)
            ax.set_xticks(ts[::step_size])
            ax.set_xticklabels(beat_times[::step_size], rotation=60)
            ax.grid(False)
            k += 1
        """

    plt.tight_layout()
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length,
        dim_red))

    # save with large size
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length,
        dim_red))
    # save with smaller size
    fig.set_figwidth(36)
    fig.set_figheight(24)
    plt.tight_layout()
    plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format(
        filename, feature, cutoff, round_to, normalize, scale, length,
        dim_red))

    plt.close(fig)
               cmap=plt.cm.plasma)
axes[1].imshow(feats_log,
               aspect='auto', origin='low', interpolation='nearest',
               cmap=plt.cm.plasma)
axes[2].imshow(feats_log_normed,
               aspect='auto', origin='low', interpolation='nearest',
               cmap=plt.cm.plasma)
fig.tight_layout()


# Clustering with DP-GMM
n_components = 32
dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000,
              covariance_type='diag', verbose=True)
dpgmm.fit(feats_log.T)
preds_proba = dpgmm.predict_proba(feats_log.T)
preds = np.argmax(preds_proba, axis=1)
np.unique(preds)
# resynthesis by sampling from clusters
resynthesis = dpgmm.means_[preds.astype(int), :]

fig, axes = plt.subplots(4, 1, figsize=(18, 8))
axes[0].set_title(feature)
axes[1].set_title('Prediction Probability')
axes[2].set_title('Resynthesis')
axes[3].set_title('Max(Prediction Probability)')

axes[0].imshow(feats_log,
               aspect='auto', origin='low', interpolation='nearest',
               cmap=plt.cm.plasma)
axes[1].imshow(preds_proba.T,