def get_best_dpgmm(X, num_c, cv_type, alpha, iters, n_init, rand_state=None): best_bic = np.inf bic_dpgmm = None lbl_vec_dpgmm = np.zeros(X.shape[0]) prob_vec_dpgmm = np.zeros(X.shape[0]) log_prob_dpgmm = None for i in xrange(n_init): dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \ alpha=alpha, random_state=rand_state) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: bic_dpgmm = b lbl_vec = dpgmm.predict(X) prob_vec = dpgmm.predict_proba(X) log_prob_dpgmm = np.sum(dpgmm.score(X)) return [lbl_vec, prob_vec, bic_dpgmm, log_prob_dpgmm]
aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) fig.tight_layout() # Clustering with DP-GMM n_components = 32 dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(feats_log.T) preds_proba = dpgmm.predict_proba(feats_log.T) preds = np.argmax(preds_proba, axis=1) np.unique(preds) # resynthesis by sampling from clusters resynthesis = dpgmm.means_[preds.astype(int), :] fig, axes = plt.subplots(4, 1, figsize=(18, 8)) axes[0].set_title(feature) axes[1].set_title('Prediction Probability') axes[2].set_title('Resynthesis') axes[3].set_title('Max(Prediction Probability)') axes[0].imshow(feats_log, aspect='auto', origin='low', interpolation='nearest',
def plotClustering(fullpath, order=1, sr=4, cutoff=.1, n_singv=3, feature='chroma', dim_red='SVD', round_to=0, normalize=1, scale=1, length=4, clustering='KMEANS'): feat = {} print ('Analyzing {} with feature {}, order {}, sr {}, cutoff {}, ' 'n_singv {}, scale {} normalize {}, round_to {}'.format( fullpath, feature, order, sr, cutoff, n_singv, scale, normalize, round_to)) # extract filename, filepath and beat aligned feature filename, file_ext = os.path.splitext(fullpath) # extract filter and apply pre-processing feat[feature], beat_times = extractFeature( filename, file_ext, feature, scale, round_to, normalize, beat_sync=True, save=True) feat['LPF'] = lpf(feat[feature], cutoff, sr, order) feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv) feat['{}(LPF)'.format(dim_red)] = dim_red_fn( dim_red, feat['LPF'], n_singv) feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order) feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF'] feat['LPF({}-LPF)'.format(feature)] = lpf( feat['{}-LPF'.format(feature)], cutoff, sr, order) feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn(dim_red, feat['LPF({}-LPF)'.format(feature)], n_singv) # create vars for plotting ts = np.arange(0, len(feat[feature])) step_size = max(1, int(len(ts) * .01)) fig = plt.figure(figsize=(98, 64)) fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format( feature, order, cutoff, sr)) gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1]) i = 0 print "\tPlot data and pre-processing" for name in (feature, '{}-LPF'.format(feature), '{}(LPF)'.format(dim_red), 'LPF({})'.format(dim_red), 'LPF({}-LPF)'.format(feature), '{}(LPF({}-LPF))'.format(dim_red, feature)): data = feat[name] data_wide = np.array([feat[name][m:m+length, :] for m in xrange(len(feat[name])-length)]) data_wide = data_wide.reshape( data_wide.shape[0], data_wide.shape[1]*data_wide.shape[2]) # build codebook using kmeans or DP-GMM if clustering == 'KMEANS': K_MIN, K_MAX = 2, 16 KM = [KMeans(n_clusters=l, init='k-means++').fit(data_wide) for l in xrange(K_MIN, K_MAX+1)] # compute scores to assess fit scores_bic = [computeBic(KM[x], data_wide) for x in xrange(len(KM))] scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))] scores_silhouette = [silhouette_score(data_wide, KM[x].labels_, metric='euclidean') for x in xrange(len(KM))] # get best clusters idx_best_bic = findElbow(np.dstack( (xrange(K_MIN, K_MAX+1), scores_bic))[0]) idx_best_inertia = findElbow(np.dstack( (xrange(K_MIN, K_MAX+1), scores_inertia))[0]) idx_best_silhouette = findElbow(np.dstack( (xrange(K_MIN, K_MAX+1), scores_silhouette))[0]) idx_best = int(np.median( (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1 # get clusters and cluster allocations given best K k_best = idx_best + K_MIN centroids = KM[idx_best].cluster_centers_ centroid_idx = KM[idx_best].labels_ elif clustering == 'DPGMM': n_components = 12 dpgmm = DPGMM( n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(data_wide) # compute scores to assess fit scores_bic = dpgmm.bic(data_wide) scores_silhouette = [silhouette_score(data_wide, centroids, metric='euclidean')] scores_silhouette = [0.0] # get clusters and cluster allocations given best K k_best = dpgmm.means_.shape[0] centroids = dpgmm.means_ centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1) # plot data if data.shape[1] == 3: data = data.reshape(1, data.shape[0], data.shape[1]) else: data = data.T ax = fig.add_subplot(gs[i, :]) ax.set_title(name) ax.imshow(data, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) xlabels = ["{}:{}".format(int(x / 60), int(x % 60)) for x in beat_times[::step_size]] ax.set_xticks(ts[::step_size]) ax.set_xticklabels(xlabels, rotation=60) ax.grid(False) # plot clustering on raw feature changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:])) for c in xrange(changes.shape[0]-1): if changes[c] and changes[c+1]: changes[c] = False ax_twin = ax.twiny() ax_twin.set_xlim(ax.get_xlim()) ax_twin.set_xticks(np.argwhere(changes)[:, 0]) ax_twin.set_xticklabels(centroid_idx[changes]) ax_twin.grid(False) # plot codebook (centroids) ax = fig.add_subplot(gs[i+1, 0]) ax.set_title(name) if centroids.shape[1] == 3: centroids = centroids.reshape( 1, centroids.shape[0], centroids.shape[1]) elif centroids.shape[1] == n_singv * length: centroids = centroids.reshape( 1, centroids.shape[0]*length, centroids.shape[1]/length) else: centroids = centroids.reshape( centroids.shape[0] * length, centroids.shape[1] / length).T ax.imshow(centroids, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(xrange(0, centroids.shape[1], 4)) ax.set_xticklabels(xrange(k_best)) ax.grid(False) # plot elbow curve c = 1 for k, v, idx in (('BIC', scores_bic, idx_best_bic), ('INERTIA', scores_inertia, idx_best_inertia), ('SILHOUETTE', scores_silhouette, idx_best_silhouette) ): ax = fig.add_subplot(gs[i+1, c]) ax.set_title('{}, {} best K {}'.format(name, k, idx+K_MIN)) ax.plot(xrange(K_MIN, K_MAX+1), v, 'b*-') ax.set_xlim((K_MIN, K_MAX+1)) ax.set_xlabel('Number of clusters') ax.set_ylabel('Score') ax.grid(True) ax.axvline(idx+K_MIN, color='r') c += 1 i += 2 """ if 'SVD' in name: # scikit-image clustering segments_slic = slic( data, n_segments=10, compactness=10, sigma=1) segments_quickshift = quickshift( data, kernel_size=3, max_dist=6, ratio=0.5) ax = fig.add_subplot(gs[k, 0]) ax.set_title('{} with quickshift'.format(name)) ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) ax = fig.add_subplot(gs[k, 1]) ax.set_title('{} with slic'.format(name)) ax.imshow(mark_boundaries(data, segments_slic, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) k += 1 """ plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with large size plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with smaller size fig.set_figwidth(36) fig.set_figheight(24) plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) plt.close(fig)
train_dataset = train.values X = train_dataset[:, 2:] y = train_dataset[:, 1] y = y.astype('int') test_dataset = test.values X_test = test_dataset[:, 2:] print(type(X_test)) print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape) # In[5]: df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']}) print('dirichlet process gaussian mixture begins****************') dpgmm = DPGMM(n_components=3) print('fitting****************') dpgmm_train = dpgmm.fit(X, y) print('predicting on train****************') dpgmm_X_prediction = dpgmm.predict_proba(X)[:, 1] print('predicting on test****************') dpgmm_X_test_prediction = dpgmm.predict_proba(X_test)[:, 1] tr_te_concatenated = np.concatenate( [dpgmm_X_prediction, dpgmm_X_test_prediction]) df['dirichlet_process_gaussian_mixture'] = tr_te_concatenated print('final tr_te shape', df.shape) print(df.head()) df.to_csv('dirichlet_process_gaussian_mixture_tr_te.csv', index=False) print(df.head())
def plotClustering(fullpath, order=1, sr=4, cutoff=.1, n_singv=3, feature='chroma', dim_red='SVD', round_to=0, normalize=1, scale=1, length=4, clustering='KMEANS'): feat = {} print( 'Analyzing {} with feature {}, order {}, sr {}, cutoff {}, ' 'n_singv {}, scale {} normalize {}, round_to {}'.format( fullpath, feature, order, sr, cutoff, n_singv, scale, normalize, round_to)) # extract filename, filepath and beat aligned feature filename, file_ext = os.path.splitext(fullpath) # extract filter and apply pre-processing feat[feature], beat_times = extractFeature(filename, file_ext, feature, scale, round_to, normalize, beat_sync=True, save=True) feat['LPF'] = lpf(feat[feature], cutoff, sr, order) feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv) feat['{}(LPF)'.format(dim_red)] = dim_red_fn(dim_red, feat['LPF'], n_singv) feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order) feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF'] feat['LPF({}-LPF)'.format(feature)] = lpf(feat['{}-LPF'.format(feature)], cutoff, sr, order) feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn( dim_red, feat['LPF({}-LPF)'.format(feature)], n_singv) # create vars for plotting ts = np.arange(0, len(feat[feature])) step_size = max(1, int(len(ts) * .01)) fig = plt.figure(figsize=(98, 64)) fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format( feature, order, cutoff, sr)) gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1]) i = 0 print "\tPlot data and pre-processing" for name in (feature, '{}-LPF'.format(feature), '{}(LPF)'.format(dim_red), 'LPF({})'.format(dim_red), 'LPF({}-LPF)'.format(feature), '{}(LPF({}-LPF))'.format(dim_red, feature)): data = feat[name] data_wide = np.array([ feat[name][m:m + length, :] for m in xrange(len(feat[name]) - length) ]) data_wide = data_wide.reshape(data_wide.shape[0], data_wide.shape[1] * data_wide.shape[2]) # build codebook using kmeans or DP-GMM if clustering == 'KMEANS': K_MIN, K_MAX = 2, 16 KM = [ KMeans(n_clusters=l, init='k-means++').fit(data_wide) for l in xrange(K_MIN, K_MAX + 1) ] # compute scores to assess fit scores_bic = [ computeBic(KM[x], data_wide) for x in xrange(len(KM)) ] scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))] scores_silhouette = [ silhouette_score(data_wide, KM[x].labels_, metric='euclidean') for x in xrange(len(KM)) ] # get best clusters idx_best_bic = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_bic))[0]) idx_best_inertia = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_inertia))[0]) idx_best_silhouette = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_silhouette))[0]) idx_best = int( np.median( (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1 # get clusters and cluster allocations given best K k_best = idx_best + K_MIN centroids = KM[idx_best].cluster_centers_ centroid_idx = KM[idx_best].labels_ elif clustering == 'DPGMM': n_components = 12 dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(data_wide) # compute scores to assess fit scores_bic = dpgmm.bic(data_wide) scores_silhouette = [ silhouette_score(data_wide, centroids, metric='euclidean') ] scores_silhouette = [0.0] # get clusters and cluster allocations given best K k_best = dpgmm.means_.shape[0] centroids = dpgmm.means_ centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1) # plot data if data.shape[1] == 3: data = data.reshape(1, data.shape[0], data.shape[1]) else: data = data.T ax = fig.add_subplot(gs[i, :]) ax.set_title(name) ax.imshow(data, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) xlabels = [ "{}:{}".format(int(x / 60), int(x % 60)) for x in beat_times[::step_size] ] ax.set_xticks(ts[::step_size]) ax.set_xticklabels(xlabels, rotation=60) ax.grid(False) # plot clustering on raw feature changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:])) for c in xrange(changes.shape[0] - 1): if changes[c] and changes[c + 1]: changes[c] = False ax_twin = ax.twiny() ax_twin.set_xlim(ax.get_xlim()) ax_twin.set_xticks(np.argwhere(changes)[:, 0]) ax_twin.set_xticklabels(centroid_idx[changes]) ax_twin.grid(False) # plot codebook (centroids) ax = fig.add_subplot(gs[i + 1, 0]) ax.set_title(name) if centroids.shape[1] == 3: centroids = centroids.reshape(1, centroids.shape[0], centroids.shape[1]) elif centroids.shape[1] == n_singv * length: centroids = centroids.reshape(1, centroids.shape[0] * length, centroids.shape[1] / length) else: centroids = centroids.reshape(centroids.shape[0] * length, centroids.shape[1] / length).T ax.imshow(centroids, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(xrange(0, centroids.shape[1], 4)) ax.set_xticklabels(xrange(k_best)) ax.grid(False) # plot elbow curve c = 1 for k, v, idx in (('BIC', scores_bic, idx_best_bic), ('INERTIA', scores_inertia, idx_best_inertia), ('SILHOUETTE', scores_silhouette, idx_best_silhouette)): ax = fig.add_subplot(gs[i + 1, c]) ax.set_title('{}, {} best K {}'.format(name, k, idx + K_MIN)) ax.plot(xrange(K_MIN, K_MAX + 1), v, 'b*-') ax.set_xlim((K_MIN, K_MAX + 1)) ax.set_xlabel('Number of clusters') ax.set_ylabel('Score') ax.grid(True) ax.axvline(idx + K_MIN, color='r') c += 1 i += 2 """ if 'SVD' in name: # scikit-image clustering segments_slic = slic( data, n_segments=10, compactness=10, sigma=1) segments_quickshift = quickshift( data, kernel_size=3, max_dist=6, ratio=0.5) ax = fig.add_subplot(gs[k, 0]) ax.set_title('{} with quickshift'.format(name)) ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) ax = fig.add_subplot(gs[k, 1]) ax.set_title('{} with slic'.format(name)) ax.imshow(mark_boundaries(data, segments_slic, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) k += 1 """ plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with large size plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with smaller size fig.set_figwidth(36) fig.set_figheight(24) plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) plt.close(fig)
cmap=plt.cm.plasma) axes[1].imshow(feats_log, aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) axes[2].imshow(feats_log_normed, aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) fig.tight_layout() # Clustering with DP-GMM n_components = 32 dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(feats_log.T) preds_proba = dpgmm.predict_proba(feats_log.T) preds = np.argmax(preds_proba, axis=1) np.unique(preds) # resynthesis by sampling from clusters resynthesis = dpgmm.means_[preds.astype(int), :] fig, axes = plt.subplots(4, 1, figsize=(18, 8)) axes[0].set_title(feature) axes[1].set_title('Prediction Probability') axes[2].set_title('Resynthesis') axes[3].set_title('Max(Prediction Probability)') axes[0].imshow(feats_log, aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) axes[1].imshow(preds_proba.T,