def _fit_dpgmm(self, x): # clustering k = max(self.crange) for r in xrange(self.repeats): # info if self.debug is True: print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1), # fit and evaluate model model_kwargs = {} if 'alpha' in self.clus_kwargs: model_kwargs.update(alpha=self.clus_kwargs['alpha']) if 'conv_thresh' in self.clus_kwargs: model_kwargs.update(thresh=self.clus_kwargs['conv_thresh']) if 'max_iter' in self.clus_kwargs: model_kwargs.update(n_iter=self.clus_kwargs['max_iter']) model = DPGMM(n_components=k, covariance_type=self.cvtype, **model_kwargs) model.fit(x) self._labels[r] = model.predict(x) self._parameters[r] = model.means_ self._ll[r] = model.score(x).sum() # evaluate goodness of fit for this run #self._gof[r] = self.gof(x, self._ll[r], k) if self.gof_type == 'aic': self._gof[r] = model.aic(x) if self.gof_type == 'bic': self._gof[r] = model.bic(x) # debug if self.debug is True: print self._gof[r], model.n_components, model.weights_.shape[0]
def get_best_dpgmm(X, num_c, cv_type, alpha, iters, n_init, rand_state=None): best_bic = np.inf bic_dpgmm = None lbl_vec_dpgmm = np.zeros(X.shape[0]) prob_vec_dpgmm = np.zeros(X.shape[0]) log_prob_dpgmm = None for i in xrange(n_init): dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \ alpha=alpha, random_state=rand_state) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: bic_dpgmm = b lbl_vec = dpgmm.predict(X) prob_vec = dpgmm.predict_proba(X) log_prob_dpgmm = np.sum(dpgmm.score(X)) return [lbl_vec, prob_vec, bic_dpgmm, log_prob_dpgmm]
def plot_num_iters_dpgmm(X, num_c, cv_type, alpha, max_iters, n_init): bic = [] for iters in np.arange(1, max_iters): best_bic = np.inf for j in xrange(n_init): dpgmm = DPGMM(n_components=comp, covariance_type=cv_type, \ alpha=a, n_iter=iters) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: best_bic = b bic.append(best_bic) fig, ax = plt.subplots(figsize=(10, 8)) ax.plot(np.arange(1, max_iters), bic) ax.set_title('BIC vs. Number of Iterations DPGMM') ax.set_xlabel('Number of iterations') ax.set_ylabel('BIC score') return fig
def plot_alpha_dpgmm(X, num_c, cv_type, alphas, iters, n_init): bic = [] for a in alphas: best_bic = np.inf for j in xrange(n_init): dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \ alpha=a, n_iter=iters) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: best_bic = b bic.append(best_bic) fig, ax = plt.subplots(figsize=(10, 8)) ax.plot(alphas, bic, 'bo-', lw=2) ax.set_title('BIC vs. Alpha DPGMM') ax.set_xlabel('Alpha') ax.set_ylabel('BIC score') return fig
def plotClustering(fullpath, order=1, sr=4, cutoff=.1, n_singv=3, feature='chroma', dim_red='SVD', round_to=0, normalize=1, scale=1, length=4, clustering='KMEANS'): feat = {} print ('Analyzing {} with feature {}, order {}, sr {}, cutoff {}, ' 'n_singv {}, scale {} normalize {}, round_to {}'.format( fullpath, feature, order, sr, cutoff, n_singv, scale, normalize, round_to)) # extract filename, filepath and beat aligned feature filename, file_ext = os.path.splitext(fullpath) # extract filter and apply pre-processing feat[feature], beat_times = extractFeature( filename, file_ext, feature, scale, round_to, normalize, beat_sync=True, save=True) feat['LPF'] = lpf(feat[feature], cutoff, sr, order) feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv) feat['{}(LPF)'.format(dim_red)] = dim_red_fn( dim_red, feat['LPF'], n_singv) feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order) feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF'] feat['LPF({}-LPF)'.format(feature)] = lpf( feat['{}-LPF'.format(feature)], cutoff, sr, order) feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn(dim_red, feat['LPF({}-LPF)'.format(feature)], n_singv) # create vars for plotting ts = np.arange(0, len(feat[feature])) step_size = max(1, int(len(ts) * .01)) fig = plt.figure(figsize=(98, 64)) fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format( feature, order, cutoff, sr)) gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1]) i = 0 print "\tPlot data and pre-processing" for name in (feature, '{}-LPF'.format(feature), '{}(LPF)'.format(dim_red), 'LPF({})'.format(dim_red), 'LPF({}-LPF)'.format(feature), '{}(LPF({}-LPF))'.format(dim_red, feature)): data = feat[name] data_wide = np.array([feat[name][m:m+length, :] for m in xrange(len(feat[name])-length)]) data_wide = data_wide.reshape( data_wide.shape[0], data_wide.shape[1]*data_wide.shape[2]) # build codebook using kmeans or DP-GMM if clustering == 'KMEANS': K_MIN, K_MAX = 2, 16 KM = [KMeans(n_clusters=l, init='k-means++').fit(data_wide) for l in xrange(K_MIN, K_MAX+1)] # compute scores to assess fit scores_bic = [computeBic(KM[x], data_wide) for x in xrange(len(KM))] scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))] scores_silhouette = [silhouette_score(data_wide, KM[x].labels_, metric='euclidean') for x in xrange(len(KM))] # get best clusters idx_best_bic = findElbow(np.dstack( (xrange(K_MIN, K_MAX+1), scores_bic))[0]) idx_best_inertia = findElbow(np.dstack( (xrange(K_MIN, K_MAX+1), scores_inertia))[0]) idx_best_silhouette = findElbow(np.dstack( (xrange(K_MIN, K_MAX+1), scores_silhouette))[0]) idx_best = int(np.median( (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1 # get clusters and cluster allocations given best K k_best = idx_best + K_MIN centroids = KM[idx_best].cluster_centers_ centroid_idx = KM[idx_best].labels_ elif clustering == 'DPGMM': n_components = 12 dpgmm = DPGMM( n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(data_wide) # compute scores to assess fit scores_bic = dpgmm.bic(data_wide) scores_silhouette = [silhouette_score(data_wide, centroids, metric='euclidean')] scores_silhouette = [0.0] # get clusters and cluster allocations given best K k_best = dpgmm.means_.shape[0] centroids = dpgmm.means_ centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1) # plot data if data.shape[1] == 3: data = data.reshape(1, data.shape[0], data.shape[1]) else: data = data.T ax = fig.add_subplot(gs[i, :]) ax.set_title(name) ax.imshow(data, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) xlabels = ["{}:{}".format(int(x / 60), int(x % 60)) for x in beat_times[::step_size]] ax.set_xticks(ts[::step_size]) ax.set_xticklabels(xlabels, rotation=60) ax.grid(False) # plot clustering on raw feature changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:])) for c in xrange(changes.shape[0]-1): if changes[c] and changes[c+1]: changes[c] = False ax_twin = ax.twiny() ax_twin.set_xlim(ax.get_xlim()) ax_twin.set_xticks(np.argwhere(changes)[:, 0]) ax_twin.set_xticklabels(centroid_idx[changes]) ax_twin.grid(False) # plot codebook (centroids) ax = fig.add_subplot(gs[i+1, 0]) ax.set_title(name) if centroids.shape[1] == 3: centroids = centroids.reshape( 1, centroids.shape[0], centroids.shape[1]) elif centroids.shape[1] == n_singv * length: centroids = centroids.reshape( 1, centroids.shape[0]*length, centroids.shape[1]/length) else: centroids = centroids.reshape( centroids.shape[0] * length, centroids.shape[1] / length).T ax.imshow(centroids, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(xrange(0, centroids.shape[1], 4)) ax.set_xticklabels(xrange(k_best)) ax.grid(False) # plot elbow curve c = 1 for k, v, idx in (('BIC', scores_bic, idx_best_bic), ('INERTIA', scores_inertia, idx_best_inertia), ('SILHOUETTE', scores_silhouette, idx_best_silhouette) ): ax = fig.add_subplot(gs[i+1, c]) ax.set_title('{}, {} best K {}'.format(name, k, idx+K_MIN)) ax.plot(xrange(K_MIN, K_MAX+1), v, 'b*-') ax.set_xlim((K_MIN, K_MAX+1)) ax.set_xlabel('Number of clusters') ax.set_ylabel('Score') ax.grid(True) ax.axvline(idx+K_MIN, color='r') c += 1 i += 2 """ if 'SVD' in name: # scikit-image clustering segments_slic = slic( data, n_segments=10, compactness=10, sigma=1) segments_quickshift = quickshift( data, kernel_size=3, max_dist=6, ratio=0.5) ax = fig.add_subplot(gs[k, 0]) ax.set_title('{} with quickshift'.format(name)) ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) ax = fig.add_subplot(gs[k, 1]) ax.set_title('{} with slic'.format(name)) ax.imshow(mark_boundaries(data, segments_slic, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) k += 1 """ plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with large size plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with smaller size fig.set_figwidth(36) fig.set_figheight(24) plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) plt.close(fig)
for chunks in np.arange(1, opts.size, step = 3): # Sample the specified number of points from X_unlabeled size = np.cumsum(chunk_sizes[:chunks])[-1] # Fit a Dirichlet process mixture of Gaussians using up to ten components dpgmm = DPGMM(n_components=10, alpha=10.0, covariance_type='full') indices = np.arange(X_unlabeled.shape[0]) np.random.shuffle(indices) X = X_unlabeled[indices[:size],] print("fitting a model with", size, "data points") with timeit(): dpgmm.fit(X) print("Done!") print("AIC for this model & data: ", dpgmm.aic(X)) print("BIC for this model & data: ", dpgmm.bic(X)) Y_hat = dpgmm.predict(X) print ("Model assigned points to", np.max(Y_hat), "components") # How can I best check this out? #color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) #for i, (clf, title) in enumerate([(gmm, 'GMM'), #(dpgmm, 'Dirichlet Process GMM')]): #splot = plt.subplot(2, 1, 1 + i) #Y_ = clf.predict(X) #for i, (mean, covar, color) in enumerate(zip( #clf.means_, clf._get_covars(), color_iter)): #v, w = linalg.eigh(covar) #u = w[0] / linalg.norm(w[0]) ## as the DP will not use every component it has access to
def plotClustering(fullpath, order=1, sr=4, cutoff=.1, n_singv=3, feature='chroma', dim_red='SVD', round_to=0, normalize=1, scale=1, length=4, clustering='KMEANS'): feat = {} print( 'Analyzing {} with feature {}, order {}, sr {}, cutoff {}, ' 'n_singv {}, scale {} normalize {}, round_to {}'.format( fullpath, feature, order, sr, cutoff, n_singv, scale, normalize, round_to)) # extract filename, filepath and beat aligned feature filename, file_ext = os.path.splitext(fullpath) # extract filter and apply pre-processing feat[feature], beat_times = extractFeature(filename, file_ext, feature, scale, round_to, normalize, beat_sync=True, save=True) feat['LPF'] = lpf(feat[feature], cutoff, sr, order) feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv) feat['{}(LPF)'.format(dim_red)] = dim_red_fn(dim_red, feat['LPF'], n_singv) feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order) feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF'] feat['LPF({}-LPF)'.format(feature)] = lpf(feat['{}-LPF'.format(feature)], cutoff, sr, order) feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn( dim_red, feat['LPF({}-LPF)'.format(feature)], n_singv) # create vars for plotting ts = np.arange(0, len(feat[feature])) step_size = max(1, int(len(ts) * .01)) fig = plt.figure(figsize=(98, 64)) fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format( feature, order, cutoff, sr)) gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1]) i = 0 print "\tPlot data and pre-processing" for name in (feature, '{}-LPF'.format(feature), '{}(LPF)'.format(dim_red), 'LPF({})'.format(dim_red), 'LPF({}-LPF)'.format(feature), '{}(LPF({}-LPF))'.format(dim_red, feature)): data = feat[name] data_wide = np.array([ feat[name][m:m + length, :] for m in xrange(len(feat[name]) - length) ]) data_wide = data_wide.reshape(data_wide.shape[0], data_wide.shape[1] * data_wide.shape[2]) # build codebook using kmeans or DP-GMM if clustering == 'KMEANS': K_MIN, K_MAX = 2, 16 KM = [ KMeans(n_clusters=l, init='k-means++').fit(data_wide) for l in xrange(K_MIN, K_MAX + 1) ] # compute scores to assess fit scores_bic = [ computeBic(KM[x], data_wide) for x in xrange(len(KM)) ] scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))] scores_silhouette = [ silhouette_score(data_wide, KM[x].labels_, metric='euclidean') for x in xrange(len(KM)) ] # get best clusters idx_best_bic = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_bic))[0]) idx_best_inertia = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_inertia))[0]) idx_best_silhouette = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_silhouette))[0]) idx_best = int( np.median( (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1 # get clusters and cluster allocations given best K k_best = idx_best + K_MIN centroids = KM[idx_best].cluster_centers_ centroid_idx = KM[idx_best].labels_ elif clustering == 'DPGMM': n_components = 12 dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(data_wide) # compute scores to assess fit scores_bic = dpgmm.bic(data_wide) scores_silhouette = [ silhouette_score(data_wide, centroids, metric='euclidean') ] scores_silhouette = [0.0] # get clusters and cluster allocations given best K k_best = dpgmm.means_.shape[0] centroids = dpgmm.means_ centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1) # plot data if data.shape[1] == 3: data = data.reshape(1, data.shape[0], data.shape[1]) else: data = data.T ax = fig.add_subplot(gs[i, :]) ax.set_title(name) ax.imshow(data, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) xlabels = [ "{}:{}".format(int(x / 60), int(x % 60)) for x in beat_times[::step_size] ] ax.set_xticks(ts[::step_size]) ax.set_xticklabels(xlabels, rotation=60) ax.grid(False) # plot clustering on raw feature changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:])) for c in xrange(changes.shape[0] - 1): if changes[c] and changes[c + 1]: changes[c] = False ax_twin = ax.twiny() ax_twin.set_xlim(ax.get_xlim()) ax_twin.set_xticks(np.argwhere(changes)[:, 0]) ax_twin.set_xticklabels(centroid_idx[changes]) ax_twin.grid(False) # plot codebook (centroids) ax = fig.add_subplot(gs[i + 1, 0]) ax.set_title(name) if centroids.shape[1] == 3: centroids = centroids.reshape(1, centroids.shape[0], centroids.shape[1]) elif centroids.shape[1] == n_singv * length: centroids = centroids.reshape(1, centroids.shape[0] * length, centroids.shape[1] / length) else: centroids = centroids.reshape(centroids.shape[0] * length, centroids.shape[1] / length).T ax.imshow(centroids, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(xrange(0, centroids.shape[1], 4)) ax.set_xticklabels(xrange(k_best)) ax.grid(False) # plot elbow curve c = 1 for k, v, idx in (('BIC', scores_bic, idx_best_bic), ('INERTIA', scores_inertia, idx_best_inertia), ('SILHOUETTE', scores_silhouette, idx_best_silhouette)): ax = fig.add_subplot(gs[i + 1, c]) ax.set_title('{}, {} best K {}'.format(name, k, idx + K_MIN)) ax.plot(xrange(K_MIN, K_MAX + 1), v, 'b*-') ax.set_xlim((K_MIN, K_MAX + 1)) ax.set_xlabel('Number of clusters') ax.set_ylabel('Score') ax.grid(True) ax.axvline(idx + K_MIN, color='r') c += 1 i += 2 """ if 'SVD' in name: # scikit-image clustering segments_slic = slic( data, n_segments=10, compactness=10, sigma=1) segments_quickshift = quickshift( data, kernel_size=3, max_dist=6, ratio=0.5) ax = fig.add_subplot(gs[k, 0]) ax.set_title('{} with quickshift'.format(name)) ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) ax = fig.add_subplot(gs[k, 1]) ax.set_title('{} with slic'.format(name)) ax.imshow(mark_boundaries(data, segments_slic, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) k += 1 """ plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with large size plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with smaller size fig.set_figwidth(36) fig.set_figheight(24) plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) plt.close(fig)