def _fit_dpgmm(self, x): # clustering k = max(self.crange) for r in xrange(self.repeats): # info if self.debug is True: print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1), # fit and evaluate model model_kwargs = {} if 'alpha' in self.clus_kwargs: model_kwargs.update(alpha=self.clus_kwargs['alpha']) if 'conv_thresh' in self.clus_kwargs: model_kwargs.update(thresh=self.clus_kwargs['conv_thresh']) if 'max_iter' in self.clus_kwargs: model_kwargs.update(n_iter=self.clus_kwargs['max_iter']) model = DPGMM(n_components=k, covariance_type=self.cvtype, **model_kwargs) model.fit(x) self._labels[r] = model.predict(x) self._parameters[r] = model.means_ self._ll[r] = model.score(x).sum() # evaluate goodness of fit for this run #self._gof[r] = self.gof(x, self._ll[r], k) if self.gof_type == 'aic': self._gof[r] = model.aic(x) if self.gof_type == 'bic': self._gof[r] = model.bic(x) # debug if self.debug is True: print self._gof[r], model.n_components, model.weights_.shape[0]
def fit_vel_profile_dpgmm(vel_profile, n_comps=5, dp=False): """ fit a velocity profile with DP-GMM """ N = 1000 # 1000 samples to fit integral = np.sum(vel_profile) #vel_profile is a 1D array, try to convert it to samples t = np.linspace(0, 1, len(vel_profile)) data = np.array([]) for i in range(len(t)): n_samples = vel_profile[i] / integral * N if n_samples > 0: #add samples samples = np.ones(n_samples) * t[i] #add noise data = np.concatenate([data, samples]) fit_data = np.array([data]).transpose() #fit Dirichlet-Process Gaussian Mixture Model, #something wrong with the module? The clusters seem merged... if dp: model = DPGMM(n_components=n_comps, n_iter=1000, alpha=10) else: model = GMM(n_components=n_comps) model.fit(fit_data) return model
def plot_GPLVM_data_cluster(results_dir, n_clusters=None, VB=False): # Load relevant datasets data_array = np.genfromtxt(os.path.join(results_dir, 'summary.csv'), delimiter=',') X = (np.genfromtxt(os.path.join(results_dir, 'GPLVM-datasets-2.csv'), delimiter=',')) datasets = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'datasets.csv'), 'r').readlines()] methods = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'methods.csv'), 'r').readlines()] # Fit a mixture model if n_clusters is None: m = DPGMM() elif VB: m = VBGMM(alpha = 10, n_components=n_clusters) else: m = GMM(n_components=n_clusters, n_init=100) m.fit(data_array.T) clusters = m.predict(data_array.T) # Plot #clf() figure(1) pretty_scatter(X[:,0], X[:,1], clusters, 200*np.ones(X[:,0].shape), datasets) xlabel('Dimension 1') ylabel('Dimension 2') if n_clusters is None: title('CRP MoG') elif VB: title('%d clusters with VB' % n_clusters) else: title('%d clusters with EM' % n_clusters) show()
def try_covar(type_str, x_words): clf = DPGMM(n_components=20, covariance_type=type_str, alpha=30, n_iter=1000) clf.fit(x_data) y_ = clf.predict(x_data) print type_str print_centers(x_words, y_, clf) print
def load_build_dpggm(dpggm_model_name, x_data): if os.path.isfile(dpggm_model_name): clf = load_dpggm(dpggm_model_name) else: clf = DPGMM(n_components=30, covariance_type='diag', alpha=5, n_iter=1000) logging.info("Fitting with DPGMM") clf.fit(x_data) pickle.dump(clf, open(dpggm_model_name, 'wb')) logging.info("Fitted") print clf.converged_ return clf
def main(): if len(sys.argv) != 4: print(__doc__) return 1 infile = sys.argv[1] N = int(sys.argv[2]) num_random = int(sys.argv[3]) print("Reading in", infile) fullarr = np.loadtxt(fileinput.input(infile), delimiter = '\t')[:,:-7] stds = np.apply_along_axis(np.std, 0, fullarr)[:,np.newaxis].T means = np.apply_along_axis(np.mean, 0, fullarr)[:,np.newaxis].T stds[stds == 0] = 1.0 num_lines = num_random fullarr = fullarr[np.random.choice(fullarr.shape[0], num_lines, replace=True),:] fullarr = (fullarr - means) / stds output = '' print("Parameter searching...") igmm = None best_score = -100000 best_alpha = -1 best_model = None for alpha in [0.01,0.1,1,10]: print("Learning infinite GMM with N={}, alpha={}".format(N, alpha)) output += "Learning infinite GMM with N={}, alpha={}\n".format(N, alpha) igmm = DPGMM(covariance_type='diag', n_components=N, alpha=alpha, init_params='wmc') igmm.fit(fullarr) score = igmm.score(fullarr) score = sum(score)/len(score) print('{}: {} with {} clusters'.format(alpha, score, igmm.n_components)) output += '{}: {} with {} clusters\n'.format(alpha, score, igmm.n_components) if score > best_score: best_score = score best_alpha = alpha best_model = igmm print('Best alpha={}, score={}'.format(best_alpha, best_score)) output += 'Best alpha={}, score={}\n'.format(best_alpha, best_score) with open('parameter_search_results.txt', 'a+') as outf: outf.write(output) return 0
def train_dpgmm(X, n_components=3, covariance_type='diag', alpha=1.0, random_state=None, thresh=None, tol=0.001, verbose=False, min_covar=None, n_iter=10, params='wmc', init_params='wmc'): """ This function trains a Infinite Gaussian Mixture Model for clustering :param X: :param n_components: :param covariance_type: :param alpha: :param random_state: :param thresh: :param tol: :param verbose: :param min_covar: :param n_iter: :param params: :param init_params: :return: a trained DPGMM clustering model """ model = DPGMM(n_components=n_components, covariance_type=covariance_type, alpha=alpha, random_state=random_state, thresh=thresh, verbose=verbose, min_covar=min_covar, n_iter=n_iter, params=params, init_params=init_params) model = model.fit(X) return model
def get_best_dpgmm(X, num_c, cv_type, alpha, iters, n_init, rand_state=None): best_bic = np.inf bic_dpgmm = None lbl_vec_dpgmm = np.zeros(X.shape[0]) prob_vec_dpgmm = np.zeros(X.shape[0]) log_prob_dpgmm = None for i in xrange(n_init): dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \ alpha=alpha, random_state=rand_state) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: bic_dpgmm = b lbl_vec = dpgmm.predict(X) prob_vec = dpgmm.predict_proba(X) log_prob_dpgmm = np.sum(dpgmm.score(X)) return [lbl_vec, prob_vec, bic_dpgmm, log_prob_dpgmm]
def train_DPGMM(d, max_n_comp=100, max_n_iter=500): '''Imports Data, Trains a DPGMM, Generates predictions testing''' print "Training Model..." gmm = DPGMM(max_n_comp, n_iter=max_n_iter) start = timeit.default_timer() gmm.fit(d) end = timeit.default_timer() print "Training completed in %f seconds" % (end-start) print print "Converged: " print gmm.converged_ print return gmm
def plot_num_iters_dpgmm(X, num_c, cv_type, alpha, max_iters, n_init): bic = [] for iters in np.arange(1, max_iters): best_bic = np.inf for j in xrange(n_init): dpgmm = DPGMM(n_components=comp, covariance_type=cv_type, \ alpha=a, n_iter=iters) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: best_bic = b bic.append(best_bic) fig, ax = plt.subplots(figsize=(10, 8)) ax.plot(np.arange(1, max_iters), bic) ax.set_title('BIC vs. Number of Iterations DPGMM') ax.set_xlabel('Number of iterations') ax.set_ylabel('BIC score') return fig
def plot_alpha_dpgmm(X, num_c, cv_type, alphas, iters, n_init): bic = [] for a in alphas: best_bic = np.inf for j in xrange(n_init): dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \ alpha=a, n_iter=iters) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: best_bic = b bic.append(best_bic) fig, ax = plt.subplots(figsize=(10, 8)) ax.plot(alphas, bic, 'bo-', lw=2) ax.set_title('BIC vs. Alpha DPGMM') ax.set_xlabel('Alpha') ax.set_ylabel('BIC score') return fig
def dpgmm_segmenter(factors, width=MEDIAN_WIDTH): factors = median_filter(factors, size=(MEDIAN_WIDTH, 1), mode='mirror') factors = pre.scale(factors, axis=1) best_boundaries = [0, factors.shape[0] - 1] best_n_types = 1 dpgmm = DPGMM(n_components=10, covariance_type='diag', alpha=10, n_iter=100) dpgmm.fit(np.tile(factors, (10, 1))) labels = dpgmm.predict(factors) boundaries, labels = find_boundaries(labels, width) if len(np.unique(labels)) > 1: best_boundaries = boundaries best_n_types = len(np.unique(labels)) if len(best_boundaries) < best_n_types + 1: best_n_types = len(best_boundaries) - 1 best_labels = segment_labeling(factors, best_boundaries, c_method='kmeans', k=best_n_types) best_boundaries = np.array(best_boundaries) return best_boundaries, best_labels
def main(): if len(sys.argv) != 5: print(__doc__) return 1 infiles = glob(sys.argv[1]) outfile = sys.argv[2] N = int(sys.argv[3]) alpha = float(sys.argv[4]) print("Reading in", len(infiles), "files") fullarr = np.loadtxt(fileinput.input(infiles), delimiter = '\t')[:,:-7] stds = np.apply_along_axis(np.std, 0, fullarr)[:,np.newaxis].T means = np.apply_along_axis(np.mean, 0, fullarr)[:,np.newaxis].T stds[stds == 0] = 1.0 num_lines = 10000 fullarr = fullarr[np.random.choice(fullarr.shape[0], num_lines, replace=True),:] fullarr = (fullarr - means) / stds print("Learning infinite GMM with N={}, alpha={}".format(N, alpha)) igmm = DPGMM(covariance_type='diag', n_components=N, alpha=alpha, init_params='wmc') igmm.fit(fullarr) print("Infinite GMM trained, saving") with open(outfile + '_' + num_lines, 'wb') as out_model: pickle.dump(igmm, out_model) print("Score:", igmm.score(fullarr)) print("Num Components:", igmm.n_components) return 0
def main(method,cluster_num=30,alpha=.5): f ='/Users/davidgreenfield/Downloads/features_csv_tmp.csv' #f ='/Users/davidgreenfield/Downloads/features_f500.csv' cols=range(1,4096) feats =np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=(cols)) asins = np.loadtxt(open(f,"rb"),delimiter=",",skiprows=1,usecols=([0]),dtype=str) if method == 'kmeans': k_means=cluster.KMeans(n_clusters=cluster_num) k_means.fit(feats) y = k_means.labels_ if MAKE_GRAPH==1: print "hello 1" create_graph(k_means) elif method == 'GMM_VB': gmm_vb = VBGMM.fit(feats,n_components=50,alpha=.5) y = gmm_vb.predict(feats) cluster_no = len(np.unique(y)) elif method == 'GMM_DP': gmm_dp = DPGMM(n_components=50,alpha=alpha) gmm_dp.fit(feats) y = gmm_dp.predict(feats) cluster_no = len(np.unique(y)) clusters=[] groups={} data=load_data('./data/boots_aws.csv') for i in range(0,cluster_num): groups[i]=np.where(y==i) ids=asins[groups[i]] clusters.append(ids) links=[data[x]['url'] for x in ids] create_html(links,"templates/groups/group"+str(i)+".html") output_clusters(clusters,"outputs/clusters.csv")
def __init__(self, cluster_method=2, cluter_tag=False, train_path=None, event_info_path=None, city_id=None): self.loss_choice = 0 # 0:reg; 1:pairwise ranking self.ndim = 20 self.tr_method = 0 # 0:SGD1; 1:SGD2 self.cluster_method = cluster_method # 0:DPGMM; 1:GMM; 2:K-means self.n_components = 20 self.city_id = city_id # SGD self.niters1 = 10 self.lr1 = 0.01 self.lambda1 = 0.001 self.neg_num1 = 5 self.beta1 = 1 self.alpha1 = 1 self.ins_weight = [self.beta1, self.alpha1] pois = [] if cluter_tag == True: events = set([entry[1] for entry in csv.reader(open(train_path, "r"))]) for entry in csv.reader(open(event_info_path, "r")): event = entry[0] if event in events: poi = map(float, entry[3].split(" ")) pois.append(poi) if not checkGeoScope(poi, self.city_id): print 'Invalic location' sys.exit(1) if self.cluster_method == 0: cluster = DPGMM(n_components=500, covariance_type='diag', alpha=1, n_iter=50) cluster.fit(pois) centers = removeDup(cluster.means_) outputCenterforVis(centers) self.n_components = len(centers) cluster_fd = open(settings["DPGMM_CLUSTER"], "wb") pickle.dump([centers, None], cluster_fd) self.model_path = settings["GEOMF"] outputCenterforVis(centers) elif self.cluster_method == 1: cluster = GMM(n_components = self.n_components, covariance_type='diag', min_covar=1e-7, n_init=10, random_state=0, n_iter=100) cluster.fit(pois) outputCenterforVis(cluster.means_) labels = deterClusterRel(pois, cluster.means_) #showNumInEachCluster(labels, self.n_components) dis_variances = calDisVariance(self.n_components, labels, pois) dis_variances = smoothVar(dis_variances) covars = smoothVar(cluster.covars_) cluster_fd = open(settings["GMM_CLUSTER"], "wb") pickle.dump([cluster.means_, covars, dis_variances], cluster_fd) elif self.cluster_method == 2: cluster = KMeans(n_clusters = self.n_components, max_iter=300, init='k-means++') cluster.fit(pois) means, variances= calCenterCov(self.n_components, cluster.labels_, pois) outputCenterforVis(means) dis_variances = calDisVariance(self.n_components, cluster.labels_, pois) variances = smoothVar(variances) dis_variances = smoothVar(dis_variances) cluster_fd = open(settings["KMEANS_CLUSTER"], "wb") pickle.dump([means, variances, dis_variances], cluster_fd) else: print 'Invalid choice of clustering method' sys.exit(1)
#print 'feature num', len(feature_idx) #fn = fn[:, feature_idx] #X = StandardScaler().fit_transform(fn) fold = 3 kf = StratifiedKFold(label, n_folds=fold, shuffle=True) #kf = KFold(len(label), n_folds=fold, shuffle=True) clf = RFC(n_estimators=100, criterion='entropy') rounds = 1 acc_sum = [[] for i in range(fold)] for train, test in kf: train_fn = fn[train] #n_class = len(np.unique(label[train])) d = DPGMM(n_components=50, covariance_type='spherical',alpha=10) d.fit(train_fn) #print 'mixture mean', d.means_ preds = d.predict(train_fn) print '# of M by DP', len(np.unique(preds)) acc_sum[0].append(ARI(label[train], preds)) #acc_sum[0].append(SS(train_fn, preds)) #n_class = len(np.unique(preds)) n_class = 32 g = GMM(n_components=n_class, covariance_type='spherical', init_params='wmc', n_iter=100) g.fit(train_fn) #g.means_ = np.array([x_train[y_train == i].mean(axis=0) for i in np.unique(y_train)]) preds = g.predict(train_fn) #prob = np.sort(g.predict_proba(train_fd)) acc_sum[1].append(ARI(label[train], preds)) #acc_sum[1].append(SS(train_fn, preds))
aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) axes[1].imshow(feats_log, aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) axes[2].imshow(feats_log_normed, aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) fig.tight_layout() # Clustering with DP-GMM n_components = 32 dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(feats_log.T) preds_proba = dpgmm.predict_proba(feats_log.T) preds = np.argmax(preds_proba, axis=1) np.unique(preds) # resynthesis by sampling from clusters resynthesis = dpgmm.means_[preds.astype(int), :] fig, axes = plt.subplots(4, 1, figsize=(18, 8)) axes[0].set_title(feature) axes[1].set_title('Prediction Probability') axes[2].set_title('Resynthesis') axes[3].set_title('Max(Prediction Probability)') axes[0].imshow(feats_log, aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma)
def plotClustering(fullpath, order=1, sr=4, cutoff=.1, n_singv=3, feature='chroma', dim_red='SVD', round_to=0, normalize=1, scale=1, length=4, clustering='KMEANS'): feat = {} print( 'Analyzing {} with feature {}, order {}, sr {}, cutoff {}, ' 'n_singv {}, scale {} normalize {}, round_to {}'.format( fullpath, feature, order, sr, cutoff, n_singv, scale, normalize, round_to)) # extract filename, filepath and beat aligned feature filename, file_ext = os.path.splitext(fullpath) # extract filter and apply pre-processing feat[feature], beat_times = extractFeature(filename, file_ext, feature, scale, round_to, normalize, beat_sync=True, save=True) feat['LPF'] = lpf(feat[feature], cutoff, sr, order) feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv) feat['{}(LPF)'.format(dim_red)] = dim_red_fn(dim_red, feat['LPF'], n_singv) feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order) feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF'] feat['LPF({}-LPF)'.format(feature)] = lpf(feat['{}-LPF'.format(feature)], cutoff, sr, order) feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn( dim_red, feat['LPF({}-LPF)'.format(feature)], n_singv) # create vars for plotting ts = np.arange(0, len(feat[feature])) step_size = max(1, int(len(ts) * .01)) fig = plt.figure(figsize=(98, 64)) fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format( feature, order, cutoff, sr)) gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1]) i = 0 print "\tPlot data and pre-processing" for name in (feature, '{}-LPF'.format(feature), '{}(LPF)'.format(dim_red), 'LPF({})'.format(dim_red), 'LPF({}-LPF)'.format(feature), '{}(LPF({}-LPF))'.format(dim_red, feature)): data = feat[name] data_wide = np.array([ feat[name][m:m + length, :] for m in xrange(len(feat[name]) - length) ]) data_wide = data_wide.reshape(data_wide.shape[0], data_wide.shape[1] * data_wide.shape[2]) # build codebook using kmeans or DP-GMM if clustering == 'KMEANS': K_MIN, K_MAX = 2, 16 KM = [ KMeans(n_clusters=l, init='k-means++').fit(data_wide) for l in xrange(K_MIN, K_MAX + 1) ] # compute scores to assess fit scores_bic = [ computeBic(KM[x], data_wide) for x in xrange(len(KM)) ] scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))] scores_silhouette = [ silhouette_score(data_wide, KM[x].labels_, metric='euclidean') for x in xrange(len(KM)) ] # get best clusters idx_best_bic = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_bic))[0]) idx_best_inertia = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_inertia))[0]) idx_best_silhouette = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_silhouette))[0]) idx_best = int( np.median( (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1 # get clusters and cluster allocations given best K k_best = idx_best + K_MIN centroids = KM[idx_best].cluster_centers_ centroid_idx = KM[idx_best].labels_ elif clustering == 'DPGMM': n_components = 12 dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(data_wide) # compute scores to assess fit scores_bic = dpgmm.bic(data_wide) scores_silhouette = [ silhouette_score(data_wide, centroids, metric='euclidean') ] scores_silhouette = [0.0] # get clusters and cluster allocations given best K k_best = dpgmm.means_.shape[0] centroids = dpgmm.means_ centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1) # plot data if data.shape[1] == 3: data = data.reshape(1, data.shape[0], data.shape[1]) else: data = data.T ax = fig.add_subplot(gs[i, :]) ax.set_title(name) ax.imshow(data, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) xlabels = [ "{}:{}".format(int(x / 60), int(x % 60)) for x in beat_times[::step_size] ] ax.set_xticks(ts[::step_size]) ax.set_xticklabels(xlabels, rotation=60) ax.grid(False) # plot clustering on raw feature changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:])) for c in xrange(changes.shape[0] - 1): if changes[c] and changes[c + 1]: changes[c] = False ax_twin = ax.twiny() ax_twin.set_xlim(ax.get_xlim()) ax_twin.set_xticks(np.argwhere(changes)[:, 0]) ax_twin.set_xticklabels(centroid_idx[changes]) ax_twin.grid(False) # plot codebook (centroids) ax = fig.add_subplot(gs[i + 1, 0]) ax.set_title(name) if centroids.shape[1] == 3: centroids = centroids.reshape(1, centroids.shape[0], centroids.shape[1]) elif centroids.shape[1] == n_singv * length: centroids = centroids.reshape(1, centroids.shape[0] * length, centroids.shape[1] / length) else: centroids = centroids.reshape(centroids.shape[0] * length, centroids.shape[1] / length).T ax.imshow(centroids, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(xrange(0, centroids.shape[1], 4)) ax.set_xticklabels(xrange(k_best)) ax.grid(False) # plot elbow curve c = 1 for k, v, idx in (('BIC', scores_bic, idx_best_bic), ('INERTIA', scores_inertia, idx_best_inertia), ('SILHOUETTE', scores_silhouette, idx_best_silhouette)): ax = fig.add_subplot(gs[i + 1, c]) ax.set_title('{}, {} best K {}'.format(name, k, idx + K_MIN)) ax.plot(xrange(K_MIN, K_MAX + 1), v, 'b*-') ax.set_xlim((K_MIN, K_MAX + 1)) ax.set_xlabel('Number of clusters') ax.set_ylabel('Score') ax.grid(True) ax.axvline(idx + K_MIN, color='r') c += 1 i += 2 """ if 'SVD' in name: # scikit-image clustering segments_slic = slic( data, n_segments=10, compactness=10, sigma=1) segments_quickshift = quickshift( data, kernel_size=3, max_dist=6, ratio=0.5) ax = fig.add_subplot(gs[k, 0]) ax.set_title('{} with quickshift'.format(name)) ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) ax = fig.add_subplot(gs[k, 1]) ax.set_title('{} with slic'.format(name)) ax.imshow(mark_boundaries(data, segments_slic, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) k += 1 """ plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with large size plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with smaller size fig.set_figwidth(36) fig.set_figheight(24) plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) plt.close(fig)
def __init__(self, cluster_method=2, cluter_tag=False, train_path=None, event_info_path=None, city_id=None): self.loss_choice = 0 # 0:reg; 1:pairwise ranking self.ndim = 20 self.tr_method = 0 # 0:SGD1; 1:SGD2 self.cluster_method = cluster_method # 0:DPGMM; 1:GMM; 2:K-means self.n_components = 20 self.city_id = city_id # SGD self.niters1 = 10 self.lr1 = 0.01 self.lambda1 = 0.001 self.neg_num1 = 5 self.beta1 = 1 self.alpha1 = 1 self.ins_weight = [self.beta1, self.alpha1] pois = [] if cluter_tag == True: events = set( [entry[1] for entry in csv.reader(open(train_path, "r"))]) for entry in csv.reader(open(event_info_path, "r")): event = entry[0] if event in events: poi = map(float, entry[3].split(" ")) pois.append(poi) if not checkGeoScope(poi, self.city_id): print 'Invalic location' sys.exit(1) if self.cluster_method == 0: cluster = DPGMM(n_components=500, covariance_type='diag', alpha=1, n_iter=50) cluster.fit(pois) centers = removeDup(cluster.means_) outputCenterforVis(centers) self.n_components = len(centers) cluster_fd = open(settings["DPGMM_CLUSTER"], "wb") pickle.dump([centers, None], cluster_fd) self.model_path = settings["GEOMF"] outputCenterforVis(centers) elif self.cluster_method == 1: cluster = GMM(n_components=self.n_components, covariance_type='diag', min_covar=1e-7, n_init=10, random_state=0, n_iter=100) cluster.fit(pois) outputCenterforVis(cluster.means_) labels = deterClusterRel(pois, cluster.means_) #showNumInEachCluster(labels, self.n_components) dis_variances = calDisVariance(self.n_components, labels, pois) dis_variances = smoothVar(dis_variances) covars = smoothVar(cluster.covars_) cluster_fd = open(settings["GMM_CLUSTER"], "wb") pickle.dump([cluster.means_, covars, dis_variances], cluster_fd) elif self.cluster_method == 2: cluster = KMeans(n_clusters=self.n_components, max_iter=300, init='k-means++') cluster.fit(pois) means, variances = calCenterCov(self.n_components, cluster.labels_, pois) outputCenterforVis(means) dis_variances = calDisVariance(self.n_components, cluster.labels_, pois) variances = smoothVar(variances) dis_variances = smoothVar(dis_variances) cluster_fd = open(settings["KMEANS_CLUSTER"], "wb") pickle.dump([means, variances, dis_variances], cluster_fd) else: print 'Invalid choice of clustering method' sys.exit(1)
data_cluster_train_ds = data_cluster_train """if you want clustering on the dissimilarity space uncomment below and change accordingly""" # print 'Calculating dissimilarity space for training queries...' # data_cluster_train_ds = sc.pdist(data_cluster_train, 'euclidean') # data_cluster_train_ds = sc.squareform(data_cluster_train_ds) # # plt.figure(1) # # plt.imshow(data_cluster_train_ds) # # plt.colorbar() # # plt.title('Initial dissimilarity') print 'Training a Dirichlet Process Gaussian Mixture model...' dpgmm = DPGMM(alpha=1.0, n_iter=100, n_components=50) dpgmm.fit(data_cluster_train_ds) prediction = dpgmm.predict(data_cluster_train_ds) clusters = np.unique(prediction) print 'Found %i clusters!' % clusters.shape[0] print clusters """create the reordered input data according to the clusters it is only needed if you want to visuallize the clustering afterwards""" #data_cluster = np.zeros((1, data_cluster_train.shape[1])) # each cluster is a list of lists that contains the indices # of the queries for each cluster each_cluster = [] for i in xrange(clusters.shape[0]):
axes[2].imshow(feats_log_normed, aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) fig.tight_layout() # Clustering with DP-GMM n_components = 32 dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(feats_log.T) preds_proba = dpgmm.predict_proba(feats_log.T) preds = np.argmax(preds_proba, axis=1) np.unique(preds) # resynthesis by sampling from clusters resynthesis = dpgmm.means_[preds.astype(int), :] fig, axes = plt.subplots(4, 1, figsize=(18, 8)) axes[0].set_title(feature) axes[1].set_title('Prediction Probability') axes[2].set_title('Resynthesis') axes[3].set_title('Max(Prediction Probability)') axes[0].imshow(feats_log, aspect='auto', origin='low',
class DPGMMClusterModel(BaseEstimator, TransformerMixin): def __init__(self, w2v_model=None, n_components=None, no_above=0.9, no_below=8, dataname="", stoplist=None, dictionary=None, recluster_thresh=1000, alpha=5): self.w2v_model = w2v_model self.no_above = no_above self.no_below = no_below self.alpha = alpha self.n_components = n_components self.n_sub_components = int(n_components / 2) self.stoplist = stoplist self.dataname = dataname self.dictionary = dictionary self.dpgmm = None self.scaler = None self.cluster_info = None # a list of sub-clusterer self.feature_crd = {} self.subdpgmms = [] self.reclustered = [] self.recluster_thresh = recluster_thresh def should_cluster_word(self, word): return (word in self.dictionary.token2id) and (len(word) > 1) and \ (self.w2v_model is None or word in self.w2v_model) and \ (self.stoplist is None or word not in self.stoplist) # constructs a dictionary and a DPGMM model on 9000 middle frequency words from X # X is a sequence of texts def fit(self, X, y=None): # either consturct a dictionary from X, trim it if self.dictionary is None: self.dictionary = corpora.Dictionary(X) # or use an existing dictionary and trim the given set of words self.dictionary.filter_extremes(no_below=self.no_below, no_above=self.no_above, keep_n=9000) if self.w2v_model is None: w2v_corpus = [[word for word in text if self.should_cluster_word(word)] for text in X] self.w2v_model = w2v_models.build_word2vec(w2v_corpus, size=100, window=10, min_count=self.no_below, dataname=self.dataname+"_dpgmm") word_list = np.array([word for word in self.dictionary.token2id.iterkeys() if self.should_cluster_word(word)]) # This was reclustering clause - I need to re-write this # else: # # note the double loop here!! # word_list = np.array([word for text in X for word in text if self.should_cluster_word(word)]) # construct a list of words to cluster # remove rare and frequent words # remove words of length 1 # remove stopwords vec_list = [self.w2v_model[word] for word in word_list] logging.info("DPGMM received %i words" % len(vec_list)) # save word representations filename = "w2v_vocab_%s_%.1f_%.0f.lcsv" % (self.dataname, self.no_above, self.no_below) io.save_words_representations(filename, word_list, vec_list) self.scaler = StandardScaler() vecs = self.scaler.fit_transform(np.array(vec_list)) self.dpgmm = DPGMM(n_components=self.n_components, covariance_type='diag', alpha=self.alpha, n_iter=1000, tol=0.0001) self.dpgmm.fit(vecs) logging.info("DPGMM converged: %s" % self.dpgmm.converged_) # save information about found clusters self.cluster_info = [] y_ = self.dpgmm.predict(vecs) for i, cluster_center in enumerate(self.dpgmm.means_): cluster_words = word_list[y_ == i] cluster_size = len(cluster_words) if cluster_size > self.recluster_thresh and self.recluster_thresh > 0: logging.info("DPGMM: reclustering %i words for cluster %i" % (len(cluster_words), i)) sub_dpgmm = DPGMMClusterModel(w2v_model=self.w2v_model, n_components=self.n_sub_components, dictionary=self.dictionary, dataname="%s-%i" % (self.dataname, i), stoplist=self.stoplist) # recluster words. Note the double array sub_dpgmm.fit([cluster_words]) self.subdpgmms.append(sub_dpgmm) self.reclustered.append(i) if cluster_size > 0: #cluster_center_original = self.scaler.inverse_transform(cluster_center) #similar_words = self.w2v_model.most_similar_cosmul(positive=[cluster_center_original], topn=cluster_size) #central_words = [word for word, _ in similar_words if word in cluster_words] central_words = cluster_words[0:10] else: central_words = [] self.cluster_info.append({'cnt': i, 'size': cluster_size, 'words': central_words}) filename = "clusters_%s_%i_%.1f_%.0f.txt" % (self.dataname, self.n_components, self.no_above, self.no_below) io.save_cluster_info(filename, self.cluster_info) # setting up the coordinates for the features self.feature_crd = {'global': range(0, self.n_components), 'reclustered': [i for i in range(0, self.n_components + self.n_sub_components*len(self.reclustered)) if i not in self.reclustered]} return self # calculate cluster counts for one text def clusterize(self, text): word_list = [word for word in text if self.should_cluster_word(word)] vec_list = np.array([self.w2v_model[word] for word in word_list]) bincounts = np.zeros((self.n_components+self.n_sub_components*len(self.reclustered),)) if len(vec_list) > 0: # assign words to clusters predictions = self.dpgmm.predict(self.scaler.transform(np.array(vec_list))) global_bincount = np.bincount(predictions, minlength=self.n_components) # re-assign words in large clusters bincounts[0:self.n_components] = global_bincount #reshape((1,len(global_bincount))) start = self.n_components for i, subdpgmm in zip(self.reclustered, self.subdpgmms): # if words in respective clusters exists - recluster them vecs_torecluster = vec_list[predictions == i] if len(vecs_torecluster) > 0: predictions = subdpgmm.dpgmm.predict(subdpgmm.scaler.transform(np.array(vecs_torecluster))) bincounts[start:start+subdpgmm.dpgmm.n_components] = \ np.bincount(predictions, minlength=subdpgmm.dpgmm.n_components) #.reshape((1, subdpgmm.n_components)) start += subdpgmm.dpgmm.n_components # erase the count inthe global counts # returns a vector of cluster bin counts: [ global, reclustered1, reclustered2, ...] return bincounts.reshape((1, len(bincounts))) # for a text, constructs a bincount of clusters present in the sentence # X is a list of texts. One text is one string! Not tokenized def transform(self, X): # Text pre-processing x_clean = [tu.normalize_punctuation(text).split() for text in X] logging.info("DPGGM: Text prepocessed") # Vectorize using W2V model if self.dpgmm is not None: logging.info("Vectorizing a corpus") size = self.w2v_model.layer1_size if len(X) > 0: vecs = np.concatenate([self.clusterize(z) for z in x_clean], axis=0) else: vecs = np.zeros(size).reshape((1, size)) logging.info("DPGMM: returning pre-processed data of shape %s" % (vecs.shape, )) else: logging.info("W2V Averaged: no model was provided.") vecs = np.zeros((len(X), 1)) return vecs
max_components = 8 # Count the number of clusters the DPGMM chooses num_clusters = [] size_sample = [] # Try clustering at different sample sizes for iteration in range(int(np.floor(len(gaussian_data) / 10)) - 2): # Number of samples to use max_sample_value = ((iteration + 2) * 10) sample_set = gaussian_data[0:max_sample_value] size_sample.append(max_sample_value - 0) # Fit Dirichlet Process Gaussian Mixture Model dpgmm_model = DPGMM(n_components = max_components, n_iter=1000, alpha=1.0) fitted_dpgmm = dpgmm_model.fit(sample_set) dpgmm_predictions = fitted_dpgmm.predict(gaussian_data) num_clusters.append(len(set(dpgmm_predictions))) # Append predicted labels to dataframe gaussian_data['predicted'] = dpgmm_predictions # Give a unique color to each category unique_categories = list(set(gaussian_data['predicted'])) color_labels = ['r', 'y', 'g', 'b', 'c', 'm', 'k', 'w'] colors = [color_labels[unique_categories.index(i)] for i in gaussian_data['predicted']] # Plot predicted data plt.scatter(gaussian_data['x'], gaussian_data['y'], c=colors) plt.xlim([-12,12]) plt.ylim([-12,12])
v = vector[0] / sp.linalg.norm(vector[0]) angle = 180* np.arctan(v[1] / v[0]) / np.pi e = Ellipse(xy=center, width=width, height=height, angle=angle, color='m', alpha=0.5, clip_box = ax.bbox) ax.add_artist(e) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title(u'GMM', fontsize=20) plt.grid(True) # DPGMM n_components = 3 dpgmm = DPGMM(n_components=n_components, alpha=1, covariance_type='full', random_state=0) dpgmm.fit(x) centers = dpgmm.means_ covs = dpgmm._get_covars() print 'DPGMM均值 = \n', centers print 'DPGMM方差 = \n', covs y_hat = dpgmm.predict(x) # print y_hat ax = plt.subplot(212) grid_hat = dpgmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o') for i, cc in enumerate(zip(centers, covs)): if i not in y_hat:
data_cluster_test = query_features(testing, 15, 10, 23, data) data_cluster_train_ds = data_cluster_train """if you want clustering on the dissimilarity space uncomment below and change accordingly""" # print 'Calculating dissimilarity space for training queries...' # data_cluster_train_ds = sc.pdist(data_cluster_train, 'euclidean') # data_cluster_train_ds = sc.squareform(data_cluster_train_ds) # # plt.figure(1) # # plt.imshow(data_cluster_train_ds) # # plt.colorbar() # # plt.title('Initial dissimilarity') print 'Training a Dirichlet Process Gaussian Mixture model...' dpgmm = DPGMM(alpha=1.0, n_iter=100, n_components=50) dpgmm.fit(data_cluster_train_ds) prediction = dpgmm.predict(data_cluster_train_ds) clusters = np.unique(prediction) print 'Found %i clusters!' % clusters.shape[0] print clusters """create the reordered input data according to the clusters it is only needed if you want to visuallize the clustering afterwards""" #data_cluster = np.zeros((1, data_cluster_train.shape[1])) # each cluster is a list of lists that contains the indices # of the queries for each cluster each_cluster = [] for i in xrange(clusters.shape[0]): cluster = data_cluster_train[prediction == clusters[i], :]
#labeled_datafile.close() unlabeled_datafile.close() for chunks in np.arange(1, opts.size, step = 3): # Sample the specified number of points from X_unlabeled size = np.cumsum(chunk_sizes[:chunks])[-1] # Fit a Dirichlet process mixture of Gaussians using up to ten components dpgmm = DPGMM(n_components=10, alpha=10.0, covariance_type='full') indices = np.arange(X_unlabeled.shape[0]) np.random.shuffle(indices) X = X_unlabeled[indices[:size],] print("fitting a model with", size, "data points") with timeit(): dpgmm.fit(X) print("Done!") print("AIC for this model & data: ", dpgmm.aic(X)) print("BIC for this model & data: ", dpgmm.bic(X)) Y_hat = dpgmm.predict(X) print ("Model assigned points to", np.max(Y_hat), "components") # How can I best check this out? #color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) #for i, (clf, title) in enumerate([(gmm, 'GMM'), #(dpgmm, 'Dirichlet Process GMM')]): #splot = plt.subplot(2, 1, 1 + i) #Y_ = clf.predict(X) #for i, (mean, covar, color) in enumerate(zip( #clf.means_, clf._get_covars(), color_iter)):
def plotClustering(fullpath, order=1, sr=4, cutoff=.1, n_singv=3, feature='chroma', dim_red='SVD', round_to=0, normalize=1, scale=1, length=4, clustering='KMEANS'): feat = {} print ('Analyzing {} with feature {}, order {}, sr {}, cutoff {}, ' 'n_singv {}, scale {} normalize {}, round_to {}'.format( fullpath, feature, order, sr, cutoff, n_singv, scale, normalize, round_to)) # extract filename, filepath and beat aligned feature filename, file_ext = os.path.splitext(fullpath) # extract filter and apply pre-processing feat[feature], beat_times = extractFeature( filename, file_ext, feature, scale, round_to, normalize, beat_sync=True, save=True) feat['LPF'] = lpf(feat[feature], cutoff, sr, order) feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv) feat['{}(LPF)'.format(dim_red)] = dim_red_fn( dim_red, feat['LPF'], n_singv) feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order) feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF'] feat['LPF({}-LPF)'.format(feature)] = lpf( feat['{}-LPF'.format(feature)], cutoff, sr, order) feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn(dim_red, feat['LPF({}-LPF)'.format(feature)], n_singv) # create vars for plotting ts = np.arange(0, len(feat[feature])) step_size = max(1, int(len(ts) * .01)) fig = plt.figure(figsize=(98, 64)) fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format( feature, order, cutoff, sr)) gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1]) i = 0 print "\tPlot data and pre-processing" for name in (feature, '{}-LPF'.format(feature), '{}(LPF)'.format(dim_red), 'LPF({})'.format(dim_red), 'LPF({}-LPF)'.format(feature), '{}(LPF({}-LPF))'.format(dim_red, feature)): data = feat[name] data_wide = np.array([feat[name][m:m+length, :] for m in xrange(len(feat[name])-length)]) data_wide = data_wide.reshape( data_wide.shape[0], data_wide.shape[1]*data_wide.shape[2]) # build codebook using kmeans or DP-GMM if clustering == 'KMEANS': K_MIN, K_MAX = 2, 16 KM = [KMeans(n_clusters=l, init='k-means++').fit(data_wide) for l in xrange(K_MIN, K_MAX+1)] # compute scores to assess fit scores_bic = [computeBic(KM[x], data_wide) for x in xrange(len(KM))] scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))] scores_silhouette = [silhouette_score(data_wide, KM[x].labels_, metric='euclidean') for x in xrange(len(KM))] # get best clusters idx_best_bic = findElbow(np.dstack( (xrange(K_MIN, K_MAX+1), scores_bic))[0]) idx_best_inertia = findElbow(np.dstack( (xrange(K_MIN, K_MAX+1), scores_inertia))[0]) idx_best_silhouette = findElbow(np.dstack( (xrange(K_MIN, K_MAX+1), scores_silhouette))[0]) idx_best = int(np.median( (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1 # get clusters and cluster allocations given best K k_best = idx_best + K_MIN centroids = KM[idx_best].cluster_centers_ centroid_idx = KM[idx_best].labels_ elif clustering == 'DPGMM': n_components = 12 dpgmm = DPGMM( n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(data_wide) # compute scores to assess fit scores_bic = dpgmm.bic(data_wide) scores_silhouette = [silhouette_score(data_wide, centroids, metric='euclidean')] scores_silhouette = [0.0] # get clusters and cluster allocations given best K k_best = dpgmm.means_.shape[0] centroids = dpgmm.means_ centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1) # plot data if data.shape[1] == 3: data = data.reshape(1, data.shape[0], data.shape[1]) else: data = data.T ax = fig.add_subplot(gs[i, :]) ax.set_title(name) ax.imshow(data, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) xlabels = ["{}:{}".format(int(x / 60), int(x % 60)) for x in beat_times[::step_size]] ax.set_xticks(ts[::step_size]) ax.set_xticklabels(xlabels, rotation=60) ax.grid(False) # plot clustering on raw feature changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:])) for c in xrange(changes.shape[0]-1): if changes[c] and changes[c+1]: changes[c] = False ax_twin = ax.twiny() ax_twin.set_xlim(ax.get_xlim()) ax_twin.set_xticks(np.argwhere(changes)[:, 0]) ax_twin.set_xticklabels(centroid_idx[changes]) ax_twin.grid(False) # plot codebook (centroids) ax = fig.add_subplot(gs[i+1, 0]) ax.set_title(name) if centroids.shape[1] == 3: centroids = centroids.reshape( 1, centroids.shape[0], centroids.shape[1]) elif centroids.shape[1] == n_singv * length: centroids = centroids.reshape( 1, centroids.shape[0]*length, centroids.shape[1]/length) else: centroids = centroids.reshape( centroids.shape[0] * length, centroids.shape[1] / length).T ax.imshow(centroids, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(xrange(0, centroids.shape[1], 4)) ax.set_xticklabels(xrange(k_best)) ax.grid(False) # plot elbow curve c = 1 for k, v, idx in (('BIC', scores_bic, idx_best_bic), ('INERTIA', scores_inertia, idx_best_inertia), ('SILHOUETTE', scores_silhouette, idx_best_silhouette) ): ax = fig.add_subplot(gs[i+1, c]) ax.set_title('{}, {} best K {}'.format(name, k, idx+K_MIN)) ax.plot(xrange(K_MIN, K_MAX+1), v, 'b*-') ax.set_xlim((K_MIN, K_MAX+1)) ax.set_xlabel('Number of clusters') ax.set_ylabel('Score') ax.grid(True) ax.axvline(idx+K_MIN, color='r') c += 1 i += 2 """ if 'SVD' in name: # scikit-image clustering segments_slic = slic( data, n_segments=10, compactness=10, sigma=1) segments_quickshift = quickshift( data, kernel_size=3, max_dist=6, ratio=0.5) ax = fig.add_subplot(gs[k, 0]) ax.set_title('{} with quickshift'.format(name)) ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) ax = fig.add_subplot(gs[k, 1]) ax.set_title('{} with slic'.format(name)) ax.imshow(mark_boundaries(data, segments_slic, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) k += 1 """ plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with large size plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with smaller size fig.set_figwidth(36) fig.set_figheight(24) plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) plt.close(fig)
def dpgmm_simple(X, init_numC, random_state): model = DPGMM(n_components = init_numC, n_iter=100, tol=0.000001, random_state=random_state) model.fit(X) y = model.predict(X) cluster_num = len(np.unique(y)) return cluster_num, y
Y = ds.target return X, Y def test1(): print 'test1' model = VDPGMM(T = 10, alpha = 1, max_iter = 50) X, Y = getXY('iris') model.fit(X) y = model.predict(X) print 'VDPGMM' print len(np.unique(y)), np.unique(y) print [np.sum(y == label) for label in np.unique(y)] from sklearn.mixture import DPGMM model = DPGMM(n_components = 10, alpha = 1, n_iter = 50) model.fit(X) y = model.predict(X) print 'DPGMM' print len(np.unique(y)), np.unique(y) print [np.sum(y == label) for label in np.unique(y)] def test2(): print 'test2' np.random.seed(1) X = np.concatenate((2 + np.random.randn(100, 2), 5 + np.random.randn(100, 2), 10 + np.random.randn(100, 2))) T = 10 model = VDPGMM(T=T, alpha=.5, max_iter=100, thresh=1e-5) model.fit(X) plt.clf() h = plt.subplot()
img.reshape((1, img.shape[0], img.shape[1], img.shape[2])), -1, 1) img = vgg16.preprocess_input(img.astype('float32')) """ Scaling activations to fit random initialization scheme""" actvs = get_activations(model, layer, img).squeeze() actvs /= np.max(actvs) * 0.1 """ Clustering with dirichlet process Gaussian Mixture Model""" dpgmm = DPGMM(n_components=50, alpha=1, verbose=2, tol=0.01, n_iter=250, min_covar=1e-6) #dpgmm = BayesianGaussianMixture(n_components=50, covariance_type="diag", reg_covar = 1e-6, # weight_concentration_prior_type="dirichlet_process", # weight_concentration_prior=1, verbose=2, # tol=0.01, max_iter=250, init_params='random', # mean_precision_prior=actvs.std(), # mean_prior=np.repeat(actvs.max()/5,actvs.shape[0])) dpgmm.fit( np.transpose(actvs.reshape(actvs.shape[0], actvs.shape[1] * actvs.shape[2]))) labels = dpgmm.predict( np.transpose(actvs.reshape(actvs.shape[0], actvs.shape[1] * actvs.shape[2]))) labels = labels.reshape((actvs.shape[1], actvs.shape[2])) plt.subplot(1, 2, 2) plt.imshow(labels, interpolation="nearest") plt.title('Labelmap from layer ' + str(layer))
train_dataset = train.values X = train_dataset[:, 2:] y = train_dataset[:, 1] y = y.astype('int') test_dataset = test.values X_test = test_dataset[:, 2:] print(type(X_test)) print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape) # In[5]: df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']}) print('dirichlet process gaussian mixture begins****************') dpgmm = DPGMM(n_components=3) print('fitting****************') dpgmm_train = dpgmm.fit(X, y) print('predicting on train****************') dpgmm_X_prediction = dpgmm.predict_proba(X)[:, 1] print('predicting on test****************') dpgmm_X_test_prediction = dpgmm.predict_proba(X_test)[:, 1] tr_te_concatenated = np.concatenate( [dpgmm_X_prediction, dpgmm_X_test_prediction]) df['dirichlet_process_gaussian_mixture'] = tr_te_concatenated print('final tr_te shape', df.shape) print(df.head()) df.to_csv('dirichlet_process_gaussian_mixture_tr_te.csv', index=False) print(df.head())
clip_box=ax.bbox) ax.add_artist(e) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title(u'GMM', fontsize=20) plt.grid(True) # DPGMM n_components = 3 dpgmm = DPGMM(n_components=n_components, alpha=1, covariance_type='full', random_state=0) dpgmm.fit(x) centers = dpgmm.means_ covs = dpgmm._get_covars() print 'DPGMM均值 = \n', centers print 'DPGMM方差 = \n', covs y_hat = dpgmm.predict(x) # print y_hat ax = plt.subplot(212) grid_hat = dpgmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o') for i, cc in enumerate(zip(centers, covs)): if i not in y_hat: