with open('C:\Users\kenny\Desktop\CAAM 495 - Senior Design\LandLL\histograms\code\histogram_2.csv') as histfile: histdata = csv.reader(histfile) intensities = [] frequencies = [] histogram = [] for row in histdata: #if int(row[0]) > -500: <-- Use this line of code to block out noise #histogram.append([float(i) for i in row]) intensities.append(float(row[0])) #frequencies.append(float(row[1])) n=len(intensities) x = np.asarray(intensities, order='F').reshape(n,1) #plt.hist(myarray,bins=256) #plt.show() print x gmm = mixture.GMM(n_components=2) # gmm for two components gmm.fit(x) # train it print gmm # linspace = np.linspace(-10, 10, 1000) # # fig, ax1 = plt.subplots() # ax2 = ax1.twinx() # # ax1.hist(x, 100) # draw samples # ax2.plot(linspace, np.exp(gmm.score_samples(linspace)[0]), 'r') # draw GMM # plt.show()
# Generate random sample following a sine curve np.random.seed(0) X = np.zeros((n_samples, 2)) step = 4 * np.pi / n_samples for i in xrange(X.shape[0]): x = i * step - 6 X[i, 0] = x + np.random.normal(0, 0.1) X[i, 1] = 3 * (np.sin(x) + np.random.normal(0, .2)) color_iter = itertools.cycle( ['navy', 'turquoise', 'cornflowerblue', 'darkorange']) for i, (clf, title) in enumerate([ (mixture.GMM(n_components=10, covariance_type='full', n_iter=100), "Expectation-maximization"), (mixture.DPGMM(n_components=10, covariance_type='full', alpha=0.01, n_iter=100), "Dirichlet Process,alpha=0.01"), (mixture.DPGMM(n_components=10, covariance_type='diag', alpha=100., n_iter=100), "Dirichlet Process,alpha=100.") ]): clf.fit(X) splot = plt.subplot(3, 1, 1 + i) Y_ = clf.predict(X) for i, (mean, covar, color) in enumerate(zip(clf.means_, clf._get_covars(),
corr_list.append(corr) file.close() corr_list = np.array(corr_list) ##============================= part#1: PDF of the original data ============================== plt.hist(corr_list, histtype='step', bins=500, normed=1, alpha=0.5) plt.plot([], [], color='blue', label="original pdf") #plt.title("Hidden batch distribution after one sample (with the initialization we calculated)") #plt.xlabel("Value of hidden batch") #plt.ylabel("Frequency") #plt.axis([-1, 1, 0, 7]) # un-normalized version: y ~ [0, 300]; normalized version: y ~ [0, 7] #plt.show() ##============================= part#2: get the two Gaussian mixture ============================== np.random.seed(1) g = mixture.GMM(n_components=2) ##=========== transform the corr_list into the required format (a wired format) =========== list = [] for corr in corr_list: list.append([corr]) corr_list = np.array(list) obs = corr_list #obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1))) g.fit(obs) #GMM(covariance_type='diag', init_params='wmc', min_covar=0.001, n_components=2, n_init=1, n_iter=100, params='wmc', random_state=None, thresh=None, tol=0.001) ##=========== get model parameters =========== ## show the learned model: # weight:
import pylab as pl from sklearn import mixture n_samples = 300 c_types = ['full', 'diag', 'spherical'] np.random.seed(0) C = np.array([[0., -0.7], [3.5, 1.7]]) X_train = np.dot(np.random.randn(n_samples, 2), C) pl.figure(dpi=100, figsize=(3, 3)) pl.scatter(X_train[:, 0], X_train[:, 1], .8) pl.axis('tight') pl.savefig('GaussianFit-data.svg') pl.close() for c_type in c_types: clf = mixture.GMM(n_components=1, covariance_type=c_type) clf.fit(X_train) x = np.linspace(-15.0, 20.0, num=200) y = np.linspace(-10.0, 10.0, num=200) X, Y = np.meshgrid(x, y) XX = np.c_[X.ravel(), Y.ravel()] # flatten Z = np.log(-clf.eval(XX)[0]) Z = Z.reshape(X.shape) pl.figure(dpi=100, figsize=(3, 3)) CS = pl.contour(X, Y, Z) pl.scatter(X_train[:, 0], X_train[:, 1], .8) pl.axis('tight') pl.savefig('GaussianFit-%s.svg' % c_type) pl.close()
meanDist = run_grmean_meanDist(samples[sample], dataDir, thr, hemi) stdev = run_stdev_meanDist(samples[sample], dataDir, thr, hemi) meanDist_norm = np.zeros((10242)) meanDist_norm[cort] = (meanDist[np.nonzero(meanDist)] - meanDist[np.nonzero(meanDist)].mean() ) / meanDist[np.nonzero(meanDist)].std() trt_4 = run_icc_meanDist(samples[sample], dataDir, fsDir, thr, hemi, ['1a', '1b', '2a', '2b']) trt_2 = run_icc_meanDist(samples[sample], dataDir, fsDir, thr, hemi, ['1ab', '2ab']) nan_mask = run_nan_grmask(samples[sample], dataDir, hemi) data_gmm = {} for n_comp in num_gmm_comps: data = meanDist * 1000 gmm = mixture.GMM(n_components=n_comp, n_iter=1000) gmm.fit(data[cort]) bic = gmm.bic(data[cort]) aic = gmm.aic(data[cort]) res = np.zeros(10242) res[cort] = gmm.predict(data[cort]) res[cort] = res[cort] + 1 data_gmm[n_comp] = res homogeneity = homogeneity_score(res[cort], yeo7[0][cort]) df_gmm_eval.loc[len(df_gmm_eval)] = [ str(thr), hemi, n_comp, bic, aic, homogeneity, gmm.converged_ ] for node in range(10242): df.loc[len(df)] = [
d1_2=sc.spatial.distance.cdist(segedpsds1[lone_half:lone],basis_set,'sqeuclidean') d2=sc.spatial.distance.cdist(segedpsds2[:ltwo_half],basis_set,'sqeuclidean') d2_2=sc.spatial.distance.cdist(segedpsds2[ltwo_half:ltwo],basis_set,'sqeuclidean') mx=np.max([np.max(d1),np.max(d2),np.max(d1_2),np.max(d2_2)]) #convert to similarity matrices: s1=1-(d1/mx) s1_2=1-(d1_2/mx) s2=1-(d2/mx) s2_2=1-(d2_2/mx) #estimate GMMs: mod1=mixture.GMM(n_components=k,n_iter=100000,n_init=5,covariance_type='full') mod1.fit(s1) mod2=mixture.GMM(n_components=k2,n_iter=100000,n_init=5,covariance_type='full') mod2.fit(s2) len2=len(s2) len1=len(d1) #calculate likelihoods for held out data: score1_1=mod1.score(s1_2) score2_1=mod2.score(s1_2) score1_2=mod1.score(s2_2) score2_2=mod2.score(s2_2)
def cluster_gmm(): snp_data_grouped_by_snp = hqa.generate_snp_data() for i, snp_data_for_all_samples in enumerate(snp_data_grouped_by_snp): print(snp_data_for_all_samples['snp_id']) snp_data = snp_data_for_all_samples['snp_data'] x_vals = [ float('nan') if x['x_norm'] is None else x['x_norm'] for x in snp_data ] y_vals = [ float('nan') if x['y_norm'] is None else x['y_norm'] for x in snp_data ] X = np.transpose([x_vals, y_vals]) # remove any non-finite values x_finite = np.isfinite(X) X = X[np.all(x_finite, axis=1), ] print('finding model') lowest_info_crit = None best_model = None best_n_components = 0 cv_types = ['tied'] #['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: print(cv_type) for n_components in range(1, 6): model = mixture.GMM(n_components=n_components, covariance_type=cv_type, n_iter=100) model.fit(X) info_crit = model.bic(X) print('SNP ID: {}, # Clusts: {}, BIC: {}'.format( snp_data_for_all_samples['snp_id'], n_components, info_crit)) if lowest_info_crit is None or info_crit < lowest_info_crit: best_model = model lowest_info_crit = info_crit best_n_components = n_components best_cv_type = cv_type print('found best n_components: {}, cv_type: {}'.format( best_n_components, best_cv_type)) Y_ = best_model.predict(X) #do_plot = len(set(Y_)) > 1 do_plot = False if do_plot: color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) fig = pylab.figure() ax = fig.add_subplot(111, aspect='equal') for j, (mean, covar, color) in enumerate( zip(best_model.means_, best_model._get_covars(), color_iter)): v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to # unless it needs it, we shouldn't plot the redundant # components. if not np.any(Y_ == j): continue if do_plot: ax.scatter(X[Y_ == j, 0], X[Y_ == j, 1], .8, color=color) # Plot an ellipse to show the Gaussian component angle = np.arctan(u[1] / u[0]) angle = 180 * angle / np.pi # convert to degrees ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color) ell.set_clip_box(ax.bbox) ell.set_alpha(0.5) ax.add_artist(ell) fig.suptitle('SNP ID: {}, # Clusts: {}, BIC: {}'.format( snp_data_for_all_samples['snp_id'], best_n_components, lowest_info_crit)) # pylab.show() fig.savefig('imgout/{}-best.png'.format( snp_data_for_all_samples['snp_id']), bbox_inches='tight') plt.clf()
# Generate random sample following a sine curve np.random.seed(0) X = np.zeros((n_samples, 2)) step = 4*np.pi/n_samples for i in xrange(X.shape[0]): x = i*step-6 X[i,0] = x+np.random.normal(0, 0.1) X[i,1] = 3*(np.sin(x)+np.random.normal(0, .2)) color_iter = itertools.cycle (['r', 'g', 'b', 'c', 'm']) for i, (clf, title) in enumerate([ (mixture.GMM(n_components=10, covariance_type='diag'), \ "Expectation-maximization"), (mixture.DPGMM(n_components=10, covariance_type='diag', alpha=0.01), "Dirichlet Process,alpha=0.01"), (mixture.DPGMM(n_components=10, covariance_type='diag', alpha=100.), "Dirichlet Process,alpha=100.") ]): clf.fit(X, n_iter=100) splot = pl.subplot(3, 1, 1+i) Y_ = clf.predict(X) for i, (mean, covar, color) in enumerate(zip(clf.means, clf.covars, color_iter)): v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # as the DP will not use every component it has access to
import matplotlib.pyplot as plt fig, ax = plt.subplots() ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r, extent=[xmin, xmax, ymin, ymax]) #ax.plot(gps[:,0], gps[:,1], 'k.', markersize=1) ax.set_xlim([xmin, xmax]) ax.set_ylim([ymin, ymax]) plt.show() n_components = 10 covariance_type = 'spherical' # 'full'#'diag' gmm = mixture.GMM(n_components=n_components, covariance_type=covariance_type, min_covar=0.00001, n_iter=1000) #gmm._set_covars(np.ones((n_components, 2)) * 0.05) gmm.fit(data) import itertools import matplotlib as mpl def plot_mixture(gmm, ax, scale=1.0): color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) for n, color in zip(range(gmm.n_components), color_iter): v, w = np.linalg.eigh(gmm._get_covars()[n][:2, :2]) u = w[0] / np.linalg.norm(w[0]) angle = np.arctan2(u[1], u[0]) angle = 180 * angle / np.pi # convert to degrees
def fit_gaussians_etc(x, y, surrogate_repeat, gain, run_direc, file, savefig=True, fz=14): # window length is in bins (1 FR bin = 2cm) => 3*2cm = 6cm Kernel (Chen = 8.5cm, Ravassard = 5cm) y = signale.tools.smooth(y, window_len=3.) # generate one more x-point between all existing x-points:________________________________________________ x_doublePointNum = numpy.arange(x[0], x[-1]+numpy.diff(x)[0]/2., numpy.diff(x)[0]/2.) # data = pl.hist(x, weights=y, bins=len(numpy.arange(min(x), max(x), x[1]-x[0]))) # in case firing rates are too small (<5) multiply them with factor that resulting histogram represents # data form better________________________________________________________________________________________ if max(y) < 5: input_y = 5. * (y/max(y)) else: input_y = multi*y # generate data histogram___________________________________________________________________________________ data = numpy.repeat(x, numpy.around(input_y).astype(int)) # generate surrogat data____________________________________________________________________________________ for su in [0]: # numpy.arange(surrogate_repeat+1): # if su != 0 and good == 1: # for su=0 actual data will be plotted # # # generate randomly shuffled data___________________________________________________________________ # surrogate_data = numpy.random.choice(list(x), len(data)) # data = surrogate_data # # # generate histogram for shuffled data______________________________________________________________ # bin_num = len(numpy.arange(min(x), max(x), abs(x[1]-x[0]))) # new_y = numpy.histogram(surrogate_data, bins=bin_num, range=(min(x), max(x)))[0] # # # undo multiplication from line 97 to get actual firing rates back__________________________________ # if max(y) < 5: # y = (new_y * max(y))/5. # else: # y = new_y # fit two gaussians_____________________________________________________________________________________ gmm = mixture.GMM(n_components=2, covariance_type='full', min_covar=0.0000001) # gmm for two components gmm.fit(numpy.vstack(data)) #numpy.vstack(data)) #numpy.vstack(data)) # train it! # get functions for two fitted gaussians________________________________________________________________ gauss1 = (gmm.weights_[0] * matplotlib.mlab.normpdf(x_doublePointNum, gmm.means_[0], numpy.sqrt(gmm.covars_[0])))[0] gauss2 = (gmm.weights_[1] * matplotlib.mlab.normpdf(x_doublePointNum, gmm.means_[1], numpy.sqrt(gmm.covars_[1])))[0] # calculate basic values for the FR distribution y_______________________________________________________ std = numpy.std(y) mean = numpy.mean(y) # calculate basic values for the gaussians_______________________________________________________________ mg1a = max(gauss1) mg2a = max(gauss2) stdg1 = numpy.sqrt(gmm.covars_[0])[0][0] stdg2 = numpy.sqrt(gmm.covars_[1])[0][0] # calculate x difference between two gaussian peaks_______________________________________________________ xDiff = abs(x_doublePointNum[numpy.argmax(gauss1)]-x_doublePointNum[numpy.argmax(gauss2)]) # define x-window depending on difference of gaussian peaks, in which gauss amplitudes will be normalised # ________________________________________________________________________________________________________ if file.endswith('normalised.hkl'): xDiff_cutoff = 0.2/float(gain) else: xDiff_cutoff = 0.20 # define devident of the standart deviation from the maximum FR that should be used____________________ std_dev1 = 2. std_dev2 = 2. if xDiff > xDiff_cutoff < (1./4.)*max(x) and stdg1 > 0.3: # for larger fields 1/8. of the std will be used std_dev1 = 8. if xDiff > xDiff_cutoff < (1./4.)*max(x) and stdg2 > 0.3: std_dev2 = 8. # amplitude maximum for gauss 1___________________________________________________________________________ x1 = numpy.argmax(gauss1)+1 if x1 >= len(x)-1: # if gauss 1 fit maximum is outside the data use maximum close to track end x1 = len(x)-3 # find indices where the FR maximum should be take out of__________________________________ g1_max0 = numpy.argmin(abs(x_doublePointNum[0:x1]-(x_doublePointNum[x1-1]-stdg1/std_dev1))) g1_max1 = numpy.argmin(abs(x_doublePointNum[x1:-1]-(x_doublePointNum[x1-1]+stdg1/std_dev1)))+x1 # define new indices for Sonderfaelle_______________________________ if g1_max0 >= len(y)-1 and g1_max1+1 >= len(y)-1: # maximum should be taken from outside the data g1_max0 = len(y)-5 g1_max1 = len(y)-2 if g1_max0 == g1_max1+1: # indices are equal, so that there is no range from which the max can be taken g1_max0 -= 2 g1_max1 += 2 if g1_max0 < 0: # maximum should be taken from outside the data g1_max0 = 0 if g1_max1+1 > len(y)-1: # larger index only is outside the data g1_max1 = len(y)-2 # get gauss maximum in area of interest_____________ g1_maxFR = max(y[g1_max0:g1_max1+1]) # amplitude maximum for gauss 2___________________________________________________________________________ x2 = numpy.argmax(gauss2)+1 if x2 >= len(x)-1: # if gauss 1 fit maximum is outside the data use maximum close to track end x2 = len(x)-3 # find indices where the FR maximum should be take out of__________________________________ g2_max0 = numpy.argmin(abs(x_doublePointNum[0:x2]-(x_doublePointNum[x2-1]-stdg2/std_dev2))) g2_max1 = numpy.argmin(abs(x_doublePointNum[x2:-1]-(x_doublePointNum[x2-1]+stdg2/std_dev2)))+x2 # define new indices for Sonderfaelle (s.o.)_______________________________ if g2_max0 >= len(y)-1 and g2_max1+1 >= len(y)-1: g2_max0 = len(y)-5 g2_max1 = len(y)-2 if g2_max0 == g2_max1+1: g2_max0 -= 2 g2_max1 += 2 if g2_max0 < 0: g2_max0 = 0 if g2_max1+1 > len(y)-1: g2_max1 = len(y)-2 # get gauss maximum in area of interest_____________ g2_maxFR = max(y[g2_max0:g2_max1+1]) # set gauss closest to y distribution maximum to its maximum___________________________________________ nearest_yMax_gauss = signale.tools.findNearest(numpy.array([x_doublePointNum[numpy.argmax(gauss1)], x_doublePointNum[numpy.argmax(gauss2)]]), x[numpy.argmax(y)])[0] if nearest_yMax_gauss == 0: g1_maxFR = max(y) else: g2_maxFR = max(y) # normalise gaussians to FR maximum of distribution y within gausstian maximum + / - 0.5 of its std:__________ gauss1 = g1_maxFR*(gauss1/mg1a) # first normalise gauss to max=1 then multiply with new maximum gauss2 = g2_maxFR*(gauss2/mg2a) # get gauss amplitude and weights______________________ amplitude_g1 = gmm.weights_[0] * g1_maxFR/mg1a amplitude_g2 = gmm.weights_[1] * g2_maxFR/mg2a weight_g1 = amplitude_g1/(amplitude_g1 + amplitude_g2) weight_g2 = amplitude_g2/(amplitude_g1 + amplitude_g2) # get maxima auf gaussians with new amplitude__________ mg1 = max(gauss1) mg2 = max(gauss2) # max_mean_diff_in_std1 = (mg1 - mean)/std # max_mean_diff_in_std2 = (mg2 - mean)/std # define plot colors based on which gauss is bigger______ if mg1 >= mg2: colour = ['r', 'k'] small_max = mg2 small_max_index = numpy.argmax(gauss2) else: colour = ['k', 'r'] small_max = mg1 small_max_index = numpy.argmax(gauss1) # calculate values to get m = deltaF/Fmean:____________________________________________ # derivative1 = numpy.diff(gauss1+gauss2) / numpy.diff(x_doublePointNum) # # # remove negative values in beginning of derivative # if run_direc == 'left': # # for leftwards runs the array is starting from the end of the track! # sc = -1 # pre_sign = 1 # sign_array = numpy.arange(len(derivative1))[::-1] # backwards array # else: # sc = 0 # pre_sign = -1 # sign_array = numpy.arange(len(derivative1)) # # # set negative slopes at the beginning of the derivative to zero, as they are artifacts___ # zero_crossings = numpy.where(numpy.diff(numpy.sign(derivative1)))[0] # if len(zero_crossings): # first_sign_change = zero_crossings[sc]+1 # # if run_direc == 'left': # derivative1[first_sign_change:len(derivative1)][derivative1[first_sign_change:len(derivative1)] < 0] = 0. # else: # derivative1[0:first_sign_change][derivative1[0:first_sign_change] < 0] = 0. # # ________________________________________________________________________________________ # # # use sign change of derivative to detect zero crossings (for that replace zeros with neighbouring values)____ # sign = numpy.sign(derivative1) # # # get rid of zeros and use sign value from the value before # for l in sign_array: # if sign[l] == 0.: # if run_direc == 'right' and l == 0: # sign[l] = sign[l+1] # elif run_direc == 'left' and l == len(sign)-1: # sign[l] = sign[l-1] # else: # sign[l] = sign[l+pre_sign] # # get rid of remaining zeros in the array edges # for l in sign_array[::-1]: # if sign[l] == 0.: # if run_direc == 'left' and l == 0: # sign[l] = sign[l+1] # elif run_direc == 'right' and l == len(sign)-1: # sign[l] = sign[l-1] # else: # sign[l] = sign[l-pre_sign] # # # find derivative zero crossings____________________________________________________________ # deri1_zero = numpy.where(numpy.diff(sign))[0]+1 # # if len(deri1_zero) == 3: # with 3 zero crossings m-value can be calculated____________ # between_peak_min_index = deri1_zero[1] # # between_peak_min = (gauss1+gauss2)[between_peak_min_index] # index_delta = abs(between_peak_min_index-small_max_index) # # delta_F = small_max-between_peak_min # # # sonderfaelle______________________ # if small_max_index-index_delta < 0: # s_index = 0 # else: # s_index = small_max_index-index_delta # # if small_max_index+index_delta+1 > len(x)-1: # l_index = len(x_doublePointNum)-1 # else: # l_index = small_max_index+index_delta+1 # # __________________________________ # # small_peak_mean = numpy.mean((gauss1+gauss2)[s_index: l_index]) # # # calculate m-value_______________________________________________________________________ # m = delta_F/small_peak_mean # # if numpy.isnan(m): # print 'delta_F = ', delta_F # print 'small_peak_mean = ', small_peak_mean # print 'mean for index1 to index2 : ', small_max_index-index_delta, small_max_index+index_delta+1 # print (gauss1+gauss2)[small_max_index-index_delta: small_max_index+index_delta+1] # sys.exit() # # if su != 0: # M.append(m) # else: # M_data.append(m) # good = 1 # extra_path = 'Deriv_good/' # # else: # not 3 zero crossings -> m-value cannot be calculated # if su == 0: # M_data.append(numpy.nan) # good = 0 # extra_path = 'Deriv_bad/' if su == 0: # plot data and gaussians from mixture model fig22, ax22 = pl.subplots(1, 1, figsize=(18, 12)) ax22.axhline(mean, linestyle='-', color=custom_plot.pretty_colors_set2[0], alpha=0.8, zorder=0) ax22.axhspan(mean-std, mean+std, facecolor=custom_plot.pretty_colors_set2[0], alpha=0.2, linewidth=False, zorder=0) ax22.plot(x, y, 'b') ax22.plot(x_doublePointNum, gauss1, linewidth=2, color=colour[0]) # gauss1 = small gauss ax22.plot(x_doublePointNum, gauss2, linewidth=2, color=colour[1]) ax22.plot(x_doublePointNum, gauss1+gauss2, linewidth=2, color='g') ax22.set_xlabel('Position from start point (m)', fontsize=fz) ax22.set_ylabel('Firing rate (Hz)', fontsize=fz) ax22.set_ylim(0, max(gauss1+gauss2)+0.01) ax22.set_xlim(0, max(x)) return fig22, ax22
plt.xticks(()) plt.yticks(()) plt.show() #Alternative clustering methods aff = cluster.AffinityPropagation() aff.fit(X_train) print(aff.cluster_centers_indices_.shape) ms = cluster.MeanShift() ms.fit(X_train) print(ms.cluster_centers_.shape) from sklearn import mixture gm = mixture.GMM(n_components=n_digits, covariance_type='tied', random_state=42) gm.fit(X_train) # Print train clustering and confusion matrix y_pred = gm.predict(X_test) print("Adjusted rand score:{:.2}".format( metrics.adjusted_rand_score(y_test, y_pred))) print("Homogeneity score:{:.2} ".format( metrics.homogeneity_score(y_test, y_pred))) print("Completeness score: {:.2} ".format( metrics.completeness_score(y_test, y_pred)))
S2.append(x) else: S3.append(x) # print(S1) # print(S2) # print(S3) # 'spherical', 'diag', 'tied', 'full' covariance = 'tied' n_gauss = 4 accuracy_results = [] for i in xrange(num_simulation): gmix1 = mixture.GMM(n_components=n_gauss, covariance_type=covariance) gmix1.fit(S1) # print gmix1.means_ gmix2 = mixture.GMM(n_components=n_gauss, covariance_type=covariance) gmix2.fit(S2) # print gmix2.means_ gmix3 = mixture.GMM(n_components=n_gauss, covariance_type=covariance) gmix3.fit(S3) # print gmix3.means_ y_train_pred1 = gmix1.score_samples(X_train) y_train_pred2 = gmix2.score_samples(X_train) y_train_pred3 = gmix1.score_samples(X_train)
data = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data', sep=",", header=None) colnames = [ 'preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age', 'class' ] data.columns = colnames X, y = data.iloc[:, :-1], data.iloc[:, -1] X.columns = colnames[:len(colnames) - 1] rp = SparseRandomProjection(n_components=6) projected_data = rp.fit_transform(X) gm = mixture.GMM(n_components=2, covariance_type='diag') gm.fit(projected_data) X_expect = y y_pred = gm.predict(projected_data) both = pd.concat([pd.DataFrame(y_pred), pd.DataFrame(y)], 1) both.columns = ['pred', 'class'] from sklearn.metrics import accuracy_score print "Accuracy" print accuracy_score(both['class'], both['pred']) for k in range(1, 8): model = mixture.GMM(n_components=k, covariance_type='diag') labels = model.fit_predict(projected_data) if k == 2:
def M_EM(gm, X): from sklearn import mixture k = gm.k sklgmm = mixture.GMM(n_components=k, covariance_type='diag', n_init=5, n_iter = 10, thresh = 1e-2) sklgmm.fit(X) return sklgmm.means_, sklgmm.covars_
def cluster_and_display_waters(site_number, w_positions_np): def optimize_n(positions_np, n_data): bic = {} for n in [x + 1 for x in range(20)]: if n < len(positions_np): gmm = mixture.GMM(n_components=n, covariance_type='spherical', n_iter=20) gmm.fit(positions_np) score = sum(gmm.score(positions_np)) lambda_c = 15 # 3 too few bic_l = score - lambda_c * 0.5 * math.log(n_data) * n bic[n] = bic_l for key in bic: print(" water bic", key, bic[key]) key, value = max(iter(bic.items()), key=lambda x: x[1]) return key n_components = optimize_n(w_positions_np, len(w_positions_np)) print("optimize_n for water:::::::::::::", n_components) dpgmm = mixture.GMM(n_components, covariance_type='spherical', n_iter=40) dpgmm.fit(w_positions_np) cluster_assignments = dpgmm.predict(w_positions_np) color_list = [ 'green', 'greentint', "sea", 'yellow', "yellowtint", "aquamarine", "forestgreen", "goldenrod", "orangered", "orange", "cyan", 'red', "blue" ] color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) means = dpgmm.means_ cvs = dpgmm._get_covars() weights = dpgmm.weights_ obj = coot.new_generic_object_number("CFC Site " + str(site_number) + " selected waters") for i, pos in enumerate(w_positions_np): mean = means[cluster_assignments[i]] # reject spheres at the origin - (from DPGMM strangeness) d = mean[0] * mean[0] + mean[1] * mean[1] + mean[2] * mean[2] if d > 1.0: col = color_list[cluster_assignments[i]] coot.to_generic_object_add_point(obj, col, 10, pos[0], pos[1], pos[2]) else: print("reject prediction", i, "for cluster", cluster_assignments[i]) # set_display_generic_object(obj, 1) obj = coot.new_generic_object_number("CFC Site " + str(site_number) + " water cluster means") for i, cv in enumerate(cvs): mean = means[i] d = mean[0] * mean[0] + mean[1] * mean[1] + mean[2] * mean[2] v, w = linalg.eigh(cv) # print "mean ", mean # print "weight", weights[i], "prec", precs[i] # print "weight", weights[i] # print "v", v if d > 1.0: pos = mean thick = 2 cluster_star_obj(obj, pos, thick, v[0]) else: print("reject", mean, v) coot.set_display_generic_object(obj, 1) cluster_assignments_as_list = [int(x) for x in cluster_assignments] return (dpgmm, cluster_assignments_as_list)
print("Testing accuracy: ", clf.score(X_test, y_test_bin)) print("Zero: ", sum(y_test_bin == np.zeros(y_test_bin.shape[0])) / y_test_bin.shape[0]) print("One: ", sum(y_test_bin == np.ones(y_test_bin.shape[0])) / y_test_bin.shape[0]) scores = cross_val_score(clf, X_train, X_test, cv=5) print("Cross Val: ", scores) print("#################") ##### GMM ####### print("GMM") clf = mixture.GMM(n_components=4) clf.fit(X_train) print("GMM score") print(clf.score_samples(X_test)) print("#################") ##### Naive Bayes ####### print("Gaussian Naive Bayes") clf = GaussianNB() clf.fit(X_train, y_train) print("Training accuracy: ", clf.score(X_train, y_train)) print("Testing accuracy: ", clf.score(X_test, y_test))
def do_system_training(dataset, model_path, feature_normalizer_path, feature_path, classifier_params, dataset_evaluation_mode='folds', classifier_method='gmm', overwrite=False): """System training model container format: { 'normalizer': normalizer class 'models' : { 'office' : mixture.GMM class 'home' : mixture.GMM class ... } } Parameters ---------- dataset : class dataset class model_path : str path where the models are saved. feature_normalizer_path : str path where the feature normalizers are saved. feature_path : str path where the features are saved. classifier_params : dict parameter dict dataset_evaluation_mode : str ['folds', 'full'] evaluation mode, 'full' all material available is considered to belong to one fold. (Default value='folds') classifier_method : str ['gmm'] classifier method, currently only GMM supported (Default value='gmm') overwrite : bool overwrite existing models (Default value=False) Returns ------- nothing Raises ------- ValueError classifier_method is unknown. IOError Feature normalizer not found. Feature file not found. """ if classifier_method != 'gmm' and classifier_method != 'dnn': raise ValueError("Unknown classifier method [" + classifier_method + "]") # Check that target path exists, create if not check_path(model_path) for fold in dataset.folds(mode=dataset_evaluation_mode): current_model_file = get_model_filename(fold=fold, path=model_path) if not os.path.isfile(current_model_file) or overwrite: # Load normalizer feature_normalizer_filename = get_feature_normalizer_filename( fold=fold, path=feature_normalizer_path) if os.path.isfile(feature_normalizer_filename): normalizer = load_data(feature_normalizer_filename) else: raise IOError("Feature normalizer not found [%s]" % feature_normalizer_filename) # Initialize model container model_container = {'normalizer': normalizer, 'models': {}} # Collect training examples file_count = len(dataset.train(fold)) data = {} for item_id, item in enumerate(dataset.train(fold)): progress(title_text='Collecting data', fold=fold, percentage=(float(item_id) / file_count), note=os.path.split(item['file'])[1]) # Load features feature_filename = get_feature_filename( audio_file=item['file'], path=feature_path) if os.path.isfile(feature_filename): feature_data = load_data(feature_filename)['feat'] else: raise IOError("Features not found [%s]" % (item['file'])) # Scale features feature_data = model_container['normalizer'].normalize( feature_data) # Store features per class label if item['scene_label'] not in data: data[item['scene_label']] = feature_data else: data[item['scene_label']] = numpy.vstack( (data[item['scene_label']], feature_data)) le = pp.LabelEncoder() tot_data = {} # Train models for each class for label in data: progress(title_text='Train models', fold=fold, note=label) if classifier_method == 'gmm': model_container['models'][label] = mixture.GMM( **classifier_params).fit(data[label]) elif classifier_method == 'dnn': if 'x' not in tot_data: tot_data['x'] = data[label] tot_data['y'] = numpy.repeat(label, len(data[label]), axis=0) else: tot_data['x'] = numpy.vstack( (tot_data['x'], data[label])) tot_data['y'] = numpy.hstack( (tot_data['y'], numpy.repeat(label, len(data[label]), axis=0))) else: raise ValueError("Unknown classifier method [" + classifier_method + "]") if classifier_method == 'dnn': clf = skflow.TensorFlowDNNClassifier(**classifier_params) tot_data['y'] = le.fit_transform(tot_data['y']) clf.fit(tot_data['x'], tot_data['y']) clf.save('dnn/dnnmodel1') # Save models save_data(current_model_file, model_container)
def fit_samples(samples, ncomponents): gmix = mixture.GMM(n_components=ncomponents, covariance_type='full') gmix.fit(samples) return gmix
def latent_cluster_centers(SAMObject, X=None, labels=None, center='gaussian', plot=True, which_indices=(0, 1), randSeed=None, ax=None): """ Find centers for the clusters identified in param. labels for the latent space. Centers can be a gaussian density (so, mean and covar.) or (not implemented yet) mean and median, as controlled by the param. center. """ from sklearn import mixture assert (labels is not None) if X is None: X = SAMObject._get_latent() cluster_labels = np.unique(labels) K = len(cluster_labels) Q = X.shape[1] cntr = np.zeros((K, Q)) * np.nan if center == 'gaussian': covars = np.zeros((K, Q, Q)) * np.nan else: covars = None for i in range(K): if center == 'gaussian': g = mixture.GMM(covariance_type='full', init_params='wmc', min_covar=0.001, n_components=1, n_init=1, n_iter=300, params='wmc', random_state=randSeed, thresh=None, tol=0.001, verbose=0) g.fit(X[labels == cluster_labels[i], :]) cntr[i, :] = g.means_ covars[i] = g.covars_ elif center == 'median': raise NotImplementedError("This is not implemented yet") elif center == 'mean': raise NotImplementedError("This is not implemented yet") else: print('Not known center type') raise if plot: color_iter = colors = cm.rainbow(np.linspace(0, 1, 20)) myperm = np.random.permutation(color_iter.shape[0]) color_iter = color_iter[myperm, :] marker_iter = itertools.cycle((',', '+', '.', 'o', '*', 'v', 'x', '>')) splot = pb.subplot(1, 1, 1) for i, (color, marker) in enumerate(zip(color_iter, marker_iter)): pb.scatter(X[labels == cluster_labels[i], which_indices[0]], X[labels == cluster_labels[i], which_indices[1]], s=40, color=color, marker=marker) if i == K - 1: break if ax is None: ax = pb.gca() for i in range(K): util_plot_cov_ellipse(cntr[i, :], covars[i], ax=ax, which_indices=which_indices) return cntr, covars
# <codecell> # %pdoc sklmix.GMM # %psource sklmix.GMM #sklmix.GMM? #sklmix.GMM?? # <headingcell level=4> # Pick your favorite clustering algorithm # <codecell> gmm_model = sklmix.GMM(n_components=3, covariance_type='full') gmm_model.fit(iris[['PW', 'PL', 'SW']]) yhat = gmm_model.predict(iris[['PW', 'PL', 'SW']]) crosstab = pd.crosstab(iris['Type'], yhat, rownames=['true'], colnames=['predicted']) print crosstab # <headingcell level=4> # Align the confusion matrix with a non-standard package # <codecell> import munkres
def cluster_weights(links, threshold): weights = np.transpose(np.array([links])) # Clustering links n = len(links) MIN_NUM_SAMPLES = 2 if n > MIN_NUM_SAMPLES: # Fit a mixture of Gaussians with EM lowest_bic = np.infty bic = [] global best_gmm n_components_range = range(2, n) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: gmm = mixture.GMM(n_components=n_components, covariance_type='full') gmm.fit(weights) # Bayesian information criterion bic.append(gmm.bic(weights)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm # Averaging ids = best_gmm.predict(weights) print(best_gmm.n_components) total_weights = [] unique_ids = list(set(ids)) for i in unique_ids: c_ids = ids == i average = np.sum(links[c_ids]) / len(links[c_ids]) links[c_ids] = average total_weights.append(np.sum(links[c_ids])) print('averaged : ', links) weak_cluster_ids = [ i for w, i in zip(total_weights, unique_ids) if w < threshold ] strong_clusters = [ links[ids == i] for i in unique_ids if i not in weak_cluster_ids ] """ combos = [] for cluster in strong_clusters: i = 1 for e in cluster: if e * i < threshold: combos.append(e) i += 1 print('combos', combos) possible_combo = 0 for i in range(0, len(combos) + 1): combo = set(combinations(combos, i)) for c in combo: w = np.sum(list(c)) if possible_combo < w < threshold: possible_combo = w weak_clusters = [(total_weights[i], i) for i in weak_cluster_ids] for w, j in weak_clusters: weighted_combo = w + possible_combo if weighted_combo < threshold: idx = ids == j links[idx] = 0 """ print("output", links) print("clusters", ids) #print("weak", weak_clusters) print("strong", strong_clusters) return links, ids elif n == 2: return links, np.array([0, 1]) else: return links, np.zeros(len(links))
def process_subfold(sf,fold): print("Fold "+str(fold)); t0 = time.time(); snoring_dataset = dm.load_ComParE2017(featPath, filetype) # load dataset trainset, develset, testset = dm.split_ComParE2017_simple(snoring_dataset) # creo i trainset per calcolare media e varianza per poter normalizzare del snoring_dataset # Read dataset size and preallocate a=trainset[0][1].shape if (filetype == 'npy'): nfeat = a[0] else: nfeat = a[1] # Read the features trainFeat=np.empty([1,nfeat]) #for seq in trainset: for seq in develset: if (filetype == 'npy'): feat = seq[1].transpose() else: feat = seq[1] # metto tutte le features in una matrice che poi passero al gmm.fit per adattaare l'UBM trainFeat = np.vstack((trainFeat, feat)) trainFeat = np.delete(trainFeat, 0, 0) print("DONE!") #trainFeat = trainFeat.astype(dtype='float32') for m in mixtures: # Train the UBM print("Fold "+str(fold)+"-->Mixture: "+str(m)+" "); sys.stdout.flush(); gmm = mixture.GMM(n_components=m, n_iter=1000, random_state=1); gmm.fit(trainFeat); ubmPath = os.path.join(curUbmsPath, str(m)); if (not os.path.exists(ubmPath)): try:#handle the simultaneous creation of folders from multiple processes os.makedirs(ubmPath); except OSError, e: if e.errno != 17: raise else: print "OSError.errno 17 ignored" pass if (not gmm.converged_): print("Fold "+str(fold)+"-->Convergence not reached with " + str(m) +" mixtures"); joblib.dump(gmm, os.path.join(ubmPath, "ubm_" + str(sf))); #salvo l'ubm. mi crea le varie compie tipo ubm_1_02 ecc... per poterle magari riutilizzare per il debug # Extract trainset supervectors curSupervecSubPath = os.path.join(curSupervecPath, str(m)); if (not os.path.exists(curSupervecSubPath)): try:#handle the simultaneous creation of folders from multiple processes os.makedirs(curSupervecSubPath); except OSError, e: if e.errno != 17: raise else: print "OSError.errno 17 ignored" pass
# ,[[4,7], [1,2], [3,7], [4,8], [2,7], [3,8], [1,7], [5,6], [2,1], [4,8]] # ,[[6,8], [3,4], [4,6], [8,6], [4,5], [5,7],[4,1], [4,3], [7,2],[2,1]]]; # unseen = [[[4,1],[1,2],[8,5],[1,2],[1,2], [6,1],[2,8],[3,4]] # ,[[4,1], [1,2], [4,3], [2,3], [3,1], [1,3], [2,8], [4,7]] # ,[[6,5], [3,8], [8,5], [4,8], [5,6],[4,6], [7,6],[2,5]]]; unseen = [[[2, 3], [1, 3], [3, 7], [5, 7], [2, 3], [4, 8], [4, 3], [2, 1], [1, 6], [8, 5]], [[2, 3], [1, 3], [7, 1], [1, 3], [2, 3], [4, 8], [2, 8], [5, 2], [5, 1], [8, 5]], [[5, 6], [7, 6], [8, 4], [6, 8], [4, 5], [6, 7], [4, 3], [7, 3], [1, 6], [2, 6]]] # unseen=[] for i in range(n_class): g = mixture.GMM(n_components=1, covariance_type='full') g.fit(data[i]) mean.append(g.means_[0]) covar.append(g.covars_[0]) gaudist.append(g) for i in range(len(unseen)): me = [] s = [0 for i in range(n_class)] support = 0 for j in range(len(unseen[i])): me.append( np.array( (mean[unseen[i][j][0] - 1][j] + mean[unseen[i][j][1] - 1][j]) / 2)) s[unseen[i][j][0] - 1] = 1
dataset = dataframe.values """Split dataset into input(X) and output(Y) variables, where the first 9 columns are removed since they are identifiers and population_count0, and the Y (hotel existance) variable is classified by 1 if it exists in a particular geohash otherwise it is classified by 0 """ X = dataset[:,10:-3].astype(float) #X = StandardScaler().fit_transform(X) Y = [0]*len(X) for sample in range(len(X)): if dataset[sample,-1] > 0: Y[sample] += 1 clf = mixture.GMM(n_components=6, covariance_type='full', random_state=7) clusters = clf.fit(X) cluster_means = clusters.means_ print(cluster_means) cluster_predict = clusters.predict(X) #lons = dataset[:,3] #lats = dataset[:,2] #pts = zip(lons,lats) lons = [] lats = [] for i in range(len(cluster_predict)): if cluster_predict[i] == 5: lons.append(dataset[i,3]) lats.append(dataset[i,2])
def do_system_training(dataset, model_path, feature_normalizer_path, feature_path, feature_params, classifier_params, dataset_evaluation_mode='folds', classifier_method='gmm', clean_audio_errors=False, overwrite=False): """System training model container format: { 'normalizer': normalizer class 'models' : { 'office' : mixture.GMM class 'home' : mixture.GMM class ... } } Parameters ---------- dataset : class dataset class model_path : str path where the models are saved. feature_normalizer_path : str path where the feature normalizers are saved. feature_path : str path where the features are saved. feature_params : dict parameter dict classifier_params : dict parameter dict dataset_evaluation_mode : str ['folds', 'full'] evaluation mode, 'full' all material available is considered to belong to one fold. (Default value='folds') classifier_method : str ['gmm'] classifier method, currently only GMM supported (Default value='gmm') clean_audio_errors : bool Remove audio errors from the training data (Default value=False) overwrite : bool overwrite existing models (Default value=False) Returns ------- nothing Raises ------- ValueError classifier_method is unknown. IOError Feature normalizer not found. Feature file not found. """ if classifier_method != 'gmm': raise ValueError("Unknown classifier method [" + classifier_method + "]") # Check that target path exists, create if not check_path(model_path) for fold in dataset.folds(mode=dataset_evaluation_mode): current_model_file = get_model_filename(fold=fold, path=model_path) if not os.path.isfile(current_model_file) or overwrite: # Load normalizer feature_normalizer_filename = get_feature_normalizer_filename( fold=fold, path=feature_normalizer_path) if os.path.isfile(feature_normalizer_filename): normalizer = load_data(feature_normalizer_filename) else: raise IOError("Feature normalizer not found [%s]" % feature_normalizer_filename) # Initialize model container model_container = {'normalizer': normalizer, 'models': {}} # Collect training examples file_count = len(dataset.train(fold)) data = {} for item_id, item in enumerate(dataset.train(fold)): progress(title_text='Collecting data', fold=fold, percentage=(float(item_id) / file_count), note=os.path.split(item['file'])[1]) # Load features feature_filename = get_feature_filename( audio_file=item['file'], path=feature_path) if os.path.isfile(feature_filename): feature_data = load_data(feature_filename)['feat'] else: raise IOError("Features not found [%s]" % (item['file'])) # Scale features feature_data = model_container['normalizer'].normalize( feature_data) # Audio error removal if clean_audio_errors: current_errors = dataset.file_error_meta(item['file']) if current_errors: removal_mask = numpy.ones((feature_data.shape[0]), dtype=bool) for error_event in current_errors: onset_frame = int( numpy.floor( error_event['event_onset'] / feature_params['hop_length_seconds'])) offset_frame = int( numpy.ceil( error_event['event_offset'] / feature_params['hop_length_seconds'])) if offset_frame > feature_data.shape[0]: offset_frame = feature_data.shape[0] removal_mask[onset_frame:offset_frame] = False feature_data = feature_data[removal_mask, :] # Store features per class label if item['scene_label'] not in data: data[item['scene_label']] = feature_data else: data[item['scene_label']] = numpy.vstack( (data[item['scene_label']], feature_data)) # Train models for each class for label in data: progress(title_text='Train models', fold=fold, note=label) if classifier_method == 'gmm': model_container['models'][label] = mixture.GMM( **classifier_params).fit(data[label]) else: raise ValueError("Unknown classifier method [" + classifier_method + "]") # Save models save_data(current_model_file, model_container)
def cluster_and_display_chemical_features(site_number, type, chemical_features_list): def optimize_n(type, positions_np, n_data): print("cluster_and_display_chemical_features.optimize_n called " \ "with n_data = ", n_data) bic = {} for n in [x + 1 for x in range(10)]: if n < n_data: gmm = mixture.GMM(n_components=n, covariance_type='spherical', n_iter=20) gmm.fit(positions_np) score = sum(gmm.score(positions_np)) lambda_c = 15 if type == 'Aromatic': lambda_c = 20 bic_l = score - lambda_c * 0.5 * math.log(n_data) * n bic[n] = bic_l if len(bic) > 1: key, value = max(iter(bic.items()), key=lambda x: x[1]) return key else: return 1 def analyse_bic(type, positions_np, n_data): for n in [x + 1 for x in range(14)]: gmm = mixture.GMM(n_components=n, covariance_type='spherical', n_iter=20) gmm.fit(positions_np) score = sum(gmm.score(positions_np)) lambda_c = 3 if type == 'Aromatic': lambda_c = 3000 bic = score - lambda_c * 0.5 * n_data * n print(type, len(positions_np), n, "converged?", gmm.converged_, "score:", score, "bic", bic) def get_cfc_col(type): if type == "Donor": return "blue" if type == "Acceptor": return "red" if type == "Hydrophobe": return "yellow" if type == "Aromatic": return "orange" return "grey" # --- main line ---- # no fake points # positions_np = np.array([item[0] for item in chemical_features_list]) ext_chemical_features_list = [item[0] for item in chemical_features_list] for item_b in chemical_features_list: delta = 0.25 item = item_b[0] p1 = [item[0], item[1], item[2] + delta] p2 = [item[0], item[1], item[2] - delta] p3 = [item[0], item[1] + delta, item[2]] p4 = [item[0], item[1] - delta, item[2]] p5 = [item[0] + delta, item[1], item[2]] p6 = [item[0] - delta, item[1], item[2]] ext_chemical_features_list.append(p1) ext_chemical_features_list.append(p2) ext_chemical_features_list.append(p3) ext_chemical_features_list.append(p4) ext_chemical_features_list.append(p5) ext_chemical_features_list.append(p6) positions_np = np.array(ext_chemical_features_list) # analyse_bic(type, positions_np, len(chemical_features_list)) n_data = len(chemical_features_list) n = 1 if n_data > 1: n = optimize_n(type, positions_np, n_data) if n <= len(chemical_features_list): gmm = mixture.GMM(n_components=n, covariance_type='spherical', n_iter=20) gmm.fit(positions_np) print(type, len(positions_np), n, "converged? ", gmm.converged_, "score:", sum(gmm.score(positions_np))) cluster_assignments = gmm.predict(positions_np) features = [] for i, cf in enumerate(chemical_features_list): # print " ", cf, cluster_assignments[i] features.append([cf, int(cluster_assignments[i])]) means = gmm.means_ means_as_list = [[x[0], x[1], x[2]] for x in means] obj_name = "CFC Site " + str( site_number) + " " + type + " pharmacophore-clusters" cfc_obj = coot.new_generic_object_number(obj_name) cfc_col = get_cfc_col(type) for mean in means_as_list: # coot.to_generic_object_add_dodecahedron(cfc_obj, cfc_col, 0.2, mean[0], mean[1], mean[2]) coot.to_generic_object_add_pentakis_dodecahedron( cfc_obj, cfc_col, 2.3, 0.1, mean[0], mean[1], mean[2]) coot.set_display_generic_object(cfc_obj, 1) return [type, features, means_as_list] # oops too many parameters for the model return False
n_samples = 500 # Generate random sample, two components np.random.seed(0) C = np.array([[0., -0.1], [1.7, .4]]) X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] lowest_bic = np.infty bic = [] n_components_range = range(1, 7) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a mixture of Gaussians with EM gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) color_iter = itertools.cycle( ['navy', 'turquoise', 'cornflowerblue', 'darkorange']) clf = best_gmm bars = [] # Plot the BIC scores spl = plt.subplot(2, 1, 1) for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
def find_the_sites(self, file_name_comp_id_list): # main line # coords_with_spec = [] for fn_comp_id in file_name_comp_id_list: fn = fn_comp_id[0] comp_id = fn_comp_id[1] imol = coot.handle_read_draw_molecule_with_recentre( fn_comp_id[0], 0) # what are the residue specs for the given comp_ids? residue_specs = coot.get_residue_specs_in_mol_py(imol, comp_id) print(fn, residue_specs) for spec in residue_specs: # centre = residue_centre_from_spec_py(imol, spec) chain_id = rsu.residue_spec_to_chain_id(spec) res_no = rsu.residue_spec_to_res_no(spec) ins_code = '' res_info = coot.residue_info_py(imol, chain_id, res_no, ins_code) for atom in res_info: coords_with_spec.append( [rsu.residue_atom_to_position(atom), imol, spec]) # print coords_with_spec # now cluster coords. There will be 1 (usually), maybe 2 possibly 3 sites if len(coords_with_spec) < 3: return False else: coords = [x[0] for x in coords_with_spec] positions_np = np.array(coords) n_components = self.optimize_n(positions_np, len(positions_np)) print("optimize_n for sites::::::::::::", n_components) dpgmm = mixture.GMM(n_components, covariance_type='full', n_iter=40) dpgmm.fit(positions_np) cluster_assignments = dpgmm.predict(positions_np) means = dpgmm.means_ weights = dpgmm.weights_ print(cluster_assignments) print(means) print(weights) print("cluster_assignments", cluster_assignments) merge_map = self.find_mergeable_clusters(means, weights) # which key (i.e. cluster index) has the most number of other clusters # that can be merged in? # # convert to a list of ints (not <type 'numpy.int64'>) (because, on decoding Python->C++ object # we do a PyInt_Check for the site_idx (and a <type 'numpy.int64'> fails that test) # new_cluster_assignments = [ int(x) for x in self.merge_clusters(cluster_assignments, merge_map) ] print("new cluster_assignments", new_cluster_assignments) specs = [x[1:] for x in coords_with_spec] cluster_assignments_with_specs = zip(new_cluster_assignments, specs) sites = coot.chemical_feature_clusters_accept_site_clusters_info_py( cluster_assignments_with_specs) # show me them if True: # debug o = coot.new_generic_object_number("site clusters") for mean in means: cluster_star_obj(o, mean, 2, 2) # coot.set_display_generic_object(o, 1) this is for debugging self.sites = sites
def DPF_distrib_histogram(ax, x): """ Parameters ax: contains the reference to the plot x: array containing all the pits DPF values in the considered areal """ X = x.reshape(-1, 1) lowest_bic = np.inf bic = [] # Find the mixture with lowest BIC (Best information criterion) for n_components in range(1,3): # Fit a mixture of Gaussians with EM gmm = mixture.GMM(n_components=n_components) gmm.fit(X) # train it! bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm print ("Best gmm with components "+str(n_components)+" and BIC "+ str(bic[-1])) gmm = best_gmm COLOR, label, alpha, NORMED = 'blue', 'pits', 1, False number_bins = int(x.shape[0]/16) if number_bins <= 0: number_bins = 1 print number_bins a,b = min(x), max(x) ax2 = ax.twinx() ax2.spines['right'].set_visible(False) ax2.spines['top'].set_visible(False) ax2.xaxis.set_ticks_position('none') ax2.yaxis.set_ticks_position('none') ax2.spines['left'].set_visible(False) for ylabel_i in ax2.get_yticklabels(): ylabel_i.set_visible(False) ylabel_i.set_fontsize(0.0) n, bins, patches = ax.hist(X, number_bins, facecolor=COLOR, alpha=alpha, range=(a,b), label=label+": "+str(x.shape[0]), normed=NORMED) x_bins=(bins[1:]+bins[:-1])/2 linspace = np.linspace(-2, 2, 1000).reshape(-1, 1) ax2.plot(linspace, np.exp(gmm.score_samples(linspace)[0]), 'r') mu1 = gmm.means_[0] std1 = np.sqrt(gmm.covars_[0]) if gmm.n_components == 1: threshold = mu1-2*std1 elif gmm.n_components == 2: mu2 = gmm.means_[1] std2 = np.sqrt(gmm.covars_[1]) A2 = gmm.weights_[1] A1 = gmm.weights_[0] x_samples = [mu2+(mu1-mu2)/250.0*p for p in range(250)] gauss1 = gauss(x_samples, mu1, std1, 1) gauss2 = gauss(x_samples, mu2, std2, 1) threshold = x_samples[np.argmin(np.abs(gauss1-gauss2))] if gmm.n_components == 1: ax.plot(np.repeat(threshold,200), np.linspace(min(n), max(n), num=200), color='magenta', lw=3, label = 'threshold') elif gmm.n_components == 2: ax.plot(np.repeat(threshold,200), np.linspace(min(n), max(n), num=200), color='green', lw=3, label = 'threshold') ax.set_xlim([-2,2]) return threshold
max_i = 20.5 len_test = len(test) test_below = len(test[test[:, 0] < min_i]) test_above = len(test[test[:, 0] > max_i]) if not CACHED: for n in range(60): print "testing Gaussian with components:", n score = 0. for bin in np.arange(min_i, max_i, diff): print "processing bin", bin train_bin = train[train[:, 0] > bin] train_bin = train_bin[train_bin[:, 0] < (bin + diff)] g = mixture.GMM(n_components=n, covariance_type='full') g.fit(train_bin[:, 1:]) val_bin = val[val[:, 0] > bin] val_bin = val_bin[val_bin[:, 0] < (bin + diff)] score += np.sum(g.score(val_bin[:, 1:])) print "score", score if score > max_score: max_score = score max_n = n print "best n is:", max_n max_n = 25