Example #1
0
def demo():
    np.random.seed(1)
    gmm = GMM(3, n_iter=1)
    gmm.means_ = np.array([[-1], [0], [3]])
    gmm.covars_ = np.array([[1.5], [1], [0.5]]) ** 2
    gmm.weights_ = np.array([0.3, 0.5, 0.2])
    return gmm.sample(1000)
def main():
    pi = np.array([0.3, 0.5, 0.2])
    mu = np.array([[1,1], [-1,-1], [-1,1]])*3
    sigma = np.array([
        [[1,0], [0,1]],
        [[2,0], [0,2]],
        [[0.5,0], [0, 0.5]],
    ])
    X, C = generate_data(pi, mu, sigma, 1000)
    plt.scatter(X[:,0], X[:,1], c=C, s=100, alpha=0.5)
    plt.show()


    # sklearn
    gmm = GMM(n_components=3, covariance_type='full')
    gmm.fit(X)
    print "pi:", gmm.weights_
    print "mu:", gmm.means_
    print "sigma:", gmm.covars_

    pi2, mu2, sigma2, L = expectation_maximization(X, len(pi))
    print "pi:", pi2
    print "mu:", mu2
    print "sigma:", sigma2
    plt.plot(L)
    plt.show()
Example #3
0
def fit_gmix(data, ngauss, n_iter, min_covar=MIN_COVAR):
    """
        gtot, T, flux
        etatot, log10(T), log10(flux)
    
    data is shape
        [npoints, ndim]
    """
    from sklearn.mixture import GMM


    print("ngauss:   ",ngauss)
    print("n_iter:   ",n_iter)
    print("min_covar:",min_covar)

    gmm=GMM(n_components=ngauss,
            n_iter=n_iter,
            min_covar=min_covar,
            covariance_type='full')

    gmm.fit(data)

    if not gmm.converged_:
        print("DID NOT CONVERGE")

    return gmm
Example #4
0
def main():
	optparser = OptionParser()
	optparser.add_option('-e', '--key_embedder', action='store', type = 'str', dest='key_embedder')
	opts, args = optparser.parse_args()

	fname_embedder = 'data/dataset/model/%s_embedder.pkl'%(opts.key_embedder)
	embedder = WordEmbedder.load(fname_embedder)

	print >> sys.stderr, 'ubm_builder: [info] preparing x'

	iterator = DBTextIterator(50000000)
	x = []
	for seq in iterator:
		if len(seq) == 0:
			continue
		x.append(np.mean(embedder.embed(seq), axis = 0))


	for n in [8, 4, 16, 32]:
		print >> sys.stderr, 'ubm_builder: [info] fitting model for n = %d ...'%(n),
		st = time.time()

		ubm = GMM(n_components = n)
		ubm.fit(x)

		print >> sys.stderr, ' OK (%.2f sec)'%(time.time() - st)

		cPickle.dump(ubm, open('data/dataset/gmmubm/db_%s_%d.pkl'%(opts.key_embedder, n), 'w'))
Example #5
0
File: gmm.py Project: kuntzer/sclas
class GaussianMixtureModel(method.Method):
	
	def __init__(self, params):

		self.params = dict(params)

		del params['features']
		del params['labels']
		
		self._set_default(params, 'covariance_type', 'full')
		#self._set_default(params, 'n_iter', 200)

		self.classifier = GMM(**params)
		
	def __str__(self):
		return "Gaussian Mixture Model from scikit-learn.org"
		
	def train(self, catalog):
		featuresdata = catalog[:,self.params['features']]	
		idlabel = np.array(self.params['features'])[-1] + self.params['labels'] + 1
		labelsdata = catalog[:,idlabel]
		
		labelsdata = labelsdata.reshape(len(labelsdata))
		self.all_labels = np.unique(labelsdata)

		self.classifier.fit(featuresdata, labelsdata)
		
	def predict(self, data):
		outcat = self.classifier.predict(data)
		outcat = np.unique(self.all_labels)[outcat]
		return outcat, 0.
Example #6
0
def algo_gmm(previmage,objmask,nextimage):
    ''' 1. form a mixture model using obj pixels
        2. Classify every pixel in nextimage
        3. Threshold it and classify'''
    import sklearn
    rows = previmage.shape[0]
    cols = previmage.shape[1]
    print previmage.shape
    objpixels = previmage[objmask]
    bgpix = np.ones((rows,cols))
    bgpix[objmask]=0
    bgmask = np.where(bgpix==1)
    bgpixels = previmage[bgmask]
    print objpixels.shape
    obj_gmm_model = GMM(n_components=3)
    obj_gmm_model.fit(objpixels)
    bg_gmm_model = GMM(n_components=3)
    #bg_gmm_model.fit(bgpixels)
    print obj_gmm_model.means_
    next_ = nextimage.reshape((rows*cols,3))
    print next_.shape
    nextlabels_obj = obj_gmm_model.predict_proba(next_)
    #nextlabels_bg = bg_gmm_model.predict_proba(next_)
    nextlabels_obj = nextlabels_obj.reshape(rows,cols,3) 
    nextlabels = obj_gmm_model.predict(next_)    
    print nextlabels_obj.shape
    return nextlabels_obj
def extract_gmm_feature( data, max_length_sec = 10 ):
    try:
        filename, lbl = data
        sr,signal = read(filename)
        if len(signal.shape) > 1:
            signal = signal[:,0]

        signal = signal - signal.mean()
        signal = signal[:max_length_sec*sr]
        signal = np.array(remove_silence( signal, 0.005 ))
        if np.sum(signal) == 0.0:
            print "Empty", filename
            return filename, None, None

        mfcc = librosa.feature.mfcc( signal, n_fft = gmm_fft_points, hop_length = gmm_fft_overlap, n_mfcc = gmm_mfcc_coefficients, fmax = 5000 )
        #mfcc = preprocess_mfcc(mfcc)
        delta_mfcc_1 = delta( mfcc, order = 1 )
        delta_mfcc_2 = delta( mfcc, order = 2 )
        total_features = np.vstack( [ mfcc, delta_mfcc_1, delta_mfcc_2 ] )
        total_features = np.transpose( total_features )
        total_features = preprocess_mfcc( total_features )        
        #total_features = StandardScaler().fit_transform( total_features )
        gmm = GMM(n_components=1)
        gmm.fit( total_features )
        res_features = np.hstack( [gmm.means_[0], gmm.covars_[0]] )
        #print gmm.means_.shape
        #result_features = np.vstack( [ gmm. ] )
        return filename, lbl, res_features
    except Exception,e:
        print e
        return filename, None, None
    def predict(self, author_id):
        author = self.db.get_author(author_id, reduced=True)

        descriptor = self.get_matrix([author], True)
        if self.scaler:
            descriptor = self.scaler.transform(descriptor)
        if self.pca:
            descriptor = self.pca.transform(descriptor)
        descriptor = descriptor[0]

        unknown_descriptor = self.get_matrix([author], False)
        if self.scaler:
            unknown_descriptor = self.scaler.transform(unknown_descriptor)
        if self.pca:
            unknown_descriptor = self.pca.transform(unknown_descriptor)
        ud = unknown_descriptor[0]

        ws = self.bg_classifier.weights_
        ms = self.bg_classifier.means_
        cvs = self.bg_classifier.covars_

        agm = GMM(n_components=self.components, covariance_type=self.tp)
        agm.weights_, agm.means_, agm.covars_ = \
                self.em(ws, ms, cvs,  [descriptor], self.r)

        if agm.score(ud)/self.bg_classifier.score(ud) < self.threshold:

            return 1.0
        else:
            return 0.0

        return 0.100
    def _accumulate_sufficient_statistics(self, stats, obs, framelogprob,
                                          posteriors, fwdlattice, bwdlattice,
                                          params):
        super(GMMHMM, self)._accumulate_sufficient_statistics(
            stats, obs, framelogprob, posteriors, fwdlattice, bwdlattice,
            params)

        for state, g in enumerate(self.gmms_):
            _, lgmm_posteriors = g.score_samples(obs)
            lgmm_posteriors += np.log(posteriors[:, state][:, np.newaxis]
                                      + np.finfo(np.float).eps)
            gmm_posteriors = np.exp(lgmm_posteriors)
            tmp_gmm = GMM(g.n_components, covariance_type=g.covariance_type)
            n_features = g.means_.shape[1]
            tmp_gmm._set_covars(
                distribute_covar_matrix_to_match_covariance_type(
                    np.eye(n_features), g.covariance_type,
                    g.n_components))
            norm = tmp_gmm._do_mstep(obs, gmm_posteriors, params)

            if np.any(np.isnan(tmp_gmm.covars_)):
                raise ValueError

            stats['norm'][state] += norm
            if 'm' in params:
                stats['means'][state] += tmp_gmm.means_ * norm[:, np.newaxis]
            if 'c' in params:
                if tmp_gmm.covariance_type == 'tied':
                    stats['covars'][state] += tmp_gmm.covars_ * norm.sum()
                else:
                    cvnorm = np.copy(norm)
                    shape = np.ones(tmp_gmm.covars_.ndim)
                    shape[0] = np.shape(tmp_gmm.covars_)[0]
                    cvnorm.shape = shape
                    stats['covars'][state] += tmp_gmm.covars_ * cvnorm
Example #10
0
def clusterDataSpec(data, k, algorithm):
    '''
    Cluster the given data into a number of clusters determined by BIC.
    @param data: 2D numpy array holding our data.
    @param algorithm: 
    @raise LogicalError if algorithm is other than "k-means" or "GMM"
    @return The predicted labels (clusters) for every example.
    '''
    
    if algorithm not in ["k-means", "GMM"]:
        raise LogicalError, "Method %s: Clustering is made only through K-means or GMM." %(stack()[0][3])
    
    print "Clustering for k=%d." %(k) 
    if algorithm == "k-means":
        whiten(data)
        codebook, _distortion = kmeans(data, k, 10) # 10 iterations only to make it faster
    else:
        g = GMM(n_components=k,thresh = 1e-05, covariance_type='diag', n_iter=10)
        g.fit(data)
            
    #print "Optimal number of clusters according to BIC: %d." %(optimalK)
    
    # Return predicted labels
    if algorithm == "k-means":
        return vq(data, codebook)[0] # predictions on the same data
    else:
        return g.predict(data) # predictions on the same data
Example #11
0
class PcaGmm(BaseEstimator):
    def __init__(self, X_all,
                 pca_components = 12, gmm_components = 4,
                 covariance_type = "full", min_covar = 0.1,
                 gamma = 0, C = 1.0):
        self.pca_components = pca_components
        self.gmm_components = gmm_components
        self.covariance_type = covariance_type
        self.min_covar = min_covar
        self.gamma = gamma
        self.C = C
        self.X_all = X_all
        X_all = X_all[:, :pca_components]
        self.gmm = GMM(n_components = gmm_components,
                       covariance_type = covariance_type,
                       min_covar = min_covar)
        self.gmm.fit(X_all)
    def fit(self, X, y):
        X = X[:, :self.pca_components]
        X = self.gmm.predict_proba(X)
        self.svm = SVC(C = self.C, gamma = self.gamma)
        self.svm.fit(X, y)
    def predict(self, X):
        X = X[:, :self.pca_components]
        return self.svm.predict(self.gmm.predict_proba(X))
    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)
    def transform(self, X, y = None):
        X = X[:, :self.pca_components]
        return self.gmm.predict_proba(X)
    def __str__(self):
        return "PCA(%d)-GMM(%d, %s, %f)-SVM(C=%f, gamma=%f)" % (self.pca_components, self.gmm_components,self.covariance_type, self.min_covar,self.C, self.gamma)
Example #12
0
class Event(object):

    def __init__(self):
        self.clusters = []
        self.gmm = None

    def draw(self):
        self.fig = plt.figure(figsize=(10,10))
        colors = 'rbgcm'
        for i, cluster in enumerate(self.clusters):
            color = colors[ i % len(colors) ]
            cluster.draw(color)
        if self.gmm:
            for icircle in range(self.gmm.n_components):
                mean = self.gmm.means_[icircle]
                covar = self.gmm.covars_[icircle]
                sigma = np.sqrt(covar[0])
                g = Gaussian(mean, sigma)           
                g.draw()

    def reconstruct(self, nclusters=None):
        if nclusters is None:
            nclusters = len(self.clusters)
        self.gmm = GMM(n_components=nclusters,
                       covariance_type='spherical',
                       init_params='wc', n_iter=10)
        self.gmm.fit( self.samples )
Example #13
0
def create_random_gmm(n_mix, n_features, covariance_type, prng=0):
    prng = check_random_state(prng)
    g = GMM(n_mix, covariance_type=covariance_type)
    g.means_ = prng.randint(-20, 20, (n_mix, n_features))
    g.covars_ = make_covar_matrix(covariance_type, n_mix, n_features)
    g.weights_ = normalized(prng.rand(n_mix))
    return g
Example #14
0
def CVK(X, KRange, covar_type="diag", reps=10):
    N, M = X.shape
    T = len(KRange)

    CVE = np.zeros((T, 1))

    # K-fold crossvalidation
    CV = cross_validation.KFold(N, 5, shuffle=True)

    for t, K in enumerate(KRange):
        print ("Fitting model for K={0}\n".format(K))

        # Fit Gaussian mixture model
        gmm = GMM(n_components=K, covariance_type=covar_type, n_init=reps, params="wmc").fit(X)

        # For each crossvalidation fold
        for train_index, test_index in CV:

            # extract training and test set for current CV fold
            X_train = X[train_index]
            X_test = X[test_index]

            # Fit Gaussian mixture model to X_train
            gmm = GMM(n_components=K, covariance_type=covar_type, n_init=reps, params="wmc").fit(X_train)

            # compute negative log likelihood of X_test
            CVE[t] += -gmm.score(X_test).sum()
            # print CVE[t]

    # Plot results
    return CVE
Example #15
0
def gmm(X, y, M, C, K=4, cov_type="diag", reps=10):

    # Fit Gaussian mixture model

    gmm = GMM(n_components=K, covariance_type=cov_type, n_init=reps, params="wmc").fit(X)

    cls = gmm.predict(X)  # extract cluster labels

    cds = gmm.means_  # extract cluster centroids (means of gaussians)

    covs = gmm.covars_  # extract cluster shapes (covariances of gaussians)

    if cov_type == "diag":

        new_covs = np.zeros([K, M, M])

        count = 0

        for elem in covs:

            temp_m = np.zeros([M, M])

            for i in range(len(elem)):

                temp_m[i][i] = elem[i]

            new_covs[count] = temp_m

            count += 1

        covs = new_covs

    clusterPlot(X, cls, K, C, y, cds, covs)
Example #16
0
def _gmm_from_memberships(data, memberships, covariance_type):
    clusters = set(memberships)
    n_clusters = len(clusters)
    gmm = GMM(n_components=n_clusters, params='m')
    gmm.weights_ = np.ones([n_clusters])/n_clusters
    gmm.means_ = np.zeros([n_clusters, data.shape[1]]) 
    if covariance_type == 'diag':
        gmm.covars_ = np.zeros([n_clusters, data.shape[1]])
    if covariance_type == 'spherical':
        gmm.covars_ = np.zeros([n_clusters])
    if covariance_type == 'full':
        gmm.covars_ = np.zeros([n_clusters, data.shape[1], data.shape[1]])

    for cluster in clusters:
        cluster = int(cluster)
        indices = (memberships == cluster)
        gmm.means_[cluster, :] = data[indices, :].mean(axis=0)
        if covariance_type in ['diag', 'spherical']:
            #TODO Fix covariance calculation, for now, return cov=1
            #D = np.diag(np.cov(data[indices, :].T))
            D = np.ones([data.shape[1]])
            if covariance_type == 'spherical':
                gmm.covars_[cluster] = D.mean()
            else:
                gmm.covars_[cluster] = D
        if covariance_type == 'full':
            cov_estimator = OAS()
            cov_estimator.fit(data[indices, :])
            gmm.covars_[cluster] = cov_estimator.covariance_
    return gmm
Example #17
0
 def plot_elbow(self,start,end):
     '''
     Fit GMM and plot elbow using AIC & BIC
     '''
     from sklearn.mixture import GMM,DPGMM
     obs = self.X_hmm
     aics = []
     bics = []
     for i in range(start,end+1):
         n_iter=1000
         for j in range(1,11):
             g = GMM(n_components=i,n_iter=n_iter)
             g.fit(obs)
             print i
             converged =  g.converged_
             if converged:
                 print 'j:%d'%(j)
                 break
             n_iter += 1000
         aics.append(g.aic(obs))
         bics.append(g.bic(obs))
     if not converged:
         print 'Not Converged!!'
     fig = plt.figure()
     ax = fig.add_subplot(111)
     ax.plot(range(start,end+1),aics,label='AIC')
     ax.plot(range(start,end+1),bics,label='BIC')
     ax.set_xlabel("No. of Clusters")
     ax.set_ylabel("Information Loss")
     ax.set_xticks(range(start,end+1),minor=True)
     ax.legend()
     ax.grid(True,which='both')
     plt.show()
Example #18
0
def clusterEM(train_data, test_data, max):
    best = -1
    best_k = 0
    scores = []
    best_rs = -1
    for rs in range(20):
        for k in range(2, max+1):
            em = GMM(n_components=k, random_state=rs)
            score, dur = score_clustering(em, train_data, test_data)
            if score > best:
                best = score
                best_k = k
                best_rs = rs
                print('local best k=%d, rs=%d, score=%.3f' % (best_k, best_rs, best))

    print('EM k=%d, rs=%d, score=%.3f' % (best_k, best_rs, best))

    em = GMM(n_components=best_k, random_state=best_rs).fit(train_data)
    clusters = em.predict(test_data)
    plot_clusters(test_data, clusters, 'EM Clusters c=%d' % best_k)

    for k in range(2, max+1):
        em = GMM(n_components=k, random_state=rs)
        score, dur = score_clustering(em, train_data, test_data)
        scores.append(score)
        if k == best_k:
            print('EM duration: %d' % dur)

    print('EM k=%d, score=%.3f' % (best_k, best))
    return best_rs, scores
Example #19
0
def cluster_and_learn_nn(train_data, train_target, test_data, test_target,):
    # get cluster assignments for training and test data
    # 2 was the best k per earlier experiments
    km = KMeans(n_clusters=2, random_state=1).fit(train_data)
    train_clusters = km.predict(train_data)
    test_clusters = km.predict(test_data)
    # add the cluster assignment as a feature
    train_with_cluster = np.concatenate((train_data, train_clusters.reshape(len(train_clusters), 1)), axis=1)
    test_with_cluster = np.concatenate((test_data, test_clusters.reshape(len(test_clusters), 1)), axis=1)

    print('KMeans cluster NN')
    learn_nn(train_with_cluster, train_target, test_with_cluster, test_target)

    # repeat with EM
    # 4 = best c per earlier experiments
    em = GMM(n_components=4, random_state=1)
    em.fit(train_data)
    train_clusters = em.predict(train_data)
    test_clusters = em.predict(test_data)
    # add the cluster assignment as a feature
    train_with_cluster = np.concatenate((train_data, train_clusters.reshape(len(train_clusters), 1)), axis=1)
    test_with_cluster = np.concatenate((test_data, test_clusters.reshape(len(test_clusters), 1)), axis=1)

    print('EM cluster NN')
    learn_nn(train_with_cluster, train_target, test_with_cluster, test_target)
Example #20
0
class OneClassGMM2(BaseClassifier):
    _predict_params = []
    _fit_params = []
    
    def __init__(self, *args, **kwargs):
        pass
    
    def fit(self, data, **kwargs):
        self.gmm = GMM_SKL(2,covariance_type='full')
        self.gmm.fit(data)
        pred = self.gmm.predict(data)
        bcnt = numpy.bincount(pred)
        self.majority_class_index = numpy.argmax(bcnt)
        self.direct_threshold = 0.5
    
    def predict(self, data):
        score = self.gmm.score(data)
        pred = self.gmm.predict(data)
        
        tmp = numpy.ones(pred.shape) * -1
        tmp[pred == self.majority_class_index] = 1
        
        self.score = score
        return tmp
    
    def decision_function(self, data):
        return -self.score
Example #21
0
    def train(self,obs):
        obs = numpy.array(obs)
        
        obs = obs[:,self.attr]
        
        num_components = 10
	try:
	        gmm = GMM(n_components=num_components,covariance_type='diag')
	except:
	        gmm = GMM(n_components=num_components,cvtype='diag')

        gmm.fit(obs)
        predictions = gmm.predict(obs)
        
        for n in range(num_components):
            indexes = numpy.where(predictions==n)[0]
            if len(indexes)>2:
                s_obs = obs[indexes]
                self.data.append(s_obs.mean(0))        
        
        X = numpy.array(self.data)
        
        try:
            self.model = svm.OneClassSVM(nu=self.nu,gamma=self.gamma)
            #self.model = svm.OneClassSVM(nu=0.1,gamma=Gamma)
            self.model.fit(X)
        except:
            print "exception in EmoModelOneClassClassifier.train()"
def profile_gmm(cache_dir, group_name, ncomponents=50, filter=None, 
                ipython_profile=None):
    cache = Cache(cache_dir)
    group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter)

    keys = group.keys()
    subsamples = subsample(cache_dir, [group[g] for g in keys], ipython_profile)

    subsampled = np.vstack(subsamples)
    meanvector = np.mean(subsampled, 0)
    mean_centered = subsampled - meanvector

    #perform PCA
    U, s, V = linalg.svd(mean_centered, full_matrices=False)
    percvar_expl = s ** 2 / np.sum(s ** 2)
    scores = np.dot(U, np.diag(s))
    loadings = np.transpose(V)

    # Find the number of PCs required to explain x% of variance
    cutoffpercentage = 80
    percvar_cum = np.cumsum(percvar_expl)
    npc = np.nonzero(percvar_cum > float(cutoffpercentage) / 100)[0][0]
    if npc < 20: 
        npc = 20
   
    # GMM
    gmm = GMM(ncomponents, cvtype='full')
    gmm.fit(scores[:, :npc], n_iter=100000, thresh=1e-7)

    parameters = [(cache_dir, group[g], gmm, meanvector, loadings[:, :npc])
                  for g in keys]
    variables = ['Component %d' % i for i in range(ncomponents)]
    return Profiles.compute(keys, variables, _compute_mixture_probabilities, 
                            parameters, ipython_profile, group_name=group_name)
	def cluster_and_label(name, data_to_cluster):
		
		data_to_cluster = scale(data_to_cluster)
		"""
		km_labels_store_file = folder + '%s_%s_%s_classifier.pickle'%(n_clusters, 'km', name)
		if load_pickled_labels:
			with open(km_labels_store_file, 'rb') as fid:
				km_labels = cPickle.load(fid)
		else:
			km = KMeans(n_clusters=n_clusters)
			km.fit(data_to_cluster)
			km_labels = km.predict(data_to_cluster)
			with open(km_labels_store_file, 'wb') as fid:
				cPickle.dump(km_labels, fid)
		make_plots('km', name, km_labels)
		make_tables('km', name, km_labels)
		"""

		gmm_labels_store_file = folder + '%s_%s_%s_classifier.pickle'%(n_clusters, 'gmm', name)
		if load_pickled_labels:
			with open(gmm_labels_store_file, 'rb') as fid:
				gmm_labels = cPickle.load(fid)
		else:
			gmm = GMM(n_components = n_clusters, covariance_type = 'full')
			gmm.fit(data_to_cluster)
			gmm_labels = gmm.predict(data_to_cluster)
			with open(gmm_labels_store_file, 'wb') as fid:
				cPickle.dump(gmm_labels, fid)
		#make_plots('gmm', name, gmm_labels)
		make_tables('gmm', name, gmm_labels)
 def __init__(self, n_components=1, covariance_type='diag',
              random_state=None, thresh=1e-2, min_covar=1e-3,
              n_iter=1000, n_init=1, params='', init_params=''):
     
     GMM.__init__(self, n_components, covariance_type,
              random_state, thresh, min_covar,
              n_iter, n_init, params, init_params)
 def test_GMM(Self):
   X=[0.9,1.,1.9,2.,2.1,1.1]
   gmm=GMM(n_components=2, covariance_type='spherical', init_params='wc', n_iter=20)
   gmm.fit(X)
   y_train_predict=gmm.predict(X)
   assert list(y_train_predict)==[1,1,0,0,0,1] or list(y_train_predict)==[0,0,1,1,1,0]
   assert gmm.means_.mean()>1.45 and gmm.means_.mean()<1.55
Example #26
0
    def gmm_component_filter(self, nc=20, threshold=0.72, show=True):
        clf = GMM(nc, n_iter=500, random_state=3).fit(self.fiter.y)
        ss = clf.predict(self.fiter.y)

        self.fiter.df['p_rk_cg'] = self.fiter.df['profit_cg'].rank()
        self.fiter.df['ss'] = ss

        win_top = len(self.fiter.df['profit_cg']) - len(self.fiter.df['profit_cg']) * 0.25
        loss_top = len(self.fiter.df['profit_cg']) * 0.25
        self.fiter.df['rk'] = 0
        self.fiter.df['rk'] = np.where(self.fiter.df['p_rk_cg'] > win_top, 1, self.fiter.df['rk'])
        self.fiter.df['rk'] = np.where(self.fiter.df['p_rk_cg'] < loss_top, -1, self.fiter.df['rk'])

        xt = pd.crosstab(self.fiter.df['ss'], self.fiter.df['rk'])
        xt_pct = xt.div(xt.sum(1).astype(float), axis=0)

        if show:
            xt_pct.plot(
                figsize=(16, 8),
                kind='bar',
                stacked=True,
                title=str('ss') + ' -> ' + str('result'))
            plt.xlabel(str('ss'))
            plt.ylabel(str('result'))

            ZLog.info(xt_pct[xt_pct[-1] > threshold])
            ZLog.info(xt_pct[xt_pct[1] > threshold])

        self.top_loss_ss = xt_pct[xt_pct[-1] > threshold].index
        self.top_win_ss = xt_pct[xt_pct[1] > threshold].index
        return xt, xt_pct
Example #27
0
def get_space_color_clusters(img_df, alpha, n_components):
    img_df = img_df.copy()
    img_df[['l', 'a', 'b']] = img_df[['l', 'a', 'b']] * alpha
    mm = GMM(n_components=n_components)

    img_pred = mm.fit(img_df).predict(img_df)
    if not mm.converged_:
        LOGGER.warning(
            'Space-color mixture model did not converge for parameters alpha = {}, n_components = {}'\
            .format(alpha, n_components))

    img_all = pd.concat([img_df, pd.Series(img_pred)], axis=1)
    img_all = img_all.rename(columns={0: 'c'})
    img_all.index.name = 'order'

    color_clusters = img_all.groupby('c')[['l', 'a', 'b']].mean()

    n_before = len(img_all)
    img_merged = img_all.reset_index()\
        .merge(color_clusters.reset_index(), on='c', suffixes=['', '_m'])\
        .sort('order').set_index('order')
    assert n_before == len(img_merged),\
        'Some rows were somehow lost during join: size before = {}, size after = {}'.format(n_before, len(img_merged))
    assert np.all(img_merged.apply(np.isfinite).apply(np.all)), 'Merged data frame has NA values somehow'

    return {
        'img_df': img_merged,
        'alpha': alpha,
        'n_components': n_components,
        'model': mm
    }
Example #28
0
def GetFeatures(driverID, j):
    #print driverID
    driverDir = '../Kaggle/drivers/'+str(driverID)
    
    cur_driver_df = pd.DataFrame(np.zeros((200,4000)))
   
    for index,tripID in enumerate(tripFiles):    
        #print tripID
        trip = Trip(driverID,tripID,pd.read_csv(driverDir+'/' + str(tripID) + '.csv'))
        X = trip.features
        X=X[(X.v<vlim[1]) & (X.v>vlim[0])]
        X=X[(X.acc<clim[1]) & (X.acc>clim[0])]
        X.index = range(X.shape[0])    
        xN = np.asanyarray(X)
    
        #train GMM
        #gmms = [GMM(n_components=n, covariance_type='full').fit(xN) for n in n_components]
        #BICs = [gmm.bic(xN) for gmm in gmms]
        #i_min = np.argmin(BICs)
        #clf=gmms[i_min]
        #print '%s components' %(n_components[i_min])
        try:
            clf = GMM(n_components=5, covariance_type='full').fit(xN)

            X_, Y_ = np.meshgrid(np.linspace(clim[0], clim[1],num=80),np.linspace(vlim[0], vlim[1]),num=40)
            XX = np.array([X_.ravel(), Y_.ravel()]).T
            Z = np.exp(clf.score(XX))
            cur_driver_df.loc[tripID] = Z
        except:
            print 'exception driver %d trip %d' %(driverID,tripID)

    cur_driver_df.loc[1:].to_csv(featuresDir+'/' + str(driverID) + '.csv', index=False)
        
    return 0
  def _find_best_split(self):
    print "OpenCL Regression split"
    #X=[0.9,1.,1.9,2.,2.1,1.1]
    X=[prediction for (feature_value,prediction) in enumerate(self._first_feature())]
    gmm=GMM(n_components=2, covariance_type='spherical', init_params='wc', n_iter=10)
    gmm.fit(X)
    classes=numpy.array(gmm.predict(X)).astype(numpy.float32)
    
    y_dim=self._seen_samples()
    x_dim=self.number_of_decision_functions

    A=numpy.empty((y_dim,x_dim)).astype(numpy.float32)
    for i, feature in enumerate(self.randomly_selected_features):
      for j, (feature_value, prediction) in enumerate(self.samples[feature]):
        A[j,i]=feature_value

    gini_matrix=RegressionTreeSecretOpenCL.opencl_calc.opencl_gini_matrix(A, classes)
    argmax=gini_matrix.argmax()

    y_max=argmax/x_dim
    x_max=argmax%x_dim

    feature_value=A[y_max,x_max]
    feature=self.randomly_selected_features[x_max]
    best_split={
          'left': numpy.array([x[1] for x in self.samples[feature] if x[0]<=feature_value]),
          'right': numpy.array([x[1] for x in self.samples[feature] if x[0]>feature_value]),
          'threshold':feature_value,
          'feature':feature
        }

    best_split_score=gini_matrix[y_max, x_max]

    return (best_split, best_split_score)
 def build_dictionary(self, features):
     """
     :param features: numpy array of shape [n_samples, n_features]
     """
     # compute mean and covariance matrix for the PCA
     pca_mean = features.mean(axis=0)
     features = features - pca_mean
     cov = np.dot(features.T, features)
     
     # compute PCA matrix and keep only pca_dimension dimensions
     eigvals, eigvecs = np.linalg.eig(cov)
     perm = eigvals.argsort()
     pca_transform = eigvecs[:, perm[-self.pca_dimension:]]
     
     # transform sample with PCA
     features = np.dot(features, pca_transform)
     
     # train GMM
     gmm = GMM(n_components=self.dictionary_size)
     gmm.fit(features)
     
     self.pca_mean = pca_mean
     self.pca_transform = pca_transform
     self.weights = gmm.weights_
     self.means = gmm.means_
     self.covariance = np.sqrt(1 / gmm.covars_)
     self.gmm = gmm
Example #31
0
# 肝功能异常指标
# 针对已有特征,选择"*天门冬氨酸氨基转换酶","*丙氨酸氨基转换酶","*碱性磷酸酶","*r-谷氨酰基转换酶","白蛋白","*球蛋白"作为特征
# 通过上面的特征,做聚类分出肝功能是否异常
def temp_feature(x,Value):
    if x < Value:
        return 0
    else:
        return 1
temp_df = df[["*天门冬氨酸氨基转换酶","*丙氨酸氨基转换酶","*碱性磷酸酶","*r-谷氨酰基转换酶","白蛋白","*球蛋白"]]
temp_df["temp_lable0"] = temp_df.loc[:,"*天门冬氨酸氨基转换酶"].apply(lambda x: temp_feature(x,40))
temp_df["temp_lable1"] = temp_df.loc[:,"*丙氨酸氨基转换酶"].apply(lambda x: temp_feature(x,40))
temp_df["temp_lable2"] = temp_df.loc[:,"*碱性磷酸酶"].apply(lambda x: temp_feature(x,185))
temp_df["temp_lable3"] = temp_df.loc[:,"*r-谷氨酰基转换酶"].apply(lambda x: temp_feature(x,35))
temp_df["temp_lable4"] = temp_df.loc[:,"白蛋白"].apply(lambda x: temp_feature(x,55))
temp_df["temp_lable5"] = temp_df.loc[:,"*球蛋白"].apply(lambda x: temp_feature(x,35))
gmm = GMM(n_components=2).fit(temp_df)
labels = gmm.predict(temp_df)
df["liver_trouble_feature"] = labels

# 专家指出,血糖和尿酸(肾功能)存在相关性,糖尿病到一定程度会影响肾功能
# 于是,这里额外添加一个特征标记高尿酸的人群
def temp_feature(x,Value):
    if x < Value:
        return 0
    else:
        return 1
df["high_feature_UA"] = df["尿酸"].apply(lambda x: temp_feature(x,420))

# 数据规范化
# 不同特征之间的均值差别还是有点大,所以这里做一下特征缩放,将数据压缩到区间[-1,1].
# 只对特征名中不含有feature的特征处理
Example #32
0
    sr, audio = read(source + path)
    #audio = (audio/ 32767).astype(int)
    #print(audio)
    #print(sr)
    # extract 40 dimensional MFCC & delta MFCC features
    vector = extract_features(audio, sr)

    if features.size == 0:
        features = vector
    else:
        features = np.vstack((features, vector))

    # when features of 5 files of speaker are concatenated, then do model training
    if count == 5:
        print(np.mean(features))
        gmm = GMM(n_components=16,
                  max_iter=200,
                  covariance_type='diag',
                  n_init=3)
        gmm.fit(features)

        # dumping the trained gaussian model
        picklefile = path.split("-")[0] + ".gmm"
        print(picklefile)
        cPickle.dump(gmm, open(dest + picklefile, 'wb'))
        print('+ modeling completed for speaker:', picklefile,
              " with data point = ", features.shape)
        features = np.asarray(())
        count = 0
    count = count + 1
Example #33
0
def train(args):
    print("Loading embeddings.")
    fname = "{}/labels.csv".format(args.workDir)
    labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
    labels = map(itemgetter(1), map(os.path.split,
                                    map(os.path.dirname,
                                        labels)))  # Get the directory.
    fname = "{}/reps.csv".format(args.workDir)
    embeddings = pd.read_csv(fname, header=None).as_matrix()
    le = LabelEncoder().fit(labels)
    labelsNum = le.transform(labels)
    nClasses = len(le.classes_)
    print("Training for {} classes.".format(nClasses))

    if args.classifier == 'LinearSvm':
        clf = SVC(C=1, kernel='linear', probability=True)
    elif args.classifier == 'GridSearchSvm':
        print("""
        Warning: In our experiences, using a grid search over SVM hyper-parameters only
        gives marginally better performance than a linear SVM with C=1 and
        is not worth the extra computations of performing a grid search.
        """)
        param_grid = [{
            'C': [1, 10, 100, 1000],
            'kernel': ['linear']
        }, {
            'C': [1, 10, 100, 1000],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf']
        }]
        clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5)
    elif args.classifier == 'GMM':  # Doesn't work best
        clf = GMM(n_components=nClasses)

    # ref:
    # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py
    elif args.classifier == 'RadialSvm':  # Radial Basis Function kernel
        # works better with C = 1 and gamma = 2
        clf = SVC(C=1, kernel='rbf', probability=True, gamma=2)
    elif args.classifier == 'DecisionTree':  # Doesn't work best
        clf = DecisionTreeClassifier(max_depth=20)
    elif args.classifier == 'GaussianNB':
        clf = GaussianNB()

    # ref: https://jessesw.com/Deep-Learning/
    elif args.classifier == 'DBN':
        from nolearn.dbn import DBN
        clf = DBN(
            [embeddings.shape[1], 500, labelsNum[-1:][0] + 1
             ],  # i/p nodes, hidden nodes, o/p nodes
            learn_rates=0.3,
            # Smaller steps mean a possibly more accurate result, but the
            # training will take longer
            learn_rate_decays=0.9,
            # a factor the initial learning rate will be multiplied by
            # after each iteration of the training
            epochs=300,  # no of iternation
            # dropouts = 0.25, # Express the percentage of nodes that
            # will be randomly dropped as a decimal.
            verbose=1)

    if args.ldaDim > 0:
        clf_final = clf
        clf = Pipeline([('lda', LDA(n_components=args.ldaDim)),
                        ('clf', clf_final)])

    clf.fit(embeddings, labelsNum)

    fName = "{}/classifier.pkl".format(args.workDir)
    print("Saving classifier to '{}'".format(fName))
    with open(fName, 'w') as f:
        pickle.dump((le, clf), f)
Example #34
0
def parcellate_region_1(roilist,
                        sub,
                        nClusters,
                        scan,
                        scan_type,
                        savepng=0,
                        session=1,
                        algo=0,
                        type_cor=0):
    p_dir = '/home/ajoshi/HCP_data'
    r_factor = 3
    ref_dir = os.path.join(p_dir, 'reference')
    ref = '100307'
    fn1 = ref + '.reduce' + str(r_factor) + '.LR_mask.mat'
    fname1 = os.path.join(ref_dir, fn1)
    msk = scipy.io.loadmat(fname1)  # h5py.File(fname1);
    dfs_left = readdfs(
        os.path.join(p_dir, 'reference',
                     ref + '.aparc.a2009s.32k_fs.reduce3.' + 'left' + '.dfs'))
    dfs_left_sm = readdfs(
        os.path.join(
            p_dir, 'reference', ref + '.aparc.\
a2009s.32k_fs.reduce3.very_smooth.' + 'left' + '.dfs'))
    data = scipy.io.loadmat(
        os.path.join(
            p_dir, sub, sub + '.rfMRI_REST' + str(session) + scan +
            '.reduce3.ftdata.NLM_11N_hvar_25.mat'))

    LR_flag = msk['LR_flag']
    LR_flag = np.squeeze(LR_flag) > 0
    data = data['ftdata_NLM']
    temp = data[LR_flag, :]
    m = np.mean(temp, 1)
    temp = temp - m[:, None]
    s = np.std(temp, 1) + 1e-16
    temp = temp / s[:, None]
    msk_small_region = np.in1d(dfs_left.labels, roilist)
    #    (dfs_left.labels == 46) | (dfs_left.labels == 28) \
    #       | (dfs_left.labels == 29)  # % motor
    d = temp[msk_small_region, :]
    rho = np.corrcoef(d)
    rho[~np.isfinite(rho)] = 0
    # rho=np.abs(rho)
    d_corr = temp[~msk_small_region, :]
    rho_1 = np.corrcoef(d, d_corr)
    rho_1 = rho_1[range(d.shape[0]), d.shape[0]:]
    rho_1[~np.isfinite(rho_1)] = 0
    if type_cor == 1:
        # f_rho=np.arctanh(rho_1)
        # f_rho[~np.isfinite(f_rho)]=0
        B = np.corrcoef(rho_1)
        B[~np.isfinite(B)] = 0
        affinity_matrix = affinity_mat(B)
        affinity_matrix[~np.isfinite(affinity_matrix)] = 0
        # B = np.abs(B)

    # SC = DBSCAN()
    if algo == 0:
        SC = SpectralClustering(n_clusters=nClusters, affinity='precomputed')
        # SC=SpectralClustering(n_clusters=nClusters,gamma=0.025)
        if type_cor == 0:
            affinity_matrix = affinity_mat(rho)
            labels = SC.fit_predict(affinity_matrix)
        if type_cor == 1:
            labels = SC.fit_predict(affinity_matrix)
            # affinity_matrix=SC.fit(np.abs(d))
    elif algo == 1:
        g = nx.Graph()
        g.add_edges_from(dfs_left.faces[:, (0, 1)])
        g.add_edges_from(dfs_left.faces[:, (1, 2)])
        g.add_edges_from(dfs_left.faces[:, (2, 0)])
        Adj = nx.adjacency_matrix(g)
        AdjS = Adj[(msk_small_region), :]
        AdjS = AdjS[:, (msk_small_region)]
        AdjS = AdjS.todense()
        np.fill_diagonal(AdjS, 1)
        SC = AgglomerativeClustering(n_clusters=nClusters, connectivity=AdjS)
        labels = SC.fit_predict(rho)
    elif algo == 2:
        GM = GMM(n_components=nClusters, covariance_type='full', n_iter=100)
        GM.fit(rho)
        labels = GM.predict(rho)

    elif algo == 3:
        neighbour_correlation(rho, dfs_left_sm.faces, dfs_left_sm.vertices,
                              msk_small_region)

    if savepng > 0:
        r = dfs_left_sm
        r.labels = np.zeros([r.vertices.shape[0]])
        r.labels[msk_small_region] = labels + 1

        cent = separate(labels, r, r.vertices, nClusters)

        manual_order = np.array([0 for x in range(nClusters)])
        save = np.array([0 for x in range(nClusters)])

        for i in range(0, nClusters):
            if nClusters > 1:
                choose_vector = np.argmax(cent.transpose(), axis=1)
                save[i] = cent[choose_vector[1]][1]
                correspondence_point = find_location_smallmask(
                    r.vertices, cent[choose_vector[1]], msk_small_region)
                cent[choose_vector[1]][1] = -np.Inf
                manual_order[i] = choose_vector[1]
                if i == 0:
                    # change
                    correlation_within_precuneus_vector = sp.array(
                        rho[correspondence_point])
                    correlation_with_rest_vector = sp.array(
                        rho_1[correspondence_point])
                else:
                    correlation_within_precuneus_vector = sp.vstack([
                        correlation_within_precuneus_vector,
                        [rho[correspondence_point]]
                    ])
                    correlation_with_rest_vector = sp.vstack([
                        correlation_with_rest_vector,
                        [rho_1[correspondence_point]]
                    ])
            else:
                choose_vector = 0
                correspondence_point = find_location_smallmask(
                    r.vertices, cent, msk_small_region)
                manual_order[i] = choose_vector
                if i == 0:
                    # change
                    correlation_within_precuneus_vector = sp.array(
                        rho[correspondence_point])
                    correlation_with_rest_vector = sp.array(
                        rho_1[correspondence_point])

        manual_order = change_order(manual_order, nClusters)
        r.labels = change_labels(r.labels, manual_order, nClusters)

        new_cent = separate(r.labels, r, temp, nClusters)

        if nClusters > 1:
            for i in range(0, nClusters):
                cent[manual_order[i]][1] = save[i]
        '''mlab.triangular_mesh(r.vertices[:, 0], r.vertices[:, 1], r.vertices[:,
                                                                 2], r.faces, representation='surface',
                             opacity=1, scalars=np.float64(r.labels))

        for i in range(nClusters):
            mlab.points3d(new_cent[i][0], new_cent[i][1], new_cent[i][2])

        mlab.gcf().scene.parallel_projection = True
        mlab.view(azimuth=0, elevation=90)
        mlab.colorbar(orientation='horizontal')
        mlab.draw()
        mlab.savefig(filename='clusters_' + str(nClusters) + '_rois_' + str(roilist) + 'subject_' +
                              sub + 'session' + str(session) + '_labels.png')

        mlab.close()'''

    # return (r,correspondence_vector,msk_small_region)
    return (r, correlation_within_precuneus_vector,
            correlation_with_rest_vector, msk_small_region, new_cent)
Example #35
0
 def fit_new(self, x, label):
     self.y.append(label)
     gmm = GMM(self.gmm_order)
     gmm.fit(x)
     self.gmms.append(gmm)
Example #36
0
def clustering_experiment(X, y, name, clusters, rdir):
    """Generate results CSVs for given datasets using the K-Means and EM
    clustering algorithms.

    Args:
        X (Numpy.Array): Attributes.
        y (Numpy.Array): Labels.
        name (str): Dataset name.
        clusters (list[int]): List of k values.
        rdir (str): Output directory.

    """
    sse = defaultdict(dict)  # sum of squared errors
    logl = defaultdict(dict)  # log-likelihood
    bic = defaultdict(dict)  # BIC for EM
    aic = defaultdict(dict)  # AIC for EM
    aic = defaultdict(dict)  # AIC for EM
    silhouette = defaultdict(dict)  # silhouette score
    acc = defaultdict(lambda: defaultdict(dict))  # accuracy scores
    adjmi = defaultdict(lambda: defaultdict(dict))  # adjusted mutual info
    h**o = defaultdict(lambda: defaultdict(dict))  # adjusted mutual info
    km = KMeans(random_state=0)  # K-Means
    gmm = GMM(random_state=0)  # Gaussian Mixture Model (EM)

    # start loop for given values of k
    print('DATESET: %s' % name)
    for k in clusters:
        print('K: %s' % k)
        km.set_params(n_clusters=k)
        gmm.set_params(n_components=k)
        km.fit(X)
        gmm.fit(X)

        # calculate SSE, log-likelihood, accuracy, and adjusted mutual info
        sse[k][name] = km.score(X)
        logl[k][name] = gmm.score(X)
        acc[k][name]['km'] = cluster_acc(y, km.predict(X))
        acc[k][name]['gmm'] = cluster_acc(y, gmm.predict(X))
        adjmi[k][name]['km'] = ami(y, km.predict(X))
        adjmi[k][name]['gmm'] = ami(y, gmm.predict(X))

        h**o[k][name]['km'] = homogeneity_score(y, km.predict(X))
        h**o[k][name]['gmm'] = homogeneity_score(y, gmm.predict(X))

        # calculate silhouette score for K-Means
        km_silhouette = silhouette_score(X, km.predict(X))
        silhouette[k][name] = km_silhouette

        # calculate BIC for EM
        bic[k][name] = gmm.bic(X)
        aic[k][name] = gmm.aic(X)

    # generate output dataframes
    sse = (-pd.DataFrame(sse)).T
    sse.rename(columns={name: 'sse'}, inplace=True)
    logl = pd.DataFrame(logl).T
    logl.rename(columns={name: 'log-likelihood'}, inplace=True)
    bic = pd.DataFrame(bic).T
    bic.rename(columns={name: 'bic'}, inplace=True)
    aic = pd.DataFrame(aic).T
    aic.rename(columns={name: 'aic'}, inplace=True)
    silhouette = pd.DataFrame(silhouette).T
    silhouette.rename(columns={name: 'silhouette_score'}, inplace=True)
    acc = pd.Panel(acc)
    acc = acc.loc[:, :, name].T.rename(lambda x: '{}_acc'.format(x),
                                       axis='columns')
    adjmi = pd.Panel(adjmi)
    adjmi = adjmi.loc[:, :, name].T.rename(lambda x: '{}_adjmi'.format(x),
                                           axis='columns')
    h**o = pd.Panel(h**o)
    h**o = h**o.loc[:, :, name].T.rename(lambda x: '{}_homo'.format(x),
                                         axis='columns')

    # concatenate all results
    dfs = (sse, silhouette, logl, bic, aic, acc, adjmi, h**o)
    metrics = pd.concat(dfs, axis=1)
    print(metrics)
    resfile = get_abspath('{}_train_metrics.csv'.format(name), rdir)
    metrics.to_csv(resfile, index_label='k')
import argos.io as io
import argos.plot as tplot
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.mixture import GMM

traj_list = io.load("1_traj_seg.dt")
traj_list = traj_list[:1000]

X = np.fromfile("gaussian_representation.dat", dtype=float)
D = io.load_distance_matrix("distance1.npz")

no_of_cluster = 12
gmm = GMM(n_components=no_of_cluster, n_iter=1000)
labels = gmm.fit_predict(X)

#  Postprocessing

clusters = [[] for i in range(no_of_cluster)]
no = len(traj_list)
for i in range(no):
    label = int(labels[i])
    clusters[label].append(traj_list[i])

silhoutte_score = metrics.silhouette_score(D, labels, sample_size=1000)
print("Silhoutte Coefficient : %.3f" % silhoutte_score)

#  Plotting Clustered Trajectories
color_list = plt.rcParams['axes.prop_cycle'].by_key()['color']
for i in range(no_of_cluster):
Example #38
0
T_x, T_y = get_graph_segments(model.X_train_, model.full_tree_)
T_trunc_x, T_trunc_y = get_graph_segments(model.X_train_, model.cluster_graph_)

#------------------------------------------------------------
# Fit a GMM to each individual cluster
Nx = 100
Ny = 250
Xgrid = np.vstack(
    map(np.ravel,
        np.meshgrid(np.linspace(xmin, xmax, Nx), np.linspace(ymin, ymax,
                                                             Ny)))).T
density = np.zeros(Xgrid.shape[0])

for i in range(n_components):
    ind = (labels == i)
    gmm = GMM(4).fit(X[ind])
    dens = np.exp(gmm.score(Xgrid))
    dens /= dens.max()
    density += dens

density = density.reshape((Ny, Nx))

#----------------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(7, 8))
fig.subplots_adjust(hspace=0, left=0.1, right=0.95, bottom=0.1, top=0.9)

ax = fig.add_subplot(311, aspect='equal')
ax.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k')
ax.set_xlim(ymin, ymax)
ax.set_ylim(xmin, xmax)
Example #39
0
for d in dirs:
    features = []
    print d
    for i in range(2):
        f = choice(glob.glob(d + "/*.wav"))
        fs, signal = wavfile.read(f)
        mfcc = extractor.extract_differential(signal)
        features.extend(mfcc)
    mfccs.append(features)

print "start training"
gmms = []
for idx, mfcc in enumerate(mfccs):
    print idx
    gmm = GMM(32, n_iter=1000, thresh=0.001)
    gmm.fit(mfcc)
    gmms.append(gmm)

print "done training"


def cal_score(model, mfcc):
    return np.exp(sum(model.score(mfcc)) / 1000)


def pred_label(mfcc):
    scores = [cal_score(gmm, mfcc) for gmm in gmms]
    return max(enumerate(scores), key=operator.itemgetter(1))[0]

Example #40
0
for path in file_paths:
    path = path.strip()
    print(path)

    # read the audio
    sr, audio = read(source + path)

    # extract 40 dimensional MFCC & delta MFCC features
    vector = extract_features(audio, sr)

    if features.size == 0:
        features = vector
    else:
        features = np.vstack((features, vector))
    # when features of 5 files of speaker are concatenated, then do model training
    if count == 3:
        gmm = GMM(n_components=16,
                  n_iter=200,
                  covariance_type='diag',
                  n_init=3)
        gmm.fit(features)

        # dumping the trained gaussian model
        temp_path = path.strip(".wav")
        picklefile = temp_path.strip("3") + ".gmm"
        cpk.dump(gmm, open(dest + picklefile, 'wb'))
        print('+ modeling completed for speaker:', picklefile,
              " with data point = ", features.shape)
        features = np.asarray(())
    count = count + 1
Example #41
0
import matplotlib
import matplotlib.pyplot as plt
textsize = 15
matplotlib.rcParams.update({'font.size': textsize})
plotdir = '../../plot/unsupervised/'
datadir = '../../data/unsupervised/'
preprossdatadir = '../../data/preprocess/'
source = "Cs137"  #"Co60"

with open(datadir + source + 'featuretrain.dat', 'rb') as f:
    feature = pickle.load(f)

X = feature[:, 4:]

gmm = GMM(n_components=2,
          covariance_type='full',
          max_iter=100,
          random_state=20).fit(X)
glabels = gmm.predict(X)

kmeans = KMeans(n_clusters=2, n_init=20).fit(X)
klabels = kmeans.predict(X)

density = DBSCAN(eps=0.5, min_samples=10).fit(X)
dlabels = density.labels_

# save glabel result
with open(preprossdatadir + source + 'normedwaveform0.dat', 'rb') as f:
    data = pickle.load(f)
paradf = data['para']
paradf['glabels'] = glabels
with open(datadir + source + 'testresultlabel.dat', 'wb') as f:
Example #42
0
# TODO: Apply a PCA transformation to the sample log-data
pca_samples = pca.transform(log_samples)

# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data,
                            columns=['Dimension 1', 'Dimension 2'])

# Display sample log-data after applying PCA transformation in two dimensions
display(
    pd.DataFrame(np.round(pca_samples, 4),
                 columns=['Dimension 1', 'Dimension 2']))

# TODO: Apply your clustering algorithm of choice to the reduced data
from sklearn.mixture import GMM
clusterer = GMM(n_components=2, covariance_type='full', random_state=42)
clusterer.fit(reduced_data)

# TODO: Predict the cluster for each data point
preds = clusterer.predict(reduced_data)

# TODO: Find the cluster centers
centers = clusterer.means_

# TODO: Predict the cluster for each transformed sample data point
sample_preds = clusterer.predict(pca_samples)

# TODO: Calculate the mean silhouette coefficient for the number of clusters chosen
from sklearn.metrics import silhouette_score
score = silhouette_score(reduced_data, preds, random_state=42)
print score
train_index, test_index = next(iter(indices))

# Extract training data and labels
X_train = iris.data[train_index]
y_train = iris.target[train_index]

# Extract testing data and labels
X_test = iris.data[test_index]
y_test = iris.target[test_index]

# Extract the number of classes
num_classes = len(np.unique(y_train))

# Build GMM
classifier = GMM(n_components=num_classes,
                 covariance_type='full',
                 init_params='wc',
                 n_iter=20)

# Initialize the GMM means
classifier.means_ = np.array(
    [X_train[y_train == i].mean(axis=0) for i in range(num_classes)])

# Train the GMM classifier
classifier.fit(X_train)

plt.figure()
colors = 'bgr'

for i, color in enumerate(colors):
    # Extract eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eigh(
Example #44
0
 def __init__(self, Xpoints, numMixtures):
     print "Scikits Learn Implementation Chosen"
     LikelihoodEvaluator.__init__(self, Xpoints, numMixtures)
     from sklearn.mixture import GMM as GMMEval
     self.evaluator = GMMEval(n_components=numMixtures)
     self.Xpoints = Xpoints
################## GMM #####################


# computes accuracy given the predictions and real labels
def accuracy(predictions, labels):
    batch_size = predictions.shape[0]
    sum = np.sum(predictions == labels)
    acc = (100.0 * sum) / batch_size
    return acc


n_classes = 10  # 10 genre classes

# Try GMMs using different types of covariances. I'm only letting 'full' as it performs better but can add different types to try
classifiers = dict((covar_type, GMM(n_components=n_classes,
                                    covariance_type=covar_type, init_params='wc', n_iter=5))
                   for covar_type in ['full'])


print("Training GMM")

for index, (name, classifier) in enumerate(classifiers.items()):
    # Since we have class labels for the training data, we can
    # initialize the GMM parameters in a supervised manner.
    classifier.means_ = np.array([train_data[train_labels == i].mean(axis=0)
                                  for i in range(n_classes)])

    # Train the other parameters using the EM algorithm.
    classifier.fit(train_data)

    # getting predictions of training set
 def training(self):
    self.gmm = GMM(n_components = 2, covariance_type = 'diag',
                    verbose = False )
    self.gmm.fit(self.train)
Example #47
0
                   columns=[
                       'job', 'marital', 'education', 'default', 'housing',
                       'loan', 'contact', 'month', 'day_of_week', 'poutcome'
                   ])
X.dropna()
X['y'].value_counts()
X['y'] = X['y'].map({'yes': 1, 'no': 0})
y = X['y']
X = X.drop(['y'], axis=1)
pca = LinearDiscriminantAnalysis(n_components=1)
X = pca.fit_transform(X, y)
X_train, y_train, X_test, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=0)

# n_components = np.arange(1, 2)
# models = [GMM(n, covariance_type='full', random_state=0).fit(X_train)
#           for n in n_components]
#
# plt.plot(n_components, [m.bic(X_train) for m in models], label='BIC')
# plt.plot(n_components, [m.aic(X_train) for m in models], label='AIC')
# plt.legend(loc='best')
# plt.xlabel('n_components')
model = GMM(3, covariance_type='full', random_state=0).fit(X)
cluster_labels = model.predict(X)
print('NMI: {}'.format(metrics.normalized_mutual_info_score(y,
                                                            cluster_labels)))
print('Homogeneity: {}'.format(metrics.homogeneity_score(y, cluster_labels)))
print('Completeness: {}'.format(metrics.completeness_score(y, cluster_labels)))
#plt.savefig('ds2_gmm_rp.png')
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Neural Network Lib
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture as GMM

# Get Data

l = ['FA', 'ICA', 'PCA', 'RP']

o = []

for al in l:
    print(al)

    data = pd.read_csv('../datasets/{}_credit.csv'.format(al))

    y = data.default
    X = data.drop('default', axis=1)

    model = GMM(n_components=2).fit(X)

    o = model.predict(X)

    data['cluster_labels'] = o

    data.to_csv('../datasets/reduced_clustered_dataset_gmm_{}.csv'.format(al))
from sklearn.mixture import GMM
import numpy as np
from math import *
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats, integrate
import seaborn as sns
sns.set(color_codes=True)

#Data Generation
gmm = GMM(2,covariance_type='diag')
gmm.means_ = np.array([[1], [4]])
gmm.weights_ = np.array([0.5, 0.5])
gmm.covars_ = np.array([[1], [1]]) 
X = gmm.sample(1000)

#Histogram
num_bins =50
n, bins, patches = plt.hist(X, num_bins, normed=1, facecolor='green', alpha=0.5)
plt.show()

#################################################
Gibbs Sampling Algorithm
#################################################
poids=0.5


#Initialization
theta_p=[]
theta_p.append(-0.2)
Example #50
0
count = 1
# Extracting features for each speaker (5 files per speakers)
features = np.asarray(())
for path in file_paths:    
    path = path.strip()   
    print path
    
    # read the audio
    sr,audio = read(source + path)
    
    # extract 40 dimensional MFCC & delta MFCC features
    vector   = extract_features(audio,sr)
    
    if features.size == 0:
        features = vector
    else:
        features = np.vstack((features, vector))
    # when features of 5 files of speaker are concatenated, then do model training
	# -> if count == 5: --> edited below
    if count == 15:    
        gmm = GMM(n_components = 16, covariance_type='diag',n_init = 3)
        gmm.fit(features)
        
        # dumping the trained gaussian model
        picklefile = path.split("-")[0]+".gmm"
        cPickle.dump(gmm,open(dest + picklefile,'w'))
        print '+ modeling completed for speaker:',picklefile," with data point = ",features.shape    
        features = np.asarray(())
        count = 0
    count = count + 1
Example #51
0
def get_gmm(data, tdata, num_classes):
    gmm = GMM(n_components=num_classes).fit(data)
    lout = gmm.predict(data)
    lout2 = gmm.predict(tdata)
    return lout.reshape(lout.shape[0], 1), lout2.reshape(lout2.shape[0], 1)
def run_all_classifiers(X_train, X_test, y_train, y_test, print_output_scores_to_csv=False, output_scores_csv_file_suffix='', print_only_table=False):
    """
    The list of all classifiers was generated by running the following commented code.

    Args:
        a_X_train, a_X_test, a_y_train, a_y_test: The train and tests datasets.
        a_print_output_scores_to_csv: If True the Precision, Recall, F1-Score and Support for both classes will
        be printed to a file with the current date and time.
        a_output_scores_csv_file_suffix: Suffix to be added to the csv file just before the .csv extension. Normally
        describing the run that is being performed.

    Returns:
        dataset: Returns output scores dataset.

    """
    assert isinstance(X_train, pd.core.frame.DataFrame)
    assert isinstance(X_test,  pd.core.frame.DataFrame)
    assert isinstance(y_train, pd.core.frame.Series)
    assert isinstance(y_test,  pd.core.frame.Series)
    assert isinstance(print_output_scores_to_csv, bool)
    assert isinstance(output_scores_csv_file_suffix, object)

    import time

    # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn
    #from sklearn.utils.testing import all_estimators
    #estimators = all_estimators()
    #for name, class_ in estimators:
    #    log_print(name)

    from sklearn.calibration           import CalibratedClassifierCV
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.ensemble              import AdaBoostClassifier
    from sklearn.ensemble              import BaggingClassifier
    from sklearn.ensemble              import ExtraTreesClassifier
    from sklearn.ensemble              import GradientBoostingClassifier
    from sklearn.ensemble              import RandomForestClassifier
    from sklearn.gaussian_process      import GaussianProcessClassifier
    from sklearn.linear_model          import LogisticRegression
    from sklearn.linear_model          import LogisticRegressionCV
    from sklearn.linear_model          import SGDClassifier

    from sklearn.mixture               import BayesianGaussianMixture
    from sklearn.mixture               import DPGMM
    from sklearn.mixture               import GaussianMixture
    from sklearn.mixture               import GMM
    from sklearn.mixture               import VBGMM
    from sklearn.naive_bayes           import BernoulliNB
    from sklearn.naive_bayes           import GaussianNB
    from sklearn.neighbors             import KNeighborsClassifier
    from sklearn.neural_network        import MLPClassifier
    from sklearn.semi_supervised       import LabelPropagation
    from sklearn.semi_supervised       import LabelSpreading
    from sklearn.svm                   import SVC
    from sklearn.tree                  import DecisionTreeClassifier
    #from xgboost                       import XGBClassifier

    models = []
    models.append(('AdaBoostClassifier',            AdaBoostClassifier()))
    models.append(('BaggingClassifier',             BaggingClassifier()))
    models.append(('BayesianGaussianMixture',       BayesianGaussianMixture()))
    models.append(('BernoulliNB',                   BernoulliNB()))
    models.append(('CalibratedClassifierCV',        CalibratedClassifierCV()))
    models.append(('DPGMM',                         DPGMM()))
    models.append(('DecisionTreeClassifier',        DecisionTreeClassifier(random_state=SEED)))
    models.append(('ExtraTreesClassifier',          ExtraTreesClassifier(random_state=SEED)))
    models.append(('GMM',                           GMM()))
    models.append(('GaussianMixture',               GaussianMixture()))
    models.append(('GaussianNB',                    GaussianNB()))
    models.append(('GaussianProcessClassifier',     GaussianProcessClassifier()))
    models.append(('GradientBoostingClassifier',    GradientBoostingClassifier()))
    models.append(('KNeighborsClassifier',          KNeighborsClassifier()))
    models.append(('LabelPropagation',              LabelPropagation()))
    models.append(('LabelSpreading',                LabelSpreading()))
    models.append(('LinearDiscriminantAnalysis',    LinearDiscriminantAnalysis()))
    models.append(('LogisticRegression',            LogisticRegression()))
    models.append(('LogisticRegressionCV',          LogisticRegressionCV()))
    models.append(('MLPClassifier',                 MLPClassifier()))
    #models.append(('MultinomialNB', MultinomialNB()))
    #models.append(('NuSVC', NuSVC()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('RandomForestClassifier',        RandomForestClassifier(random_state=SEED)))
    models.append(('SGDClassifier',                 SGDClassifier()))
    models.append(('SVC',                           SVC()))
    models.append(('VBGMM',                         VBGMM()))
    #models.append(('XGBClassifier',                 XGBClassifier()))
    
    output_scores_df = fit_predict_plot(X_train, X_test, y_train, y_test, models, print_only_table)

    if print_output_scores_to_csv:
        output_scores_df.to_csv(time.strftime('output_scores' + str(output_scores_csv_file_suffix) + '.csv')

    return output_scores_df

def run_all_classifiers(X_train, X_test, y_train, y_test, print_details=True):
    """
    Run all classifiers of sklearn

    Args:
        X_train, X_test, y_train, y_test: The train and tests datasets.
        print_details: if true, print details of all models and save csv table ;
                       if false, print only table with summary of the models
    Returns:
        dataset: Returns output scores dataset.

    """
    assert isinstance(X_train, pd.core.frame.DataFrame)
    assert isinstance(X_test, pd.core.frame.DataFrame)
    assert isinstance(y_train, pd.core.frame.Series)
    assert isinstance(y_test, pd.core.frame.Series)
    assert isinstance(print_details, bool)

    log_method_execution_time(log_funcname())

    from sklearn.utils.testing import all_estimators
    import sklearn.metrics
    import time
    from src.util.acq_util import RANDOM_SEED

    # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn
    #from xgboost import XGBClassifier
    #models.append(('XGBClassifier', XGBClassifier()))

    models = all_estimators(type_filter='classifier')
    output_scores_dataset = pd.DataFrame(index=['Precision 0', 'Recall 0', 'F1-Score 0', 'Support 0',
                                                'Precision 1', 'Recall 1', 'F1-Score 1', 'Support 1'],
                                         columns=list(zip(*models))[0])

    for name, model in models:
        if print_details is True:
            print('------------------------------------------------------------------------------')
            print(name)
            print('------------------------------------------------------------------------------')

        if (name == 'MultinomialNB' or name == 'NuSVC' or name == 'RadiusNeighborsClassifier' or name == 'GaussianProcessClassifier'):
            continue

        model = model()
        if 'random_state' in model.get_params():
            model.random_state = SEED

        #Fitting the model.
        model.fit(X_train, y_train)

        #Measuring accuracy.
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        output_scores_dataset = class_compute_accuracy(y_train, y_train_pred, output_scores_dataset,
                                                       ['Accuracy on the train set', name], print_details)
        output_scores_dataset = class_compute_accuracy(y_test, y_test_pred, output_scores_dataset,
                                                       ['Accuracy on the test set', name], print_details)

        #Plotting confusion matrix.
        output_scores_dataset = class_compute_plot_confusion_matrix(y_test, y_test_pred, output_scores_dataset, name, print_details)

        #Showing classification report.
        if print_details is True:
            print(sklearn.metrics.classification_report(y_test, y_test_pred))

        # Printing scores to output dataset.
        output_scores_dataset = class_compute_recall_precision_f1(y_test, y_test_pred, output_scores_dataset, name)

    # Can use idxmax with axis=1 to find the column with the greatest value on each row.
    output_scores_dataset['Max Value'] = output_scores_dataset.apply(max, axis=1)
    #output_scores_dataset['Max Classifier'] = output_scores_dataset.idxmax(axis=1)

    if print_details is True:
        output_scores_dataset.to_csv('output_scores' + '.csv')

    return output_scores_dataset

def train_test_split_for_classification(dataset, label, test_size, random_state=SEED):
    """
    Selects X and y, considering that y has been renamed to label.
    """
    from sklearn.model_selection import train_test_split

    assert isinstance(dataset, pd.core.frame.DataFrame)
    assert isinstance(test_size, float)
    assert isinstance(random_state, int)

    X = dataset.loc[:, dataset.columns != label]
    y = dataset[g_label]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    log_print('X_train: {}'.format(X_train.shape))
    log_print('y_train: {}'.format(y_train.shape))
    log_print('X_test:  {}'.format(X_test.shape))
    log_print('y_test:  {}'.format(y_test.shape))
    return(X_train, X_test, y_train, y_test)
Example #53
0
def gmm_entropy(points, n_est=None, n_components=None):
    r"""
    Use sklearn.mixture.BayesianGaussianMixture to estimate entropy.

    *points* are the data points in the sample.

    *n_est* are the number of points to use in the estimation; default is
    10,000 points, or 0 for all the points.

    *n_components* are the number of Gaussians in the mixture. Default is
    $5 \sqrt{d}$ where $d$ is the number of dimensions.

    Returns estimated entropy and uncertainty in the estimate.

    This method uses BayesianGaussianMixture from scikit-learn to build a
    model of the point distribution, then uses Monte Carlo sampling to
    determine the entropy of that distribution. The entropy uncertainty is
    computed from the variance in the MC sample scaled by the number of
    samples. This does not incorporate any uncertainty in the sampling that
    generated the point distribution or the uncertainty in the GMM used to
    model that distribution.
    """
    #from sklearn.mixture import GaussianMixture as GMM
    from sklearn.mixture import BayesianGaussianMixture as GMM
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = 10000
    elif n_est == 0:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
        n_est = n
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    if n_components is None:
        n_components = int(5 * sqrt(d))

    ## Standardization doesn't seem to help
    ## Note: sigma may be zero
    #x, mu, sigma = standardize(x)   # if standardized
    predictor = GMM(
        n_components=n_components,
        covariance_type='full',
        #verbose=True,
        max_iter=1000)
    predictor.fit(x)
    eval_x, _ = predictor.sample(n_est)
    weight_x = predictor.score_samples(eval_x)
    H = -np.mean(weight_x)
    #with np.errstate(divide='ignore'): H = H + np.sum(np.log(sigma))   # if standardized
    dH = np.std(weight_x, ddof=1) / sqrt(n)
    ## cross-check against own calcs
    #alt = GaussianMixture(predictor.weights_, mu=predictor.means_, sigma=predictor.covariances_)
    #print("alt", H, alt.entropy())
    #print(np.vstack((weight_x[:10], alt.logpdf(eval_x[:10]))).T)
    return H / LN2, dH / LN2
Example #54
0
def wnn_entropy(points, k=None, weights=True, n_est=None, gmm=None):
    r"""
    Weighted Kozachenko-Leonenko nearest-neighbour entropy calculation.

    *k* is the number of neighbours to consider, with default $k=n^{1/3}$

    *n_est* is the number of points to use for estimating the entropy,
    with default $n_\rm{est} = n$

    *weights* is True for default weights, False for unweighted (using the
    distance to the kth neighbour only), or a vector of weights of length *k*.

    *gmm* is the number of gaussians to use to model the distribution using
    a gaussian mixture model.  Default is 0, and the points represent an
    empirical distribution.

    Returns entropy H in bits and its uncertainty.

    Berrett, T. B., Samworth, R.J., Yuan, M., 2016. Efficient multivariate
    entropy estimation via k-nearest neighbour distances.
    DOI:10.1214/18-AOS1688 https://arxiv.org/abs/1606.00304
    """
    from sklearn.neighbors import NearestNeighbors
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = 10000
    elif n_est == 0:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
        n_est = n
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    # Default k based on n
    if k is None:
        # Private communication: cube root of n is a good choice for k
        # Personal observation: k should be much bigger than d
        k = max(int(n**(1 / 3)), 3 * d)

    # If weights are given then use them (setting the appropriate k),
    # otherwise use the default weights.
    if isinstance(weights, bool):
        weights = _wnn_weights(k, d, weights)
    else:
        k = len(weights)
    #print("weights", weights, sum(weights))

    # select knn algorithm
    algorithm = 'auto'
    #algorithm = 'kd_tree'
    #algorithm = 'ball_tree'
    #algorithm = 'brute'

    n_components = 0 if gmm is None else gmm

    # H = 1/n sum_i=1^n sum_j=1^k w_j log E_{j,i}
    # E_{j,i} = e^-Psi(j) V_d (n-1) z_{j,i}^d = C z^d
    # logC = -Psi(j) + log(V_d) + log(n-1)
    # H = 1/n sum sum w_j logC + d/n sum sum w_j log(z)
    #   = sum w_j logC + d/n sum sum w_j log(z)
    #   = A + d/n B
    # H^2 = 1/n sum
    Psi = digamma(np.arange(1, k + 1))
    logVd = d / 2 * log(pi) - gammaln(1 + d / 2)
    logC = -Psi + logVd + log(n - 1)

    # TODO: standardizing points doesn't work.
    # Standardize the data so that distances conform.  This is equivalent to
    # a u-substitution u = sigma x + mu, so the integral needs to be corrected
    # for dU = det(sigma) dx.  Since the standardization squishes the dimensions
    # independently, sigma is a diagonal matrix, with the determinant equal to
    # the product of the diagonal elements.
    #x, mu, sigma = standardize(x)  # Note: sigma may be zero
    #detDU = np.prod(sigma)
    detDU = 1.

    if n_components > 0:
        # Use Gaussian mixture to model the distribution
        from sklearn.mixture import GaussianMixture as GMM
        predictor = GMM(n_components=gmm, covariance_type='full')
        predictor.fit(x)
        eval_x, _ = predictor.sample(n_est)
        #weight_x = predictor.score_samples(eval_x)
        skip = 0
    else:
        # Empirical distribution
        # TODO: should we use the full draw for kNN and a subset for eval points?
        # Choose a subset for evaluating the entropy estimate, if desired
        #print(n_est, n)
        #eval_x = x if n_est >= n else x[permutation(n)[:n_est]]
        eval_x = x
        #weight_x = 1
        skip = 1

    tree = NearestNeighbors(algorithm=algorithm, n_neighbors=k + skip)
    tree.fit(x)
    dist, _ind = tree.kneighbors(eval_x,
                                 n_neighbors=k + skip,
                                 return_distance=True)
    # Remove first column. Since test points are in x, the first column will
    # be a point from x with distance 0, and can be ignored.
    if skip:
        dist = dist[:, skip:]
    # Find log distances.  This can be problematic for MCMC runs where a
    # step is rejected, and therefore identical points are in the distribution.
    # Ignore them by replacing these points with nan and using nanmean.
    # TODO: need proper analysis of duplicated points in MCMC chain
    dist[dist == 0] = nan
    logdist = log(dist)
    H_unweighted = logC + d * np.nanmean(logdist, axis=0)
    H = np.dot(H_unweighted, weights)[0]
    Hsq_k = np.nanmean((logC[-1] + d * logdist[:, -1])**2)
    # TODO: abs shouldn't be needed?
    if Hsq_k < H**2:
        print("warning: avg(H^2) < avg(H)^2")
    dH = sqrt(abs(Hsq_k - H**2) / n_est)
    #print("unweighted", H_unweighted)
    #print("weighted", H, Hsq_k, H**2, dH, detDU, LN2)
    return H * detDU / LN2, dH * detDU / LN2
Example #55
0
from sklearn.model_selection import GridSearchCV
import sys
import time
out = '../results/clustering/'
perm_x, perm_y, housing_x, housing_y = load_data()  # perm, housing

# np.reshape(perm_y, 30000, order='F')

# raise Exception('Remove this line to run code')

SSE = defaultdict(dict)  # some of squared errors
ll = defaultdict(dict)  # log likelihood
acc = defaultdict(lambda: defaultdict(dict))
adjMI = defaultdict(lambda: defaultdict(dict))
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

st = time.time()
print(len(clusters))
for k in clusters:
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(perm_x)
    gmm.fit(perm_x)
    SSE[k]['perm'] = km.score(perm_x)
    ll[k]['perm'] = gmm.score(perm_x)

    acc[k]['perm']['Kmeans'] = cluster_acc(perm_y, km.predict(perm_x))
    acc[k]['perm']['GMM'] = cluster_acc(perm_y, gmm.predict(perm_x))
    adjMI[k]['perm']['Kmeans'] = ami(perm_y, km.predict(perm_x))
    adjMI[k]['perm']['GMM'] = ami(perm_y, gmm.predict(perm_x))
    def train_from_images(self, filenames):
        raw_patches, raw_unspread_patches, raw_unspread_patches_padded, raw_originals = self.random_patches_from_images(
            filenames)
        if len(raw_patches) == 0:
            raise Exception(
                "No patches found, maybe your thresholds are too strict?")
        # Also store these in "settings"

        mixtures = []
        llhs = []
        for i in range(1):
            mixture = ag.stats.BernoulliMixture(self.num_parts,
                                                raw_patches,
                                                init_seed=0 + i)
            mixture.run_EM(1e-8,
                           min_probability=self.settings['min_probability'])
            mixtures.append(mixture)
            llhs.append(mixture.loglikelihood)

        best_i = np.argmax(llhs)
        mixture = mixtures[best_i]

        ag.info("Done.")

        counts = np.bincount(mixture.mixture_components(),
                             minlength=self.num_parts)
        print(counts)
        print('Total', np.sum(counts))
        from scipy.stats.mstats import mquantiles
        print(mquantiles(counts))

        # Reject weak parts
        scores = np.empty(self.num_parts)
        for i in range(self.num_parts):
            part = mixture.templates[i]
            sh = part.shape
            p = part.reshape((sh[0] * sh[1], sh[2]))

            pec = p.mean(axis=0)

            N = np.sum(p * np.log(p / pec) + (1 - p) * np.log((1 - p) /
                                                              (1 - pec)))
            D = np.sqrt(
                np.sum(np.log(p / pec * (1 - pec) / (1 - p))**2 * p * (1 - p)))
            # Old:
            #D = np.sqrt(np.sum(np.log(p/(1-p))**2 * p * (1-p)))

            scores[i] = N / D

            # Require at least 20 occurrences
            #if counts[i] < 5:
            #scores[i] = 0

        # Only keep with a certain score
        if not self.settings['bedges']['contrast_insensitive']:

            visparts = mixture.remix(raw_originals)
        else:
            visparts = np.empty((self.num_parts, ) + raw_originals.shape[1:])

            #self.extra['originals'] = []

            # Improved visparts
            comps = mixture.mixture_components()
            for i in range(self.num_parts):
                ims = raw_originals[comps == i].copy()

                #self.extra['originals'].append(ims)

                # Stretch them all out
                #for j in xrange(len(ims)):
                #ims[j] = (ims[j] - ims[j].min()) / (ims[j].max() - ims[j].min())

                # Now, run a GMM with NM components on this and take the most common
                NM = 2

                from sklearn.mixture import GMM
                gmix = GMM(n_components=NM)
                gmix.fit(ims.reshape((ims.shape[0], -1)))

                visparts[i] = gmix.means_[gmix.weights_.argmax()].reshape(
                    ims.shape[1:])

        # Unspread parts
        unspread_parts_all = mixture.remix(raw_unspread_patches)
        unspread_parts_padded_all = mixture.remix(raw_unspread_patches_padded)

        # The parts to keep
        ok = (scores > 1) & (counts >= 10)

        #if 'originals' in self.extra:
        #self.extra['originals'] = list(itr.compress(self.extra['originals'], ok))

        scores = scores[ok]
        counts = counts[ok]

        self.parts = mixture.templates[ok]
        self.unspread_parts = unspread_parts_all[ok]
        self.unspread_parts_padded = unspread_parts_padded_all[ok]
        self.visparts = visparts[ok]
        self.num_parts = self.parts.shape[0]

        # Update num_parts

        # Store the stuff in the instance
        #self.parts = mixture.templates
        #self.visparts = mixture.remix(raw_originals)

        # Sort the parts according to orientation, for better diagonistics
        if 1:
            E = self.parts.shape[-1]
            E = self.parts.shape[-1]
            ang = np.array([[0, -1], [1, -1], [1, 0], [1, 1], [0, 1], [-1, 1],
                            [-1, 0], [-1, 1]])
            nang = ang / np.expand_dims(np.sqrt(ang[:, 0]**2 + ang[:, 1]**2),
                                        1)
            orrs = np.apply_over_axes(np.mean, self.parts, [1, 2]).reshape(
                (self.num_parts, -1))
            if E == 8:
                orrs = orrs[..., :4] + orrs[..., 4:]
            nang = nang[:4]
            norrs = orrs / np.expand_dims(orrs.sum(axis=1), 1)
            dirs = (np.expand_dims(norrs, -1) * nang).sum(axis=1)
            self.orientations = np.asarray(
                [math.atan2(x[1], x[0]) for x in dirs])
            II = np.argsort(self.orientations)

        II = np.argsort(scores)

        scores = scores[II]
        counts = counts[II]

        self.extra['scores'] = scores
        self.extra['counts'] = counts
        #self.extra['originals'] = [self.extra['originals'][ii] for ii in II]

        # Now resort the parts according to this sorting
        self.orientations = self.orientations[II]
        self.parts = self.parts[II]
        self.unspread_parts = self.unspread_parts[II]
        self.unspread_parts_padded = self.unspread_parts_padded[II]
        self.visparts = self.visparts[II]

        self._preprocess_logs()
Example #57
0
                lamb[i] = l
                break
            v_old = v_new

    return (V_est, lamb)


cov1 = 0.25 * np.identity(5)
cov2 = np.identity(5)
Kd = [[-0.5, 1.1, 0.2, -0.9, 0.2], [0.2, -0.1, 0.5, -0.8, 1.0],
      [-0.3, 0.2, 0.9, 0.7, 1.0], [0.2, 0.9, 0.1, -0.4, 0.5]]
dataset1 = datacreation(Kd, cov1)
dataset2 = datacreation(Kd, cov2)
components = 4
#fitting gaussian mixtures using EM algorithm
gmm1 = GMM(n_components=4)
gmm1.fit(dataset1)
gmm2 = GMM(n_components=4)
gmm2.fit(dataset2)

print(' Predicted means and covariance of 1st mixture = \n', gmm1.means_)
print('\n')
print(gmm1.covars_)
print('Predicted means and covariances of 2nd mixture = \n', gmm2.means_)
print('\n')
print(gmm2.covars_)

#tensor method
X = datacreation(Kd, cov1)
mu = calculate_first_moment(X)
Sigma = calculate_second_moment(X)
Example #58
0
X_2D = model.transform(X_iris)

iris['PCA1'] = X_2D[:, 0]
iris['PCA2'] = X_2D[:, 1]
sns.lmplot("PCA1", "PCA2", hue='species', data=iris, fit_reg=False)

print("#---------------------------------------#")
print("               Clustering                ")
print("#---------------------------------------#")
print("\n")

# Gaussian mixture model (GMM)
# covariance:协方差

from sklearn.mixture import GMM
model = GMM(n_components=3, covariance_type='full')
model.fit(X_iris)
y_gmm = model.predict(X_iris)

iris['cluster'] = y_gmm
sns.lmplot("PCA1",
           "PCA2",
           data=iris,
           hue='species',
           col='cluster',
           fit_reg=False)

print("#---------------------------------------#")
print("           Hand-written digits           ")
print("#---------------------------------------#")
print("\n")
Example #59
0
import cv2
from matplotlib import pyplot as plt

#Use plant cells to demo the GMM on 2 components
#Use BSE_Image to demo it on 4 components
#USe alloy.jpg to demonstrate bic and how 2 is optimal for alloy

img = cv2.imread("images/BSE.tif")
plt.imshow(img)
# Convert MxNx3 image into Kx3 where K=MxN
img2 = img.reshape((-1, 3))  #-1 reshape means, in this case MxN

from sklearn.mixture import GaussianMixture as GMM

#covariance choices, full, tied, diag, spherical
gmm_model = GMM(n_components=4,
                covariance_type='tied').fit(img2)  #tied works better than full
gmm_labels = gmm_model.predict(img2)

#Put numbers back to original shape so we can reconstruct segmented image
original_shape = img.shape
segmented = gmm_labels.reshape(original_shape[0], original_shape[1])
plt.imshow(segmented)
#cv2.imwrite("images/segmented.jpg", segmented)
##############################################################
#How to know the best number of components?
#Using Bayesian information criterion (BIC) to find the best number of components
import numpy as np
import cv2

img = cv2.imread("images/BSE.tif")
img2 = img.reshape((-1, 3))
Example #60
0
    def __do_perform(self,
                     custom_out=None,
                     main_experiment=None
                     ):  # ./output/ICA/clustering//{}', ICAExperiment
        if custom_out is not None:
            # if not os.path.exists(custom_out):
            #     os.makedirs(custom_out)
            self._old_out = self._out  # './output/ICA/{}'
            self._out = custom_out  # ./output/ICA/clustering//{}'
        elif self._old_out is not None:
            self._out = self._old_out

        if main_experiment is not None:
            self.log("Performing {} as part of {}".format(
                self.experiment_name(),
                main_experiment.experiment_name()))  # 'clustering', 'ICA'
        else:
            self.log("Performing {}".format(self.experiment_name()))

        # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py
        # %% Data for 1-3
        sse = defaultdict(list)
        ll = defaultdict(list)
        bic = defaultdict(list)
        sil = defaultdict(lambda: defaultdict(list))
        sil_s = np.empty(shape=(2 * len(self._clusters) *
                                self._details.ds.training_x.shape[0], 4),
                         dtype='<U21')
        acc = defaultdict(lambda: defaultdict(float))
        adj_mi = defaultdict(lambda: defaultdict(float))
        km = kmeans(random_state=self._details.seed)
        gmm = GMM(random_state=self._details.seed)

        st = clock()
        j = 0
        for k in self._clusters:
            km.set_params(n_clusters=k)
            gmm.set_params(n_components=k)
            km.fit(
                self._details.ds.training_x
            )  # cluster the ICA-transformed input features using kMeans with varying K
            gmm.fit(
                self._details.ds.training_x
            )  # cluster the ICA-transformed input features using GMM with varying k

            km_labels = km.predict(
                self._details.ds.training_x
            )  # give each ICA-transformed input feature a label
            gmm_labels = gmm.predict(self._details.ds.training_x)

            sil[k]['Kmeans'] = sil_score(
                self._details.ds.training_x, km_labels
            )  # compute mean silhouette score for all ICA-transformed input features
            sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels)

            km_sil_samples = sil_samples(
                self._details.ds.training_x, km_labels
            )  # compute silhouette score for each ICA-transformed input feature
            gmm_sil_samples = sil_samples(self._details.ds.training_x,
                                          gmm_labels)
            # There has got to be a better way to do this, but I can't brain right now
            for i, x in enumerate(km_sil_samples):
                sil_s[j] = [
                    k, 'Kmeans', round(x, 6), km_labels[i]
                ]  # record the silhouette score x for each instance i given its label kn_labels[i] by kMeans with value k
                j += 1
            for i, x in enumerate(gmm_sil_samples):
                sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]]
                j += 1

            sse[k] = [
                km.score(self._details.ds.training_x)
            ]  # score (opposite of the value of X on the k-Means objective (what is the objective???)
            ll[k] = [gmm.score(self._details.ds.training_x)
                     ]  # per-sample average log-likelihood
            bic[k] = [
                gmm.bic(self._details.ds.training_x)
            ]  # bayesian information criterion (review ???) on the input X

            acc[k]['Kmeans'] = cluster_acc(
                self._details.ds.training_y, km_labels
            )  # compute the accuracy of the clustering algorithm on the ICA-transformed data (against the original y-label) if it predicted the majority y-label for each cluster
            acc[k]['GMM'] = cluster_acc(self._details.ds.training_y,
                                        gmm_labels)

            adj_mi[k]['Kmeans'] = ami(
                self._details.ds.training_y, km_labels
            )  # compute the adjusted mutual information between the true labels and the cluster predicted labels (how well does clustering match truth)
            adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels)

            self.log("Cluster: {}, time: {}".format(k, clock() - st))

        sse = (-pd.DataFrame(sse)).T
        sse.index.name = 'k'
        sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)
                       ]  # Bank sse (left)

        ll = pd.DataFrame(ll).T
        ll.index.name = 'k'
        ll.columns = [
            '{} log-likelihood'.format(self._details.ds_readable_name)
        ]  # Bank log-likelihood

        bic = pd.DataFrame(bic).T
        bic.index.name = 'k'
        bic.columns = ['{} BIC'.format(self._details.ds_readable_name)
                       ]  # Bank BIC

        sil = pd.DataFrame(sil).T
        sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score',
                                             'label']).set_index('k')  #.T
        # sil_s = sil_s.T
        acc = pd.DataFrame(acc).T
        adj_mi = pd.DataFrame(adj_mi).T

        sil.index.name = 'k'
        sil_s.index.name = 'k'
        acc.index.name = 'k'
        adj_mi.index.name = 'k'

        # write scores to files
        sse.to_csv(self._out.format('{}_sse.csv'.format(
            self._details.ds_name)))
        ll.to_csv(
            self._out.format('{}_logliklihood.csv'.format(
                self._details.ds_name)))
        bic.to_csv(self._out.format('{}_bic.csv'.format(
            self._details.ds_name)))
        sil.to_csv(
            self._out.format('{}_sil_score.csv'.format(self._details.ds_name)))
        sil_s.to_csv(
            self._out.format('{}_sil_samples.csv'.format(
                self._details.ds_name)))
        acc.to_csv(self._out.format('{}_acc.csv'.format(
            self._details.ds_name)))
        adj_mi.to_csv(
            self._out.format('{}_adj_mi.csv'.format(self._details.ds_name)))

        # %% NN fit data (2,3)
        # train a NN on clustered data
        grid = {
            'km__n_clusters': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        km = kmeans(random_state=self._details.seed,
                    n_jobs=self._details.threads)
        pipe = Pipeline(
            [('km', km), ('NN', mlp)], memory=experiments.pipeline_memory
        )  # run a NN on the clustered data (only on the cluster labels, or input features + cluster labels???)
        gs, _ = self.gs_with_best_estimator(
            pipe, grid, type='kmeans')  # write the best NN to file
        self.log("KMmeans Grid search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_kmeans.csv'.format(
                self._details.ds_name))
        )  # write grid search results --> bank_cluster_kmeans.csv

        grid = {
            'gmm__n_components': self._clusters,
            'NN__alpha': self._nn_reg,
            'NN__hidden_layer_sizes': self._nn_arch
        }
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=self._details.seed)
        gmm = CustomGMM(random_state=self._details.seed)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)],
                        memory=experiments.pipeline_memory)
        gs, _ = self.gs_with_best_estimator(
            pipe, grid, type='gmm')  # write the best NN to file
        self.log("GMM search complete")

        tmp = pd.DataFrame(gs.cv_results_)
        tmp.to_csv(
            self._out.format('{}_cluster_GMM.csv'.format(
                self._details.ds_name))
        )  # write grid search results --> bank_cluster_GMM.csv

        # %% For chart 4/5
        # perform TSNE D.R on training data (why???)
        self._details.ds.training_x2D = TSNE(
            verbose=10, random_state=self._details.seed).fit_transform(
                self._details.ds.training_x)

        ds_2d = pd.DataFrame(
            np.hstack((self._details.ds.training_x2D,
                       np.atleast_2d(self._details.ds.training_y).T)),
            columns=['x', 'y', 'target']
        )  # prepare NN-learnable data using TSNE D.R'd input features + label
        ds_2d.to_csv(
            self._out.format('{}_2D.csv'.format(
                self._details.ds_name)))  # --> bank_2D.csv
        self.log("Done")