Beispiel #1
0
def kmeans(tsne_model, vz,data,cat):
    num_clusters = 2
    kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                             init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
    kmeans = kmeans_model.fit(vz)
    kmeans_clusters = kmeans.predict(vz)
    kmeans_distances = kmeans.transform(vz)
    sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(num_clusters):
        print("Cluster %d:" % i, end='')
        for j in sorted_centroids[i, :10]:
            print(' %s' % terms[j], end='')
        print()
    tsne_kmeans = tsne_model.fit_transform(kmeans_distances[:10000])
   
    output_file(cat+".html", title="Euro 2016")
    plot_kmeans = bp.figure(plot_width=900, plot_height=700, title="Euro 2016 (k-means)",
        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
        x_axis_type=None, y_axis_type=None, min_border=1)
    
    plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1], 
                        color=colormap[kmeans_clusters][:10000], 
                        source=bp.ColumnDataSource({
                            "tweet": data['text'][:10000], 
                            "processed": data['processed'][:10000],
                            "cluster": kmeans_clusters[:10000]
                        }))
    hover = plot_kmeans.select(dict(type=HoverTool))
    hover.tooltips={"tweet": "@tweet (processed: \"@processed\" - cluster: @cluster)"}
    show(plot_kmeans)
def test_predict_minibatch_dense_sparse(init):
    # check that models trained on sparse input also works for dense input at
    # predict time
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init=init,
                                 n_init=10, random_state=0).fit(X_csr)

    assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
Beispiel #3
0
def mbkm_wrapper(full_dissimilarity_matrix, n_clusters, streamlines_ids):
    """Wrapper of MBKM with API compatible to the Manipulator.

    streamlines_ids can be set or list.
    """
    sids = np.array(list(streamlines_ids))
    dissimilarity_matrix = full_dissimilarity_matrix[sids]

    print "MBKM clustering time:",
    init = 'random'
    mbkm = MiniBatchKMeans(init=init, n_clusters=n_clusters, batch_size=1000,
                          n_init=10, max_no_improvement=5, verbose=0)
    t0 = time.time()
    mbkm.fit(dissimilarity_matrix)
    t_mini_batch = time.time() - t0
    print t_mini_batch

    print "exhaustive smarter search of the medoids:",
    medoids_exhs = np.zeros(n_clusters, dtype=np.int)
    t0 = time.time()
    idxs = []
    for i, centroid in enumerate(mbkm.cluster_centers_):
        idx_i = np.where(mbkm.labels_==i)[0]
        if idx_i.size == 0: idx_i = [0]
        tmp = full_dissimilarity_matrix[idx_i] - centroid
        medoids_exhs[i] = sids[idx_i[(tmp * tmp).sum(1).argmin()]]
        idxs.append(set(sids[idx_i].tolist()))
        
    t_exhs_query = time.time() - t0
    print t_exhs_query, "sec"
    clusters = dict(zip(medoids_exhs, idxs))
    return clusters
Beispiel #4
0
Datei: odr.py Projekt: caoym/odr
class DocDescriptor(object):

    def __init__(self, word_descriptor, n_clusters = 1000):
        self._n_clusters = n_clusters
        self._cluster = MiniBatchKMeans(n_clusters=n_clusters,verbose=1,max_no_improvement=None,reassignment_ratio=1.0)
        self._word_descriptor = word_descriptor

    def get_word_descriptor(self, img):
        X = get_features_from_image(img)
        words = []
        for i in X:
            words.append(self._word_descriptor.transform(i))
        return words

    def partial_fit(self, img):
        X = self.get_word_descriptor(img)
        self._cluster.partial_fit(X)

    def transform(self, img):
        X = self.get_word_descriptor(img)
        Y = self._cluster.predict(X)
        desc = [0]*self._n_clusters
        unit = 1.0/self._n_clusters
        for i in range(0, len(Y)):
            desc[Y[i]] += unit
        return desc
Beispiel #5
0
def _run_cluster(origin_list, cluster_num = 8, batch_size=100,resize=(64,64)):
    clf = MiniBatchKMeans(n_clusters=cluster_num,batch_size=batch_size)
    def next_batch(allfiles,batch_size):
        imgs = []
        inds = []
        for ind,(path,label) in enumerate(allfiles):
            img = Image.open(path).convert("L")
            img = img.resize(size=resize,Image.ANTIALIAS)
            img = np.reshape(np.array(img),(1,-1)).astype(np.float32) / 255.0
            imgs.append(img)
            inds.append(ind)
            if len(imgs) >= batch_size:
                yield  np.vstack(imgs), inds
                imgs = []
                inds = []
        if len(inds) > 0:
            return np.vstack(imgs), inds
    for _,batch in next_batch(origin_list,batch_size):
        clf.partial_fit(batch)

    cluster_dict = defaultdict(list)
    for inds, batch in next_batch(origin_list, batch_size):
        Ys = clf.predict(batch)
        for y, ind in zip(Ys, inds):
            path,label = origin_list[ind]
            cluster_dict.setdefault(y,[]).append((path,label))
    return cluster_dict
Beispiel #6
0
def correct_y(X,Y):
	# Correct wrongly assigned ZIP codes
	print "Correcting wrong ZIP codes..."
	[N, Nfeats]=X.shape
	NZIP=857
	# use K-means clustering to make it faster
	cluster=MiniBatchKMeans(NZIP,init_size=2000,max_iter=500)
	cluster_distance = cluster.fit_transform(X)
	cluster_values = cluster.predict(X)
	clstr=np.zeros((N,2))
	min_dist=1000*np.ones(NZIP)
	Y_min=np.zeros(NZIP)
	# clstr contains for each line cluster and cluster distance to center
	for i in xrange(N):
		idx = int(cluster_values[i])	
		clstr[i][0]=idx
		clstr[i][1]=cluster_distance[i][idx]
		if (clstr[i][1]<min_dist[idx]) :
			min_dist[idx]=clstr[i][1]
			Y_min[idx]=Y[i]
	counter=0
	for i in xrange(N):
		idx = int(clstr[i][0])
		if ((clstr[i][1]<1.5) & (int(Y[i]/1000)==int(Y_min[idx]/1000))) :	
			Y[i]= Y_min[idx]
			counter+=1
	print "%s ZIP codes corrected.", counter
	return(Y)
Beispiel #7
0
class ClusteringEnsemble(BaseEstimator):

    def __init__(self, estimator_const=LinearRegression, n_clusters=2):
        self.estimator_const_ = estimator_const
        self.n_clusters_ = n_clusters
        self.clustering = MiniBatchKMeans(n_clusters=self.n_clusters_)

    def get_params(self, deep=True):
        return { "n_clusters": self.n_clusters_}

    def fit(self, X, y):
        print("Training KMeans")
        colors = self.clustering.fit_predict(X).reshape(X.shape[0])

        print("Training Estimators")
        # each estimator is assigned to one cluster
        self.estimators = [self.estimator_const_() for i in range(self.n_clusters_)]
        for i in range(self.n_clusters_):
            rows = colors == i
            self.estimators[i].fit(X[rows], y[rows])

    def predict(self, X):
        y = np.zeros(X.shape[0])
        print("Predicting clusters")
        colors = self.clustering.predict(X)

        print("Estimating results")
        for i in range(self.n_clusters_):
            rows = colors == i
            y[rows] = self.estimators[i].predict(X[rows])

        return y
def clusterSurfFeatures(surf_all_hist, n_clusters):
	#
	all_hists = []
	for imagename in surf_all_hist:		
		all_hists.append(surf_all_hist[imagename])
	#
	X_train_surf_features = np.concatenate(all_hists)
	#		
	print 'Clustering', len(X_train_surf_features), 'features (k=' + str(n_clusters) + ')'
	estimator = MiniBatchKMeans(n_clusters=n_clusters)
	estimator.fit_transform(X_train_surf_features)
	#	
	final_features = {}
	for imagename in surf_all_hist:
		instance = surf_all_hist[imagename]
		#
		clusters = estimator.predict(instance)
		features = np.bincount(clusters)
		#
		if len(features) < n_clusters:
			features = np.append(features, np.zeros((1, n_clusters-len(features))))
		#print features
		#		
		final_features[imagename] = features		
	return final_features
Beispiel #9
0
def extract_spatial_pyramid(images, dataset, vq=None, n_words=1000):
    descriptors, locations = sift_descriptors(images, dataset)
    if vq is None:
        vq = MiniBatchKMeans(n_clusters=n_words, verbose=1, init='random',
                             batch_size=2 * n_words, compute_labels=False,
                             reassignment_ratio=0.0, random_state=1, n_init=3)
        #vq = KMeans(n_clusters=n_words, verbose=10, init='random')
        vq.fit(shuffle(np.vstack(descriptors)))
    else:
        n_words = vq.n_clusters

    pyramids = []
    for descr, locs in zip(descriptors, locations):
        words = vq.predict(descr)
        global_ = np.bincount(words, minlength=n_words).astype(np.float)
        global_ /= max(global_.sum(), 1)
        third_of_image = locs[1].max() // 3 + 1
        stripe_indicator = locs[1] // third_of_image
        inds = np.vstack([stripe_indicator, words])
        stripe_hists = sparse.coo_matrix((np.ones(len(words)), inds),
                                         shape=(3, n_words)).toarray()

        stripe_hists = [x / max(x.sum(), 1) for x in stripe_hists]
        pyramids.append(np.hstack([np.hstack(stripe_hists), global_]))

    return vq, np.vstack(pyramids)
Beispiel #10
0
    def generateCodebook(self, features):
        """ Generate codebook using extracted features """
    
        
        codebook = None
        
        if self._codebookGenerateMethod == 'k-means':
#             # Codebook generation using scipy k-means
#             while run:
#                 try:
#                     # Set missing = 'raise' to raise exception 
#                     # when one of the clusters is empty
#                     whitenedFeatures = whiten(features)
#                     codebook, _ = kmeans2(whitenedFeatures, 
#                                           self._codebookSize, 
#                                           missing = 'raise')
#                     
#                     # No empty clusters
#                     run = False
#                 except ClusterError:
#                     # If one of the clusters is empty, re-run k-means
#                     run = True
            
            # Codebook generation using sklearn k-means
            whitenedFeatures = whiten(features)
            kmeans = MiniBatchKMeans(n_clusters = config.codebookSize)
            kmeans.fit(whitenedFeatures)
            codebook = kmeans.cluster_centers_
        else:
            pass
        
        self._codebook = codebook
Beispiel #11
0
    def project(self, ndim=None):
        """ Projects the data object given to the constructor onto `ndim` dimensions

        Parameters
        ----------
        ndim : int
            The number of dimensions we want to project the data on.

        Returns
        -------
        dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object
            A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the projected data

        Example
        -------
        >>> tri = KMeansTri(data)
        >>> datatri = tri.project(5)
        """
        import scipy.spatial.distance as scidist
        from sklearn.cluster import MiniBatchKMeans
        from htmd.metricdata import MetricData

        datconcat = np.concatenate(self.data.dat)
        mb = MiniBatchKMeans(n_clusters=ndim)
        mb.fit(datconcat)

        # TODO: Could make it into a loop to waste less memory
        dist = scidist.cdist(datconcat, mb.cluster_centers_)
        dist = np.mean(dist, axis=1)[:, np.newaxis] - dist
        dist[dist < 0] = 0

        return MetricData(dat=self.data.deconcatenate(dist), ref=self.data.ref, simlist=self.data.simlist,
                          fstep=self.data.fstep, parent=self.data)
def cluster_function(user_np):
    ##############################################################################
    # Compute clustering with Means
    if len(user_np) < 10 :
        n_cl = 2
    elif len(user_np) <= 100 :
        n_cl = 10
    elif len(user_np) <= 500 :
        n_cl = 15
    elif len(user_np) <= 1000 :
        n_cl = 20
    else :
        n_cl = 30

    k_means = MiniBatchKMeans(n_clusters=n_cl, init='k-means++', max_iter=100, batch_size=100, verbose=0, compute_labels=True, 
                              random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01)

    t0 = time.time()
    k_means.fit(user_np)
    
    t_batch = time.time() - t0
    print "Batch running time : ", t_batch
    
    k_means_labels = k_means.labels_
    
    #prediction = k_means.predict(user_np)
    return k_means_labels
Beispiel #13
0
def make_cluster(datasets):
    num_clusters = 5
    lsa_dim = 500
    max_df = 0.8
    max_features = 10000
    minibatch = True
    print("datasets are %(datasets)s" % locals())

    km = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++',
                         batch_size=1000, n_init=10, max_no_improvement=10, verbose=True)
    km.fit(datasets)
    labels = km.labels_

    transformed = km.transform(x)
    dists = np.zeros(labels.shape)
    for i in range(len(labels)):
        dists[i] = transformed[i, labels[i]]

    clusters = []
    for i in range(num_clusters):
        cluster = []
        ii = np.where(labels == i)[0]
        dd = dists[ii]
        di = np.vstack([dd, ii]).transpose().tolist()
        di.sort()
        for d, j in di:
            cluster.append(datasets[int(j)])
        clusters.append(cluster)

    return clusters
Beispiel #14
0
def getCluster(X,k,M,opts):
    # M: knnNum
#    t0 = time()
#    print("knn graph")
    knn_graph = None
    #    knn_graph = kneighbors_graph(X, M)
#    print("knn graph done in %0.3fs" % (time() - t0))
#    outfile.write("knn graph done in %0.3fs\n" % (time() - t0))
#    aggl = AgglomerativeClustering(linkage='ward', connectivity=knn_graph, n_clusters=k)
    if opts.minibatch:
        km = MiniBatchKMeans(n_clusters=k, init='k-means++', n_init=50,
                             init_size=1000, batch_size=1000, verbose=opts.verbose)
    else:
        km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=50,
                    verbose=opts.verbose)
    #aggl = AgglomerativeClustering(linkage='ward', n_clusters=k)
    print("Clustering sparse data with %s" % km)
#    outfile.write("Clustering sparse data with %s\n" % aggl)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))
#    outfile.write("clustering done in %0.3fs\n" % (time() - t0))
    print()
    
    labels = km.labels_
    clus2doc = {}
    for i in range(len(labels)):
        clus2doc[labels[i]] = clus2doc.get(labels[i],set())
        clus2doc[labels[i]].add(i)    
    return (km,clus2doc,knn_graph)
Beispiel #15
0
def initializeWeight(D, type, N_OUT):
	# Here we first whiten the data (PCA or ZCA) and then optionally run k-means
	# on this whitened data.
	import numpy as np
	if D.shape[0] < N_OUT:
		print( "  Not enough data for '%s' estimation, using elwise"%type )
		return np.random.normal(0, 1, (N_OUT,D.shape[1]))
	D = D - np.mean(D, axis=0, keepdims=True)
	# PCA, ZCA, K-Means
	assert type in ['pca', 'zca', 'kmeans', 'rand'], "Unknown initialization type '%s'"%type
	C = D.T.dot(D)
	s, V = np.linalg.eigh(C)
	# order the eigenvalues
	ids = np.argsort(s)[-N_OUT:]
	s = s[ids]
	V = V[:,ids]
	s[s<1e-6] = 0
	s[s>=1e-6] = 1. / np.sqrt(s[s>=1e-6]+1e-3)
	S = np.diag(s)
	if type == 'pca':
		return S.dot(V.T)
	elif type == 'zca':
		return V.dot(S.dot(V.T))
	# Whiten the data
	wD = D.dot(V.dot(S))
	wD /= np.linalg.norm(wD, axis=1)[:,None]
	if type == 'kmeans':
		# Run k-means
		from sklearn.cluster import MiniBatchKMeans
		km = MiniBatchKMeans(n_clusters = wD.shape[1], batch_size=10*wD.shape[1]).fit(wD).cluster_centers_
	elif type == 'rand':
		km = wD[np.random.choice(wD.shape[0], wD.shape[1], False)]
	C = km.dot(S.dot(V.T))
	C /= np.std(D.dot(C.T), axis=0, keepdims=True).T
	return C
def init_all(K,X,DT):
    km = MiniBatchKMeans(n_clusters=K, init='k-means++', n_init=10,init_size=1000,
            batch_size=1000,verbose=True)

#    km = KMeans(n_clusters=K, init='k-means++', max_iter=100, n_init=50)
    km.fit(X)
    labels = km.labels_
    centers = km.cluster_centers_
    # print number of doc in each cluster
    clus2doc = {}
    for i in range(len(labels)):
        clus2doc[labels[i]] = clus2doc.get(labels[i],set())
        clus2doc[labels[i]].add(i)    
    if len(clus2doc) < K:
        K_ = len(clus2doc)
        print (str(K_)+" clusters")
        print("kmeans reduce K to "+str(K_))
        return init_all(K_,X,DT)
        #print("kmeans reduce K to "+str(K-1))
        #return init_all(K-1,X,DT)
    for i in clus2doc:
        print (str(i+1)+"\t"+str(len(clus2doc[i])))        
    # init                                                                                
    nDocs,nWords = X.shape
    Pz_d_km = np.zeros((K,nDocs))
    for i in range(nDocs):
        Pz_d_km[labels[i],i] = 1
    Pz_d_km = Pz_d_km +0.01;
    Pz_d_km = Pz_d_km / np.tile(sum(Pz_d_km),(K,1))
    C = centers.T+1/nWords/nWords
    Pw_z_km = C/np.tile(sum(C),(nWords,1))
    mu_km, sigma_km= inittime(DT,K,labels)
    return (K,[Pz_d_km,Pw_z_km,mu_km, sigma_km])
Beispiel #17
0
    def train(self, featurefiles, k=100, subsampling=10):
        nbr_images = len(featurefiles)
        descr = []
        descr.append(sift.read_features_from_file(featurefiles[0])[1])
        descriptors = descr[0]
        print "begin loading image feature files..."
        for i in np.arange(1, nbr_images):
            descr.append(sift.read_features_from_file(featurefiles[i])[1])
#                descriptors = np.vstack((descriptors, descr[i]))
            descriptors = np.vstack((descriptors, descr[i][::subsampling,:]))
            if i%100 == 0:
                print i, "images have been loaded..."
        print "finish loading image feature files!"

#        self.voc, distortion = cluster.kmeans(descriptors[::subsampling,:], k, 1)
        print "begin MiniBatchKMeans cluster....patient"
        mbk = MiniBatchKMeans(k, init="k-means++", compute_labels=False, n_init=3, init_size=3*k)
#        mbk.fit(descriptors[::subsampling,:])
        mbk.fit(descriptors)
        self.voc = mbk.cluster_centers_
        print "cluster finish!"
        self.nbr_word = self.voc.shape[0]
        imwords = np.zeros((nbr_images, self.nbr_word))
        for i in xrange(nbr_images):
            imwords[i] = self.project(descr[i])

        nbr_occurences = np.sum((imwords > 0)*1, axis=0)
        self.idf = np.log( (1.0*nbr_images) / (1.0*nbr_occurences+1) )
        self.traindata = featurefiles
Beispiel #18
0
def main():

    with open("aas/corpus.json") as f:
        corpus = json.loads(f.read())

    corpus = [(k,v) for k,v in corpus.items() if v > 5]
    corpus = sorted(corpus, key=lambda x: x[1])
    corpus = corpus[:-6]
    Ncorpus = len(corpus)

    with open("aas/abstracts.json") as f:
        abstracts = json.loads(f.read())

    X = np.zeros((len(abstracts),Ncorpus))
    for jj,abstract in enumerate(abstracts):
        for ii in range(Ncorpus):
            try:
                X[jj,ii] = abstract['counts'][corpus[ii][0]]
            except KeyError:
                continue
    X = bsr_matrix(X)
    
    print("Initializing k-means")
    km = MiniBatchKMeans(n_clusters=50, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=True)
    print("fitting")
    t0 = time.time()
    km.fit(X) # X is nsamples, nfeatures
    print("Took {} seconds".format(time.time()-t0))

    return km
Beispiel #19
0
   def aggregate(self, track_dataset):
       """
       An example implementation of the k-means algorithm implemented in 
       DSI Studio.  This function is automatically applied to all 
       TrackDatasets returned from a query.
 
       Parameters:
       -----------
       track_dataset:dsi2.streamlines.track_dataset.TrackDataset
       """
       # extract the streamline data
       tracks = track_dataset.tracks
       
       # Make a matrix of downsampled streamlines
       points = np.array([ downsample(trk, 3).flatten() \
                                   for trk in tracks])
 
       # Calculate the length of each streamline
       lengths = np.array([len(trk) for trk in tracks]).reshape(-1,1)
       
       # Concatenate the points and the track lengths
       features = np.hstack((points, lengths))
       
       # Initialize the k-means algorithm
       kmeans = MiniBatchKMeans(n_clusters=self.k, compute_labels=True)
       kmeans.fit(features)
 
       # Return the labels
       return kmeans.labels_      
Beispiel #20
0
def cluster_tfidf(tfidf):
    kmeans = MiniBatchKMeans(n_clusters=10, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000)

    kmeans.fit(tfidf)

    return kmeans.cluster_centers_
Beispiel #21
0
   def obtainCodebook(self, sampled_x, x):

      print 'Obatining codebook using online k-means...'
      
      sampled_x = np.array(sampled_x)
      sampled_x = sampled_x.astype(float)
      x = np.array(x)
      x = x.astype(float)

      #normalize
      scaled_x_sampled = StandardScaler().fit_transform(sampled_x)
      scaled_x = StandardScaler().fit_transform(x)
      
      des_vector_suffled = scaled_x_sampled
       
      #shuffle list of descriptors
      np.random.shuffle(des_vector_suffled)
      
      minibatch = MiniBatchKMeans(n_clusters=self.size, init='k-means++', batch_size=self.batch_size, n_init=10, max_no_improvement=10, verbose=0, random_state=0)
      
      codebook = minibatch.fit(des_vector_suffled, y=None)
      
      #for n in range(0,len(des_vector_suffled)/batchsize+1):
         #if n!=len(des_vector_suffled)/batchsize:  
            #data = des_vector_suffled[n*batchsize:n*batchsize+batchsize]
         #else:
            #data = des_vector_suffled[n*batchsize:]
         #kmeans.partial_fit(data)
          
      projections = minibatch.predict(scaled_x)
      
      print 'Codebook obtained.'
      
      return codebook.cluster_centers_, projections
Beispiel #22
0
class MiniCluster:
    def __init__(self, nclusters=1000, psize=16):
        self.psize = 16
        self.patch_size = (self.psize, self.psize)
        self.nclusters = nclusters
        self.rng = np.random.RandomState(0)
        self.kmeans = MiniBatchKMeans(n_clusters=nclusters, random_state=self.rng, verbose=True)
        
    def fit(self, images):
        buffer = []
        index = 1
        t0 = time.time()

        # The online learning part: cycle over the whole dataset 4 times
        index = 0
        passes = 10
        for _ in range(passes):
            for img in images:
                data = extract_patches_2d(img, self.patch_size, max_patches=15,
                                          random_state=self.rng)
                data = np.reshape(data, (len(data), -1))
                #This casting is only needed for RGB data
                #buffer.append(data.astype(float))
                buffer.append(data)
                index += 1
                #if index % 1000 == 0:
                if index % (self.nclusters * 2) == 0:
                    data = np.concatenate(buffer, axis=0)
                    data = gcn(data)
                    data = whiten(data)
                    self.kmeans.partial_fit(data)
                    buffer = []
                          
        dt = time.time() - t0
        print('done in %.2fs.' % dt)
Beispiel #23
0
    def clustering(self, X, NUM_CLUSTERS, MINIBATCH):
        '''
        k平均法によってクラス分け
        '''
        
        if MINIBATCH:
            km = MiniBatchKMeans(n_clusters = NUM_CLUSTERS,
                                 init='k-means++', batch_size=1000,
                                 n_init=10, max_no_improvement=10)
        else:
            km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1)
        
        km.fit(X)
        transformed = km.transform(X) #商品の各クラスの中心への距離
        labels = km.labels_
        
        dists = []
        for i in range(len(labels)):
            dists.append(transformed[i, labels[i]]) #商品の属するクラスの中心への距離

        labels = DataFrame(labels)
        dists = DataFrame(dists)
        labels.columns = ['label']
        dists.columns = ['dists']
        self.data = pd.concat([labels, dists, self.data], axis=1) #元のデータにラベルを加える
        
        return km
Beispiel #24
0
    def clusterize(self):
        X = np.ndarray((len(self.real_power_data), 2))
        X[:, 0] = self.real_power_data
        X[:, 1] = self.reac_power_data

        clustering = MiniBatchKMeans(self.spinbox_cluster.value())
        clustering.fit(X)

        # Identifica os centróides dos clusteres
        centroids = clustering.cluster_centers_.tolist()

        # Conta quantos elementos cada cluster possui
        predictions = clustering.predict(X)
        occurrences = Counter(predictions)

        # Identifica os clusteres que possuem somente 1 elemento
        # (serão tratados como clusteres de transição)
        transition_clusters = [k for k, v in occurrences.iteritems()
                                if v < 2]

        # Remove os centróides de clusteres de transição
        centroids = [e for i, e in enumerate(centroids)
                        if i not in transition_clusters]

        predictions = [-1 if v in transition_clusters else v
                        for v in predictions]
        
        self.prototypes = centroids
        self.plot_clusterized(predictions)
Beispiel #25
0
def main():
    if len(sys.argv) != 4:
        print(__doc__)
        return 1

    infiles = glob(sys.argv[1])
    outfile = sys.argv[2]
    K = int(sys.argv[3])

    print("Reading in", len(infiles), "files")
    fullarr = np.loadtxt(fileinput.input(infiles), delimiter = '\t')[:,:-7]

    summary_stats = None
    stats_file = '/n/fs/gcf/dchouren-repo/COS513-Finance/summary_stats/stats2'
    with open(stats_file, 'rb') as inf:
        summary_stats = np.loadtxt(inf)
    stds = summary_stats[:len(summary_stats)/2]
    means = summary_stats[len(summary_stats)/2:]

    fullarr = (fullarr - means) / stds

    print("Learning MiniBatchKMeans with K =", K)

    km = MiniBatchKMeans(n_clusters = K, verbose = True) # TODO max_iter
    km.fit(fullarr)

    print("KMeans trained, saving")

    with open(outfile, 'wb') as out_model:
        pickle.dump(km, out_model)

    print("Score:", km.score(fullarr))
    
    return 0
Beispiel #26
0
def color_quantization_sk(image, clusters):
    # load the image and grab its width and height
    (h, w) = image.shape[:2]
     
    # convert the image from the RGB color space to the L*a*b*
    # color space -- since we will be clustering using k-means
    # which is based on the euclidean distance, we'll use the
    # L*a*b* color space where the euclidean distance implies
    # perceptual meaning
    image = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
     
    # reshape the image into a feature vector so that k-means
    # can be applied
    image = image.reshape((image.shape[0] * image.shape[1], 3))
     
    # apply k-means using the specified number of clusters and
    # then create the quantized image based on the predictions
    clt = MiniBatchKMeans(n_clusters = clusters)
    labels = clt.fit_predict(image)
    quant = clt.cluster_centers_.astype("uint8")[labels]
     
    # reshape the feature vectors to images
    quant = quant.reshape((h, w, 3))
     
    # convert from L*a*b* to RGB
    quant = cv2.cvtColor(quant, cv2.COLOR_LAB2BGR)
    return quant
Beispiel #27
0
def do_clustering():
    keys = request.get_json()

    points = []
    values = []
    types = set(CLUSTER_SENSOR_TYPES)
    for id, timestamp in keys:
        time = datetime.fromtimestamp(timestamp)
        point = MeasurePoint.query.get((id, time))
        value = {v.type_name: v.value for v in point.values.all()}
        if not set(value.keys()) >= types:
            continue
        # Normalization:
        value['OxidizingGas'] /= 1000
        value['ReducingGas'] /= 10000
        points.append(point)
        values.append(value)

    X = [[v[t] for t in CLUSTER_SENSOR_TYPES]
         for v in values]
    X = np.array(X)
    k = KMeans(n_clusters=2, init='k-means++')
    k.fit(X)
    score = silhouette_score(X, k.labels_)

    groups = [[_point_to_json_dict(p) for p, l in
              zip(points, k.labels_) if l == i] for i in
              range(max(k.labels_) + 1)]

    return jsonify(score=score, groups=groups)
Beispiel #28
0
def define_clusters(projections):
    """
    Creates several different clusterings of the data in projections.

    :param projections: dict(string, (2 x Num_Samples) numpy.ndarray)
        dictionary mapping the projection type (e.g. "tSNE") to an array containing
        the two-dimensional coordinates for each sample in the projection.

    :return: dict of string (projection name) =>
        (dict of string (cluster technique) => np.ndarray of size N_Samples (cluster assignments))
    """

    pbar = ProgressBar(4 * len(projections));

    out_clusters = dict();

    for key in projections:

        proj_data = projections[key];
        proj_clusters = dict();

        # K-means for k = 2-5
        for k in range(2, 6):
            clust_name = "K-Means, k=" + str(k);
            kmeans = MiniBatchKMeans(n_clusters=k);
            clust_assignments = kmeans.fit_predict(proj_data.T);
            proj_clusters.update({clust_name: clust_assignments});
            pbar.update();

        out_clusters.update({key: proj_clusters});

    pbar.complete();

    return out_clusters;
Beispiel #29
0
def test_minibatch_reassign():
    # Give a perfect initialization, but a large reassignment_ratio,
    # as a result all the centers should be reassigned and the model
    # should not longer be good
    for this_X in (X, X_csr):
        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1,
                                     random_state=42)
        mb_k_means.fit(this_X)
        centers_before = mb_k_means.cluster_centers_.copy()
        try:
            old_stdout = sys.stdout
            sys.stdout = StringIO()
            # Turn on verbosity to smoke test the display code
            _mini_batch_step(this_X, (X ** 2).sum(axis=1),
                             mb_k_means.cluster_centers_,
                             mb_k_means.counts_,
                             np.zeros(X.shape[1], np.double),
                             False, random_reassign=True, random_state=42,
                             reassignment_ratio=1, verbose=True)
        finally:
            sys.stdout = old_stdout
        centers_after = mb_k_means.cluster_centers_.copy()
        # Check that all the centers have moved
        assert_greater(((centers_before - centers_after)**2).sum(axis=1).min(),
                       .2)
Beispiel #30
0
def VideoFrameReaders(VideoDirectory):
    cap = cv2.VideoCapture(VideoDirectory)
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    fgbg = cv2.createBackgroundSubtractorMOG2()
    timestamp = []
    count = 0
    try:
        while cap.isOpened():
            ret,frame = cap.read()
            time = cap.get(0) #get the frame in seconds
            timestamp.append(time)

            print timestamp

            if frame == None:
                break;
           # frame = cv2.cvtColor(frame,cv2.COLOR_RGB2GRAY)
            image = frame.reshape((frame.shape[0]*frame.shape[1],3))
            K = 4
            clf = MiniBatchKMeans(K)

            #predict cluster labels and quanitize each color based on the labels

            cls_labels = clf.fit_predict(image)
            print cls_labels
            cls_quant = clf.cluster_centers_astype("uint8")[labels]


    except EOFError:
        pass
Beispiel #31
0
        print(dirname)
        for (direcpath, direcnames, files) in os.walk(path + "\\" + dirname):
            for file in files:
                actual_path = path + "\\\\" + dirname + "\\\\" + file
                print(actual_path)
                des = func(actual_path)
                img_descs.append(des)
                y.append(label)
        label = label + 1

#finding indexes of test train and validate
y = np.array(y)
training_idxs, test_idxs, val_idxs = train_test_val_split_idxs(
    len(img_descs), 0.4, 0.0)

#creating histogram using kmeans minibatch cluster model
X, cluster_model = cluster_features(img_descs, training_idxs,
                                    MiniBatchKMeans(n_clusters=150))

#splitting data into test, train, validate using the indexes
X_train, X_test, X_val, y_train, y_test, y_val = perform_data_split(
    X, y, training_idxs, test_idxs, val_idxs)

#using classification methods
predict_knn(X_train, X_test, y_train, y_test)
#predict_mlp(X_train, X_test,y_train, y_test)
predict_svm(X_train, X_test, y_train, y_test)

predict_lr(X_train, X_test, y_train, y_test)
predict_nb(X_train, X_test, y_train, y_test)
Beispiel #32
0
print '-----feature size-----'
print TFIDFvectorizer.get_feature_names()
print len(TFIDFvectorizer.get_feature_names())

print '-----feature mapping-----'

print '-----start SVD-----'
svd = TruncatedSVD(n_components=20, n_iter = 30, random_state = 50)
normalizer = Normalizer(copy = False)
U = svd.fit_transform(x)			#fit: create V in SVD #transform: create U*S in SVD, which is need
U = normalizer.fit_transform(U)
print svd.explained_variance_


print '-----start kmeans-----'
minikmeans = MiniBatchKMeans(n_clusters = 22, init = 'k-means++', n_init = 1,init_size = 500)
minikmeans.fit(U)
#print minikmeans.cluster_centers_
print minikmeans.inertia_

label = minikmeans.labels_
'''
print '-----save labels-----'
fout = open('checklabel','w')
for i in label:
	fout.write(' '+str(i)+' \n')
fout.close()
'''
print '-----loading check_index.csv-----'
f = open(str(path)+'check_index.csv','r')
index = []
Beispiel #33
0
    kmeans.fit(word_embeddings),

    y_kmeans = kmeans.predict(word_embeddings)

    #generateCSV(y_kmeans,etichette)

    pca(y_kmeans)

if (algo == 'GaussianMM'):
    gmm = GaussianMixture(n_components=n_cluster).fit(word_embeddings)
    labels = gmm.predict(word_embeddings)
    pca(labels)

if (algo == 'MiniBatch100'):
    kmeans = MiniBatchKMeans(
        n_clusters=n_cluster,
        batch_size=100,
    ).fit(word_embeddings)

    kmeans.fit(word_embeddings),

    y_kmeans = kmeans.predict(word_embeddings)
    print(y_kmeans)
    pca(y_kmeans)

if (algo == 'MiniBatch250'):
    kmeans = MiniBatchKMeans(
        n_clusters=n_cluster,
        batch_size=250,
    ).fit(word_embeddings)

    kmeans.fit(word_embeddings),
 def createKMeans(self, n_clusters,max_iter):
     self.kmeansClf = MiniBatchKMeans(n_clusters= n_clusters, max_iter = max_iter, random_state=self.randomSeed)
class RecommendMovie():
    remarks = None
    randomSeed = None
    kmeansClf = None
    numU = None #用户数
    numM = None #电影数
    userGroup = None #用于记录所有用户的类别
    userNeighSize = 100
    movieNeighSize = 10
    reductSize = 50
    zeroImputer = None
    movieTitle = None
    
    #一些公用参数的初始化
    def __init__(self, sparseMatrix_file, movieTitle, randomSeed = None):
        self.remarks = sparse.load_npz(sparseMatrix_file)
        self.remarks = self.remarks.tocsr()
        self.randomSeed = randomSeed
        self.numU,self.numM = self.remarks.shape
        self.movieTitle = movieTitle    
        
    #修改协同过滤参数
    def tunningCollParams(self, userNeighSize, movieNeighSize, reductSize):
        self.userNeighSize = userNeighSize
        self.movieNeighSize = movieNeighSize
        self.reductSize = reductSize
        
    #初始化kmeans模型
    def createKMeans(self, n_clusters,max_iter):
        self.kmeansClf = MiniBatchKMeans(n_clusters= n_clusters, max_iter = max_iter, random_state=self.randomSeed)
        
    #训练kmeans模型
    def fitKMeans(self, trainsetsize, modelSavePath=None):  
        #抽取训练kmeans所用的集合
        np.random.seed(self.randomSeed)
        trainset_dense = self.remarks[np.random.choice(np.arange(self.numU), size = trainsetsize)].toarray()
        #平均值填补missing和用户(按行)标准化
        self.zeroImputer = Imputer(missing_values=0, strategy='mean', axis=1, copy = False)
        trainset_dense = self.zeroImputer.fit_transform(trainset_dense)
        trainset_dense = scale(X=trainset_dense, axis=1)
        self.kmeansClf.fit(trainset_dense)
        if modelSavePath!=None:
            joblib.dump(self.kmeansClf, filename=modelSavePath)
    
   # def updateKMeans(self, updatesetsize, modelSavePath=None)
   
    #给定一个稀疏矩阵,预测所属类别
    def __predictGroup(self, sparseArr):
        denseArr = self.zeroImputer.fit_transform(sparseArr.toarray())
        denseArr = scale(denseArr, axis=1)
        return self.kmeansClf.predict(denseArr)
    
    #预测所有用户所属的类别
    def findUsersGroup(self):
        userGroup = np.array([])
        for i in range(int(self.numU/10000)):
            temp = self.__predictGroup(self.remarks[(i*10000):((i+1)*10000)])
            userGroup = np.hstack([userGroup, temp])
        temp = self.__predictGroup(self.remarks[((i+1)*10000):])
        self.userGroup = np.hstack([userGroup,temp])
        print(self.userGroup, len(self.userGroup))
        print(pd.Series(self.userGroup).value_counts())
    
    #标准化单个用户的评分
    def __userRemarkScaler(self, aUser):
        tempaUser = self.zeroImputer.fit_transform(aUser.toarray())
        return scale(tempaUser,axis = 1).flatten()
    
    #找出同类成员并抽取子样
    def __findNeigbor(self, aUser, neighborsize=1000):
        g = self.__predictGroup(aUser)[0]
        np.random.seed(self.randomSeed)
        gindex = np.random.choice(np.where(self.userGroup==g)[0],size=neighborsize)
        groupMember = self.remarks[gindex].toarray()
        groupMember = self.zeroImputer.fit_transform(groupMember)
        groupMember = scale(groupMember, axis=1)
        return groupMember,gindex
    
    #按照pearson correlation遴选用户近邻
    def __pearsonRNeigh(self, aUser):
        groupMember, gindex = self.__findNeigbor(aUser,neighborsize=self.userNeighSize*10)
        pearsonDis = np.zeros(shape=self.userNeighSize*10)
        tempaUser = self.__userRemarkScaler(aUser)
        for i in range(self.userNeighSize*10):
            pearsonDis[i] = stats.pearsonr(tempaUser.flatten(),groupMember[i])[0]
        cutpoint = np.percentile(pearsonDis,90)
        maxindex = np.where(pearsonDis>=cutpoint)
        return groupMember[maxindex], gindex[maxindex]

    #给定 用户近邻-子矩阵和目标电影ID,计算电影间距离
    def __getScoreAndDist(self, movieindex, subRemarkMat, ratedIndex):
        u,s,vt = slinalg.svds(subRemarkMat, k=self.reductSize, which='LM')
        movieScore = vt.transpose()[movieindex,:].reshape(1,-1)
        dist = np.array([distance.cosine(movieScore, vt.transpose()[i]) for i in list(ratedIndex)])
        return dist
    
    #给定 电影间距离 计算用户对某电影的评分
    def __scorePredict(self, aUser, dist, ratedIndex):
        distRated = dist
        distSort = np.argsort(distRated)[1:min(self.movieNeighSize+1,len(distRated))]
        userSort = ratedIndex[distSort]
        similarity = 1-(dist[distSort])
        similarity[similarity<0]=0
        userRemarkP = (aUser.toarray()[0][userSort].dot(similarity.reshape(-1,1)))/(np.sum(similarity)+0.001)
        return userRemarkP

    #给定用户ID&电影ID预测得分    
    def __user2Movie(self, userID, movieID,subRemarkMat, gindex, ratedIndex):
        #aUser = self.remarks[userID]
        #subRemarkMat,gindex = self.__pearsonRNeigh(aUser)
        dist = self.__getScoreAndDist(movieID, subRemarkMat, ratedIndex)
        #ratedIndex = np.where(aUser[0].toarray().flatten()!=0)[0]#等会写到外层去
        score = self.__scorePredict(self.remarks[userID], dist, ratedIndex)
        return score
    
    #给定用户ID预测对候选电影的评分
    def recommend2User(self, userID, toPredict):
        #toPredict=a list of movieID
        aUser = self.remarks[userID]
        subRemarkMat,gindex = self.__pearsonRNeigh(aUser)
        ratedIndex=np.where(self.remarks[userID].toarray().flatten()!=0)[0]
        movieScoreP = []
        for movieID in toPredict:
            movieScoreP.append(self.__user2Movie(userID, movieID,subRemarkMat,gindex,ratedIndex))
        movieRecommend = pd.DataFrame(index=toPredict, 
                                   data={'Movie Title':self.movieTitle[toPredict],
                                         'Estimate':movieScoreP})
        movieRecommend.sort_values(by='Estimate', ascending=False, inplace=True)
        return movieRecommend
Beispiel #36
0
def eval_batch(x_train, y_train, x_test, y_test, classifier, components,
               no_clusters, dimensionality):

    cluster_finder = cluster.KMeans(n_clusters=no_clusters)

    if classifier == 'mbk':
        cluster_finder = MiniBatchKMeans(init='k-means++',
                                         n_clusters=no_clusters,
                                         batch_size=16,
                                         n_init=10,
                                         max_no_improvement=10,
                                         verbose=0)
        cluster_finder.fit(x)
        cddd = str(cluster_finder.score)
        clll = str(cluster_finder)
        log = str(
            components) + 'score' + cddd + 'algo=' + clll + 'comp=' + str(
                components) + dimensionality
        labels = cluster_finder.labels_

    else:
        cluster_finder = cluster.KMeans(n_clusters=no_clusters)
        cluster_finder.fit(x)
        cddd = str(cluster_finder.score)
        clll = str(cluster_finder)
        log = str(
            components) + 'score' + cddd + 'algo=' + clll + 'comp=' + str(
                components) + dimensionality
        labels = cluster_finder.labels_

    clustered_x = []
    clustered_y = []
    test_clustered_x = []
    test_clustered_y = []

    for c in range(0, no_clusters):
        clustered_x.append([])
        clustered_y.append([])
        test_clustered_x.append([])
        test_clustered_y.append([])

    for i, item in enumerate(x_train):
        item = item.reshape(1, item.shape[0])
        #		print('item', item)
        #		sys.exit(0)
        predicted = cluster_finder.predict(item)
        clustered_x[predicted[0]].append(item)
        clustered_y[predicted[0]].append(y[i])

    for i, item in enumerate(x_test):
        item = item.reshape(1, item.shape[0])
        predicted = cluster_finder.predict(item)
        test_clustered_x[predicted[0]].append(item)
        test_clustered_y[predicted[0]].append(y_test[i])

    print(len(clustered_x))
    print(len(clustered_y))
    print(len(test_clustered_x))
    print(len(test_clustered_y))

    for j, jtem in enumerate(clustered_x):
        file_name = "./clusters/all_train/cluster" + str(j) + ".txt"
        c_file = open(file_name, 'w')
        for m, mtem in enumerate(clustered_x[j]):
            ii = str(decode_sequence(mtem[0]))
            oo = str(decode_sequence(clustered_y[j][m]))
            c_file.writelines(ii + "===" + oo + "\n")
        c_file.close()

    for j, jtem in enumerate(test_clustered_x):
        file_name = "./clusters/all_test/cluster" + str(j) + ".txt"
        c_file = open(file_name, 'w')
        for m, mtem in enumerate(test_clustered_x[j]):
            ii = str(decode_sequence(mtem[0]))
            oo = str(decode_sequence(test_clustered_y[j][m]))
            c_file.writelines(ii + "===" + oo + "\n")
        c_file.close()


#	print(test_clustered_x)

    sys.exit(0)
    '''
    plt.yticks([])
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    if inertia:
        label = label + ", inertia={0:0.2f}".format(inertia)
    plt.title(label)
    plt.show()


plot_clustering('Ground truth', y, centers, inertia=inertia(X, centers))

##############################################################################
# We run a regular MiniBatchKMeans. KMeans would be more suited for this kind
# of small dataset but we are aiming at using Increment KMeans on large
# datasets so its implementation relies on MiniBatchKMeans.

kmeans = MiniBatchKMeans(n_clusters=8, random_state=2)
kmeans.fit(X)
plot_clustering('KMeans',
                kmeans.predict(X),
                kmeans.cluster_centers_,
                inertia=kmeans.inertia_)

##############################################################################
# We now consider that we are aware of 4 of the 8 clusters. We fix them in
# the IncrementalMiniBatchKMeans so that they are strictly enforced.

ikmeans = IncrementalMiniBatchKMeans(n_clusters=8, random_state=2)
ikmeans.fit(X, fixed_cluster_centers=centers[:n_fixed_clusters])
plot_clustering('Incremental KMeans',
                ikmeans.predict(X),
                centers[n_fixed_clusters:],
import pandas as pd

from sklearn.cluster import MiniBatchKMeans

if __name__ == "__main__":

    dataset = pd.read_csv('./data/candy.csv')
    print(dataset.head(10))

    X = dataset.drop('competitorname', axis=1)

    kmeans = MiniBatchKMeans(n_clusters=4, batch_size=8).fit(X)
    print("Total de centros: ", len(kmeans.cluster_centers_))
    print("="*64)
    print(kmeans.predict(X))

    dataset['group'] = kmeans.predict(X)

    print(dataset)
def test_minibatch_tol():
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters,
                                 batch_size=10,
                                 random_state=42,
                                 tol=.01).fit(X)
    _check_fitted_model(mb_k_means)
Beispiel #40
0
    def _init_classifier(self, opt):
        if "base_estimator" in opt:
            b_est = self._init_classifier(opt["base_estimator"])
        else:
            b_est = None

        if "n_estimators" in opt:
            n_estimators = opt["n_estimators"]
        else:
            n_estimators = 200

        if "max_iter" in opt:
            max_iter = opt["max_iter"]
        else:
            max_iter = 100000

        if "num_parallel_tree" in opt:
            num_parallel_tree = opt["num_parallel_tree"]
        else:
            num_parallel_tree = 5

        if "layer_structure" in opt:
            layer_structure = opt["layer_structure"]
        else:
            layer_structure = (100, )

        if "n_clusters" in opt:
            n_clusters = opt["n_clusters"]
        else:
            n_clusters = 8

        if opt["type"] in ["random_forrest", "rf"]:
            return RandomForestClassifier(n_estimators=n_estimators,
                                          class_weight="balanced",
                                          n_jobs=-1)
        elif opt["type"] == "ada_boost":
            return AdaBoostClassifier(base_estimator=b_est,
                                      n_estimators=n_estimators)
        elif opt["type"] in ["logistic_regression", "lr"]:
            return LogisticRegression(class_weight='balanced',
                                      max_iter=max_iter)
        elif opt["type"] == "sgd":
            return SGDClassifier(class_weight='balanced', max_iter=max_iter)
        elif opt["type"] in ["gaussian_bayes", "bayes", "gaussian_nb"]:
            return GaussianNB()
        elif opt["type"] in ["support_vector_machine", "svm"]:
            return SVC(kernel='rbf', class_weight='balanced', gamma="scale")
        elif opt["type"] in ["multilayer_perceptron", "mlp"]:
            return MLPClassifier(hidden_layer_sizes=layer_structure,
                                 max_iter=max_iter)
        elif opt["type"] in ["decision_tree", "dt", "tree"]:
            return DecisionTreeClassifier()
        elif opt["type"] in ["b_decision_tree", "b_dt", "b_tree"]:
            return DecisionTreeClassifier(class_weight="balanced")
        elif opt["type"] in ["neighbours", "knn"]:
            return KNeighborsClassifier(n_neighbors=opt["n_neighbours"])
        elif opt["type"] == "extra_tree":
            return ExtraTreesClassifier(n_estimators=n_estimators,
                                        class_weight="balanced",
                                        n_jobs=-1)
        elif opt["type"] == "xgboost":
            return XGBClassifier(objective='binary:logistic',
                                 n_estimators=n_estimators,
                                 num_parallel_tree=num_parallel_tree,
                                 tree_method="hist",
                                 booster="gbtree",
                                 n_jobs=-1)
        elif opt["type"] in ["b_random_forrest", "b_rf"]:
            return BalancedRandomForestClassifier(n_estimators=n_estimators,
                                                  n_jobs=-1)
        elif opt["type"] == "b_bagging":
            return BalancedBaggingClassifier(base_estimator=b_est,
                                             n_estimators=n_estimators)
        elif opt["type"] == "b_boosting":
            return RUSBoostClassifier(base_estimator=b_est,
                                      n_estimators=n_estimators)
        elif opt["type"] == "kmeans":
            return MiniBatchKMeans(n_clusters=n_clusters)
        else:
            raise ValueError("type: {} not recognised".format(opt["type"]))
def test_minibatch_k_means_init_multiple_runs_with_explicit_centers():
    mb_k_means = MiniBatchKMeans(init=centers.copy(),
                                 n_clusters=n_clusters,
                                 random_state=42,
                                 n_init=10)
    assert_warns(RuntimeWarning, mb_k_means.fit, X)
def test_mini_match_k_means_invalid_init():
    km = MiniBatchKMeans(init="invalid", n_init=1, n_clusters=n_clusters)
    assert_raises(ValueError, km.fit, X)
def test_mb_k_means_plus_plus_init_sparse_matrix():
    mb_k_means = MiniBatchKMeans(init="k-means++",
                                 n_clusters=n_clusters,
                                 random_state=42)
    mb_k_means.fit(X_csr)
    _check_fitted_model(mb_k_means)
def test_minibatch_k_means_perfect_init_sparse_csr():
    mb_k_means = MiniBatchKMeans(init=centers.copy(),
                                 n_clusters=n_clusters,
                                 random_state=42,
                                 n_init=1).fit(X_csr)
    _check_fitted_model(mb_k_means)
Beispiel #45
0
# exclude 'comp.os.ms-windows.misc'
categories = ['alt.atheism', 'comp.graphics',
              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'comp.windows.x', 'misc.forsale', 'rec.autos',
              'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
              'sci.med', 'sci.space', 'soc.religion.christian',
              'talk.politics.guns', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
data = get_data()
vectorizer = TfidfVectorizer(stop_words='english', min_df=5,
                             tokenizer=number_aware_tokenizer)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack', random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=100,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time,
    v_measure_score(y_cocluster, y_true)))

print("MiniBatchKMeans...")
start_time = time()
y_kmeans = kmeans.fit_predict(X)
def test_minibatch_init_with_large_k():
    mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20)
    # Check that a warning is raised, as the number clusters is larger
    # than the init_size
    assert_warns(RuntimeWarning, mb_k_means.fit, X)
    print("done in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

# #############################################################################
# Do the actual clustering

if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k,
                         init='k-means++',
                         n_init=1,
                         init_size=1000,
                         batch_size=1000,
                         verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k,
                init='k-means++',
                max_iter=100,
                n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()
def test_mb_k_means_plus_plus_init_dense_array():
    mb_k_means = MiniBatchKMeans(init="k-means++",
                                 n_clusters=n_clusters,
                                 random_state=42)
    mb_k_means.fit(X)
    _check_fitted_model(mb_k_means)
Beispiel #49
0
class KMeansSMOTE(BaseSMOTE):
    """Apply a KMeans clustering before to over-sample using SMOTE.

    This is an implementation of the algorithm described in [1]_.

    Read more in the `User Guide <https://imbalanced-learn.org/stable/over_sampling.html#smote-adasyn>`_.

    Parameters
    ----------
    {sampling_strategy}

    {random_state}

    k_neighbors : int or object, default=2
        If ``int``, number of nearest neighbours to used to construct synthetic
        samples.  If object, an estimator that inherits from
        :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
        find the k_neighbors.

    {n_jobs}

    kmeans_estimator : int or object, default=None
        A KMeans instance or the number of clusters to be used. By default,
        we used a :class:`~sklearn.cluster.MiniBatchKMeans` which tend to be
        better with large number of samples.

    cluster_balance_threshold : "auto" or float, default="auto"
        The threshold at which a cluster is called balanced and where samples
        of the class selected for SMOTE will be oversampled. If "auto", this
        will be determined by the ratio for each class, or it can be set
        manually.

    density_exponent : "auto" or float, default="auto"
        This exponent is used to determine the density of a cluster. Leaving
        this to "auto" will use a feature-length based exponent.

    Attributes
    ----------
    kmeans_estimator_ : estimator
        The fitted clustering method used before to apply SMOTE.

    nn_k_ : estimator
        The fitted k-NN estimator used in SMOTE.

    cluster_balance_threshold_ : float
        The threshold used during ``fit`` for calling a cluster balanced.

    See Also
    --------
    SMOTE : Over-sample using SMOTE.
    
    SVMSMOTE : Over-sample using SVM-SMOTE variant.

    BorderlineSMOTE : Over-sample using Borderline-SMOTE variant.

    ADASYN : Over-sample using ADASYN.

    Notes
    -----
    See the original papers: [1]_ for more details.

    Supports multi-class resampling. A one-vs.-rest scheme is used.

    References
    ----------
    .. [1] Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for
       Imbalanced Learning Based on K-Means and SMOTE"
       https://arxiv.org/abs/1711.00837

    Examples
    --------
    >>> import numpy as np
    >>> from imbalanced_ensemble.sampler.over_sampling import KMeansSMOTE
    >>> from sklearn.datasets import make_blobs
    >>> blobs = [100, 800, 100]
    >>> X, y  = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)])
    >>> # Add a single 0 sample in the middle blob
    >>> X = np.concatenate([X, [[0, 0]]])
    >>> y = np.append(y, 0)
    >>> # Make this a binary classification problem
    >>> y = y == 1
    >>> sm = KMeansSMOTE(random_state=42)
    >>> X_res, y_res = sm.fit_resample(X, y)
    >>> # Find the number of new samples in the middle blob
    >>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum()
    >>> print("Samples in the middle blob: %s" % n_res_in_middle)
    Samples in the middle blob: 801
    >>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1))
    Middle blob unchanged: True
    >>> print("More 0 samples: %s" % ((y_res == 0).sum() > (y == 0).sum()))
    More 0 samples: True
    """
    @_deprecate_positional_args
    def __init__(
        self,
        *,
        sampling_strategy="auto",
        random_state=None,
        k_neighbors=2,
        n_jobs=None,
        kmeans_estimator=None,
        cluster_balance_threshold="auto",
        density_exponent="auto",
    ):
        super().__init__(
            sampling_strategy=sampling_strategy,
            random_state=random_state,
            k_neighbors=k_neighbors,
            n_jobs=n_jobs,
        )
        self.kmeans_estimator = kmeans_estimator
        self.cluster_balance_threshold = cluster_balance_threshold
        self.density_exponent = density_exponent

    def _validate_estimator(self):
        super()._validate_estimator()
        if self.kmeans_estimator is None:
            self.kmeans_estimator_ = MiniBatchKMeans(
                batch_size=4096,
                random_state=self.random_state,
            )
        elif isinstance(self.kmeans_estimator, int):
            self.kmeans_estimator_ = MiniBatchKMeans(
                batch_size=4096,
                n_clusters=self.kmeans_estimator,
                random_state=self.random_state,
            )
        else:
            self.kmeans_estimator_ = clone(self.kmeans_estimator)

        # validate the parameters
        for param_name in ("cluster_balance_threshold", "density_exponent"):
            param = getattr(self, param_name)
            if isinstance(param, str) and param != "auto":
                raise ValueError(
                    f"'{param_name}' should be 'auto' when a string is passed."
                    f" Got {repr(param)} instead.")

        self.cluster_balance_threshold_ = (
            self.cluster_balance_threshold
            if self.kmeans_estimator_.n_clusters != 1 else -np.inf)

    def _find_cluster_sparsity(self, X):
        """Compute the cluster sparsity."""
        euclidean_distances = pairwise_distances(X,
                                                 metric="euclidean",
                                                 n_jobs=self.n_jobs)
        # negate diagonal elements
        for ind in range(X.shape[0]):
            euclidean_distances[ind, ind] = 0

        non_diag_elements = (X.shape[0]**2) - X.shape[0]
        mean_distance = euclidean_distances.sum() / non_diag_elements
        exponent = (math.log(X.shape[0], 1.6)**1.8 * 0.16 if
                    self.density_exponent == "auto" else self.density_exponent)
        return (mean_distance**exponent) / X.shape[0]

    def _fit_resample(self, X, y, sample_weight=None):
        self._validate_estimator()
        X_resampled = X.copy()
        y_resampled = y.copy()
        total_inp_samples = sum(self.sampling_strategy_.values())

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue

            # target_class_indices = np.flatnonzero(y == class_sample)
            # X_class = _safe_indexing(X, target_class_indices)

            X_clusters = self.kmeans_estimator_.fit_predict(X)
            valid_clusters = []
            cluster_sparsities = []

            # identify cluster which are answering the requirements
            for cluster_idx in range(self.kmeans_estimator_.n_clusters):

                cluster_mask = np.flatnonzero(X_clusters == cluster_idx)
                X_cluster = _safe_indexing(X, cluster_mask)
                y_cluster = _safe_indexing(y, cluster_mask)

                cluster_class_mean = (y_cluster == class_sample).mean()

                if self.cluster_balance_threshold_ == "auto":
                    balance_threshold = n_samples / total_inp_samples / 2
                else:
                    balance_threshold = self.cluster_balance_threshold_

                # the cluster is already considered balanced
                if cluster_class_mean < balance_threshold:
                    continue

                # not enough samples to apply SMOTE
                anticipated_samples = cluster_class_mean * X_cluster.shape[0]
                if anticipated_samples < self.nn_k_.n_neighbors:
                    continue

                X_cluster_class = _safe_indexing(
                    X_cluster, np.flatnonzero(y_cluster == class_sample))

                valid_clusters.append(cluster_mask)
                cluster_sparsities.append(
                    self._find_cluster_sparsity(X_cluster_class))

            cluster_sparsities = np.array(cluster_sparsities)
            cluster_weights = cluster_sparsities / cluster_sparsities.sum()
            cluster_n_samples_list = np.zeros_like(cluster_weights)

            # if class_sample == 1:
            #     print (n_samples)
            #     print (cluster_weights)
            #     print ([math.ceil(
            #         n_samples * cluster_weights[valid_cluster_idx] - 1e-3
            #         ) for valid_cluster_idx, _ in enumerate(valid_clusters)])
            #     print (cluster_n_samples)

            if not valid_clusters:
                raise RuntimeError(
                    f"No clusters found with sufficient samples of "
                    f"class {class_sample}. Try lowering the "
                    f"cluster_balance_threshold or increasing the number of "
                    f"clusters.")

            for valid_cluster_idx, valid_cluster in enumerate(valid_clusters):
                X_cluster = _safe_indexing(X, valid_cluster)
                y_cluster = _safe_indexing(y, valid_cluster)

                X_cluster_class = _safe_indexing(
                    X_cluster, np.flatnonzero(y_cluster == class_sample))

                self.nn_k_.fit(X_cluster_class)
                nns = self.nn_k_.kneighbors(X_cluster_class,
                                            return_distance=False)[:, 1:]

                if valid_cluster_idx == self.kmeans_estimator_.n_clusters - 1:
                    cluster_n_samples = int(n_samples -
                                            sum(cluster_n_samples_list))
                else:
                    cluster_n_samples = math.floor(
                        n_samples * cluster_weights[valid_cluster_idx])
                cluster_n_samples_list[valid_cluster_idx] = cluster_n_samples

                X_new, y_new = self._make_samples(
                    X_cluster_class,
                    y.dtype,
                    class_sample,
                    X_cluster_class,
                    nns,
                    cluster_n_samples,
                    1.0,
                )

                stack = [np.vstack, sparse.vstack][int(sparse.issparse(X_new))]
                X_resampled = stack((X_resampled, X_new))
                y_resampled = np.hstack((y_resampled, y_new))

        # If given sample_weight
        if sample_weight is not None:
            # sample_weight is already validated in self.fit_resample()
            sample_weight_new = \
                np.empty(y_resampled.shape[0] - y.shape[0], dtype=np.float64)
            sample_weight_new[:] = np.mean(sample_weight)
            sample_weight_resampled = np.hstack(
                [sample_weight, sample_weight_new]).reshape(-1, 1)
            sample_weight_resampled = \
                np.squeeze(normalize(sample_weight_resampled, axis=0, norm='l1'))
            return X_resampled, y_resampled, sample_weight_resampled
        else:
            return X_resampled, y_resampled
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import MiniBatchKMeans

dataset = pd.read_csv('studentinfoCHK.csv')
X = dataset.iloc[10000:20000, [6, 7]].values
mb_clustering = clustering = MiniBatchKMeans(n_clusters=2)
y_mb_clustering = mb_clustering.fit_predict(X)
plt.scatter(X[y_mb_clustering == 0, 0],
            X[y_mb_clustering == 0, 1],
            s=20,
            c='red',
            label='high score and high study credits')
plt.scatter(X[y_mb_clustering == 1, 0],
            X[y_mb_clustering == 1, 1],
            s=20,
            c='blue',
            label='high score and low study credits')
plt.title('Clusters of STUDENTS')
plt.xlabel('SCORE')
plt.ylabel('STUDIED CREDITS')
plt.plot()
     True),

    ('Sparse comp. - MiniBatchSparsePCA',
     decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8,
                                      n_iter=100, batch_size=3,
                                      random_state=rng),
     True),

    ('MiniBatchDictionaryLearning',
        decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
                                                  n_iter=50, batch_size=3,
                                                  random_state=rng),
     True),

    ('Cluster centers - MiniBatchKMeans',
        MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20,
                        max_iter=50, random_state=rng),
     True),

    ('Factor Analysis components - FA',
     decomposition.FactorAnalysis(n_components=n_components, max_iter=20),
     True),
]


# #############################################################################
# Plot a sample of the input data

plot_gallery("First centered Olivetti faces", faces_centered[:n_components])

# #############################################################################
# Do the estimation and plot it
def partialAddingLearn(feature_extracted_model,
                       n_cluster,
                       channal,
                       ex_epoches,
                       batch_size=141,
                       epoches=32):
    # 获得训练数据
    Kmeans_ds = ReadImage.getKmeansDataSet(batch_size)
    print('数据提取完成')
    # 获得预训练模型
    # feature_extracted_model = getPre_trainedModel(checkpoint_path, image_height, image_weight)
    # print('预训练模型提取完成')
    # 使用tensorflow每次训练数据
    iterator = Kmeans_ds.make_initializable_iterator()
    data_element = iterator.get_next()
    sess = tf.Session()
    sess.run(iterator.initializer)
    Kmeans_label = []  #增量式学习不需要生成全体样本特征
    Kmeans_feature = []
    print('进入训练特征提取+增量式kmeans学习过程')
    kmeans_mode = MiniBatchKMeans(n_clusters=n_cluster,
                                  batch_size=batch_size * channal,
                                  random_state=0)  #初始化增量式学习模型
    for cur_epoch in range(ex_epoches):
        for i in range(epoches):
            Kmeans_image, curKmeans_label = sess.run(data_element)
            cur_image_feature = feature_extracted_model.predict(
                Kmeans_image)  #得到一个batchsize的深度特征2维矩阵,可以放入增量式学习当中
            cur_image_vectors = getFeatureVector(cur_image_feature)  #转为一维变量
            if cur_epoch == 0:  #只有在第一次循环时收集样本信息
                Kmeans_feature.extend(cur_image_vectors)
                Kmeans_label.extend(curKmeans_label)
            kmeans_mode.partial_fit(cur_image_vectors)
            print('第%d轮增量式学习完成' % i)
    sess.close()
    # 保存kmeans模型
    Kmeans_feature = np.asarray(Kmeans_feature)
    Kmeans_label = np.asarray(Kmeans_label)
    CSV.csvWrite('./BoW/data_csv/train_label.csv', enumerate(Kmeans_label))
    joblib.dump(kmeans_mode,
                filename='./BoW/result/kmeans_' + str(n_cluster) +
                '_.pkl')  # 保存文件
    print('增量式学习完成,kmeans——%d保存完成' % n_cluster)
    # 获得相应的特征标签
    print('获得相应的特征标签')
    feature_labels = kmeans_mode.predict(Kmeans_feature)
    print('特征标签获取完成')
    # 进入特征编码过程
    print('进入特征编码过程')
    m = len(Kmeans_label)  #样本数
    print(m)
    histogram_code = []
    for cur_image in range(m):
        cur_code = np.zeros((1, n_cluster))
        for cur_feature in range(cur_image * channal,
                                 (cur_image + 1) * channal):
            cur_cluster = feature_labels[cur_feature]
            cur_code[0, cur_cluster] += 1
        cur_code = cur_code / np.sum(cur_code)  # 归一化
        histogram_code.append(cur_code[0])  #数组只要在创造后就变为两维(ones,zeros)
    histogram_code = np.asarray(histogram_code)
    print(np.shape(histogram_code))
    print('训练集histogram编码完成')
    CSV.csvWrite('./BoW/data_csv/train_code_' + str(n_cluster) + '.csv',
                 histogram_code)
    print('直方图编码完成')
    return kmeans_mode, histogram_code, Kmeans_label
Beispiel #53
0
 from sklearn.cluster import KMeans, MiniBatchKMeans
 """
 ######### K MEANS ################################################################
 """
 num_clusters = 5
 km = MiniBatchKMeans(
     n_clusters=num_clusters,
     max_iter=300,
     n_init=10,
     init="k-means++",
     batch_size=100,
     compute_labels=True,
 )
 result = km.fit_predict(soft)
 soft_label_pred = km.labels_
 centroids = km.cluster_centers_
 inertia = km.inertia_
    print("fatto in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

# #############################################################################
# Do the actual clustering

if opts.minibatch:
    print("*******MINI BATCH KMEANS***********")
    km = MiniBatchKMeans(n_clusters=num_cluster,
                         init='k-means++',
                         n_init=1,
                         init_size=1000,
                         batch_size=1000,
                         verbose=opts.verbose)
#elif opts.use_spectral:
#print("*******SPECTRAL CLUSTERING***********")
#km = SpectralClustering(n_clusters=num_cluster, affinity='precomputed', n_init=100, assign_labels = 'discretize')
#elif opts.use_agglomerative:
# print("************AGGLOMERATIVE CLUSTERING*********")
#km = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
#     connectivity=None, linkage='ward', memory=None, n_clusters=num_cluster,
#   pooling_func='deprecated')
else:
    print("*******K-MEANS***********")
    km = KMeans(n_clusters=num_cluster,
                init='k-means++',
                max_iter=100,
Beispiel #55
0
from sklearn.cluster import MiniBatchKMeans

# this now clusters in 100 or 400 clusters
# I also used cluster numbers of 8, 20 and 40
cluster_names = [(8, 'cl0'), (20, 'cl1'), (40, 'cl2')]
ckeys = [c[1] for c in cluster_names]
kms = {}
for nclust, key in cluster_names:
    km = MiniBatchKMeans(n_clusters=nclust)
    X = None
    count = 0
    for e in event_info.find():
        if X is None:
            X = np.array(e['words'])
        else:
            X = np.vstack((X, e['words']))
        count += 1

        if count % 10000 == 0:
            km.partial_fit(X)
            X = None
            print(count)

    kms[key] = km

event_clusters = {}
for e in event_info.find():
    clusters = {key: int(km.predict(e['words'])[0]) for key, km in kms.items()}
    event_info.update({'id': e['id']}, {'$set': clusters})

# load database data, for fast access
Beispiel #56
0
class Offbeatr(object):
    def __init__(self, random_state=823):
        self.rng = np.random.RandomState(random_state)
        self.keepers = ['danceability', 'energy', 'loudness', 'speechiness','acousticness', \
                        'liveness', 'valence', 'tempo']

    def get_songs(self,
                  songfile=None,
                  host='35.196.88.209',
                  user='******',
                  password='******',
                  database='SPOTIFY'):
        """As a security measure, IP must be whitelisted in Google cloud prior to 
        getting song data"""
        if not songfile:
            conn = pymysql.connect(host='35.196.88.209', user='******', \
                                   password='******', database='SPOTIFY')
            query = """
                    SELECT * 
                    FROM songs
                    """
            print('fetching songs from database')
            self.songs = pd.read_sql(query, conn)
            conn.close()
        else:
            print('reading songs from local file')
            self.songs = pd.read_csv(songfile, skiprows=[1])
        self.N = self.songs.shape[0]
        self.songs_labeled_ = self.songs[['song_id']].copy()
        qt = QuantileTransformer(output_distribution='normal',
                                 random_state=self.rng)
        self.raw_data = qt.fit_transform(np.array(self.songs[self.keepers]))
        dump(qt, 'qt.pickle')
        print("Saved transformer to file: 'qt.pickle'")

    def get_starting_clusters(self,
                              mb_kmeans_n_clusters=25000,
                              random_state=0,
                              batch_size=100000,
                              verbose=0):
        print('computing starting clusters')
        self.mb_kmeans = MiniBatchKMeans(n_clusters=mb_kmeans_n_clusters, \
                                         random_state=random_state, batch_size=batch_size, \
                                         verbose=verbose)
        self.preds = self.mb_kmeans.fit_predict(self.raw_data)

    def agglom_cluster(self, cluster_sizes=[3800, 2528, 1264, 632]):
        """Run the aglomerative clustering algorithm"""
        #inits
        num_levels = len(cluster_sizes)
        centroids = np.zeros((sum(cluster_sizes), len(self.keepers)))
        fit_list = []
        colnames = []

        #calculate the agglomerative cluster labels
        print('performing agglomerative clustering')
        for i in range(num_levels):
            agglom = AgglomerativeClustering(n_clusters=cluster_sizes[i])
            fit = agglom.fit_predict(self.mb_kmeans.cluster_centers_)
            fit_list.append(fit + sum(cluster_sizes[:i]))
            level_labels = [
                fit_list[i][self.preds[j]] for j, _ in enumerate(self.preds)
            ]
            colnames.append("level" + str(i))
            self.songs_labeled_[colnames[i]] = level_labels
        print("DataFrame created: 'songs_labeled_'")

        #calculate centroids
        for i in range(num_levels):
            colname = colnames[i]
            val = sum(cluster_sizes[:i])
            for j in range(val, val + cluster_sizes[i]):
                centroids[j,:] = np.mean(self.raw_data[self.songs_labeled_[ \
                    self.songs_labeled_[colname]==j].index], axis=0)
        self.centroids_ = pd.DataFrame(centroids)
        self.centroids_.columns = self.keepers
        print("DataFrame created: 'centroids_'")

    def export_csv(self, save_songs=True, save_centroids=True, num_parts=20):
        #save transformer object as pickle
        if save_songs == True and save_centroids == True:
            print("creating 'songs_labeled_{id}.csv' and 'centroids.csv'")
            for id, df_i in enumerate(
                    np.array_split(self.songs_labeled_, num_parts)):
                df_i.to_csv('songs_labeled_{id}.csv'.format(id=id),
                            index=False)
            self.centroids_.to_csv('centroids.csv', index=False)
        elif save_songs == True:
            print("creating songs_labeled_{id}.csv")
            for id, df_i in enumerate(
                    np.array_split(self.songs_labeled_, num_parts)):
                df_i.to_csv('songs_labeled_{id}.csv'.format(id=id),
                            index=False)
        elif save_centroids == True:
            print("Creating 'centroids.csv'")
            self.centroids_.to_csv('centroids.csv', index=False)
        else:
            print('nothing to do')

    def beat_master(self, songfile=None, host='35.196.88.209', user='******', password='******', \
                    database='SPOTIFY',mb_kmeans_n_clusters=25000, random_state=0, \
                    batch_size=100000, verbose=0, cluster_sizes=[3800,2528,1264,632], \
                    save_songs=True, save_centroids=True, num_parts=20):
        "This self contained script takes ~2.5 hours to run"
        self.get_songs(songfile, host, user, password, database)
        self.get_starting_clusters(mb_kmeans_n_clusters, random_state,
                                   batch_size, verbose)
        self.agglom_cluster(cluster_sizes)
        self.export_csv(save_songs, save_centroids, num_parts)
        print('done!')
Beispiel #57
0
        _, p = np.unique(true_labels, return_inverse=True)
        # Counts the number of each index
        counts = np.bincount(p)
        # Gets the index with the highest count
        maxpos = counts.argmax()
        mistakes += (p != maxpos).sum()
    return mistakes / datapoints


#print "Preparing Tfidf vectorizer"

# prepare features
vectorizer = TfidfVectorizer(max_df=0.5,
                             max_features=1000,
                             min_df=2,
                             stop_words='english',
                             use_idf=True)

X = vectorizer.fit_transform(data)
#print "Fitting K-means for clusters 1 through 20"
#print "___________________"
#print '% 9s' % 'clusters	time 	inertia	   h**o    compl    v-meas    ARI    AMI    Mistake Rate'
print "numc,h**o,comp,v-meas,mr"
for numc in range(1, true_k + 1):
    mbkm = MiniBatchKMeans(n_clusters=numc,
                           init='k-means++',
                           max_iter=100,
                           n_init=5,
                           verbose=False)
    assess_mbkm(mbkm, numc, X, labels)
Beispiel #58
0
    print('Loading data and getting representations')
    model = BoAW(100)
    model.load(data_dir)
    model.fit()

    print('Building lookup')
    lkp = model.toLookup()

    print('Loading auditory space')
    asp = AggSpace(lkp, 'mean')

    the_data, labels_true = [], []
    for instrument in instruments:
        the_data.append(asp.space[instrument])
        labels_true.append(instclass[instrument])
    the_data = np.array(the_data)

    mbk = MiniBatchKMeans(n_clusters=len(classes),
                          batch_size=2,
                          verbose=True,
                          compute_labels=True,
                          max_iter=10000,
                          n_init=25)
    mbk.fit(the_data)
    centroids = mbk.cluster_centers_
    #labels_pred, _ = vq(the_data, centroids)
    score = v_measure_score(labels_true, mbk.labels_)
    print 'V-measure:', score
    for instrument, label in zip(instruments, mbk.labels_):
        print('Instrument=%s,\tcluster=%d' % (instrument, label))
Beispiel #59
0
    sigma = np.zeros(K)
    for i in range(K):
        ts = np.array(DT)[labels == i]
        mu[i] = np.mean(ts)
        sigma[i] = np.std(ts)
    return mu, sigma


# input args: K display
with open('test30.pickle') as f:
    [X,Xp,Xl,Xo,X_all,K,Learn,Pz_d_km,Pw_z_km,Pw_z,Pz_d,Pd,Li,\
            labels,terms,termsp,termsl,termso,terms_all,DT,ind2obj,clusModel]=pickle.load(f)
if K != int(sys.argv[1]):
    km = MiniBatchKMeans(n_clusters=k,
                         init='k-means++',
                         n_init=100,
                         init_size=1000,
                         batch_size=1000,
                         verbose=True)
    km.fit(X)
    labels = km.labels_
    centers = km.cluster_centers_
    clus2doc = {}
    for i in range(len(labels)):
        clus2doc[labels[i]] = clus2doc.get(labels[i], set())
        clus2doc[labels[i]].add(i)
## print number of docs in each cluster
    for i in clus2doc:
        print(str(i + 1) + "\t" + str(len(clus2doc[i])))

t0 = time()
Learn = (1, 10)
Beispiel #60
0
class TopicDocs:
    def __init__(self,
                 ndim=128,
                 random_seed=1965123,
                 topic_tokens=8196,
                 verbose=True):
        """
        Class initialization method.

        :param ndim: Number of latent dimensions
        :param targets: The target vector
        :param random_seed: The random seed used
        :param ed_cutoff: Cutoff for fuzzy string matching when comparing documents
        :param doc_limit: The max number of documents to be considered.
        :param verbose: Whether to have the printouts
        
        """

        self.ndim = int(np.sqrt(ndim))
        self.verbose = verbose
        self.random_seed = random_seed
        self.topic_tokens = topic_tokens

    def fit(self, text_list):
        """
        The fit method.

        :param text_list: List of input texts
        
        """

        if not type(text_list) == list:
            text_list = text_list.values.tolist()
        self.clx = TfidfVectorizer(max_features=self.topic_tokens)
        docspace = self.clx.fit_transform(text_list).T
        fnames = [(x, y) for x, y in self.clx.vocabulary_.items()]
        fnames = [x[0] for x in sorted(fnames, key=lambda x: x[1])]
        self.clustering_algo = MiniBatchKMeans(n_clusters=self.ndim)
        clusters = self.clustering_algo.fit(docspace)
        assert len(clusters.labels_) == docspace.shape[0]
        cluster_assignments = clusters.labels_
        assert len(clusters.labels_) == len(fnames)
        self.topic_features = defaultdict(set)
        for k, v in zip(fnames, cluster_assignments):
            self.topic_features[v].add(k)

    def transform(self, new_documents):
        """
        Transform method.

        :param new_documents: The new set of documents to be transformed.
        :return all_embeddings: The final embedding matrix
        
        """

        if not type(new_documents) == list:
            new_documents.values.tolist()

        if self.verbose:
            logging.info("Transforming new documents.")

        new_features = np.zeros((len(new_documents), self.ndim))
        for enx, doc in tqdm.tqdm(enumerate(new_documents),
                                  total=len(new_documents)):
            parts = set(doc.lower().strip().split())
            for k, v in self.topic_features.items():
                denominator = len(v)
                overlap = len(parts.intersection(v)) / denominator
                if not overlap is None:
                    new_features[enx, k] = overlap

        return new_features

    def fit_transform(self, documents, b=None):
        """
        The sklearn-like fit-transform method.

        """

        self.fit(documents)
        return self.transform(documents)

    def get_feature_names(self):
        """
        Get feature names.
        """

        return list(["topic_" + str(x) for x in range(self.ndim)])