Python Ward.fit Examples, sklearn.cluster.Ward.fit Python Examples

Example #1

0

Show file

 def __hieclu(self):
     #use Hierarchical clustering
     print 'using hierarchical clustering......'
     ac = Ward(n_clusters=self.k)
     ac.fit(self.data_matrix)
     result = ac.fit_predict(self.data_matrix)
     return result

Example #2

0

Show file

def hieclu(data_matrix, k):
    #use Hierarchical clustering
    print 'using hierarchical clustering......'
    ac = Ward(n_clusters=k)
    ac.fit(data_matrix)
    result = ac.fit_predict(data_matrix)
    return result

Example #3

0

Show file

File: test_hierarchical.py Project: VirgileFritsch/scikit-learn

def test_connectivity_popagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    from sklearn.neighbors import NearestNeighbors

    X = np.array(
        [
            (0.014, 0.120),
            (0.014, 0.099),
            (0.014, 0.097),
            (0.017, 0.153),
            (0.017, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.152),
            (0.018, 0.149),
            (0.018, 0.144),
        ]
    )
    nn = NearestNeighbors(n_neighbors=10).fit(X)
    connectivity = nn.kneighbors_graph(X)
    ward = Ward(n_clusters=4, connectivity=connectivity)
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)

Example #4

0

Show file

File: text_clustering.py Project: arianpasquali/textmining-homework

def compute_clusters(dataset, features_vector):
    """
    Apply clustering method
    """

    labels = dataset.target
    true_k = np.unique(labels).shape[0]

    # Run clustering method
    print "Performing clustering with method ", cmd_options.clust_method.upper(
    )
    print

    if (cmd_options.clust_method == "hclust"):
        result = features_vector.toarray()
        ward = Ward(n_clusters=true_k)
        ward.fit(result)

        return ward

    if (cmd_options.clust_method == "kmeans"):
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    max_iter=1000,
                    verbose=1)
        km.fit(features_vector)

        return km

Example #5

0

Show file

def test_connectivity_popagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    from sklearn.neighbors import kneighbors_graph

    X = np.array([
        (.014, .120),
        (.014, .099),
        (.014, .097),
        (.017, .153),
        (.017, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .152),
        (.018, .149),
        (.018, .144),
    ])
    connectivity = kneighbors_graph(X, 10)
    ward = Ward(n_clusters=4, connectivity=connectivity)
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)

Example #6

0

Show file

File: evaluate_class.py Project: chenzheng128/evaluate_cluster

	def __hieclu(self):
		#use Hierarchical clustering
		print 'using hierarchical clustering......'
		ac = Ward(n_clusters = self.k)
		ac.fit(self.data_matrix)
		result = ac.fit_predict(self.data_matrix)
		return result

Example #7

0

Show file

File: evaluate.py Project: chenzheng128/evaluate_cluster

def hieclu(data_matrix, k):
	#use Hierarchical clustering
	print 'using hierarchical clustering......'
	ac = Ward(n_clusters=k)
	ac.fit(data_matrix)
	result = ac.fit_predict(data_matrix)
	return result

Example #8

0

Show file

File: test_hierarchical.py Project: tnunes/scikit-learn

def test_connectivity_popagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    from sklearn.neighbors import NearestNeighbors

    X = np.array([
        (.014, .120),
        (.014, .099),
        (.014, .097),
        (.017, .153),
        (.017, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .152),
        (.018, .149),
        (.018, .144),
    ])
    nn = NearestNeighbors(n_neighbors=10, warn_on_equidistant=False).fit(X)
    connectivity = nn.kneighbors_graph(X)
    ward = Ward(n_clusters=4, connectivity=connectivity)
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)

Example #9

0

Show file

File: test_hierarchical.py Project: AlexLerman/scikit-learn

def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    assert_true(np.size(np.unique(clustering.labels_)) == 10)

Example #10

0

Show file

def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    np.random.seed(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = np.random.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    assert_true(np.size(np.unique(clustering.labels_)) == 10)

Example #11

0

Show file

File: test_hierarchical.py Project: bbabenko/scikit-learn

def test_connectivity_fixing_non_lil():
    """
    Check non regression of a bug if a non item assignable connectivity is
    provided with more than one component.
    """
    # create dummy data
    x = np.array([[0, 0], [1, 1]])
    # create a mask with several components to force connectivity fixing
    m = np.array([[True, False], [False, True]])
    c = grid_to_graph(n_x=2, n_y=2, mask=m)
    w = Ward(connectivity=c)
    w.fit(x)

Example #12

0

Show file

def test_connectivity_fixing_non_lil():
    """
    Check non regression of a bug if a non item assignable connectivity is
    provided with more than one component.
    """
    # create dummy data
    x = np.array([[0, 0], [1, 1]])
    # create a mask with several components to force connectivity fixing
    m = np.array([[True, False], [False, True]])
    c = grid_to_graph(n_x=2, n_y=2, mask=m)
    w = Ward(connectivity=c)
    w.fit(x)

Example #13

0

Show file

File: test_hierarchical.py Project: pmnyc/Data_Engineering_Collections

def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    # test caching
    clustering = Ward(n_clusters=10, connectivity=connectivity,
                      memory=mkdtemp())
    clustering.fit(X)
    labels = clustering.labels_
    assert_true(np.size(np.unique(labels)) == 10)
    # Turn caching off now
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    # Check that we obtain the same solution with early-stopping of the
    # tree building
    clustering.compute_full_tree = False
    clustering.fit(X)
    np.testing.assert_array_equal(clustering.labels_, labels)
    clustering.connectivity = None
    clustering.fit(X)
    assert_true(np.size(np.unique(clustering.labels_)) == 10)
    # Check that we raise a TypeError on dense matrices
    clustering = Ward(n_clusters=10,
                      connectivity=connectivity.todense())
    assert_raises(TypeError, clustering.fit, X)
    clustering = Ward(n_clusters=10,
                      connectivity=sparse.lil_matrix(
                          connectivity.todense()[:10, :10]))
    assert_raises(ValueError, clustering.fit, X)

Example #14

0

Show file

File: spectral_cluster.py Project: Yayong-guan/mlcode

def spectral_cluster(data, n_clusters, method='sl'):
    # 获取拉普拉斯矩阵
    if method == 'NJW':
        lap_matrix = get_lap_matrix_njw(data, 0.1)
        eigenvalues, eigenvectors = np.linalg.eig(lap_matrix)
        idx = eigenvalues.argsort()[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

    elif method == 'self-tuning':
        lap_matrix = get_lap_matrix_self_tuning(data)
        eigenvalues, eigenvectors = np.linalg.eig(lap_matrix)
        idx = eigenvalues.argsort()[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

    else:
        lap_matrix = get_lap_matrix_sl(data, 0.1)
        eigenvalues, eigenvectors = np.linalg.eig(lap_matrix)
        idx = eigenvalues.argsort()
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]

    #print(eigenvalues)
    # 获取前n_clusters个特征向量
    x_matrix = eigenvectors[:, 0:n_clusters]
    # 归一化特征向量矩阵
    y_matrix = normal_eigen(x_matrix)

    # 调用自己写的k_means函数
    """
    k_dist_dic, k_centers_dic, cluster_group = kmeans.k_means(y_matrix, n_clusters)
    mat_plot_cluster_sample(data, cluster_group, method)
    """
    # 调用自己写的bi_k_means函数
    """center_list, cluster_assign = bikmeans.exe_bi_k_means(y_matrix, n_clusters)
    labels = cluster_assign[:, 0]
    mat_plot_cluster_sample(data, labels. method)

    # 调用sklearn中的KMeans函数，效果比自己写的强了好多
    k_means = KMeans(n_clusters)
    k_means.fit(y_matrix)
    #k_centers = k_means.cluster_centers_
    #mat_plot_cluster_sample(data, k_means.labels_, method)
    """
    # 调用sklearn中的hierarchical 聚类方法进行聚类
    hie_cluster = Ward(n_clusters)
    hie_cluster.fit(y_matrix)
    mat_plot_cluster_sample(data, hie_cluster.labels_, method)

Example #15

0

Show file

File: cluster.py Project: zaycev/n7

 def ward(self, X, n_clusters, plot=True):
     k_means = Ward(n_clusters=n_clusters, copy=False, compute_full_tree=True, memory="cache")
     k_means.fit(X)
     labels = k_means.labels_
     
     pl.close('all')
     pl.figure(1)
     pl.clf()
     
     if plot:
         colors = "rbgcmybgrcmybgrcmybgrcm" * 10
         X2d = RandomizedPCA(n_components=2).fit_transform(X)
         for i in xrange(len(X2d)):
             x = X2d[i]
             pl.plot(x[0], x[1], "o", markerfacecolor=colors[labels[i]], markeredgecolor=colors[labels[i]], alpha=0.035)
         pl.show()
     
     return k_means.labels_

Example #16

0

Show file

File: peptalk.py Project: asaladin/peptalk

 def cluster_ward(self, calpha=True):
     '''
     cluster the positively predicted residues using the Ward method.
     Returns a list of cluster labels the same length as the number of positively predicted residues.
     '''
     
     if calpha:
         data_atoms = self.positive_surface_residues.ca
     #else:
     #    data_atoms = self.positive_surface_residues.select('ca or sidechain').copy()
     if data_atoms.getCoords().shape[0] < 4:
         print self.pdbid, data_atoms.getCoords().shape
         return {}
     connectivity = kneighbors_graph(data_atoms.getCoords(), 5)
     ward = Ward(n_clusters=self.WARD_N_CLUSTERS, connectivity=connectivity)
     ward.fit(data_atoms.getCoords())
     resnums = data_atoms.getResnums()
     reslabels = ward.labels_
     clusters = sorted([resnums[reslabels==i] for i in set(reslabels)], key=len, reverse=True)
     return dict(enumerate(clusters))

Example #17

0

Show file

File: text_clustering.py Project: arianpasquali/textmining

def compute_clusters(dataset,features_vector):
    """
    Apply clustering method
    """

    labels = dataset.target
    true_k = np.unique(labels).shape[0]
    
    # Run clustering method
    print "Performing clustering with method ", cmd_options.clust_method.upper()
    print

    if(cmd_options.clust_method == "hclust"):
        result = features_vector.toarray()
        ward = Ward(n_clusters=true_k)
        ward.fit(result) 

        return ward

    if(cmd_options.clust_method == "kmeans"):
        km = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, verbose=1)
        km.fit(features_vector)

        return km

Example #18

0

Show file

File: main.py Project: colibri17/TextCategorization

print("Homogeneity k-means: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness k-means: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure k-means: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Silhouette Coefficient k-means: %0.3f" % metrics.silhouette_score(clustering, km.labels_, sample_size = 8000))

# DBSCAN
# Structured hierarchical clustering
db = DBSCAN()
db.fit(clustering)
print 'DBSCAN clusters created..'

print("Homogeneity DBSCAN: %0.3f" % metrics.homogeneity_score(labels, db.labels_))
print("Completeness DBSCAN: %0.3f" % metrics.completeness_score(labels, db.labels_))
print("V-measure DBSCAN: %0.3f" % metrics.v_measure_score(labels, db.labels_))
print("Silhouette Coefficient DBSCAN: %0.3f" % metrics.silhouette_score(clustering, db.labels_, sample_size = 5000))

# Structured hierarchical clustering
ward = Ward(n_clusters = 9)
ward.fit(clustering)
print 'Hierarchical clusters created..'

print("Homogeneity hierarchical: %0.3f" % metrics.homogeneity_score(labels, ward.labels_))
print("Completeness hierarchical: %0.3f" % metrics.completeness_score(labels, ward.labels_))
print("V-measure hierarchical: %0.3f" % metrics.v_measure_score(labels, ward.labels_))
print("Silhouette Coefficient hierarchical: %0.3f" % metrics.silhouette_score(clustering, ward.labels_, sample_size = 5000))

Example #19

0

Show file

File: report.py Project: simengy/Report-Clustering

        print i
        train = pd.concat([train, pd.get_dummies(raw_train[i])], axis=1)
        
freq = train.groupby('Report ID').sum()
freq = freq.drop('Has Combined Queries', 1)


# Train Model #############################

num_cluster = 12

kmean = KMeans(n_clusters=num_cluster, max_iter=400, verbose = 0, n_jobs = 2, n_init=20, tol=1e-6)
model_kmean = kmean.fit(freq)
        
ward = Ward(n_clusters=num_cluster)
model_ward = ward.fit(freq)


from sklearn.neighbors import kneighbors_graph
connectivity = kneighbors_graph(freq, n_neighbors=4)

#ward = Ward(n_clusters=num_cluster, connectivity = connectivity)
#model_ward = ward.fit(freq)

# Visualization #####################################################

import mpl_toolkits.mplot3d.axes3d as p3
import pylab as pl
from sklearn.datasets.samples_generator import make_friedman3

def plot(model, data, name):

Example #20

0

Show file

File: GE_NeighConst_HCA.py Project: svegapons/PyBDGK

    def encode(self, interm_rep, neighborhood_size = 26,
               clust_ratio=10,
               encoding='geometrical',
               similarity_measure='pearson',
               threshold=0.3, n_jobs=1, **kwds):
        """
        Parameters
        ----------
        interm_rep: IntermRep
            IntermRep object containing the arr_xyz and arr_voxel matrixes.
        neighborhood_size: int
            Number of neighbors each voxel will be connected to.
        clust_ratio: int
            The number of clusters will be equal to n/clust_ratio, where n is
            the number of voxels.
        encoding: string
            Type of encoding. 'geometrical' and 'functional' are allowed.
        similarity_measure: string
            Similarity measure used to compare the representative value of each
            parcel (cluster). 'pearson' or the measures available in scikit-learn
            are allowed.
        threshold: float
            Threshold applied to the similarity values in order to define the
            edges in the graph.

        Returns
        -------
        g: Graph
            Networkx graph representing the graph encoding of the data.
        """

        #computing the connectivity matrix, each voxel is connected to
        #"neighborhood_size" neighbors.
        #
        conn = kneighbors_graph(interm_rep.arr_xyz, n_neighbors=neighborhood_size)
#        conn_n = kneighbors_graph(interm_rep.arr_xyz, n_neighbors=neighborhood_size)
#        conn_r = radius_neighbors_graph(interm_rep.arr_xyz, radius=10)
#        conn = conn_n * conn_r

        #Hierarchical clustering algorithm. The number of clusters is defined
        #accoring to the parameter "clust_ratio".
        ward = Ward(n_clusters=len(interm_rep.arr_xyz)/clust_ratio, connectivity=conn)
        #ward = Ward(n_clusters=60, connectivity=conn)

        #Type of encoding: geometrical (only xyz data is used) or
        # functional (voxel time series is used).
        if encoding=='geometrical':
            ward.fit(interm_rep.arr_xyz)
        elif encoding=='functional':
            ward.fit(interm_rep.arr_voxels)

        labels = ward.labels_

        #Plotting the voxels with the cluster labels.
        #pp.plot_clustering_intermediate_representation(interm_rep, labels*10)


        #Computing the unique cluster indentifiers
        l_unique = np.unique(labels)

        mean_voxels = np.zeros((len(l_unique), interm_rep.arr_voxels.shape[1]))
        mean_xyz = np.zeros((len(l_unique), interm_rep.arr_xyz.shape[1]))

        cont = 0
        for i in l_unique:
            #Taking the possitions corresponding to the same cluster.
            pos = np.where(labels == i)[0]
            #Taking data from these possitions and computing the mean time serie
            m_voxel = interm_rep.arr_voxels[pos].mean(0)
            #Taking the xyz from these positions and computing the mean value
            m_xyz = interm_rep.arr_xyz[pos].mean(0)

            mean_voxels[cont] = m_voxel
            mean_xyz[cont] = m_xyz

            cont += 1


        #plotting the voxels time series for each cluster
        #pp.plot_interm_representation_time_series(ir.IntermRep(mean_voxels, mean_xyz))

        #The new intermediate representation is given by mean_voxels and
        # mean_xyz.

        #Computing similarity matrix and applying the threshold
        adj_mat = np.zeros((len(mean_voxels), len(mean_voxels)),
                           dtype = np.byte)
        for j in range(len(mean_voxels) - 1):
            for k in range(j + 1, len(mean_voxels)):
                if similarity_measure == 'pearson':
                    aux = st.pearsonr(mean_voxels[j], mean_voxels[k])[0]
                else:
                    aux = skpw.pairwise_kernel(mean_voxels[j], mean_voxels[k],
                                               metric = similarity_measure,
                                               n_jobs = n_jobs)
                if aux >= threshold:
                    adj_mat[j,k] = 1
                    adj_mat[k,j] = 1


#        #Weighted encoding (for graph kernels that work with weighted graphs)
#        #------------------------------------
#        adj_mat = np.zeros((len(mean_voxels), len(mean_voxels)),
#                           dtype = np.float)
#        for j in range(len(mean_voxels) - 1):
#            for k in range(j + 1, len(mean_voxels)):
#                if similarity_measure == 'pearson':
#                    aux = st.pearsonr(mean_voxels[j], mean_voxels[k])[0]
#                else:
#                    aux = skpw.pairwise_kernel(mean_voxels[j], mean_voxels[k],
#                                               metric = similarity_measure,
#                                               n_jobs = n_jobs)
##                if aux >= threshold:
##                    adj_mat[j,k] = aux
##                    adj_mat[k,j] = aux
#                adj_mat[j,k] = adj_mat[k,j] = aux
#        adj_mat = (adj_mat - np.mean(adj_mat))/np.std(adj_mat)
#        adj_mat = (adj_mat - np.min(adj_mat))/(np.max(adj_mat) - np.min(adj_mat))
#        adj_mat = np.where(adj_mat>=threshold, 1, 0)
#        #------------------------------------


        #Building the graph from the adjacency matrix
        g = nx.from_numpy_matrix(adj_mat)

        #Spliting the node degrees into some categories and using them as node labels.
#        num_lab = 5
        deg = g.degree()
#        for k in deg:
#            deg[k]/= num_lab
        nx.set_node_attributes(g, 'node_label', deg)

        ############
        #Storing the mean time-series of each parcell as a node attribute
        ts_att = {}
        mv = mean_voxels.tolist()
        for pos in range(len(mv)):
            ts_att[pos] = mv[pos]
        nx.set_node_attributes(g, 'time_series', ts_att)



        #Saving the graphs for CLFR subject (the one for which I have the structural data)
#        if interm_rep.subj_name == 'CLFR':
#            nx.write_gexf(g, 'graph_gephi_format.gexf')
#            np.savetxt('CLFR_clusters_xyz.txt', mean_xyz, fmt='%1d', delimiter=' ')
#            edges = np.array(np.where(adj_mat==1)).T
#            np.savetxt('CLFR_clusters_timeseries_cond%s.txt' %(interm_rep.cls), edges, fmt='%1d', delimiter=' ')


        #Plot Graphs
        #pp.plot_graph(mean_xyz, g)

        return g

Example #21

0

Show file

File: utils.py Project: lixiangchun/MCP05

def ward(X, n_clust):
    "H"

    ward = Ward(n_clusters=n_clust)
    ward.fit(X)
    return ward

Example #22

0

Show file

def HierachicalClustering(X, Expect_ext):
    from sklearn.cluster import Ward
    HC = Ward(n_clusters=Expect_ext)
    HC.fit(X)
    return HC.labels_

Example #23

0

Show file

File: utils.py Project: lixiangchun/MCP05

def ward(X, n_clust):
    "H"

    ward = Ward(n_clusters=n_clust)
    ward.fit(X)
    return ward

Example #24

0

Show file

class VisualVocabulary:
    """ Creates a visual vocabulary and quantises visual features """
    def __init__(self, pathFile=None, flagVerbose=False):

        self.mbk = None
        self.ward = None

        # If a path file is provided...
        if pathFile != None:
            # ...read from disk
            self.loadFromDisk(pathFile)

        if flagVerbose == True:
            self.flagVerbose = 1
        else:
            self.flagVerbose = 0

    def readImageIdsFromTxtFile(self, pathTxtFile):
        """ Read the image IDs contained in a text file """
        print pathTxtFile
        if not os.path.exists(pathTxtFile):
            print 'File not found ' + pathTxtFile
            return []

        # Read the file containing the image IDs
        fileDataset = open(pathTxtFile, 'r')

        # Read lines from the text file, stripping the end of line character
        imageIds = [line.strip() for line in fileDataset]

        # Close file
        fileDataset.close()

        return imageIds

    def buildFromImageCollection(self,
                                 pathTxtFile,
                                 pathDirImages,
                                 fileImageExtension,
                                 vocabularySize=4096,
                                 maxNumImages=sys.maxint):

        # Read the image IDs
        imageIds = self.readImageIdsFromTxtFile(pathTxtFile)

        # If there are more images than the considered ones...
        if (len(imageIds) > maxNumImages):
            imageIds = random.sample(imageIds, maxNumImages)

        # Extract the SURF descriptors from a collection of images and save in dictionary
        surfExtractor = SurfExtractor(True, True)
        surfExtractor.processCollectionFilesImage(imageIds, pathDirImages,
                                                  fileImageExtension)

        # Create a numpy array from the descriptors
        descriptors = surfExtractor.getDescriptors()
        arr_descriptor = np.vstack(tuple(descriptors))

        #        if( self.flagRunOnServer == True):
        #            # K-means: The amount of clusters is specified with 'k' in the sci-kit version
        #            # in the GPI computation service
        #            self.mbk = MiniBatchKMeans(init='k-means++',
        #                                        k=vocabularySize,
        #                                        init_size=3*vocabularySize,
        #                                        max_no_improvement=10,
        #                                        verbose=1)
        #        else:
        # K-means: The amount of clusters is specified in 'n_clusters' in latest scikit-learn version
        self.mbk = MiniBatchKMeans(init='k-means++',
                                   n_clusters=vocabularySize,
                                   init_size=3 * vocabularySize,
                                   max_no_improvement=10,
                                   verbose=self.flagVerbose)

        self.mbk.fit(arr_descriptor)

    def buildFromImageCollectionWard(self,
                                     pathTxtFile,
                                     pathDirImages,
                                     fileImageExtension,
                                     vocabularySize,
                                     maxNumImages=sys.maxint):
        # vocabularySize could be 4096
        # Read the image IDs
        imageIds = self.readImageIdsFromTxtFile(pathTxtFile)

        # If there are more images than the considered ones...
        if (len(imageIds) > maxNumImages):
            imageIds = random.sample(imageIds, maxNumImages)

        # Extract the SURF descriptors from a collection of images and save in dictionary
        surfExtractor = SurfExtractor(True)
        surfExtractor.processCollectionFilesImage(imageIds, pathDirImages,
                                                  fileImageExtension)

        # Create a numpy array from the descriptors
        descriptors = surfExtractor.getDescriptors()
        arr_descriptor = np.vstack(tuple(descriptors))

        #self.mbk = MiniBatchKMeans(init='k-means++',
        #                                k=vocabularySize,
        #                                n_init=10,
        #                                max_no_improvement=10,
        #                                verbose=0)
        self.ward = Ward(n_clusters=vocabularySize)

        self.ward.fit(arr_descriptor)

    def loadFromDisk(self, pathFile):

        if not os.path.exists(pathFile):
            print "File not found " + pathFile
            return

        self.mbk = pickle.load(open(pathFile, "rb"))

    def saveToDisk(self, pathFile):

        # Save mini batch K-Means to disk using Pickle
        pickle.dump(self.mbk, open(pathFile, "wb"))

    def quantizeVector(self, descriptors):

        #        if len(descriptors)<128:
        #            descriptors

        # Vector quantization with the visual vocabulary
        quant = self.mbk.predict(descriptors)

        # Build Histogram
        histogram = np.histogram(quant, bins=self.mbk.n_clusters)
        #        histogram = np.histogram(quant, bins=self.mbk.k)

        return histogram

Example #25

0

Show file

File: bench_plot_ward.py Project: shirareznik/scikit-learn

from sklearn.cluster import Ward

ward = Ward(n_clusters=15)

n_samples = np.logspace(.5, 3, 9)
n_features = np.logspace(1, 3.5, 7)
N_samples, N_features = np.meshgrid(n_samples,
                                    n_features)
scikits_time = np.zeros(N_samples.shape)
scipy_time = np.zeros(N_samples.shape)

for i, n in enumerate(n_samples):
    for j, p in enumerate(n_features):
        X = np.random.normal(size=(n, p))
        t0 = time.time()
        ward.fit(X)
        scikits_time[j, i] = time.time() - t0
        t0 = time.time()
        hierarchy.ward(X)
        scipy_time[j, i] = time.time() - t0

ratio = scikits_time/scipy_time

pl.clf()
pl.imshow(np.log(ratio), aspect='auto', origin="lower")
pl.colorbar()
pl.contour(ratio, levels=[1, ], colors='k')
pl.yticks(range(len(n_features)), n_features.astype(np.int))
pl.ylabel('N features')
pl.xticks(range(len(n_samples)), n_samples.astype(np.int))
pl.xlabel('N samples')

Example #26

0

Show file

File: bench_plot_ward.py Project: 2011200799/scikit-learn

from sklearn.cluster import Ward

ward = Ward(n_clusters=3)

n_samples = np.logspace(.5, 3, 9)
n_features = np.logspace(1, 3.5, 7)
N_samples, N_features = np.meshgrid(n_samples,
                                    n_features)
scikits_time = np.zeros(N_samples.shape)
scipy_time = np.zeros(N_samples.shape)

for i, n in enumerate(n_samples):
    for j, p in enumerate(n_features):
        X = np.random.normal(size=(n, p))
        t0 = time.time()
        ward.fit(X)
        scikits_time[j, i] = time.time() - t0
        t0 = time.time()
        hierarchy.ward(X)
        scipy_time[j, i] = time.time() - t0

ratio = scikits_time / scipy_time

pl.figure("scikit-learn Ward's method benchmark results")
pl.imshow(np.log(ratio), aspect='auto', origin="lower")
pl.colorbar()
pl.contour(ratio, levels=[1, ], colors='k')
pl.yticks(range(len(n_features)), n_features.astype(np.int))
pl.ylabel('N features')
pl.xticks(range(len(n_samples)), n_samples.astype(np.int))
pl.xlabel('N samples')