Ejemplo n.º 1
0
    def __init__(self, initial_means, priors=None, covariance_matrices=None,
                       conv_threshold=1e-6, bias=0.1, normalise=False,
                       svd_dimensions=None):
        """
        Creates an EM clusterer with the given starting parameters,
        convergence threshold and vector mangling parameters.

        :param  initial_means: the means of the gaussian cluster centers
        :type   initial_means: [seq of] numpy array or seq of SparseArray
        :param  priors: the prior probability for each cluster
        :type   priors: numpy array or seq of float
        :param  covariance_matrices: the covariance matrix for each cluster
        :type   covariance_matrices: [seq of] numpy array
        :param  conv_threshold: maximum change in likelihood before deemed
                    convergent
        :type   conv_threshold: int or float
        :param  bias: variance bias used to ensure non-singular covariance
                      matrices
        :type   bias: float
        :param  normalise:  should vectors be normalised to length 1
        :type   normalise:  boolean
        :param  svd_dimensions: number of dimensions to use in reducing vector
                               dimensionsionality with SVD
        :type   svd_dimensions: int
        """
        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
        self._means = numpy.array(initial_means, numpy.float64)
        self._num_clusters = len(initial_means)
        self._conv_threshold = conv_threshold
        self._covariance_matrices = covariance_matrices
        self._priors = priors
        self._bias = bias
Ejemplo n.º 2
0
    def cluster(self,
                vectors,
                assign_clusters=False,
                ClusterNum=None,
                DisType='cos',
                Stype='avg',
                trace=False):
        # stores the merge order

        #-------------------------------------------------
        self._distMap.clear()  # 每次聚类不同样本之前必须更新
        #-------------------------------------------------

        l = len(vectors)
        if ('cos' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = cosine_distance(
                        vectors[i], vectors[j])
        elif ('euc' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = euclidean_distance(
                        vectors[i], vectors[j])
        self._dendrogram = Dendrogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        result = VectorSpaceClusterer.cluster(self, vectors, assign_clusters,
                                              ClusterNum, Stype, trace)

        if (2 == len(vectors[0])):  # 二维样本则显示可视化结果
            self.draw_2D(vectors, result)

        return result
Ejemplo n.º 3
0
    def cluster(self, vectors, assign_clusters=False,ClusterNum=None, DisType='euc',Stype='mean',trace=False):
        # stores the merge order

        #-------------------------------------------------
        self._distMap.clear()   # 每次聚类不同样本之前必须更新
        #-------------------------------------------------

        l = len(vectors)
        if(0==l):
            return []


        if('cos'==DisType):
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = cosine_distance(vectors[i], vectors[j])
        elif('euc'==DisType):
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = euclidean_distance(vectors[i], vectors[j])
        result = VectorSpaceClusterer.cluster(self, vectors,assign_clusters,ClusterNum, Stype, trace)

        #/////////////////////// 测试,输出距离 /////////////////
        # m = 0
        # for k,v in self._distMap:
        #     m +=1 
        #     print v,"\t",
        #     if (m%7==0):
        #         print
        #/////////////////////////////////////////////////////

        if(2==len(vectors[0])):         # 二维样本则显示可视化结果
            draw_2D_cluster(vectors, result)

        return result
Ejemplo n.º 4
0
 def __init__(self,
              num_means,
              distance,
              repeats=1,
              conv_test=1e-6,
              initial_means=None,
              normalise=False,
              svd_dimensions=None,
              rng=None,
              avoid_empty_clusters=False):
     """
     :param  num_means:  the number of means to use (may use fewer)
     :type   num_means:  int
     :param  distance:   measure of distance between two vectors
     :type   distance:   function taking two vectors and returing a float
     :param  repeats:    number of randomised clustering trials to use
     :type   repeats:    int
     :param  conv_test:  maximum variation in mean differences before
                         deemed convergent
     :type   conv_test:  number
     :param  initial_means: set of k initial means
     :type   initial_means: sequence of vectors
     :param  normalise:  should vectors be normalised to length 1
     :type   normalise:  boolean
     :param svd_dimensions: number of dimensions to use in reducing vector
                            dimensionsionality with SVD
     :type svd_dimensions: int
     :param  rng:        random number generator (or None)
     :type   rng:        Random
     :param avoid_empty_clusters: include current centroid in computation
                                  of next one; avoids undefined behavior
                                  when clusters become empty
     :type avoid_empty_clusters: boolean
     """
     VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
     self._num_means = num_means
     self._distance = distance
     self._max_difference = conv_test
     assert not initial_means or len(initial_means) == num_means
     self._means = initial_means
     assert repeats >= 1
     assert not (initial_means and repeats > 1)
     self._repeats = repeats
     if rng: self._rng = rng
     else: self._rng = random.Random()
     self._avoid_empty_clusters = avoid_empty_clusters
Ejemplo n.º 5
0
    def __init__(self, num_means, distance, repeats=1,
                       conv_test=1e-6, initial_means=None,
                       normalise=False, svd_dimensions=None,
                       rng=None, avoid_empty_clusters=False):

        """
        :param  num_means:  the number of means to use (may use fewer)
        :type   num_means:  int
        :param  distance:   measure of distance between two vectors
        :type   distance:   function taking two vectors and returing a float
        :param  repeats:    number of randomised clustering trials to use
        :type   repeats:    int
        :param  conv_test:  maximum variation in mean differences before
                            deemed convergent
        :type   conv_test:  number
        :param  initial_means: set of k initial means
        :type   initial_means: sequence of vectors
        :param  normalise:  should vectors be normalised to length 1
        :type   normalise:  boolean
        :param svd_dimensions: number of dimensions to use in reducing vector
                               dimensionsionality with SVD
        :type svd_dimensions: int
        :param  rng:        random number generator (or None)
        :type   rng:        Random
        :param avoid_empty_clusters: include current centroid in computation
                                     of next one; avoids undefined behavior
                                     when clusters become empty
        :type avoid_empty_clusters: boolean
        """
        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
        self._num_means = num_means
        self._distance = distance
        self._max_difference = conv_test
        assert not initial_means or len(initial_means) == num_means
        self._means = initial_means
        assert repeats >= 1
        assert not (initial_means and repeats > 1)
        self._repeats = repeats
        if rng: self._rng = rng
        else:   self._rng = random.Random()
        self._avoid_empty_clusters = avoid_empty_clusters
Ejemplo n.º 6
0
    def __init__(self,
                 initial_means,
                 priors=None,
                 covariance_matrices=None,
                 conv_threshold=1e-6,
                 bias=0.1,
                 normalise=False,
                 svd_dimensions=None):
        """
        Creates an EM clusterer with the given starting parameters,
        convergence threshold and vector mangling parameters.

        :param  initial_means: the means of the gaussian cluster centers
        :type   initial_means: [seq of] numpy array or seq of SparseArray
        :param  priors: the prior probability for each cluster
        :type   priors: numpy array or seq of float
        :param  covariance_matrices: the covariance matrix for each cluster
        :type   covariance_matrices: [seq of] numpy array
        :param  conv_threshold: maximum change in likelihood before deemed
                    convergent
        :type   conv_threshold: int or float
        :param  bias: variance bias used to ensure non-singular covariance
                      matrices
        :type   bias: float
        :param  normalise:  should vectors be normalised to length 1
        :type   normalise:  boolean
        :param  svd_dimensions: number of dimensions to use in reducing vector
                               dimensionsionality with SVD
        :type   svd_dimensions: int
        """
        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
        self._means = numpy.array(initial_means, numpy.float64)
        self._num_clusters = len(initial_means)
        self._conv_threshold = conv_threshold
        self._covariance_matrices = covariance_matrices
        self._priors = priors
        self._bias = bias
Ejemplo n.º 7
0
    def cluster(self,
                vectors,
                assign_clusters=False,
                ClusterNum=None,
                DisType='euc',
                Stype='mean',
                trace=False):
        # stores the merge order

        #-------------------------------------------------
        self._distMap.clear()  # 每次聚类不同样本之前必须更新
        #-------------------------------------------------

        l = len(vectors)
        if (0 == l):
            return []

        if ('cos' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = cosine_distance(
                        vectors[i], vectors[j])
        elif ('euc' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = euclidean_distance(
                        vectors[i], vectors[j])
        result = VectorSpaceClusterer.cluster(self, vectors, assign_clusters,
                                              ClusterNum, Stype, trace)

        #/////////////////////// 测试,输出距离 /////////////////
        # m = 0
        # for k,v in self._distMap:
        #     m +=1
        #     print v,"\t",
        #     if (m%7==0):
        #         print
        #/////////////////////////////////////////////////////

        if (2 == len(vectors[0])):  # 二维样本则显示可视化结果
            draw_2D_cluster(vectors, result)

        return result
Ejemplo n.º 8
0
    def cluster(self, vectors, assign_clusters=False, DisType='cos',Stype='avg',trace=False):
        # stores the merge order

        #-------------------------------------------------
        self._distMap.clear()   # 每次聚类不同样本之前必须更新
        #-------------------------------------------------

        l = len(vectors)
        if('cos'==DisType):
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = cosine_distance(vectors[i], vectors[j])
        elif('euc'==DisType):
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = euclidean_distance(vectors[i], vectors[j])
        self._dendrogram = Dendrogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        result = VectorSpaceClusterer.cluster(self, vectors,assign_clusters, Stype, trace)

        return result
Ejemplo n.º 9
0
 def __init__(self, num_clusters=None, normalise=True, svd_dimensions=None):
     VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
     self._num_clusters = num_clusters
     self._groups_values = None
     self._distMap = {}
Ejemplo n.º 10
0
 def __init__(self, num_clusters=None, normalise=True, svd_dimensions=None):
     VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
     self._num_clusters = num_clusters
     self._groups_values = None
     self._distMap ={}