Example #1
0
 def cluster(self):
     for i in range(self.numberOfClusters):
         self.means.append(
             VectorGenerator.getRandomGaussianUnitVector(
                 len(self.vectors[0]), 4, 1).values())
     clusterer = cluster.EMClusterer(self.means, bias=0.1)
     return clusterer.cluster(self.vectors, True, trace=True)
Example #2
0
def batch_em_cluster(read_directory, write_directory1, write_directory2):

    file_number = sum(
        [len(files) for root, dirs, files in os.walk(read_directory)])

    cluster_number = 8
    init_mu = 0.1
    init_sigma = 1.0

    for i in range(file_number):
        vsm = np.loadtxt(read_directory + '/' + str(i + 1) + '.txt')
        data_dimension = vsm.shape[1]

        init_means = []
        for j in range(cluster_number):
            init_means.append(init_sigma * np.random.randn(data_dimension) +
                              init_mu)

        cluster_model = cluster.EMClusterer(init_means, bias=0.1)

        cluster_tag = cluster_model.cluster(vsm, True, trace=False)

        cluster_tag_to_string = [str(x) for x in cluster_tag]
        center_data = cluster_model._means

        quick_write_list_to_text(cluster_tag_to_string,
                                 write_directory1 + '/' + str(i + 1) + '.txt')
        write_matrix_to_text(center_data,
                             write_directory2 + '/' + str(i + 1) + '.txt')
Example #3
0
def gmm_cluster_docs(docs, nclusters=3, svd_d=5):
    # gaussian mixture model
    # first convert to numeric vectors
    import random
    dv = (lambda docs: ([(id, array([count for fname, count in dfreq]))
                         for id, dfreq in docs.iteritems()
                         if sum([count for fname, count in dfreq]) > 0]))(docs)

    n_features = len(dv[0][1])
    rand_means = [
        array([random.random() for i in xrange(n_features)])
        for j in xrange(nclusters)
    ]

    kmc = cluster.EMClusterer(
        rand_means,
        normalise=True)  ## ,svd_dimensions=svd_d) ## svd is horribly

    kmc.cluster([dv_[1] for dv_ in dv])
    #print kmc.cluster(dv.values())
    classes_by_jid = dict([(id, kmc.classify(fv)) for id, fv in dv])
    return dv, classes_by_jid, kmc
Example #4
0
def demo():
    """
    Non-interactive demonstration of the clusterers with simple 2-D data.
    """

    from nltk import cluster

    # example from figure 14.10, page 519, Manning and Schutze

    vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
    means = [[4, 2], [4, 2.01]]

    clusterer = cluster.EMClusterer(means, bias=0.1)
    clusters = clusterer.cluster(vectors, True, trace=True)

    print('Clustered:', vectors)
    print('As:       ', clusters)
    print()

    for c in range(2):
        print('Cluster:', c)
        print('Prior:  ', clusterer._priors[c])
        print('Mean:   ', clusterer._means[c])
        print('Covar:  ', clusterer._covariance_matrices[c])
        print()

    # classify a new vector
    vector = numpy.array([2, 2])
    print('classify(%s):' % vector, end=' ')
    print(clusterer.classify(vector))

    # show the classification probabilities
    vector = numpy.array([2, 2])
    print('classification_probdist(%s):' % vector)
    pdist = clusterer.classification_probdist(vector)
    for sample in pdist.samples():
        print('%s => %.0f%%' % (sample,
                    pdist.prob(sample) *100))
Example #5
0
"""
Non-interactive demonstration of the clusterers with simple 2-D data.
"""

from nltk import cluster
import numpy

# example from figure 14.10, page 519, Manning and Schutze

vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
means = [[4, 2], [4, 2.01]]

clusterer = cluster.EMClusterer(means, bias=0.1)
clusters = clusterer.cluster(vectors, True, trace=True)

print('Clustered:', vectors)
print('As:       ', clusters)
print()

for c in range(2):
    print('Cluster:', c)
    print('Prior:  ', clusterer._priors[c])
    print('Mean:   ', clusterer._means[c])
    print('Covar:  ', clusterer._covariance_matrices[c])
    print()

# classify a new vector
vector = numpy.array([2, 2])
print('classify(%s):' % vector, end=' ')
print(clusterer.classify(vector))