def demo():
    """
    Non-interactive demonstration of the clusterers with simple 2-D data.
    """

    from nltk import cluster

    # use a set of tokens with 2D indices
    vectors = [
        numpy.array(f)
        for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]
    ]

    # test the GAAC clusterer with 4 clusters
    clusterer = cluster.GAAClusterer(4)
    clusters = clusterer.cluster(vectors, True)

    print 'Clusterer:', clusterer
    print 'Clustered:', vectors
    print 'As:', clusters
    print

    # show the dendrogram
    clusterer.dendrogram().show()

    # classify a new vector
    vector = numpy.array([3, 3])
    print 'classify(%s):' % vector,
    print clusterer.classify(vector)
    print
Beispiel #2
0
def main():
    try:
        f = open("small_clusters.dat", "r")
        lines = f.read().splitlines()
        f.close()
    except IOError:
        print("Couldn't find contexts file.")
        sys.exit(-1)
    # Load data to memory, this can take a while
    contexts = []
    for line in lines:
        l = line.split()
        values = []
        word = l[0]
        for v in l[1:]:
            values.append(np.float32(v))
        if any(values):
            contexts.append((word, np.array(values)))
    only_values = []
    for elem in contexts:
        only_values.append(elem[1])
    print "Finish loading files."
    # Begin clustering
    clusterer = cluster.GAAClusterer(1000)
    clusters = clusterer.cluster(only_values, True)
    final_clusters = []
    for i, elem in enumerate(clusters):
        final_clusters.append((contexts[i][0], elem))
    f = open("resultados_cluster.dat", "w")
    for clus, group in groupby(final_clusters, itemgetter(1)):
        print str(clus) + ":"
        for word, clust in group:
            s = set()
            s.add(word)
        for w in s:
            print w
    f.close()
    print("Terminado procesamiento. Comenzando con input.")
    lin = sys.stdin.read()
    while lin:
        lin = lin.strip().split()
        for v in lin:
            values.append(np.float32(v))
        lin = np.array(values)
        print(clusterer.classify(lin))
        lin = sys.stdin.read()
Beispiel #3
0
def createCluster(data, cltype):
    # pprint(data)

    vectors = []
    labels = []

    for row in data:
        rowval = []
        labels = []
        labelval = True
        if row[1] == 'unfair':
            labelval = False
        labels.append(labelval)

        for k, v in row[0].iteritems():

            if k == 'hasPrivacy' or k == 'hasDeveloperEmail' or k == 'hasDeveloperWebsite':
                v = int(bool(v))
            rowval.append(v)

        vectors.append(np.array(rowval))

    pprint(vectors)
    # vectors = np.asarray(vectors)
    data = np.vstack(vectors)

    means = [vectors[20].tolist(), vectors[21].tolist()]

    if cltype == 'GAAC':
        clusterer = cluster.GAAClusterer(num_clusters=4)
        clusters = clusterer.cluster(vectors, True)
        clusterer.dendrogram().show()
    elif cltype == 'kmeans':
        centroids, variance = vq.kmeans(data, 3)
        identified, distance = vq.vq(data, centroids)

        print identified
        print centroids

        print variance

    elif cltype == 'hy':
        # Creating a cluster of clusters function
        def clusters(number=20, cnumber=5, csize=10):
            # Note that the way the clusters are positioned is Gaussian randomness.
            rnum = np.random.rand(cnumber, 2)
            rn = rnum[:, 0] * number
            rn = rn.astype(int)
            rn[np.where(rn < 5)] = 5
            rn[np.where(rn > number / 2.)] = round(number / 2., 0)
            ra = rnum[:, 1] * 2.9
            ra[np.where(ra < 1.5)] = 1.5

            cls = np.random.randn(number, 3) * csize

            # Random multipliers for central point of cluster
            rxyz = np.random.randn(cnumber - 1, 3)
            for i in xrange(cnumber - 1):
                tmp = np.random.randn(rn[i + 1], 3)
                x = tmp[:, 0] + (rxyz[i, 0] * csize)
                y = tmp[:, 1] + (rxyz[i, 1] * csize)
                z = tmp[:, 2] + (rxyz[i, 2] * csize)
                tmp = np.column_stack([x, y, z])
                cls = np.vstack([cls, tmp])
            return cls

        # Generate a cluster of clusters and distance matrix.
        cls = clusters()
        D = pdist(cls[:, 0:2])
        D = squareform(D)

        # Compute and plot first dendrogram.
        fig = plt.figure(figsize=(8, 8))
        ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
        Y1 = hy.linkage(D, method='complete')
        cutoff = 0.3 * np.max(Y1[:, 2])
        Z1 = hy.dendrogram(Y1, orientation='right', color_threshold=cutoff)
        ax1.xaxis.set_visible(False)
        ax1.yaxis.set_visible(False)

        # Compute and plot second dendrogram.
        ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
        Y2 = hy.linkage(D, method='average')
        cutoff = 0.3 * np.max(Y2[:, 2])
        Z2 = hy.dendrogram(Y2, color_threshold=cutoff)
        ax2.xaxis.set_visible(False)
        ax2.yaxis.set_visible(False)

        # Plot distance matrix.
        ax3 = fig.add_axes([0.3, 0.1, 0.6, 0.6])
        idx1 = Z1['leaves']
        idx2 = Z2['leaves']
        D = D[idx1, :]
        D = D[:, idx2]
        ax3.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu)
        ax3.xaxis.set_visible(False)
        ax3.yaxis.set_visible(False)

        # Plot colorbar.
        fig.savefig('scipy_352_ex1.pdf', bbox='tight')
Beispiel #4
0
print 'Means:', clusterer.means()
#print vectors

# classify a new vector
#vector = array([3, 3])
#print 'classify(%s):' % vector,
#print clusterer.classify(vector)
#print

#"""
print "GAAC Clustering"
# use a set of tokens with 2D indices
#vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

# test the GAAC clusterer with 4 clusters
clusterer = cluster.GAAClusterer(2)
clusters = clusterer.cluster(vectors, True)

print 'Clusterer:', clusterer
#print 'Clustered:', vectors
print 'As:'  # clusters
i = 2
for clst in clusters:
    print i, clst
    i = i + 1
# show the dendrogram
#print "The dendogram"
#clusterer.dendrogram().show()

# classify a new vector
#print "Classify the vector [3,3]"