def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk import cluster # use a set of tokens with 2D indices vectors = [ numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]] ] # test the GAAC clusterer with 4 clusters clusterer = cluster.GAAClusterer(4) clusters = clusterer.cluster(vectors, True) print 'Clusterer:', clusterer print 'Clustered:', vectors print 'As:', clusters print # show the dendrogram clusterer.dendrogram().show() # classify a new vector vector = numpy.array([3, 3]) print 'classify(%s):' % vector, print clusterer.classify(vector) print
def main(): try: f = open("small_clusters.dat", "r") lines = f.read().splitlines() f.close() except IOError: print("Couldn't find contexts file.") sys.exit(-1) # Load data to memory, this can take a while contexts = [] for line in lines: l = line.split() values = [] word = l[0] for v in l[1:]: values.append(np.float32(v)) if any(values): contexts.append((word, np.array(values))) only_values = [] for elem in contexts: only_values.append(elem[1]) print "Finish loading files." # Begin clustering clusterer = cluster.GAAClusterer(1000) clusters = clusterer.cluster(only_values, True) final_clusters = [] for i, elem in enumerate(clusters): final_clusters.append((contexts[i][0], elem)) f = open("resultados_cluster.dat", "w") for clus, group in groupby(final_clusters, itemgetter(1)): print str(clus) + ":" for word, clust in group: s = set() s.add(word) for w in s: print w f.close() print("Terminado procesamiento. Comenzando con input.") lin = sys.stdin.read() while lin: lin = lin.strip().split() for v in lin: values.append(np.float32(v)) lin = np.array(values) print(clusterer.classify(lin)) lin = sys.stdin.read()
def createCluster(data, cltype): # pprint(data) vectors = [] labels = [] for row in data: rowval = [] labels = [] labelval = True if row[1] == 'unfair': labelval = False labels.append(labelval) for k, v in row[0].iteritems(): if k == 'hasPrivacy' or k == 'hasDeveloperEmail' or k == 'hasDeveloperWebsite': v = int(bool(v)) rowval.append(v) vectors.append(np.array(rowval)) pprint(vectors) # vectors = np.asarray(vectors) data = np.vstack(vectors) means = [vectors[20].tolist(), vectors[21].tolist()] if cltype == 'GAAC': clusterer = cluster.GAAClusterer(num_clusters=4) clusters = clusterer.cluster(vectors, True) clusterer.dendrogram().show() elif cltype == 'kmeans': centroids, variance = vq.kmeans(data, 3) identified, distance = vq.vq(data, centroids) print identified print centroids print variance elif cltype == 'hy': # Creating a cluster of clusters function def clusters(number=20, cnumber=5, csize=10): # Note that the way the clusters are positioned is Gaussian randomness. rnum = np.random.rand(cnumber, 2) rn = rnum[:, 0] * number rn = rn.astype(int) rn[np.where(rn < 5)] = 5 rn[np.where(rn > number / 2.)] = round(number / 2., 0) ra = rnum[:, 1] * 2.9 ra[np.where(ra < 1.5)] = 1.5 cls = np.random.randn(number, 3) * csize # Random multipliers for central point of cluster rxyz = np.random.randn(cnumber - 1, 3) for i in xrange(cnumber - 1): tmp = np.random.randn(rn[i + 1], 3) x = tmp[:, 0] + (rxyz[i, 0] * csize) y = tmp[:, 1] + (rxyz[i, 1] * csize) z = tmp[:, 2] + (rxyz[i, 2] * csize) tmp = np.column_stack([x, y, z]) cls = np.vstack([cls, tmp]) return cls # Generate a cluster of clusters and distance matrix. cls = clusters() D = pdist(cls[:, 0:2]) D = squareform(D) # Compute and plot first dendrogram. fig = plt.figure(figsize=(8, 8)) ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6]) Y1 = hy.linkage(D, method='complete') cutoff = 0.3 * np.max(Y1[:, 2]) Z1 = hy.dendrogram(Y1, orientation='right', color_threshold=cutoff) ax1.xaxis.set_visible(False) ax1.yaxis.set_visible(False) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2]) Y2 = hy.linkage(D, method='average') cutoff = 0.3 * np.max(Y2[:, 2]) Z2 = hy.dendrogram(Y2, color_threshold=cutoff) ax2.xaxis.set_visible(False) ax2.yaxis.set_visible(False) # Plot distance matrix. ax3 = fig.add_axes([0.3, 0.1, 0.6, 0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] D = D[idx1, :] D = D[:, idx2] ax3.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu) ax3.xaxis.set_visible(False) ax3.yaxis.set_visible(False) # Plot colorbar. fig.savefig('scipy_352_ex1.pdf', bbox='tight')
print 'Means:', clusterer.means() #print vectors # classify a new vector #vector = array([3, 3]) #print 'classify(%s):' % vector, #print clusterer.classify(vector) #print #""" print "GAAC Clustering" # use a set of tokens with 2D indices #vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test the GAAC clusterer with 4 clusters clusterer = cluster.GAAClusterer(2) clusters = clusterer.cluster(vectors, True) print 'Clusterer:', clusterer #print 'Clustered:', vectors print 'As:' # clusters i = 2 for clst in clusters: print i, clst i = i + 1 # show the dendrogram #print "The dendogram" #clusterer.dendrogram().show() # classify a new vector #print "Classify the vector [3,3]"