def cluster_kmeans(vectors, num_clusters, distance_metric="cosine"): """ Takes in vectors and clusters them using KMeans clustering. Inputs: vectors -- matrix containing rows of vectors num_clusters -- number of clusters to create distance_metric -- distance measure between vectors (default "cosine") """ print "Starting KMeans clustering" start_time = time.time() # initialize if distance_metric == "euclidean": clusterer = cluster.KMeansClusterer(num_clusters, euclidean_distance) elif distance_metric == "cosine": clusterer = cluster.KMeansClusterer(num_clusters, cosine_distance) assignment = clusterer.cluster(vectors, True) end_time = time.time() print "Clustering required", (end_time - start_time), "seconds" return assignment
def demo(): # example from figure 14.9, page 517, Manning and Schutze from nltk import cluster vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] means = [[4, 3], [5, 5]] clusterer = cluster.KMeansClusterer(2, euclidean_distance, initial_means=means) clusters = clusterer.cluster(vectors, True, trace=True) print 'Clustered:', vectors print 'As:', clusters print 'Means:', clusterer.means() print vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test k-means using the euclidean distance metric, 2 means and repeat # clustering 10 times with random seeds clusterer = cluster.KMeansClusterer(2, euclidean_distance, repeats=10) clusters = clusterer.cluster(vectors, True) print 'Clustered:', vectors print 'As:', clusters print 'Means:', clusterer.means() print # classify a new vector vector = numpy.array([3, 3]) print 'classify(%s):' % vector, print clusterer.classify(vector) print
def kmeans_note_fvs(keys, nfvs, n=10, metric=cluster.euclidean_distance): # turn fvs into vectors vectors = [] for n in nfvs: vectors.append(numpy.array([n.get(k, 0) for k in keys])) clusterer = cluster.KMeansClusterer(n, metric) clusterer.cluster(vectors, True) return clusterer
def cluster(self, assignAndReturnDetails=False, numberOfTopFeatures=5, algorithmSource='nltk', **kwargs): bestFeatures, error = {}, None if algorithmSource == 'nltk': clusterer = cluster.KMeansClusterer(self.numberOfClusters, euclidean_distance, **kwargs) clusters = clusterer.cluster(self.vectors, True) means = clusterer.means() for id, mean in zip(clusterer.cluster_names(), means): bestFeatures[id] = [ (dimension, score) for dimension, score in sorted(zip([ self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i) for i in range(len(mean)) ], mean), key=itemgetter(1), reverse=True)[:numberOfTopFeatures] if score > 0 ] elif algorithmSource == 'biopython': from Bio.Cluster import kcluster, clustercentroids clusters, error, _ = kcluster(self.vectors, nclusters=self.numberOfClusters, npass=kwargs['repeats']) means, _ = clustercentroids(self.vectors, self.masks, clusters) means = [unitVector(c) for c in means] for id, mean in zip(range(len(means)), means): bestFeatures[id] = [ (dimension, score) for dimension, score in sorted(zip([ self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i) for i in range(len(mean)) ], mean), key=itemgetter(1), reverse=True)[:numberOfTopFeatures] if score > 0 ] if assignAndReturnDetails: documentAssignments = sorted( [(docId, clusterId) for docId, clusterId in zip(self.docIds, clusters)], key=itemgetter(1)) clusters = dict( (clusterId, [t[0] for t in documents]) for clusterId, documents in groupby(documentAssignments, key=itemgetter(1))) return { 'clusters': clusters, 'bestFeatures': bestFeatures, 'error': error } return clusters
def km_cluster_docs(docs, nclusters=3, normalise=True, distance=cluster.cosine_distance, svd_d=None): # first convert to numeric vectors dv = (lambda docs: ([(id, array([count for fname, count in dfreq])) for id, dfreq in docs.iteritems() if sum([count for fname, count in dfreq]) > 0]))(docs) n_features = len(dv[0][1]) rand_means = [ array([random.random() for i in xrange(n_features)]) for j in xrange(nclusters) ] if svd_d is not None: kmc = cluster.KMeansClusterer( nclusters, distance=distance, normalise=normalise, svd_dimensions=svd_d, initial_means=rand_means) ## svd is horribly else: kmc = cluster.KMeansClusterer(nclusters, distance=distance, normalise=normalise, initial_means=rand_means) print "Documents: ", len([dv_[1] for dv_ in dv]) for dv_ in dv: print dv_[1] kmc.cluster([dv_[1] for dv_ in dv]) #print kmc.cluster(dv.values()) classes_by_jid = dict([(id, kmc.classify(fv)) for id, fv in dv]) return dv, classes_by_jid, kmc
def cluster_things(keys_to_use, gold_standard="normal", make_pickle=False): # Open the CSV file vectors = [] gold_filter = [] with open(FEATURE_CSV, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: row_values = [] gold_filter += [int(row['book_id'])] for key in row: if key != 'book_id' and key in keys_to_use: row_values += [float(row[key])] vectors += [row_values] gold_clusters = [] if gold_standard == "normal": gold_clusters = get_gold_standard(gold_filter) else: gold_clusters = get_kincaid_cluster(gold_filter) vectors = [array(f) for f in vectors] clusterer = cluster.KMeansClusterer(len(gold_clusters), euclidean_distance) clusters = clusterer.cluster(vectors, True) if make_pickle == True: pickle.dump(clusterer, open(PICKLE_FILE, 'w')) # Attempt to classify the things again, so we know which vector they belong to results = [] for i in range(0, len(gold_clusters)): results += [[]] with open(FEATURE_CSV, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: row_values = [] for key in row: if key != 'book_id' and key in keys_to_use: row_values += [float(row[key])] results[clusterer.classify([array(f) for f in row_values])] += [row] book_ids = [] for i, c in enumerate(results): t = [] for row in c: t += [int(row['book_id'])] book_ids += [t] # Open the source files and find the correct things return score_clusters(gold_clusters, book_ids)
def go_cluster(topic_rows): print "Clustering", len(topic_rows), "Topics" topics = [t[1] for t in topic_rows] print "Getting topic words" words = get_words(topics) print "Vectorizing topics" vectorized = [vectorspaced(topic, words) for topic in topics] k = len(topics) / 3 c = cluster.KMeansClusterer(k, cutil.euclidean_distance, avoid_empty_clusters=True) print "Clustering into", k res = c.cluster(vectorized, assign_clusters=True, trace=False) #print res print "Clustering done, gathering" clusters = {} output_clusters = {} for (t, tid), cluster_id in zip(topic_rows, res): if cluster_id not in output_clusters: output_clusters[cluster_id] = [] clusters[cluster_id] = [] output_clusters[cluster_id].append(t) clusters[cluster_id].append(tid) #pprint.pprint(clusters) print "Saving clusters" save_clusters(output_clusters) return []
fig = pylab.figure(figsize=(100, 100)) linkageMatrix = hier.linkage(distSquareMatrix, method='ward') dendro = hier.dendrogram(linkageMatrix, orientation='left', labels=scriptlist) #fig.show() fig.savefig('dendrogram.png') print '\nlinkage matrix:' print linkageMatrix print '\ndendrogram:' print dendro answer = [] vectors = [array(f) for f in data] clusterer = cluster.KMeansClusterer(8, euclidean_distance, repeats=10, avoid_empty_clusters=True) print '\nK-means results using NLTK:' answer = clusterer.cluster(vectors, True) i = 0 j = 0 for j in range(8): print '\n cluster:' print j for i in range(len(answer)): if (answer[i] == j): print scriptlist[i] # classify a new vector '''
def main(): print("good") #df = pd.read_csv("D:\\\document_vector.csv", delimiter=',') #print(len(df)) ''' my_randoms = [] for i in range(6797): my_randoms.append(random.randrange(1, 101, 1)) print(my_randoms) ''' #取出content title_list = [] content_list = [] show_list = [] content = "D:\\NTUST\\人工智慧\\final\\csv\\headfile.csv" content_file = open(content, 'r', encoding='utf-8') content_filecsvCursor = csv.reader(content_file) next(content_filecsvCursor, None) # skip the headers for row in content_filecsvCursor: #print(row[0]+"~~"+row[1]) #title_list.append(row[3]) #content_list.append(row[5]) context = re.findall("(.{1,25})", row[5]) rebuildcontext = "" for w in context: rebuildcontext += (w + "<br>") show_list.append(row[0] + "<br>" + row[3] + "<br>" + rebuildcontext) #print(rebuildcontext) #exit() #讀取document_vector fileName = "D:\\NTUST\\人工智慧\\final\\csv\\document_vector.csv" file = open(fileName, 'r', encoding='utf-8') filecsvCursor = csv.reader(file) high_dim_data = [] for row in filecsvCursor: rowlist = list(row) #print(rowlist) temp = [] count = 0 for item in rowlist: if (count >= 2): temp.append(float(item)) #print(float(item)) count += 1 high_dim_data.append(temp) #PCA降至2維 pca = PCA(n_components=2) newData = pca.fit_transform(high_dim_data) #降至2維 print(newData) lx = [x for x, y in newData] print(lx) ly = [y for x, y in newData] print(ly) # Create a trace trace = Scatter(x=lx, y=ly, mode='markers', marker=dict(size=10, color='rgba(255, 182, 193, .9)', line=dict(width=2, )), text="good") data = [trace] #plotly.offline.plot(data) vectors = [array(f) for f in newData] # test k-means using the euclidean distance metric, 2 means and repeat # clustering 10 times with random seeds k = 5 clusterer = cluster.KMeansClusterer(k, cosine_distance, repeats=10) clusters = clusterer.cluster(vectors, True) print('Clustered:', end="") print(vectors) print('As:', end="") print(clusters) print('Means:', end="") print(clusterer.means()) totaldata = array(vectors) print(totaldata) print(type(totaldata)) print(type(totaldata[0])) label = array(clusters) print(label) center = [f.tolist() for f in clusterer.means()] trace_set = [] colornow = [] ds = totaldata[np.where(label == 0)] for i in range(k): #colornow.append(random.random(0,255)) r = random.randrange(0, 255) g = random.randrange(0, 255) b = random.randrange(0, 255) colornow.append('rgba(' + str(r) + ', ' + str(g) + ', ' + str(b) + ', .9)') for i in range(k): ds = totaldata[np.where(label == i)] trace_now = Scatter(x=ds[:, 0], y=ds[:, 1], mode='markers', marker=dict(size=10, color=colornow[i], line=dict(width=2, )), text=show_list) trace_set.append(trace_now) centerx = [x for x, y in center] print(lx) centery = [y for x, y in center] center_trace = Scatter( x=centerx, y=centery, mode='markers', marker=dict(size=10, color="rgba(0,0,0)", line=dict(width=30, )), ) trace_set.append(center_trace) plotly.offline.plot(trace_set)
#print type(document) document = nltk.Text(tweet_corpus.words(document)) word_counts = [] for word in unique_terms: word_counts.append(document.count(word)) #print word_counts return word_counts vectors = [numpy.array(BOW(f)) for f in tweet_corpus.fileids()] print "Vectors created." print "First 10 words are", unique_terms[:10] print "First 10 counts for first document are", vectors[0][0:10] CLUSTERS = 2 kmeans_clusterer = cluster.KMeansClusterer(CLUSTERS, cosine_distance) print "Starting Clustering" clusters = kmeans_clusterer.cluster(vectors, assign_clusters=False, trace=False) #print 'Clustered:', vectors #print 'As:', clusters print "Number of clusters: ", kmeans_clusterer.num_clusters() print "Means:", kmeans_clusterer.means() print "Cluster names: ", str(kmeans_clusterer.cluster_names()) # Go through the docs in the same order as we did when we created feature vectors # Create a dict of tweet => cluster ID cluster_dict = {} for i, fileid in enumerate(tweet_corpus.fileids()):
#not sure why it is needed but value is not actually #used. just needed to push down values to right spot #once added to articledf categories = pd.read_csv('categoriesIndex.txt') fileIN = 'Output2.txt' featwords = np.genfromtxt(fileIN, dtype='U300', converters={0: lambda x: x.decode()}) articledf = pd.DataFrame(data=datamat[:, 1:], index=datamat[:, 0], columns=featwords[1:]) clusterer = cluster.KMeansClusterer(6, euclidean_distance, repeats=1) results = clusterer.cluster(datamat2, True) means1 = clusterer.means() clusterer2 = cluster.KMeansClusterer(6, cosine_distance, repeats=1) results2 = clusterer2.cluster(datamat2, True) means2 = clusterer2.means() clusterer3 = cluster.KMeansClusterer(6, spatial.distance.jaccard, repeats=1, avoid_empty_clusters=True, conv_test=1) results3 = clusterer3.cluster(datamat2, True) means3 = clusterer3.means()
#clusterer = cluster.KMeansClusterer(2, euclidean_distance, initial_means=means) #clusters = clusterer.cluster(vectors, True, trace=True) #print 'Clustered:', vectors #print 'As:', clusters #print 'Means:', clusterer.means() #print #vectors = [] #vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test k-means using the euclidean distance metric, 2 means and repeat # clustering 10 times with random seeds clusterer = cluster.KMeansClusterer(2, euclidean_distance, avoid_empty_clusters=True) clusters = clusterer.cluster(vectors, True) #print 'Clustered:', vectors print 'As:' # clusters i = 2 for clst in clusters: print i, clst i = i + 1 print 'Means:', clusterer.means() #print vectors # classify a new vector #vector = array([3, 3]) #print 'classify(%s):' % vector,
from numpy import array from nltk import cluster from nltk.cluster import euclidean_distance vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]] clusterer = cluster.KMeansClusterer(2, euclidean_distance, repeats=10) print clusterer.cluster(vectors, True)
def cluster(self, feats_file, num_training, docs_file): """ feats_file: file contains features num_training: number of random data point from file to use for traninig. Because of limitted computation power we have to train our clusters on subset of data. doc_files: file contains all original data with their id. """ lines = [line.strip() for line in open(feats_file)] feats = [] for line in lines: parts = line.split() v = [float(x) for x in parts[1].split(",")] feats.append(v) feats = np.array(feats) # use nltk clustering because it has cosine distance self.clusterer = cluster.KMeansClusterer(self.n_clusters, cosine_distance, repeats=3, avoid_empty_clusters=True) # randomly select 10000 data points self.clusters = self.clusterer.cluster( feats[np.random.choice(feats.shape[0], num_training, replace=False ), :], False, True) P = [] for i in range(feats.shape[0]): p = self.clusterer.classify(feats[i, :]) P.append(p) # load the docs lines = [line.strip() for line in open(docs_file)] docs = [] for line in lines: docs.append(line) clean_docs = [] # pass the docs through pipeline to remove stopwords etc for doc in docs: v = [] words = self.pattern.sub(" ", doc.lower()).split() for word in words: if word in self.dictionary.token2id: v.append(word) #if (len(v) > 0): clean_docs.append(v) # find what are words corresponding to each cluster words_cluster = {} cluster_counter = {} for itr in range(len(clean_docs)): cn = P[itr] words = clean_docs[itr] if cn not in cluster_counter: cluster_counter[cn] = 1 cluster_counter[cn] += 1 if cn not in words_cluster: words_cluster[cn] = {} for w in words: if w not in words_cluster[cn]: words_cluster[cn][w] = 1 else: words_cluster[cn][w] += 1 # find a albel for each cluster cluster2word = {} for cn in words_cluster: sorted_words = [] for w in sorted(words_cluster[cn], key=words_cluster[cn].get, reverse=True): sorted_words.append(w) cluster2word[cn] = sorted_words self.main_topics_id = [] for i in sorted(cluster_counter, key=cluster_counter.get, reverse=True): self.main_topics_id.append(i) self.cluster_names = {} for ci in cluster2word: v = cluster2word[ci][:4] #print(ci, " --- ", v) self.cluster_names[ci] = "-".join(v) print("TOP TOPICS") for i in range(10): tid = self.main_topics_id[i] print(tid, " --- ", self.cluster_names[tid])
del v[5] #remove ss del v[-2] #remove aidf del v[-3] #remove aidf del v[1] heads.append(tokens[0]) heads2values[tokens[0]] = v values.append(v) #print len(heads) #print len(values) print(key) #print heads #print values vectors = [array(f) for f in values] clusterer = cluster.KMeansClusterer(3, euclidean_distance) clusters = clusterer.cluster(vectors, True) print(clusters) #print len(clusters) #print vectors cluster2head = defaultdict(list) for i in range(0, len(clusters)): head = heads[i] cl = clusters[i] cluster2head[cl].append(head) for cl in list(cluster2head.keys()): print("Cluster: {0}".format(cl)) print("{0:31} {1}".format("head", key)) for head in cluster2head[cl]:
#generating the vectors for each text tweetsBoW = [None] * len(tweets) for tweet in range(0, len(tweets)): tweetsBoW[tweet] = numpy.zeros(len(vocab)) for w in tweets[tweet]: for i, word in enumerate(vocab): if word == w: tweetsBoW[tweet][i] += 1 #clustering nClustersStr = input("---> How many clusters do you want to use? ") nClusters = int(nClustersStr) kmeans = cluster.KMeansClusterer(nClusters, cosine_distance, avoid_empty_clusters=True, conv_test=1e-4) clusters = kmeans.cluster(tweetsBoW, True, trace=True) for doc, cls in zip(unchangedTweets, clusters): print(cls, doc) #plotting clusters number of elements for analysis labels = list(Counter(clusters)) print(labels) values = list(Counter(clusters).values()) print(values) y_pos = numpy.arange(len(labels)) plt.bar(y_pos, values, color=(0.5, 0.1, 0.5, 0.6)) plt.title('Number of clusters in total = ' + str(nClusters)) plt.xlabel('Clusters')