def Clustering(orig, minclusters, maxclusters) : '''returns (distortion score, number of clusters, cluster assignment)''' # perform clustering clusterer = GAAClusterer() clusterer.cluster(orig) vrc = [] # calculate distortions wb = len(orig) centroid = numpy.mean(orig, axis=0) for vector in orig : wb -= cosine_distance(vector, centroid) lowerbound = minclusters if lowerbound < 2 : lowerbound = 2 for k in range(lowerbound, maxclusters + 1) : clusterer.update_clusters(k) gaac = [] ww = len(orig) for vector in orig : maxcos = None for j in range(k) : clust = clusterer._centroids[j] cdist = cosine_distance(vector, clust) if not maxcos or cdist > maxcos[0] : maxcos = (cdist, j) ww -= maxcos[0] gaac.append(maxcos[1]) vrc.append(((wb/(k - 1)) / (ww/(len(orig) - k)), k, gaac)) khat = (float("inf"), vrc[0][1], vrc[0][2]) for k in range(1, len(vrc) - 1) : dist = (vrc[k+1][0] - vrc[k][0]) - (vrc[k][0] - vrc[k-1][0]) if dist < khat[0] : khat = (dist, vrc[k][1], vrc[k][2]) return khat
def Clustering(orig, minclusters, maxclusters): '''returns (distortion score, number of clusters, cluster assignment)''' # perform clustering clusterer = GAAClusterer() clusterer.cluster(orig) vrc = [] # calculate distortions wb = len(orig) centroid = numpy.mean(orig, axis=0) for vector in orig: wb -= cosine_distance(vector, centroid) lowerbound = minclusters if lowerbound < 2: lowerbound = 2 for k in range(lowerbound, maxclusters + 1): clusterer.update_clusters(k) gaac = [] ww = len(orig) for vector in orig: maxcos = None for j in range(k): clust = clusterer._centroids[j] cdist = cosine_distance(vector, clust) if not maxcos or cdist > maxcos[0]: maxcos = (cdist, j) ww -= maxcos[0] gaac.append(maxcos[1]) vrc.append(((wb / (k - 1)) / (ww / (len(orig) - k)), k, gaac)) khat = (float("inf"), vrc[0][1], vrc[0][2]) for k in range(1, len(vrc) - 1): dist = (vrc[k + 1][0] - vrc[k][0]) - (vrc[k][0] - vrc[k - 1][0]) if dist < khat[0]: khat = (dist, vrc[k][1], vrc[k][2]) return khat
def extract_tweets_citedby_graph(df): global stemmer_func, words, stopwords stemmer_func = nltk.stem.snowball.SnowballStemmer("english").stem stopwords = set(nltk.corpus.stopwords.words('english')) words = get_words(df[2].values) # pp.pprint(words[:10]) # K-Means clustering: # cluster = KMeansClusterer(7, euclidean_distance,avoid_empty_clusters=True) # GAAClusterer cluster = GAAClusterer(21) cluster.cluster([vectorspaced(title) for title in df[2].values if title],True) classified_examples = [cluster.classify(vectorspaced(title)) for title in df[2].values] # for cluster_id, title in sorted(zip(classified_examples, df[2].values)): # # print "{}\t{}\t{}\n".format(cluster_id, df[0].loc[df[2] == title].values, df[1].loc[df[2] == title].values) # print "{}\t{}\t{}".format(cluster_id, df[1].loc[df[2] == title].values, title) # Display clusters / write to disk with open ('Results/clustered_relevant_users.tsv', 'w') as f: for cluster_id,title in sorted(zip(classified_examples, df[2].values)): if cluster_id>6: # save: docid tab userids f.write('{}\t{}\n'.format(df[0].loc[df[2] == title].values, df[1].loc[df[2] == title].values)) if os.path.exists('Results/clustered_relevant_users.tsv'): print 'file saved: Results/clustered_relevant_users.tsv' return
def get_word_clusters(tweets): all_words = set() for tweet in tweets: for word in get_words(tweet[HEADER_DICT["text"]]): all_words.add(word) all_words = tuple(all_words) cluster = GAAClusterer(5) cluster.cluster([vectorspaced(tweet[HEADER_DICT["text"]], all_words) for tweet in tweets]) classified_examples = [cluster.classify(vectorspaced(tweet[HEADER_DICT["text"]], all_words)) for tweet in tweets] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk.cluster import GAAClusterer # use a set of tokens with 2D indices vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test the GAAC clusterer with 4 clusters clusterer = GAAClusterer(4) clusters = clusterer.cluster(vectors, True) print('Clusterer:', clusterer) print('Clustered:', vectors) print('As:', clusters) print() # show the dendrogram clusterer.dendrogram().show() # classify a new vector vector = numpy.array([3, 3]) print('classify(%s):' % vector, end=' ') print(clusterer.classify(vector)) print()
def cluster_texts(texts, clustersNumber, distance): # Convierte texto en una coleccion # Load the list of texts into a TextCollection object. collection = nltk.TextCollection(texts) print("Created a collection of", len(collection), "terms.") # Para representar los textos como vectores de terminos representativos, cojo los terminos unicos # Get a list of unique terms unique_terms = list(set(collection)) print("Unique terms found: ", len(unique_terms)) ### And here we actually call the function and create our array of vectors. # TF mide la frecuencia en los textos. # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts] print("Vectors created.") print(vectors) # initialize the clusterer clusterer = GAAClusterer(clustersNumber) clusters = clusterer.cluster(vectors, True) # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn #clusterer = AgglomerativeClustering(n_clusters=clustersNumber, # linkage="average", affinity=distanceFunction) #clusters = clusterer.fit_predict(vectors) return clusters
def get_word_clusters(): all_words = set() for tweet in tweets.find(): for word in get_words(tweet['text']): all_words.add(word) all_words = tuple(all_words) cluster = GAAClusterer(5) cluster.cluster([vectorspaced(tweet['text'], all_words) for tweet in tweets.find()]) classified_examples = [ cluster.classify(vectorspaced(tweet['text'], all_words)) for tweet in tweets.find() ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk.cluster import GAAClusterer # use a set of tokens with 2D indices vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test the GAAC clusterer with 4 clusters clusterer = GAAClusterer(4) clusters = clusterer.cluster(vectors, True) print("Clusterer:", clusterer) print("Clustered:", vectors) print("As:", clusters) print() # show the dendrogram clusterer.dendrogram().show() # classify a new vector vector = numpy.array([3, 3]) print("classify(%s):" % vector, end=" ") print(clusterer.classify(vector)) print()
def get_word_clusters(): all_words = set() for tweet in tweets.find(): for word in get_words(tweet['text']): all_words.add(word) all_words = tuple(all_words) cluster = GAAClusterer(5) cluster.cluster( [vectorspaced(tweet['text'], all_words) for tweet in tweets.find()]) classified_examples = [ cluster.classify(vectorspaced(tweet['text'], all_words)) for tweet in tweets.find() ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
def clusterIt(kwnb, clusternb, keywords): @decorators.memoize def normalize_word(word): return stemmer_func(word.lower()) def get_words(titles): words = set() for title in job_titles: for word in title.split(): words.add(normalize_word(word)) return list(words) @decorators.memoize def vectorspaced(title): title_components = [normalize_word(word) for word in title.split()] return numpy.array([ word in title_components and not word in stopwords for word in words], numpy.short) ret = list() if len(keywords) > 0: job_titles = [x.keyword for x in keywords] job_titles = [x.strip() for x in job_titles] words = get_words(job_titles) # cluster = KMeansClusterer(5, euclidean_distance) cluster = GAAClusterer(clusternb) cluster.cluster([vectorspaced(title) for title in job_titles if title]) classified_examples = [cluster.classify(vectorspaced(title)) for title in job_titles] for cluster_id, title in sorted(zip(classified_examples, job_titles)): if(title != ''): for keyword in keywords: if (title==keyword.keyword): keyword.assignCluster(cluster_id) ret.append(keyword) return ret
def Gaaclusterer_experiment(samples,k_cluster): silhouette = [] devis_boldin = [] for i in range(2,k_cluster): gaaclusterer= GAAClusterer(num_clusters=i) assigned_cluster = gaaclusterer.cluster(samples,True) silhouette.append(metrics.silhouette_score(X=samples, labels=np.array(assigned_cluster))) devis_boldin.append(davies_bouldin_score(samples, assigned_cluster)) plt.plot(np.arange(2,k_cluster), silhouette,c='r', label='silhouette') plt.plot(np.arange(2,k_cluster), devis_boldin,c='g' ,label='devis_bouldin') plt.xlabel('number of cluster') plt.ylabel('Score') plt.title('GAACluster') plt.legend() plt.show() return assigned_cluster
def cluster_texts(texts, clustersNumber, distance): #Load the list of texts into a TextCollection object. collection = nltk.TextCollection(texts) print("Created a collection of", len(collection), "terms.") #get a list of unique terms unique_terms = list(set(collection)) print("Unique terms found: ", len(unique_terms)) ### And here we actually call the function and create our array of vectors. vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts] print("Vectors created.") # initialize the clusterer clusterer = GAAClusterer(clustersNumber) clusters = clusterer.cluster(vectors, True) #clusterer = AgglomerativeClustering(n_clusters=clustersNumber, # linkage="average", affinity=distanceFunction) #clusters = clusterer.fit_predict(vectors) return clusters
def vectorspaced(stemmer, title): title_components = [stemmer.stem(word.lower()) for word in title.split()] return numpy.array([ word in title_components and not word in stopwords for word in words], numpy.short) if __name__ == '__main__': filename = 'CSV/pridected_true_text_alldata.csv' if len(sys.argv) == 2: filename = sys.argv[1] with open(filename) as title_file: job_titles = [line.strip() for line in title_file.readlines()] words = get_words(stemmer, job_titles) # cluster = KMeansClusterer(5, euclidean_distance) cluster = GAAClusterer(30) cluster.cluster([vectorspaced(stemmer, title) for title in job_titles if title]) # NOTE: This is inefficient, cluster.classify should really just be # called when you are classifying previously unseen examples! classified_examples = [ cluster.classify(vectorspaced(stemmer, title)) for title in job_titles ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
title_components = [normalize_word(word) for word in title.split()] return numpy.array( [word in title_components and not word in stopwords for word in words], numpy.short) if __name__ == '__main__': filename = 'example.txt' if len(sys.argv) == 2: filename = sys.argv[1] with open(filename) as title_file: job_titles = [line.strip() for line in title_file.readlines()] words = get_words(job_titles) # cluster = KMeansClusterer(5, euclidean_distance) cluster = GAAClusterer(5) cluster.cluster([vectorspaced(title) for title in job_titles if title]) # NOTE: This is inefficient, cluster.classify should really just be # called when you are classifying previously unseen examples! classified_examples = [ cluster.classify(vectorspaced(title)) for title in job_titles ] for cluster_id, title in sorted(zip(classified_examples, job_titles)): print cluster_id, title
def cluster3(index, k): from nltk.cluster import GAAClusterer clusterer = GAAClusterer(k) clusters = clusterer.cluster(index, True) return clusters
return numpy.array(w_vec, numpy.float) if __name__ == '__main__': corpus_dir='/Users/dehao/github/Lydata/ckip/corpus_res/' #corpus_dir='D:/python_workspace/corpus_test/' words=get_words(corpus_dir) vec_space=[] indf_dict={} index=0 #写对应关系 fw1=open('/Users/dehao/github/Lydata/ckip/index_file.txt', 'w') #写结果 fw2=open('/Users/dehao/github/Lydata/ckip/cluser_res.txt', 'w') for f in os.listdir(corpus_dir): index+=1 print 'already handle file:', index indf_dict[index]=f with codecs.open(corpus_dir+f, 'r', encoding='utf-8') as title: vec_space.append(vectorspaced(title)) #如何根据hisgram判断分类数目? cluster = GAAClusterer(40) clustered=cluster.cluster(vec_space, True) #cluster.dendrogram().show() for k, v in indf_dict.items(): fw1.write('%s\t%s\n' % (str(k), str(v))) fw1.close() fw2.write('%s\n' % ','.join([str(v) for v in clustered])) fw2.close()
silhouette_score(tfidf, array1, metric='euclidean', sample_size=None, random_state=None) #0.031277350000072916 ## clustering only on the training data km2 = KMeans(n_clusters=num_clusters, random_state=42) km2.fit(X_all) clusters2 = km2.labels_.tolist() array2 = np.array(clusters2) silhouette_score(X_all, array2, metric='euclidean', sample_size=None, random_state=None) #0.037444797109297122 from nltk.cluster import GAAClusterer clusterer = GAAClusterer(4) clusters_agg = clusterer.cluster(X_all.toarray(), True) array3 = np.array(clusters_agg) # EValuating the nltk Agglomerative clustering silhouette_score(X_all, array3, metric='cosine', sample_size=None, random_state=None)