def main(distance_metric, threshold, testmode=False): starttime = time.time() #make this flexible in case there are no subfolders folders = [i for i in os.listdir(pathi) if not i.startswith(".")] print ", ".join(folders) print "Items in folders", ", ".join( [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders]) #folders=['files9_output_0102']#, 'files9_output_0102', 'files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102', 'files9_output_0102'] print "We have {} folders".format(len(folders)) featuredict = dictmaker(folders, threshold, remove_stopwords=True, remove_punct=True) wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine( folders, featuredict, testmode, "category1") wordmatrix_without_cat, wordmatrix_with_cat = ct.matrixstats( wordmatrix_without_cat, wordmatrix_with_cat, distance_metric, zscores=False, outlier_removal=True, outlier_threshold=2, median_metric='median') #apply to wordmatrix with cats x = clustermachine(wordmatrix_without_cat, distance_metric, 4) #print [(i.name, i.no_of_clusters) for i in x] excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats'] print "These clusterings have less than 2 clusters\n{}\n\n".format( "\n".join([str(c.name) for c in x if c.no_of_clusters < 2])) #PRINTING STUFF headline = "\n\n-----------\n\n" print "Working with {} distance metric".format(distance_metric) #CROSS CLUSTERING COMPARISON for clustering in [c for c in x if c.no_of_clusters > 1]: cati = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels) sili = ct.Clusteringstats( wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric) #GENERAL STATS print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[1], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(sili)) stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).size_of_clusters() catstats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cats_per_cluster() for cluster in stats: print "\nCluster {} contains {} items, {} % of the total".format( cluster, stats[cluster], round( float(stats[cluster]) / len(wordmatrix_without_cat) * 100)) for cat in [i for i in catstats[cluster] if not i in excludelist]: print "{} items of category {} make up {} % of this cluster".format( catstats[cluster][cat], "".join( [i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat] / catstats[cluster]['total'] * 100)) cats = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories() #STATS PER CAT print headline, "Statistics per category" for cat in [i for i in cats if not i in excludelist]: print "\nCategory {} has {} items".format( "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total']) for entry in [ i for i in cats[cat]['cat_per_cluster'] if not i in excludelist ]: print "{} items or {} percent in cluster {}".format( cats[cat]['cat_per_cluster'][entry], round( float(cats[cat]['cat_per_cluster'][entry]) / float(cats[cat]['total']) * 100), entry) #PREDICTIVE FEATURES print headline, "Strongly predictive features are" cents = ct.Centroidstats( wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).cluster_predictors(featuredict) if cents: for diff in cents: print "\nRaw Scores" print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( diff[0], diff[1], ", ".join([ " : ".join(map(unicode, i[::-1])) for i in cents[diff]['raw_diff'] ][:10])) print "Zscores" print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( diff[0], diff[1], ", ".join([ " : ".join(map(unicode, i[::-1])) for i in cents[diff]['zscores_diff'] ][:10])) #PROTOTYPES print headline, "Here is a typical document for each cluster" distance = distance_metric if distance_metric == 'manhattan': distance = 'cityblock' print "We set the distance metric to {}".format(distance) docs = ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).central_documents( wordmatrix_with_cat, filedicti) if docs: for cluster in docs: print "\nCLUSTER {} \n".format(cluster) with open(docs[cluster][distance][0]) as f: print f.read() if len(docs[cluster][distance]) > 8: print "\nOther files close by in cluster {}:\n".format( cluster) print("{}\n" * 8).format(*docs[cluster][distance][1:9]) print headline, "Comparing clusterings" for clustering in [c for c in x if c.no_of_clusters > 1]: print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[0], clustering.no_of_clusters) print "Its silhouette score is {}".format( str( ct.Clusteringstats( wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric))) #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement input = [(str(type(i.name)).split(".")[3].rstrip("'>") + "--" + str(i.no_of_clusters), i) for i in x] simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input) options = [ 'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim', 'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim' ] for o in options: print "\n---\n" ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input).similarity_matrix(o) print "\n---\n" endtime = time.time() process = endtime - starttime print headline, "This took us {} minutes".format(process / 60) #or do we want to do predictive features and typical document per cluster as well???? os.system('say "your program has finished"')
def main(distance_metric, threshold, testmode=False): starttime=time.time() with codecs.open('clusterskmeans_54_19_10_07_30.json', 'r', 'utf-8') as jsoninput: wordtovecclusters=json.load(jsoninput) print "pre length", len(wordtovecclusters) wordtovecclusters={k:v for k,v in wordtovecclusters.items() if k not in ['1','25','30','37','49']} print "post length", len(wordtovecclusters) #make this flexible in case there are no subfolders folders=[i for i in os.listdir(pathi) if not i.startswith(".")] print ", ".join(folders) print "Items in folders", ", ".join([str(len(os.listdir(os.path.join(pathi,f)))) for f in folders]) #folders=['files9_output_0102']#, 'files9_output_0102', 'files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102', 'files9_output_0102'] print "We have {} folders".format(len(folders)) wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine(folders, wordtovecclusters, testmode, "category1") wordmatrix_without_cat, wordmatrix_with_cat=ct.matrixstats(wordmatrix_without_cat, wordmatrix_with_cat, distance_metric, zscores=False, outlier_removal=False, outlier_threshold = 2, median_metric='median') np.savetxt('wordmatrix_without_cat.gz',wordmatrix_without_cat ) np.savetxt('wordmatrix_with_cat.gz', wordmatrix_with_cat) x=clustermachine(wordmatrix_without_cat,distance_metric,4) #print [(i.name, i.no_of_clusters) for i in x] excludelist=['total','no_of_categories', 'no_of_clusters', 'no_of_cats'] print "These clusterings have less than 2 clusters\n{}\n\n".format("\n".join([str(c.name) for c in x if c.no_of_clusters < 2])) #PRINTING STUFF headline="\n\n-----------\n\n" print "Working with {} distance metric".format(distance_metric) #CROSS CLUSTERING COMPARISON for clustering in [c for c in x if c.no_of_clusters > 1]: cati=ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels) print "Categorystats established" sili=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric) print "Clusteringstats established" #GENERAL STATS print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS". format(clustering.getname()[1], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(sili)) stats=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).size_of_clusters() catstats=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cats_per_cluster() for cluster in stats: print "\nCluster {} contains {} items, {} % of the total".format(cluster, stats[cluster], round(float(stats[cluster])/len(wordmatrix_without_cat)*100)) for cat in [i for i in catstats[cluster] if not i in excludelist]: print "{} items of category {} make up {} % of this cluster".format(catstats[cluster][cat], "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat]/catstats[cluster]['total']*100)) cats=ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories() #STATS PER CAT print headline,"Statistics per category" for cat in [i for i in cats if not i in excludelist]: print "\nCategory {} has {} items".format("".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total']) for entry in [i for i in cats[cat]['cat_per_cluster'] if not i in excludelist]: print "{} items or {} percent in cluster {}".format(cats[cat]['cat_per_cluster'][entry], round(float(cats[cat]['cat_per_cluster'][entry])/float(cats[cat]['total'])*100), entry) #PREDICTIVE FEATURES print headline, "Strongly predictive features are" cents=ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).cluster_predictors(wordtovecclusters) if cents: for diff in cents: print "\nRaw Scores" print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(diff[0], diff[1], ", ".join([" : ".join(map(unicode, i[::-1])) for i in cents[diff]['raw_diff']][:10])) print "Zscores" print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(diff[0], diff[1], ", ".join([" : ".join(map(unicode, i[::-1])) for i in cents[diff]['zscores_diff']][:10])) #PROTOTYPES print headline, "Here is a typical document for each cluster" distance=distance_metric if distance_metric=='manhattan': distance='cityblock' print "We set the distance metric to {}".format(distance) docs=ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).central_documents(wordmatrix_with_cat, filedicti) if docs: for cluster in docs: print "\nCLUSTER {} \n".format(cluster) with open(docs[cluster][distance][0]) as f: print f.read() if len(docs[cluster][distance]) > 8: print "\nOther files close by in cluster {}:\n".format(cluster) print ("{}\n"*8).format(*docs[cluster][distance][1:9]) print headline, "Comparing clusterings" for clustering in [c for c in x if c.no_of_clusters > 1]: print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS". format(clustering.getname()[0], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric))) #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement input=[(str(type(i.name)).split(".")[3].rstrip("'>")+"--"+str(i.no_of_clusters), i) for i in x] simi=ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat ,input) options=['adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim', 'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim'] for o in options: print "\n---\n" ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat ,input).similarity_matrix(o) print "\n---\n" endtime=time.time() process=endtime-starttime print headline, "This took us {} minutes".format(process/60) #or do we want to do predictive features and typical document per cluster as well???? os.system('say "your program has finished"')
def main(distance_metric, threshold, testmode=False): starttime=time.time() #here we read in the featuredict #create the featuredict from a text file featuredict={} with codecs.open('/Users/ps22344/Downloads/chapter2/textfiles/emolist_final.txt', "r", "utf-8") as inputtext: for line in inputtext.readlines(): featuredict[line.rstrip("\n")]=0 print "pre length", len(featuredict) folders=[i for i in os.listdir(pathi) if not i.startswith(".")] print ", ".join(folders) print "Items in folders", ", ".join([str(len(os.listdir(os.path.join(pathi,f)))) for f in folders]) print "We have {} folders".format(len(folders)) #here we input the featuredict wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine(folders, featuredict, testmode, "category1") wordmatrix_without_cat, wordmatrix_with_cat=ct.matrixstats(wordmatrix_without_cat, wordmatrix_with_cat, distance_metric, zscores=False, outlier_removal=False, outlier_threshold = 2, median_metric='median') np.savetxt('wordmatrix_without_cat.gz',wordmatrix_without_cat ) np.savetxt('wordmatrix_with_cat.gz', wordmatrix_with_cat) x=clustermachine(wordmatrix_without_cat,distance_metric,4) #print [(i.name, i.no_of_clusters) for i in x] excludelist=['total','no_of_categories', 'no_of_clusters', 'no_of_cats'] print "These clusterings have less than 2 clusters\n{}\n\n".format("\n".join([str(c.name) for c in x if c.no_of_clusters < 2])) #PRINTING STUFF headline="\n\n-----------\n\n" print "Working with {} distance metric".format(distance_metric) #CROSS CLUSTERING COMPARISON for clustering in [c for c in x if c.no_of_clusters > 1]: cati=ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels) print "Categorystats established" sili=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric) print "Clusteringstats established" #GENERAL STATS print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS". format(clustering.getname()[1], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(sili)) stats=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).size_of_clusters() catstats=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cats_per_cluster() for cluster in stats: print "\nCluster {} contains {} items, {} % of the total".format(cluster, stats[cluster], round(float(stats[cluster])/len(wordmatrix_without_cat)*100)) for cat in [i for i in catstats[cluster] if not i in excludelist]: print "{} items of category {} make up {} % of this cluster".format(catstats[cluster][cat], "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat]/catstats[cluster]['total']*100)) cats=ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories() #STATS PER CAT print headline,"Statistics per category" for cat in [i for i in cats if not i in excludelist]: print "\nCategory {} has {} items".format("".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total']) for entry in [i for i in cats[cat]['cat_per_cluster'] if not i in excludelist]: print "{} items or {} percent in cluster {}".format(cats[cat]['cat_per_cluster'][entry], round(float(cats[cat]['cat_per_cluster'][entry])/float(cats[cat]['total'])*100), entry) #PREDICTIVE FEATURES print headline, "Strongly predictive features are" cents=ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).cluster_predictors(featuredict) if cents: for diff in cents: print "\nRaw Scores" print diff #the two below should return the same results; one was worked over because of unicode issues. print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(unicode(diff[0]), diff[1], ", ".join([" : ".join([i,unicode(k)]) for i,k in cents[diff]['raw_diff'][:10]])) print "Zscores" #this is python slice notation: "a[::-1] to reverse a string" ; http://stackoverflow.com/questions/509211/explain-pythons-slice-notation print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(diff[0], diff[1], ", ".join([" : ".join(map(unicode, i[::-1])) for i in cents[diff]['zscores_diff']][:10])) #PROTOTYPES print headline, "Here is a typical document for each cluster" distance=distance_metric if distance_metric=='manhattan': distance='cityblock' print "We set the distance metric to {}".format(distance) docs=ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).central_documents(wordmatrix_with_cat, filedicti) if docs: for cluster in docs: print "\nCLUSTER {} \n".format(cluster) with open(docs[cluster][distance][0]) as f: print f.read() if len(docs[cluster][distance]) > 8: print "\nOther files close by in cluster {}:\n".format(cluster) print ("{}\n"*8).format(*docs[cluster][distance][1:9]) print headline, "Comparing clusterings" for clustering in [c for c in x if c.no_of_clusters > 1]: print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS". format(clustering.getname()[0], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric))) #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement input=[(str(type(i.name)).split(".")[3].rstrip("'>")+"--"+str(i.no_of_clusters), i) for i in x] simi=ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat ,input) options=['adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim', 'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim'] for o in options: print "\n---\n" ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat ,input).similarity_matrix(o) print "\n---\n" endtime=time.time() process=endtime-starttime print headline, "This took us {} minutes".format(process/60) #or do we want to do predictive features and typical document per cluster as well???? os.system('say "your program has finished"')
featuredict, verbose="csv", limit=100) ##ZSCORES? #zscored matrix wordmatrix_without_cat = scipy.stats.zscore(t[:, 2:], axis=0) wordmatrix_with_cat = np.column_stack( [category1, uniqs, scipy.stats.zscore(t[:, 2:], axis=0)]) #print "ayayay", wordmatrix_with_cat ##TFIDF? #textfreq inverse doc freq #tfidf=sklearn.feature_extraction.text.TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) #wordmatrix_without_cat=tfidf.fit_transform(t[:,2:]).toarray() #wordmatrix_with_cat=np.column_stack([category1, uniqs, wordmatrix_without_cat]) #print "settings from tfidf", tfidf.get_params() print "matrix with cat and uniq", wordmatrix_with_cat.shape print "matrix w/out cat and uniq", wordmatrix_without_cat.shape ct.matrixstats(wordmatrix_without_cat, wordmatrix_with_cat) print "feature dict", featuredict, len(featuredict) #print listi ##ADD SPELLING!!!!TO DO #print wordmatrix_with_cat print wordmatrix_with_cat.shape completeend = time.time() os.system('say "your program has finished"')
def main(distance_metric, threshold, testmode=False): starttime = time.time() #here we read in the featuredict #create the featuredict from a text file featuredict = {} with codecs.open( '/Users/ps22344/Downloads/chapter2/textfiles/emolist_final.txt', "r", "utf-8") as inputtext: for line in inputtext.readlines(): featuredict[line.rstrip("\n")] = 0 print "pre length", len(featuredict) folders = [i for i in os.listdir(pathi) if not i.startswith(".")] print ", ".join(folders) print "Items in folders", ", ".join( [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders]) print "We have {} folders".format(len(folders)) #here we input the featuredict wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine( folders, featuredict, testmode, "category1") wordmatrix_without_cat, wordmatrix_with_cat = ct.matrixstats( wordmatrix_without_cat, wordmatrix_with_cat, distance_metric, zscores=False, outlier_removal=False, outlier_threshold=2, median_metric='median') np.savetxt('wordmatrix_without_cat.gz', wordmatrix_without_cat) np.savetxt('wordmatrix_with_cat.gz', wordmatrix_with_cat) x = clustermachine(wordmatrix_without_cat, distance_metric, 4) #print [(i.name, i.no_of_clusters) for i in x] excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats'] print "These clusterings have less than 2 clusters\n{}\n\n".format( "\n".join([str(c.name) for c in x if c.no_of_clusters < 2])) #PRINTING STUFF headline = "\n\n-----------\n\n" print "Working with {} distance metric".format(distance_metric) #CROSS CLUSTERING COMPARISON for clustering in [c for c in x if c.no_of_clusters > 1]: cati = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels) print "Categorystats established" sili = ct.Clusteringstats( wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric) print "Clusteringstats established" #GENERAL STATS print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[1], clustering.no_of_clusters) print "Its silhouette score is {}".format(str(sili)) stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).size_of_clusters() catstats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cats_per_cluster() for cluster in stats: print "\nCluster {} contains {} items, {} % of the total".format( cluster, stats[cluster], round( float(stats[cluster]) / len(wordmatrix_without_cat) * 100)) for cat in [i for i in catstats[cluster] if not i in excludelist]: print "{} items of category {} make up {} % of this cluster".format( catstats[cluster][cat], "".join( [i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat] / catstats[cluster]['total'] * 100)) cats = ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories() #STATS PER CAT print headline, "Statistics per category" for cat in [i for i in cats if not i in excludelist]: print "\nCategory {} has {} items".format( "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total']) for entry in [ i for i in cats[cat]['cat_per_cluster'] if not i in excludelist ]: print "{} items or {} percent in cluster {}".format( cats[cat]['cat_per_cluster'][entry], round( float(cats[cat]['cat_per_cluster'][entry]) / float(cats[cat]['total']) * 100), entry) #PREDICTIVE FEATURES print headline, "Strongly predictive features are" cents = ct.Centroidstats( wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).cluster_predictors(featuredict) if cents: for diff in cents: print "\nRaw Scores" print diff #the two below should return the same results; one was worked over because of unicode issues. print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( unicode(diff[0]), diff[1], ", ".join([ " : ".join([i, unicode(k)]) for i, k in cents[diff]['raw_diff'][:10] ])) print "Zscores" #this is python slice notation: "a[::-1] to reverse a string" ; http://stackoverflow.com/questions/509211/explain-pythons-slice-notation print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format( diff[0], diff[1], ", ".join([ " : ".join(map(unicode, i[::-1])) for i in cents[diff]['zscores_diff'] ][:10])) #PROTOTYPES print headline, "Here is a typical document for each cluster" distance = distance_metric if distance_metric == 'manhattan': distance = 'cityblock' print "We set the distance metric to {}".format(distance) docs = ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).central_documents( wordmatrix_with_cat, filedicti) if docs: for cluster in docs: print "\nCLUSTER {} \n".format(cluster) with open(docs[cluster][distance][0]) as f: print f.read() if len(docs[cluster][distance]) > 8: print "\nOther files close by in cluster {}:\n".format( cluster) print("{}\n" * 8).format(*docs[cluster][distance][1:9]) print headline, "Comparing clusterings" for clustering in [c for c in x if c.no_of_clusters > 1]: print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format( clustering.getname()[0], clustering.no_of_clusters) print "Its silhouette score is {}".format( str( ct.Clusteringstats( wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric))) #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement input = [(str(type(i.name)).split(".")[3].rstrip("'>") + "--" + str(i.no_of_clusters), i) for i in x] simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input) options = [ 'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim', 'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim' ] for o in options: print "\n---\n" ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat, input).similarity_matrix(o) print "\n---\n" endtime = time.time() process = endtime - starttime print headline, "This took us {} minutes".format(process / 60) #or do we want to do predictive features and typical document per cluster as well???? os.system('say "your program has finished"')