def main(distance_metric, threshold, testmode=False):
    starttime = time.time()
    #make this flexible in case there are no subfolders
    folders = [i for i in os.listdir(pathi) if not i.startswith(".")]
    print ", ".join(folders)
    print "Items in folders", ", ".join(
        [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders])
    #folders=['files9_output_0102']#, 'files9_output_0102', 'files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102', 'files9_output_0102']
    print "We have {} folders".format(len(folders))
    featuredict = dictmaker(folders,
                            threshold,
                            remove_stopwords=True,
                            remove_punct=True)

    wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine(
        folders, featuredict, testmode, "category1")

    wordmatrix_without_cat, wordmatrix_with_cat = ct.matrixstats(
        wordmatrix_without_cat,
        wordmatrix_with_cat,
        distance_metric,
        zscores=False,
        outlier_removal=True,
        outlier_threshold=2,
        median_metric='median')
    #apply to wordmatrix with cats

    x = clustermachine(wordmatrix_without_cat, distance_metric, 4)
    #print [(i.name, i.no_of_clusters) for i in x]
    excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats']
    print "These clusterings have less than 2 clusters\n{}\n\n".format(
        "\n".join([str(c.name) for c in x if c.no_of_clusters < 2]))
    #PRINTING STUFF
    headline = "\n\n-----------\n\n"
    print "Working with {} distance metric".format(distance_metric)

    #CROSS CLUSTERING COMPARISON
    for clustering in [c for c in x if c.no_of_clusters > 1]:
        cati = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels)
        sili = ct.Clusteringstats(
            wordmatrix_with_cat, wordmatrix_without_cat, clustering.name,
            clustering.labels).cluster_silhouette(distance_metric)

        #GENERAL STATS
        print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[1], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(str(sili))
        stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat,
                                   clustering.name,
                                   clustering.labels).size_of_clusters()
        catstats = ct.Clusteringstats(wordmatrix_with_cat,
                                      wordmatrix_without_cat, clustering.name,
                                      clustering.labels).cats_per_cluster()
        for cluster in stats:
            print "\nCluster {} contains {} items, {} % of the total".format(
                cluster, stats[cluster],
                round(
                    float(stats[cluster]) / len(wordmatrix_without_cat) * 100))
            for cat in [i for i in catstats[cluster] if not i in excludelist]:
                print "{} items of category {} make up {} % of this cluster".format(
                    catstats[cluster][cat], "".join(
                        [i[0] for i in catdicti.items() if i[1] == int(cat)]),
                    round(catstats[cluster][cat] / catstats[cluster]['total'] *
                          100))
        cats = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels).size_of_categories()

        #STATS PER CAT
        print headline, "Statistics per category"
        for cat in [i for i in cats if not i in excludelist]:
            print "\nCategory {} has {} items".format(
                "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]),
                cats[cat]['total'])
            for entry in [
                    i for i in cats[cat]['cat_per_cluster']
                    if not i in excludelist
            ]:
                print "{} items or {} percent in cluster {}".format(
                    cats[cat]['cat_per_cluster'][entry],
                    round(
                        float(cats[cat]['cat_per_cluster'][entry]) /
                        float(cats[cat]['total']) * 100), entry)

        #PREDICTIVE FEATURES
        print headline, "Strongly predictive features are"
        cents = ct.Centroidstats(
            wordmatrix_without_cat, clustering.name, clustering.labels,
            clustering.centroids).cluster_predictors(featuredict)
        if cents:
            for diff in cents:
                print "\nRaw Scores"
                print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    diff[0], diff[1], ", ".join([
                        " : ".join(map(unicode, i[::-1]))
                        for i in cents[diff]['raw_diff']
                    ][:10]))
                print "Zscores"
                print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    diff[0], diff[1], ", ".join([
                        " : ".join(map(unicode, i[::-1]))
                        for i in cents[diff]['zscores_diff']
                    ][:10]))

        #PROTOTYPES
        print headline, "Here is a typical document for each cluster"
        distance = distance_metric
        if distance_metric == 'manhattan':
            distance = 'cityblock'
        print "We set the distance metric to {}".format(distance)
        docs = ct.Centroidstats(wordmatrix_without_cat, clustering.name,
                                clustering.labels,
                                clustering.centroids).central_documents(
                                    wordmatrix_with_cat, filedicti)
        if docs:
            for cluster in docs:
                print "\nCLUSTER {} \n".format(cluster)
                with open(docs[cluster][distance][0]) as f:
                    print f.read()
                if len(docs[cluster][distance]) > 8:
                    print "\nOther files close by in cluster {}:\n".format(
                        cluster)
                    print("{}\n" * 8).format(*docs[cluster][distance][1:9])
    print headline, "Comparing clusterings"
    for clustering in [c for c in x if c.no_of_clusters > 1]:
        print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[0], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(
            str(
                ct.Clusteringstats(
                    wordmatrix_with_cat, wordmatrix_without_cat,
                    clustering.name,
                    clustering.labels).cluster_silhouette(distance_metric)))
    #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement
    input = [(str(type(i.name)).split(".")[3].rstrip("'>") + "--" +
              str(i.no_of_clusters), i) for i in x]
    simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                   input)
    options = [
        'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim',
        'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim'
    ]
    for o in options:
        print "\n---\n"
        ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                input).similarity_matrix(o)

    print "\n---\n"
    endtime = time.time()
    process = endtime - starttime
    print headline, "This took us {} minutes".format(process / 60)
    #or do we want to do predictive features and typical document per cluster as well????
    os.system('say "your program has finished"')
def main(distance_metric, threshold, testmode=False):
	starttime=time.time()
	with codecs.open('clusterskmeans_54_19_10_07_30.json', 'r', 'utf-8') as jsoninput:
		wordtovecclusters=json.load(jsoninput)
	print "pre length", len(wordtovecclusters)
	wordtovecclusters={k:v for k,v in wordtovecclusters.items() if k not in ['1','25','30','37','49']}
	print "post length", len(wordtovecclusters)
	#make this flexible in case there are no subfolders
	folders=[i for i in os.listdir(pathi) if not i.startswith(".")]
	print ", ".join(folders)
	print "Items in folders", ", ".join([str(len(os.listdir(os.path.join(pathi,f)))) for f in folders])
	#folders=['files9_output_0102']#, 'files9_output_0102', 'files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102', 'files9_output_0102'] 
	print "We have {} folders".format(len(folders))
	
	wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine(folders, wordtovecclusters, testmode, "category1")
	
	wordmatrix_without_cat, wordmatrix_with_cat=ct.matrixstats(wordmatrix_without_cat, wordmatrix_with_cat, distance_metric, zscores=False, outlier_removal=False, outlier_threshold = 2, median_metric='median')
	np.savetxt('wordmatrix_without_cat.gz',wordmatrix_without_cat )
	np.savetxt('wordmatrix_with_cat.gz', wordmatrix_with_cat)
	x=clustermachine(wordmatrix_without_cat,distance_metric,4)
	#print [(i.name, i.no_of_clusters) for i in x]
	excludelist=['total','no_of_categories', 'no_of_clusters', 'no_of_cats']
	print "These clusterings have less than 2 clusters\n{}\n\n".format("\n".join([str(c.name) for c in x if c.no_of_clusters < 2]))
	#PRINTING STUFF
	headline="\n\n-----------\n\n"
	print "Working with {} distance metric".format(distance_metric)
		
	#CROSS CLUSTERING COMPARISON
	for clustering in [c for c in x if c.no_of_clusters > 1]:
		cati=ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels)
		print "Categorystats established"
		sili=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric)
		print "Clusteringstats established"
		#GENERAL STATS
		print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS". format(clustering.getname()[1], clustering.no_of_clusters)
		print "Its silhouette score is {}".format(str(sili))
		stats=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).size_of_clusters()
		catstats=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cats_per_cluster()
		for cluster in stats:
			print "\nCluster {} contains {} items, {} % of the total".format(cluster, stats[cluster], round(float(stats[cluster])/len(wordmatrix_without_cat)*100))
			for cat in [i for i in catstats[cluster] if not i in excludelist]:
				print "{} items of category {} make up {} % of this cluster".format(catstats[cluster][cat], "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat]/catstats[cluster]['total']*100))
		cats=ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories()
		
		#STATS PER CAT
		print headline,"Statistics per category"
		for cat in [i for i in cats if not i in excludelist]:
			print "\nCategory {} has {} items".format("".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total'])
			for entry in [i for i in cats[cat]['cat_per_cluster'] if not i in excludelist]:
				print "{} items or {} percent in cluster {}".format(cats[cat]['cat_per_cluster'][entry], round(float(cats[cat]['cat_per_cluster'][entry])/float(cats[cat]['total'])*100), entry)

		#PREDICTIVE FEATURES
		print headline, "Strongly predictive features are"
		cents=ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).cluster_predictors(wordtovecclusters)
		if cents:
			for diff in cents:
				print "\nRaw Scores"
				print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(diff[0], diff[1], ", ".join([" : ".join(map(unicode, i[::-1])) for i in cents[diff]['raw_diff']][:10])) 
				print "Zscores"
				print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(diff[0], diff[1], ", ".join([" : ".join(map(unicode, i[::-1])) for i in cents[diff]['zscores_diff']][:10]))	
			
		
		#PROTOTYPES
		print headline, "Here is a typical document for each cluster"
		distance=distance_metric
		if distance_metric=='manhattan':
			distance='cityblock'
		print "We set the distance metric to {}".format(distance)
		docs=ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).central_documents(wordmatrix_with_cat, filedicti)
		if docs:
			for cluster in docs:
				print "\nCLUSTER {} \n".format(cluster)
				with open(docs[cluster][distance][0]) as f:
					print f.read()
				if len(docs[cluster][distance]) > 8:
					print "\nOther files close by in cluster {}:\n".format(cluster)
					print ("{}\n"*8).format(*docs[cluster][distance][1:9])
	print headline, "Comparing clusterings"
	for clustering in [c for c in x if c.no_of_clusters > 1]:
		print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS". format(clustering.getname()[0], clustering.no_of_clusters)
		print "Its silhouette score is {}".format(str(ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric)))
	#all input does it just concatenate name + cluster # and supply clustering object to similarity measurement
	input=[(str(type(i.name)).split(".")[3].rstrip("'>")+"--"+str(i.no_of_clusters), i) for i in x]
	simi=ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat ,input)
	options=['adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim', 'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim']
	for o in options:
		print "\n---\n"
		ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat ,input).similarity_matrix(o)
	
	
	print "\n---\n"
	endtime=time.time()
	process=endtime-starttime
	print headline, "This took us {} minutes".format(process/60)
		#or do we want to do predictive features and typical document per cluster as well????	
	os.system('say "your program has finished"')
def main(distance_metric, threshold, testmode=False):
	starttime=time.time()
	#here we read in the featuredict
	#create the featuredict from a text file
	featuredict={}
	with codecs.open('/Users/ps22344/Downloads/chapter2/textfiles/emolist_final.txt', "r", "utf-8") as inputtext:
		for line in inputtext.readlines():
			featuredict[line.rstrip("\n")]=0
	print "pre length", len(featuredict)
	folders=[i for i in os.listdir(pathi) if not i.startswith(".")]
	print ", ".join(folders)
	print "Items in folders", ", ".join([str(len(os.listdir(os.path.join(pathi,f)))) for f in folders])
	print "We have {} folders".format(len(folders))
	#here we input the featuredict
	wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine(folders, featuredict, testmode, "category1")
	
	wordmatrix_without_cat, wordmatrix_with_cat=ct.matrixstats(wordmatrix_without_cat, wordmatrix_with_cat, distance_metric, zscores=False, outlier_removal=False, outlier_threshold = 2, median_metric='median')
	np.savetxt('wordmatrix_without_cat.gz',wordmatrix_without_cat )
	np.savetxt('wordmatrix_with_cat.gz', wordmatrix_with_cat)
	x=clustermachine(wordmatrix_without_cat,distance_metric,4)
	#print [(i.name, i.no_of_clusters) for i in x]
	excludelist=['total','no_of_categories', 'no_of_clusters', 'no_of_cats']
	print "These clusterings have less than 2 clusters\n{}\n\n".format("\n".join([str(c.name) for c in x if c.no_of_clusters < 2]))
	#PRINTING STUFF
	headline="\n\n-----------\n\n"
	print "Working with {} distance metric".format(distance_metric)
		
	#CROSS CLUSTERING COMPARISON
	for clustering in [c for c in x if c.no_of_clusters > 1]:
		cati=ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels)
		print "Categorystats established"
		sili=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric)
		print "Clusteringstats established"
		#GENERAL STATS
		print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS". format(clustering.getname()[1], clustering.no_of_clusters)
		print "Its silhouette score is {}".format(str(sili))
		stats=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).size_of_clusters()
		catstats=ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cats_per_cluster()
		for cluster in stats:
			print "\nCluster {} contains {} items, {} % of the total".format(cluster, stats[cluster], round(float(stats[cluster])/len(wordmatrix_without_cat)*100))
			for cat in [i for i in catstats[cluster] if not i in excludelist]:
				print "{} items of category {} make up {} % of this cluster".format(catstats[cluster][cat], "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), round(catstats[cluster][cat]/catstats[cluster]['total']*100))
		cats=ct.Categorystats(wordmatrix_with_cat, clustering.name, clustering.labels).size_of_categories()
		
		#STATS PER CAT
		print headline,"Statistics per category"
		for cat in [i for i in cats if not i in excludelist]:
			print "\nCategory {} has {} items".format("".join([i[0] for i in catdicti.items() if i[1] == int(cat)]), cats[cat]['total'])
			for entry in [i for i in cats[cat]['cat_per_cluster'] if not i in excludelist]:
				print "{} items or {} percent in cluster {}".format(cats[cat]['cat_per_cluster'][entry], round(float(cats[cat]['cat_per_cluster'][entry])/float(cats[cat]['total'])*100), entry)

		#PREDICTIVE FEATURES
		print headline, "Strongly predictive features are"
		cents=ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).cluster_predictors(featuredict)
		if cents:
			for diff in cents:
				print "\nRaw Scores"
				print diff
				#the two below should return the same results; one was worked over because of unicode issues. 
				print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(unicode(diff[0]), diff[1], ", ".join([" : ".join([i,unicode(k)]) for i,k in cents[diff]['raw_diff'][:10]]))
				print "Zscores"
				#this is python slice notation: "a[::-1] to reverse a string" ; http://stackoverflow.com/questions/509211/explain-pythons-slice-notation
				print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(diff[0], diff[1], ", ".join([" : ".join(map(unicode, i[::-1])) for i in cents[diff]['zscores_diff']][:10]))	
			
		
		#PROTOTYPES
		print headline, "Here is a typical document for each cluster"
		distance=distance_metric
		if distance_metric=='manhattan':
			distance='cityblock'
		print "We set the distance metric to {}".format(distance)
		docs=ct.Centroidstats(wordmatrix_without_cat, clustering.name, clustering.labels, clustering.centroids).central_documents(wordmatrix_with_cat, filedicti)
		if docs:
			for cluster in docs:
				print "\nCLUSTER {} \n".format(cluster)
				with open(docs[cluster][distance][0]) as f:
					print f.read()
				if len(docs[cluster][distance]) > 8:
					print "\nOther files close by in cluster {}:\n".format(cluster)
					print ("{}\n"*8).format(*docs[cluster][distance][1:9])
	print headline, "Comparing clusterings"
	for clustering in [c for c in x if c.no_of_clusters > 1]:
		print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS". format(clustering.getname()[0], clustering.no_of_clusters)
		print "Its silhouette score is {}".format(str(ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat, clustering.name, clustering.labels).cluster_silhouette(distance_metric)))
	#all input does it just concatenate name + cluster # and supply clustering object to similarity measurement
	input=[(str(type(i.name)).split(".")[3].rstrip("'>")+"--"+str(i.no_of_clusters), i) for i in x]
	simi=ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat ,input)
	options=['adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim', 'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim']
	for o in options:
		print "\n---\n"
		ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat ,input).similarity_matrix(o)
	
	
	print "\n---\n"
	endtime=time.time()
	process=endtime-starttime
	print headline, "This took us {} minutes".format(process/60)
		#or do we want to do predictive features and typical document per cluster as well????	
	os.system('say "your program has finished"')
Esempio n. 4
0
               featuredict,
               verbose="csv",
               limit=100)

##ZSCORES?
#zscored matrix
wordmatrix_without_cat = scipy.stats.zscore(t[:, 2:], axis=0)
wordmatrix_with_cat = np.column_stack(
    [category1, uniqs, scipy.stats.zscore(t[:, 2:], axis=0)])
#print "ayayay", wordmatrix_with_cat

##TFIDF?
#textfreq inverse doc freq
#tfidf=sklearn.feature_extraction.text.TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
#wordmatrix_without_cat=tfidf.fit_transform(t[:,2:]).toarray()
#wordmatrix_with_cat=np.column_stack([category1, uniqs, wordmatrix_without_cat])
#print "settings from tfidf", tfidf.get_params()

print "matrix with cat and uniq", wordmatrix_with_cat.shape
print "matrix w/out cat and uniq", wordmatrix_without_cat.shape
ct.matrixstats(wordmatrix_without_cat, wordmatrix_with_cat)

print "feature dict", featuredict, len(featuredict)
#print listi
##ADD SPELLING!!!!TO DO
#print wordmatrix_with_cat
print wordmatrix_with_cat.shape

completeend = time.time()
os.system('say "your program has finished"')
def main(distance_metric, threshold, testmode=False):
    starttime = time.time()
    #here we read in the featuredict
    #create the featuredict from a text file
    featuredict = {}
    with codecs.open(
            '/Users/ps22344/Downloads/chapter2/textfiles/emolist_final.txt',
            "r", "utf-8") as inputtext:
        for line in inputtext.readlines():
            featuredict[line.rstrip("\n")] = 0
    print "pre length", len(featuredict)
    folders = [i for i in os.listdir(pathi) if not i.startswith(".")]
    print ", ".join(folders)
    print "Items in folders", ", ".join(
        [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders])
    print "We have {} folders".format(len(folders))
    #here we input the featuredict
    wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine(
        folders, featuredict, testmode, "category1")

    wordmatrix_without_cat, wordmatrix_with_cat = ct.matrixstats(
        wordmatrix_without_cat,
        wordmatrix_with_cat,
        distance_metric,
        zscores=False,
        outlier_removal=False,
        outlier_threshold=2,
        median_metric='median')
    np.savetxt('wordmatrix_without_cat.gz', wordmatrix_without_cat)
    np.savetxt('wordmatrix_with_cat.gz', wordmatrix_with_cat)
    x = clustermachine(wordmatrix_without_cat, distance_metric, 4)
    #print [(i.name, i.no_of_clusters) for i in x]
    excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats']
    print "These clusterings have less than 2 clusters\n{}\n\n".format(
        "\n".join([str(c.name) for c in x if c.no_of_clusters < 2]))
    #PRINTING STUFF
    headline = "\n\n-----------\n\n"
    print "Working with {} distance metric".format(distance_metric)

    #CROSS CLUSTERING COMPARISON
    for clustering in [c for c in x if c.no_of_clusters > 1]:
        cati = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels)
        print "Categorystats established"
        sili = ct.Clusteringstats(
            wordmatrix_with_cat, wordmatrix_without_cat, clustering.name,
            clustering.labels).cluster_silhouette(distance_metric)
        print "Clusteringstats established"
        #GENERAL STATS
        print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[1], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(str(sili))
        stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat,
                                   clustering.name,
                                   clustering.labels).size_of_clusters()
        catstats = ct.Clusteringstats(wordmatrix_with_cat,
                                      wordmatrix_without_cat, clustering.name,
                                      clustering.labels).cats_per_cluster()
        for cluster in stats:
            print "\nCluster {} contains {} items, {} % of the total".format(
                cluster, stats[cluster],
                round(
                    float(stats[cluster]) / len(wordmatrix_without_cat) * 100))
            for cat in [i for i in catstats[cluster] if not i in excludelist]:
                print "{} items of category {} make up {} % of this cluster".format(
                    catstats[cluster][cat], "".join(
                        [i[0] for i in catdicti.items() if i[1] == int(cat)]),
                    round(catstats[cluster][cat] / catstats[cluster]['total'] *
                          100))
        cats = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels).size_of_categories()

        #STATS PER CAT
        print headline, "Statistics per category"
        for cat in [i for i in cats if not i in excludelist]:
            print "\nCategory {} has {} items".format(
                "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]),
                cats[cat]['total'])
            for entry in [
                    i for i in cats[cat]['cat_per_cluster']
                    if not i in excludelist
            ]:
                print "{} items or {} percent in cluster {}".format(
                    cats[cat]['cat_per_cluster'][entry],
                    round(
                        float(cats[cat]['cat_per_cluster'][entry]) /
                        float(cats[cat]['total']) * 100), entry)

        #PREDICTIVE FEATURES
        print headline, "Strongly predictive features are"
        cents = ct.Centroidstats(
            wordmatrix_without_cat, clustering.name, clustering.labels,
            clustering.centroids).cluster_predictors(featuredict)
        if cents:
            for diff in cents:
                print "\nRaw Scores"
                print diff
                #the two below should return the same results; one was worked over because of unicode issues.
                print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    unicode(diff[0]), diff[1], ", ".join([
                        " : ".join([i, unicode(k)])
                        for i, k in cents[diff]['raw_diff'][:10]
                    ]))
                print "Zscores"
                #this is python slice notation: "a[::-1] to reverse a string" ; http://stackoverflow.com/questions/509211/explain-pythons-slice-notation
                print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    diff[0], diff[1], ", ".join([
                        " : ".join(map(unicode, i[::-1]))
                        for i in cents[diff]['zscores_diff']
                    ][:10]))

        #PROTOTYPES
        print headline, "Here is a typical document for each cluster"
        distance = distance_metric
        if distance_metric == 'manhattan':
            distance = 'cityblock'
        print "We set the distance metric to {}".format(distance)
        docs = ct.Centroidstats(wordmatrix_without_cat, clustering.name,
                                clustering.labels,
                                clustering.centroids).central_documents(
                                    wordmatrix_with_cat, filedicti)
        if docs:
            for cluster in docs:
                print "\nCLUSTER {} \n".format(cluster)
                with open(docs[cluster][distance][0]) as f:
                    print f.read()
                if len(docs[cluster][distance]) > 8:
                    print "\nOther files close by in cluster {}:\n".format(
                        cluster)
                    print("{}\n" * 8).format(*docs[cluster][distance][1:9])
    print headline, "Comparing clusterings"
    for clustering in [c for c in x if c.no_of_clusters > 1]:
        print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[0], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(
            str(
                ct.Clusteringstats(
                    wordmatrix_with_cat, wordmatrix_without_cat,
                    clustering.name,
                    clustering.labels).cluster_silhouette(distance_metric)))
    #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement
    input = [(str(type(i.name)).split(".")[3].rstrip("'>") + "--" +
              str(i.no_of_clusters), i) for i in x]
    simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                   input)
    options = [
        'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim',
        'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim'
    ]
    for o in options:
        print "\n---\n"
        ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                input).similarity_matrix(o)

    print "\n---\n"
    endtime = time.time()
    process = endtime - starttime
    print headline, "This took us {} minutes".format(process / 60)
    #or do we want to do predictive features and typical document per cluster as well????
    os.system('say "your program has finished"')