Ejemplo n.º 1
0
def main():
	folders=[i for i in os.listdir(pathi) if not i.startswith(".")]
	folders=['files9_output_0102']
	print "We have {} folders".format(len(folders))
	featuredict=dictmaker(folders)
	wordmatrix_without_cat, wordmatrix_with_cat, catdicti = matrixmachine(folders, featuredict, "category1")
	x=clustermachine(wordmatrix_without_cat, scipy.cluster.vq.kmeans2)
	print x
	f=[(i.name, i.no_of_clusters) for i in x]
	g=[ct.Clusteringstats(wordmatrix_with_cat, type(i), i.labels).size_of_clusters() for i in x]
	#print g
	h=[len(ct.Clusteringstats(wordmatrix_with_cat, type(i), i.labels).cluster_features()) for i in x]
	print "no of clusters",  h
	g=[ct.Centroidstats(wordmatrix_with_cat, i.name, i.labels, i.centroids, i.centroids)._centroiddictmaker() for i in x]
	test=x[0]
	#print test
	#g=ct.Centroidstats(wordmatrix_with_cat, test.name, test.labels,i.centroids )
	#print g
	#t=g.cluster_predictors(featuredict)
	# t=Partitionsimilarity(x[0], x[0])
# 	print t.compare_partitions()
	t=[Categorystats(wordmatrix_with_cat, type(i), i.labels).size_of_categories() for i in x]
	print t
def main(distance_metric, threshold, testmode=False):
    starttime = time.time()
    #make this flexible in case there are no subfolders
    folders = [i for i in os.listdir(pathi) if not i.startswith(".")]
    print ", ".join(folders)
    print "Items in folders", ", ".join(
        [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders])
    #folders=['files9_output_0102']#, 'files9_output_0102', 'files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102', 'files9_output_0102']
    print "We have {} folders".format(len(folders))
    featuredict = dictmaker(folders,
                            threshold,
                            remove_stopwords=True,
                            remove_punct=True)

    wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine(
        folders, featuredict, testmode, "category1")

    wordmatrix_without_cat, wordmatrix_with_cat = ct.matrixstats(
        wordmatrix_without_cat,
        wordmatrix_with_cat,
        distance_metric,
        zscores=False,
        outlier_removal=True,
        outlier_threshold=2,
        median_metric='median')
    #apply to wordmatrix with cats

    x = clustermachine(wordmatrix_without_cat, distance_metric, 4)
    #print [(i.name, i.no_of_clusters) for i in x]
    excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats']
    print "These clusterings have less than 2 clusters\n{}\n\n".format(
        "\n".join([str(c.name) for c in x if c.no_of_clusters < 2]))
    #PRINTING STUFF
    headline = "\n\n-----------\n\n"
    print "Working with {} distance metric".format(distance_metric)

    #CROSS CLUSTERING COMPARISON
    for clustering in [c for c in x if c.no_of_clusters > 1]:
        cati = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels)
        sili = ct.Clusteringstats(
            wordmatrix_with_cat, wordmatrix_without_cat, clustering.name,
            clustering.labels).cluster_silhouette(distance_metric)

        #GENERAL STATS
        print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[1], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(str(sili))
        stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat,
                                   clustering.name,
                                   clustering.labels).size_of_clusters()
        catstats = ct.Clusteringstats(wordmatrix_with_cat,
                                      wordmatrix_without_cat, clustering.name,
                                      clustering.labels).cats_per_cluster()
        for cluster in stats:
            print "\nCluster {} contains {} items, {} % of the total".format(
                cluster, stats[cluster],
                round(
                    float(stats[cluster]) / len(wordmatrix_without_cat) * 100))
            for cat in [i for i in catstats[cluster] if not i in excludelist]:
                print "{} items of category {} make up {} % of this cluster".format(
                    catstats[cluster][cat], "".join(
                        [i[0] for i in catdicti.items() if i[1] == int(cat)]),
                    round(catstats[cluster][cat] / catstats[cluster]['total'] *
                          100))
        cats = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels).size_of_categories()

        #STATS PER CAT
        print headline, "Statistics per category"
        for cat in [i for i in cats if not i in excludelist]:
            print "\nCategory {} has {} items".format(
                "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]),
                cats[cat]['total'])
            for entry in [
                    i for i in cats[cat]['cat_per_cluster']
                    if not i in excludelist
            ]:
                print "{} items or {} percent in cluster {}".format(
                    cats[cat]['cat_per_cluster'][entry],
                    round(
                        float(cats[cat]['cat_per_cluster'][entry]) /
                        float(cats[cat]['total']) * 100), entry)

        #PREDICTIVE FEATURES
        print headline, "Strongly predictive features are"
        cents = ct.Centroidstats(
            wordmatrix_without_cat, clustering.name, clustering.labels,
            clustering.centroids).cluster_predictors(featuredict)
        if cents:
            for diff in cents:
                print "\nRaw Scores"
                print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    diff[0], diff[1], ", ".join([
                        " : ".join(map(unicode, i[::-1]))
                        for i in cents[diff]['raw_diff']
                    ][:10]))
                print "Zscores"
                print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    diff[0], diff[1], ", ".join([
                        " : ".join(map(unicode, i[::-1]))
                        for i in cents[diff]['zscores_diff']
                    ][:10]))

        #PROTOTYPES
        print headline, "Here is a typical document for each cluster"
        distance = distance_metric
        if distance_metric == 'manhattan':
            distance = 'cityblock'
        print "We set the distance metric to {}".format(distance)
        docs = ct.Centroidstats(wordmatrix_without_cat, clustering.name,
                                clustering.labels,
                                clustering.centroids).central_documents(
                                    wordmatrix_with_cat, filedicti)
        if docs:
            for cluster in docs:
                print "\nCLUSTER {} \n".format(cluster)
                with open(docs[cluster][distance][0]) as f:
                    print f.read()
                if len(docs[cluster][distance]) > 8:
                    print "\nOther files close by in cluster {}:\n".format(
                        cluster)
                    print("{}\n" * 8).format(*docs[cluster][distance][1:9])
    print headline, "Comparing clusterings"
    for clustering in [c for c in x if c.no_of_clusters > 1]:
        print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[0], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(
            str(
                ct.Clusteringstats(
                    wordmatrix_with_cat, wordmatrix_without_cat,
                    clustering.name,
                    clustering.labels).cluster_silhouette(distance_metric)))
    #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement
    input = [(str(type(i.name)).split(".")[3].rstrip("'>") + "--" +
              str(i.no_of_clusters), i) for i in x]
    simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                   input)
    options = [
        'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim',
        'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim'
    ]
    for o in options:
        print "\n---\n"
        ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                input).similarity_matrix(o)

    print "\n---\n"
    endtime = time.time()
    process = endtime - starttime
    print headline, "This took us {} minutes".format(process / 60)
    #or do we want to do predictive features and typical document per cluster as well????
    os.system('say "your program has finished"')
Ejemplo n.º 3
0
def main():
    starttime = time.time()
    folders = [i for i in os.listdir(pathi) if not i.startswith(".")]
    print ", ".join(folders)
    print ", ".join(
        [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders])
    folders = [
        'files9_output_0102'
    ]  #, 'files9_output_0102', 'files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102', 'files9_output_0102']
    print "We have {} folders".format(len(folders))
    featuredict = dictmaker(folders, 5000)
    wordmatrix_without_cat, wordmatrix_with_cat, catdicti = matrixmachine(
        folders, featuredict, "category1")
    #self.matrix_with_cats=matrix_with_cats  #data frame including "gold labels"
    #self.matrix_without_cats=matrix_with_cats[:,1:] #data frame without "gold labels"
    x = clustermachine(wordmatrix_without_cat, 2)
    print[(i.name, i.no_of_clusters) for i in x]
    #print [i.name for i in x]
    excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats']
    for clustering in x:
        cati = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels)
        sili = ct.Clusteringstats(wordmatrix_with_cat, clustering.name,
                                  clustering.labels).cluster_silhouette()
        # 	print cati.size_of_categories()

        print "\n\n-----------\n\nClustering called {} has {} clusters".format(
            clustering.getname()[0], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(str(sili))
        stats = ct.Clusteringstats(wordmatrix_with_cat, clustering.name,
                                   clustering.labels).size_of_clusters()
        catstats = ct.Clusteringstats(wordmatrix_with_cat, clustering.name,
                                      clustering.labels).cats_per_cluster()
        for cluster in stats:
            print "\nCluster {} contains {} items, {} % of the total".format(
                cluster, stats[cluster],
                round(
                    float(stats[cluster]) / len(wordmatrix_without_cat) * 100))
            for cat in [i for i in catstats[cluster] if not i in excludelist]:
                print "{} items of category {} make up {} % of this cluster".format(
                    catstats[cluster][cat], "".join(
                        [i[0] for i in catdicti.items() if i[1] == int(cat)]),
                    round(catstats[cluster][cat] / catstats[cluster]['total'] *
                          100))
        cats = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels).size_of_categories()
        print "\n\n-----------\n\nStatistics per category"
        for cat in [i for i in cats if not i in excludelist]:
            print "\nCategory {} has {} items".format(
                "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]),
                cats[cat]['total'])
            for entry in [
                    i for i in cats[cat]['cat_per_cluster']
                    if not i in excludelist
            ]:
                print "{} items or {} percent in cluster {}".format(
                    cats[cat]['cat_per_cluster'][entry],
                    round(
                        float(cats[cat]['cat_per_cluster'][entry]) /
                        float(cats[cat]['total']) * 100), entry)

        print "\n\n-----------\n\nStronly predictive features are"
        cents = ct.Centroidstats(
            clustering.name, clustering.labels,
            clustering.centroids).cluster_predictors(featuredict)
        for diff in cents:
            print "\n Raw Scores"
            print "{} differentiate {} and {}\n".format(
                ", ".join([
                    " : ".join(map(str, i[::-1]))
                    for i in cents[diff]['raw_diff']
                ]), diff[0], diff[1])
            print "Zscores"
            print "{} differentiate {} and {}".format(
                ", ".join([
                    " : ".join(map(str, i[::-1]))
                    for i in cents[diff]['zscores_diff']
                ]), diff[0], diff[1])
        "We can also add equivalent features if we want"
        "And stems and whatnot"

        print "\n\n-----------\n\nHere is a typical document for each cluster"
    endtime = time.time()
    process = endtime - starttime
    print "This took us {} minutes".format(process / 60)
Ejemplo n.º 4
0
def main(distance_metric, testmode=False):
    starttime = time.time()

    #x=ct.clustermachine(wordmatrix_without_cat,distance_metric,4)
    print "These clusterings have less than 2 clusters\n{}\n\n".format(
        "\n".join([str(c.name) for c in x if c.no_of_clusters < 2]))
    #PRINTING STUFF
    headline = "\n\n-----------\n\n"
    print "Working with {} distance metric".format(distance_metric)
    #v is a number, k a word
    excludelist = [
        'total', 'no_of_categories', 'no_of_clusters', 'no_of_cats'
    ] + [
        v for k, v in catdicti.items()
        if wordmatrix_with_cat[wordmatrix_with_cat[:, 0] == v].shape[0] < 100
    ]
    print "excludelist", excludelist
    #CROSS CLUSTERING COMPARISON
    for clustering in [c for c in x if c.no_of_clusters > 1]:
        cati = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels)
        print "Categorystats done"
        sili = ct.Clusteringstats(
            wordmatrix_with_cat, wordmatrix_without_cat, clustering.name,
            clustering.labels).cluster_silhouette(distance_metric)
        print "Clusteringstats done"
        #GENERAL STATS
        print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[1], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(str(sili))
        stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat,
                                   clustering.name,
                                   clustering.labels).size_of_clusters()
        print "stats done"
        catstats = ct.Clusteringstats(wordmatrix_with_cat,
                                      wordmatrix_without_cat, clustering.name,
                                      clustering.labels).cats_per_cluster()
        print "catstats done"
        for cluster in stats:
            print "\nCluster {} contains {} items, {} % of the total".format(
                cluster, stats[cluster],
                round(
                    float(stats[cluster]) / len(wordmatrix_without_cat) * 100))
            for cat in [i for i in catstats[cluster] if not i in excludelist]:
                print "{} items of category {} make up {} % of this cluster".format(
                    catstats[cluster][cat], "".join(
                        [i[0] for i in catdicti.items() if i[1] == int(cat)]),
                    round(catstats[cluster][cat] / catstats[cluster]['total'] *
                          100))
        cats = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels).size_of_categories()

        #STATS PER CAT
        print headline, "Statistics per category"
        for cat in [i for i in cats if not i in excludelist]:
            print "\nCategory {} has {} items".format(
                "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]),
                cats[cat]['total'])
            for entry in [
                    i for i in cats[cat]['cat_per_cluster']
                    if not i in excludelist
            ]:
                print "{} items or {} percent in cluster {}".format(
                    cats[cat]['cat_per_cluster'][entry],
                    round(
                        float(cats[cat]['cat_per_cluster'][entry]) /
                        float(cats[cat]['total']) * 100), entry)

        # #PREDICTIVE FEATURES
        print headline, "Strongly predictive features are"
        cents = ct.Centroidstats(
            wordmatrix_without_cat, clustering.name, clustering.labels,
            clustering.centroids).cluster_predictors(featuredict)
        if cents:
            for diff in cents:
                print "\nRaw Scores"
                print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    diff[0], diff[1], ", ".join([
                        " : ".join(map(unicode, i[::-1]))
                        for i in cents[diff]['raw_diff']
                    ][:10]))
                print "Zscores"
                print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    diff[0], diff[1], ", ".join([
                        " : ".join(map(unicode, i[::-1]))
                        for i in cents[diff]['zscores_diff']
                    ][:10]))

        #PROTOTYPES
        print headline, "Here is a typical document for each cluster"
        distance = distance_metric
        if distance_metric == 'manhattan':
            distance = 'cityblock'
        print "We set the distance metric to {}".format(distance)
        docs = ct.Centroidstats(wordmatrix_without_cat, clustering.name,
                                clustering.labels,
                                clustering.centroids).central_documents(
                                    wordmatrix_with_cat, filedicti)
        if docs:
            for cluster in docs:
                print "\nCLUSTER {} \n".format(cluster)
                print docs[cluster][distance]
                with open(docs[cluster][distance][0]) as f:
                    print f.read()
                if len(docs[cluster][distance]) > 8:
                    print "\nOther files close by in cluster {}:\n".format(
                        cluster)
                    print("{}\n" * 8).format(*docs[cluster][distance][1:9])
    #COMPARING CLUSTERINGS
    print headline, "Comparing clusterings"
    for clustering in [c for c in x if c.no_of_clusters > 1]:
        print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[0], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(
            str(
                ct.Clusteringstats(
                    wordmatrix_with_cat, wordmatrix_without_cat,
                    clustering.name,
                    clustering.labels).cluster_silhouette(distance_metric)))
    #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement
    input = [(str(type(i.name)).split(".")[3].rstrip("'>") + "--" +
              str(i.no_of_clusters), i) for i in x]
    simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                   input)
    options = [
        'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim',
        'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim'
    ]
    for o in options:
        print "\n---\n"
        ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                input).similarity_matrix(o)

    print "\n---\n"
    endtime = time.time()
    process = endtime - starttime
    print headline, "This took us {} minutes".format(process / 60)
def main(distance_metric, threshold, testmode=False):
    starttime = time.time()
    #here we read in the featuredict
    #create the featuredict from a text file
    featuredict = {}
    with codecs.open(
            '/Users/ps22344/Downloads/chapter2/textfiles/emolist_final.txt',
            "r", "utf-8") as inputtext:
        for line in inputtext.readlines():
            featuredict[line.rstrip("\n")] = 0
    print "pre length", len(featuredict)
    folders = [i for i in os.listdir(pathi) if not i.startswith(".")]
    print ", ".join(folders)
    print "Items in folders", ", ".join(
        [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders])
    print "We have {} folders".format(len(folders))
    #here we input the featuredict
    wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine(
        folders, featuredict, testmode, "category1")

    wordmatrix_without_cat, wordmatrix_with_cat = ct.matrixstats(
        wordmatrix_without_cat,
        wordmatrix_with_cat,
        distance_metric,
        zscores=False,
        outlier_removal=False,
        outlier_threshold=2,
        median_metric='median')
    np.savetxt('wordmatrix_without_cat.gz', wordmatrix_without_cat)
    np.savetxt('wordmatrix_with_cat.gz', wordmatrix_with_cat)
    x = clustermachine(wordmatrix_without_cat, distance_metric, 4)
    #print [(i.name, i.no_of_clusters) for i in x]
    excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats']
    print "These clusterings have less than 2 clusters\n{}\n\n".format(
        "\n".join([str(c.name) for c in x if c.no_of_clusters < 2]))
    #PRINTING STUFF
    headline = "\n\n-----------\n\n"
    print "Working with {} distance metric".format(distance_metric)

    #CROSS CLUSTERING COMPARISON
    for clustering in [c for c in x if c.no_of_clusters > 1]:
        cati = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels)
        print "Categorystats established"
        sili = ct.Clusteringstats(
            wordmatrix_with_cat, wordmatrix_without_cat, clustering.name,
            clustering.labels).cluster_silhouette(distance_metric)
        print "Clusteringstats established"
        #GENERAL STATS
        print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[1], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(str(sili))
        stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat,
                                   clustering.name,
                                   clustering.labels).size_of_clusters()
        catstats = ct.Clusteringstats(wordmatrix_with_cat,
                                      wordmatrix_without_cat, clustering.name,
                                      clustering.labels).cats_per_cluster()
        for cluster in stats:
            print "\nCluster {} contains {} items, {} % of the total".format(
                cluster, stats[cluster],
                round(
                    float(stats[cluster]) / len(wordmatrix_without_cat) * 100))
            for cat in [i for i in catstats[cluster] if not i in excludelist]:
                print "{} items of category {} make up {} % of this cluster".format(
                    catstats[cluster][cat], "".join(
                        [i[0] for i in catdicti.items() if i[1] == int(cat)]),
                    round(catstats[cluster][cat] / catstats[cluster]['total'] *
                          100))
        cats = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels).size_of_categories()

        #STATS PER CAT
        print headline, "Statistics per category"
        for cat in [i for i in cats if not i in excludelist]:
            print "\nCategory {} has {} items".format(
                "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]),
                cats[cat]['total'])
            for entry in [
                    i for i in cats[cat]['cat_per_cluster']
                    if not i in excludelist
            ]:
                print "{} items or {} percent in cluster {}".format(
                    cats[cat]['cat_per_cluster'][entry],
                    round(
                        float(cats[cat]['cat_per_cluster'][entry]) /
                        float(cats[cat]['total']) * 100), entry)

        #PREDICTIVE FEATURES
        print headline, "Strongly predictive features are"
        cents = ct.Centroidstats(
            wordmatrix_without_cat, clustering.name, clustering.labels,
            clustering.centroids).cluster_predictors(featuredict)
        if cents:
            for diff in cents:
                print "\nRaw Scores"
                print diff
                #the two below should return the same results; one was worked over because of unicode issues.
                print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    unicode(diff[0]), diff[1], ", ".join([
                        " : ".join([i, unicode(k)])
                        for i, k in cents[diff]['raw_diff'][:10]
                    ]))
                print "Zscores"
                #this is python slice notation: "a[::-1] to reverse a string" ; http://stackoverflow.com/questions/509211/explain-pythons-slice-notation
                print u"Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    diff[0], diff[1], ", ".join([
                        " : ".join(map(unicode, i[::-1]))
                        for i in cents[diff]['zscores_diff']
                    ][:10]))

        #PROTOTYPES
        print headline, "Here is a typical document for each cluster"
        distance = distance_metric
        if distance_metric == 'manhattan':
            distance = 'cityblock'
        print "We set the distance metric to {}".format(distance)
        docs = ct.Centroidstats(wordmatrix_without_cat, clustering.name,
                                clustering.labels,
                                clustering.centroids).central_documents(
                                    wordmatrix_with_cat, filedicti)
        if docs:
            for cluster in docs:
                print "\nCLUSTER {} \n".format(cluster)
                with open(docs[cluster][distance][0]) as f:
                    print f.read()
                if len(docs[cluster][distance]) > 8:
                    print "\nOther files close by in cluster {}:\n".format(
                        cluster)
                    print("{}\n" * 8).format(*docs[cluster][distance][1:9])
    print headline, "Comparing clusterings"
    for clustering in [c for c in x if c.no_of_clusters > 1]:
        print headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[0], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(
            str(
                ct.Clusteringstats(
                    wordmatrix_with_cat, wordmatrix_without_cat,
                    clustering.name,
                    clustering.labels).cluster_silhouette(distance_metric)))
    #all input does it just concatenate name + cluster # and supply clustering object to similarity measurement
    input = [(str(type(i.name)).split(".")[3].rstrip("'>") + "--" +
              str(i.no_of_clusters), i) for i in x]
    simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                   input)
    options = [
        'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim',
        'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim'
    ]
    for o in options:
        print "\n---\n"
        ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                input).similarity_matrix(o)

    print "\n---\n"
    endtime = time.time()
    process = endtime - starttime
    print headline, "This took us {} minutes".format(process / 60)
    #or do we want to do predictive features and typical document per cluster as well????
    os.system('say "your program has finished"')
Ejemplo n.º 6
0
def main():
    starttime = time.time()
    folders = [i for i in os.listdir(pathi) if not i.startswith(".")]
    print ", ".join(folders)
    print ", ".join(
        [str(len(os.listdir(os.path.join(pathi, f)))) for f in folders])
    #folders=['files9_output_0102']#, 'files9_output_0102', 'files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102','files9_output_0102', 'files9_output_0102', 'files9_output_0102']
    print "We have {} folders".format(len(folders))
    featuredict = dictmaker(folders, 10000)
    wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedicti = matrixmachine(
        folders, featuredict, "category1")
    x = clustermachine(wordmatrix_without_cat, 4)
    print[(i.name, i.no_of_clusters) for i in x]
    #print [i.name for i in x]
    excludelist = ['total', 'no_of_categories', 'no_of_clusters', 'no_of_cats']

    for clustering in x:
        cati = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels)
        sili = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat,
                                  clustering.name,
                                  clustering.labels).cluster_silhouette()
        # 	print cati.size_of_categories()
        headline = "\n\n-----------\n\n"

        #GENERAL STATS
        print headline, headline, "CLUSTERING CALLED {} HAS {} CLUSTERS".format(
            clustering.getname()[0], clustering.no_of_clusters)
        print "Its silhouette score is {}".format(str(sili))
        stats = ct.Clusteringstats(wordmatrix_with_cat, wordmatrix_without_cat,
                                   clustering.name,
                                   clustering.labels).size_of_clusters()
        catstats = ct.Clusteringstats(wordmatrix_with_cat,
                                      wordmatrix_without_cat, clustering.name,
                                      clustering.labels).cats_per_cluster()
        for cluster in stats:
            print "\nCluster {} contains {} items, {} % of the total".format(
                cluster, stats[cluster],
                round(
                    float(stats[cluster]) / len(wordmatrix_without_cat) * 100))
            for cat in [i for i in catstats[cluster] if not i in excludelist]:
                print "{} items of category {} make up {} % of this cluster".format(
                    catstats[cluster][cat], "".join(
                        [i[0] for i in catdicti.items() if i[1] == int(cat)]),
                    round(catstats[cluster][cat] / catstats[cluster]['total'] *
                          100))
        cats = ct.Categorystats(wordmatrix_with_cat, clustering.name,
                                clustering.labels).size_of_categories()

        #STATS PER CAT
        print headline, "Statistics per category"
        for cat in [i for i in cats if not i in excludelist]:
            print "\nCategory {} has {} items".format(
                "".join([i[0] for i in catdicti.items() if i[1] == int(cat)]),
                cats[cat]['total'])
            for entry in [
                    i for i in cats[cat]['cat_per_cluster']
                    if not i in excludelist
            ]:
                print "{} items or {} percent in cluster {}".format(
                    cats[cat]['cat_per_cluster'][entry],
                    round(
                        float(cats[cat]['cat_per_cluster'][entry]) /
                        float(cats[cat]['total']) * 100), entry)

        #PREDICTIVE FEATURES
        print headline, "Stronly predictive features are"
        cents = ct.Centroidstats(
            clustering.name, clustering.labels,
            clustering.centroids).cluster_predictors(featuredict)
        if cents:
            for diff in cents:
                print "\nRaw Scores"
                print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    diff[0], diff[1], ", ".join([
                        " : ".join(map(str, i[::-1]))
                        for i in cents[diff]['raw_diff']
                    ][:10]))
                print "Zscores"
                print "Cluster {} and cluster {} are differentiated by \n{}\n\n\n".format(
                    diff[0], diff[1], ", ".join([
                        " : ".join(map(str, i[::-1]))
                        for i in cents[diff]['zscores_diff']
                    ][:10]))
                print "We can also add equivalent features if we want"
                print "And stems and whatnot"

        #PROTOTYPES
        print headline, "Here is a typical document for each cluster"
        distance = 'euclidean'
        print "We set the distance metric to {}".format(distance)
        docs = ct.Centroidstats(clustering.name, clustering.labels,
                                clustering.centroids).central_documents(
                                    wordmatrix_with_cat, filedicti)
        if docs:
            for cluster in docs:
                print "\nCLUSTER {} \n".format(cluster)
                f = open(docs[cluster][distance][0]).read()
                print f

    #CROSS CLUSTERING COMPARISON
    print headline, "Comparing clusterings"
    #
    input = [(str(type(i.name)).split(".")[3].rstrip("'>"), i) for i in x]
    simi = ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                   input)
    options = [
        'adjustedrand_sim', 'adjustedmutualinfo_sim', 'jaccard_sim', 'v_sim',
        'completeness_sim', 'homogeneity_sim', 'silhouette_score_sim'
    ]
    for o in options:
        print "\n---\n"
        ct.Clusteringsimilarity(wordmatrix_with_cat, wordmatrix_without_cat,
                                input).similarity_matrix(o)

    print "\n---\n"
    endtime = time.time()
    process = endtime - starttime
    print headline, "This took us {} minutes".format(process / 60)