Ejemplo n.º 1
0
    def evaluateClusters(self, clase_points):

        evaluation_dict = {}
        point2cluster = {}
        point2class = {}

        cluster_points = collections.defaultdict(list)
        cursor = self.mongoConnectInstance.getRecords("quickDBSCAN", {},
                                                      {"bucket"})
        clusterId = 1
        for document in cursor:
            coordsInDocument = list()
            color = np.random.rand(3, )
            for pair in document["bucket"]:
                cluster_points[clusterId].append((pair[0], pair[1], clusterId))
            clusterId = clusterId + 1

        idx = 0
        for elem in clase_points:
            evaluation_dict[idx] = {}
            for points in clase_points[elem]:
                point2class[points] = idx
            idx += 1

        idx = 0
        for elem in cluster_points:
            for point in cluster_points[elem]:
                index_dict = list()
                for dim in range(self.no_dims):
                    index_dict.append(point[dim])
                point2cluster[tuple(index_dict)] = idx
            for c in evaluation_dict:
                evaluation_dict[c][idx] = 0
            idx += 1

        for point in point2cluster:
            evaluation_dict[point2class[point]][point2cluster[point]] += 1

        print('Purity:  ', evaluation_measures.purity(evaluation_dict))
        print('Entropy: ', evaluation_measures.entropy(
            evaluation_dict))  # perfect results have entropy == 0
        print('RI       ', evaluation_measures.rand_index(evaluation_dict))
        print('ARI      ', evaluation_measures.adj_rand_index(evaluation_dict))

        f = open("rezultate_evaluare.txt", "a")
        f.write('quickDBSCAN' + "\n")
        f.write('Purity:  ' +
                str(evaluation_measures.purity(evaluation_dict)) + "\n")
        f.write('Entropy:  ' +
                str(evaluation_measures.entropy(evaluation_dict)) + "\n")
        f.write('RI:  ' +
                str(evaluation_measures.rand_index(evaluation_dict)) + "\n")
        f.write('ARI:  ' +
                str(evaluation_measures.adj_rand_index(evaluation_dict)) +
                "\n")
        f.close()
Ejemplo n.º 2
0
    def evaluate_cluster(self, clase_points, cluster_points, filename,
                         nume_algoritm, nume_set_date):

        evaluation_dict = {}
        point2cluster = {}
        point2class = {}

        idx = 0
        for elem in clase_points:
            evaluation_dict[idx] = {}
            for points in clase_points[elem]:
                point2class[points] = idx
            idx += 1

        idx = 0
        for elem in cluster_points:
            for point in cluster_points[elem]:
                index_dict = list()
                for dim in range(self.no_dims):
                    index_dict.append(point[dim])
                point2cluster[tuple(index_dict)] = idx
            for c in evaluation_dict:
                evaluation_dict[c][idx] = 0
            idx += 1

        for point in point2cluster:
            evaluation_dict[point2class[point]][point2cluster[point]] += 1

        print('Purity: ', evaluation_measures.purity(evaluation_dict))
        print('Entropy: ', evaluation_measures.entropy(
            evaluation_dict))  # perfect results have entropy == 0
        print('RI ', evaluation_measures.rand_index(evaluation_dict))
        print('ARI ', evaluation_measures.adj_rand_index(evaluation_dict))

        f = open(
            "rezultate_evaluare_" + nume_algoritm + "_" + nume_set_date +
            ".txt", "a")
        f.write("Rezultate evaluare pentru setul de date " + str(filename) +
                "\n")
        f.write('Purity: ' + str(evaluation_measures.purity(evaluation_dict)) +
                "\n")
        f.write('Entropy: ' +
                str(evaluation_measures.entropy(evaluation_dict)) + "\n")
        f.write('RI: ' + str(evaluation_measures.rand_index(evaluation_dict)) +
                "\n")
        f.write('ARI: ' +
                str(evaluation_measures.adj_rand_index(evaluation_dict)) +
                "\n")
        f.write("\n")
        f.close()
Ejemplo n.º 3
0
    def evaluateCluster(self, dataset, clusterElements):

        evaluationDict = {}
        element2cluster = {}
        element2class = {}

        for element in dataset:
            clusterId = self.noDims
            element2class[tuple(element[0:clusterId])] = element[clusterId]

        for clusterId in element2class.values():
            evaluationDict[clusterId] = {}

        idx = 1
        for elem in clusterElements:
            for element in clusterElements[elem]:
                indexDict = []
                for dim in range(self.noDims):
                    indexDict.append(element[dim])
                element2cluster[tuple(indexDict)] = idx
            for c in evaluationDict:
                evaluationDict[c][idx] = 0
            idx += 1

        for element in element2cluster:
            evaluationDict[element2class[element]][
                element2cluster[element]] += 1

        print('Purity:  ', evaluation_measures.purity(evaluationDict))
        # perfect results have entropy == 0
        print('Entropy: ', evaluation_measures.entropy(evaluationDict))
        print('RI       ', evaluation_measures.rand_index(evaluationDict))
        print('ARI      ', evaluation_measures.adj_rand_index(evaluationDict))

        f = open("rezultate_evaluare.txt", "a")
        f.write('Purity:  ' + str(evaluation_measures.purity(evaluationDict)) +
                "\n")
        f.write('Entropy:  ' +
                str(evaluation_measures.entropy(evaluationDict)) + "\n")
        f.write('RI:  ' + str(evaluation_measures.rand_index(evaluationDict)) +
                "\n")
        f.write('ARI:  ' +
                str(evaluation_measures.adj_rand_index(evaluationDict)) + "\n")
        f.close()
        print(filename)
        reader = csv.DictReader(csvfile)
        for row in reader:
            if Community1toID.get(int(row['Community1'])) == None:
                Community1toID[int(row['Community1'])] = idx_c1
                IDtoCommunity1[idx_c1] = int(row['Community1'])
                idx_c1 += 1
            if Community2toID.get(int(row['Community2'])) == None:
                Community2toID[int(row['Community2'])] = idx_c2
                IDtoCommunity2[idx_c2] = int(row['Community2'])
                idx_c2 += 1
            # print(row['Community1'], Community1toID[int(row['Community1'])], row['Community2'], Community2toID[int(row['Community2'])])
            csv_data.append(row)

        # create a dictionary with 0
        for id_c1 in IDtoCommunity1:
            line = {}
            for id_c2 in IDtoCommunity2:
                line[id_c2] = 0
            cluster_dic[id_c1] = line

        for row in csv_data:
            cluster_dic[Community1toID[int(
                row['Community1'])]][Community2toID[int(
                    row['Community2'])]] = int(row['Nodes'])

        print('Purity:  ', evaluation_measures.entropy(cluster_dic))
        print('Entropy: ', evaluation_measures.purity(cluster_dic))
        print('RI       ', evaluation_measures.rand_index(cluster_dic))
        print('ARI      ', evaluation_measures.adj_rand_index(cluster_dic))
Ejemplo n.º 5
0
        topic_model = TopicModeling(id2word=id2word,
                                    corpus=csr_tfidf,
                                    doc2class=doc2class,
                                    num_cores=30)
        start = time()
        topics = topic_model.topicsNMF(num_topics=num_topics,
                                       num_iterations=num_iter)
        for topic in topics:
            wTopics = []
            for words in topic[1]:
                wTopics.append(words[0])
            print("Topic", topic[0], wTopics)
        end = time()
        print("NMF TFIDF c-value time", (end - start))
        print('NMF TFIDF ARI c-value:',
              evaluation_measures.adj_rand_index(topic_model.doc2topicNMF))

        print('LDA TFIDF with cvalue:')
        start = time()
        topic_model = TopicModeling(id2word=id2word,
                                    corpus=csr_tfidf,
                                    doc2class=doc2class,
                                    num_cores=30)
        topics = topic_model.topicsLDA(num_topics=num_topics,
                                       num_iterations=num_iter)
        for topic in topics:
            wTopics = []
            for words in topic[1]:
                wTopics.append(words[0])
            print("Topic", topic[0], wTopics)
        end = time()
Ejemplo n.º 6
0
	clustering = DBSCAN(eps=0.29, metric='cosine', min_samples=320, n_jobs=8).fit(dataset)

	# clustering = DBSCAN(eps=distanceDec[maxSlopeIdx], min_samples=3, n_jobs=8).fit(dataset)

	k = 0
	for label in clustering.labels_:
		point2cluster[k] = label
		k = k + 1
		for c in evaluationDict:
			evaluationDict[c][label] = 0

	for point in point2cluster:
		evaluationDict[point2class[point]][point2cluster[point]] += 1

	print(evaluation_measures.adj_rand_index(evaluationDict))

	matriceContingenta = evaluation_measures.construct_cont_table(evaluationDict)

	print(matriceContingenta)

	np.savetxt('matriceContingenta.txt', matriceContingenta, delimiter=',', fmt='%s')

	print(metrics.homogeneity_score(labels, clustering.labels_))
	print(metrics.completeness_score(labels, clustering.labels_))
	print(metrics.v_measure_score(labels, clustering.labels_))
	print(metrics.adjusted_rand_score(labels, clustering.labels_))

	print("=======================================================================")