#!/usr/bin/python3.2 # -*-coding:Utf-8 -* import math import numpy as np import random as rnd from tfidfStorage import TFIDF_reader from tfidfStorage import tfidf_cosine_distance as cosdist from cluster import Cluster reader = TFIDF_reader("test.vectors") # Number of tfidf vectors npoint = reader.doc_nb # Number of clusters ncluster = 100 # Initialize centroids so that they are as far from one another as possible # Centroids indices in tfidf_list #centroids_idx = [0] # Distance of each point to the closest cluster #mindist = np.array([tfidfdct.cosdist(tfidf_list[0]) for tfidfdct in tfidf_list]) #for i in range(1, ncluster) : # max_idx = np.argmax(mindist) # centroids_idx.append(max_idx) # dist = np.array([tfidfdct.cosdist(tfidf_list[max_idx]) for tfidfdct in tfidf_list]) # mindist = np.amin(np.vstack((mindist, dist)), 0) #clusters = [Cluster(tfidf_list[c_idx]) for c_idx in centroids_idx]
self.__class__(points_1, self.compute_distance, self.get_max_subnodes, self.sample_size), self.__class__(points_2, self.compute_distance, self.get_max_subnodes, self.sample_size), ] def threshold_cosine(initial, max, max_count): def t(): t.count += 1 return float(initial) + float((max - initial) * t.count) / float(max_count) t.count = 0 return t if __name__ == "__main__": reader = TFIDF_reader("test.vectors") grgpf = GRGPF( tfidf_cosine_distance, reader.read_docId, limit_subnodes=10, limit_clusters_per_leaf=10, limit_total_clusters=1000, limit_total_nodes=200, k=10, sample_size=30, get_next_threshold=threshold_cosine(0.8, 0.99, 10), ) for i in range(0, reader.doc_nb): print(i) if i != 0 and i % 100 == 0: # recalcul des samples tout les 100 points print("Recomputing samples")