with open('./data_set/words_idfs.txt') as f: vocab_size = len(f.read().splitlines()) num_cluster = 20 Kmean = Kmeans(num_clusters=num_cluster, num_word_vocab=vocab_size) print(Kmean._num_clusters) print(Kmean._num_word_vocab) # Load data Kmean.load_data('./data_set/train_tf_idf.txt') max_purity = -1 max_NMI = -1 choose_seed = 0 # Run and choose the best seed for i in range(10): Kmean.run(seed_value=i + 1, criterion='centroid', threshold=0) print(Kmean.compute_purity()) print(Kmean.compute_NMI()) if (Kmean.compute_purity() > max_purity): max_purity = Kmean.compute_purity() max_NMI = Kmean.compute_NMI() choose_seed = i print() print(' Best compute_purity is ' + str(max_purity) + 'with seed ' + str(choose_seed)) print(' When seed is ' + str(choose_seed) + ' NMI = ' + str(max_NMI))
member1 = Member(label = 1, doc_id = 1, r_d = [0,0]) member2 = Member(label = 1, doc_id = 1, r_d = [1,0]) member3 = Member(label = 1, doc_id = 1, r_d = [0,1]) member4 = Member(label = 0, doc_id = 1, r_d = [5,0]) member5 = Member(label = 0, doc_id = 1, r_d = [6,0]) member6 = Member(label = 0, doc_id = 1, r_d = [5,1]) Kmean = Kmeans(num_clusters = 2, num_word_vocab = 2) Kmean._data.append(member1) Kmean._data.append(member2) Kmean._data.append(member3) Kmean._data.append(member4) Kmean._data.append(member5) Kmean._data.append(member6) Kmean._label_count = {0 : 3, 1 : 3} Kmean.run(seed_value = 1, criterion = 'centroid', threshold = 0) print(Kmean.compute_purity()) print(Kmean.compute_NMI()) # 1 ###################################################################################################3 Test SVM Load data X_train, Y_train = scipy.sparse.load_npz('./data_set/X_train.npz'), np.load('./data_set/Y_train.npy') X_test, Y_test = scipy.sparse.load_npz('./data_set/X_test.npz'), np.load('./data_set/Y_test.npy') print(X_train.toarray().shape) print(Y_train.shape)