Ejemplo n.º 1
0
with open('./data_set/words_idfs.txt') as f:
    vocab_size = len(f.read().splitlines())

num_cluster = 20

Kmean = Kmeans(num_clusters=num_cluster, num_word_vocab=vocab_size)
print(Kmean._num_clusters)
print(Kmean._num_word_vocab)

# Load data
Kmean.load_data('./data_set/train_tf_idf.txt')

max_purity = -1
max_NMI = -1
choose_seed = 0

# Run and choose the best seed

for i in range(10):
    Kmean.run(seed_value=i + 1, criterion='centroid', threshold=0)
    print(Kmean.compute_purity())
    print(Kmean.compute_NMI())
    if (Kmean.compute_purity() > max_purity):
        max_purity = Kmean.compute_purity()
        max_NMI = Kmean.compute_NMI()
        choose_seed = i
print()
print(' Best compute_purity is ' + str(max_purity) + 'with seed ' +
      str(choose_seed))
print(' When seed is ' + str(choose_seed) + ' NMI = ' + str(max_NMI))
Ejemplo n.º 2
0
member1 = Member(label = 1, doc_id = 1, r_d = [0,0])
member2 = Member(label = 1, doc_id = 1, r_d = [1,0])
member3 = Member(label = 1, doc_id = 1, r_d = [0,1])
member4 = Member(label = 0, doc_id = 1, r_d = [5,0])
member5 = Member(label = 0, doc_id = 1, r_d = [6,0])
member6 = Member(label = 0, doc_id = 1, r_d = [5,1])
Kmean = Kmeans(num_clusters =  2, num_word_vocab = 2) 
Kmean._data.append(member1)
Kmean._data.append(member2)
Kmean._data.append(member3)
Kmean._data.append(member4)
Kmean._data.append(member5)
Kmean._data.append(member6)
Kmean._label_count = {0 : 3, 1 : 3}
Kmean.run(seed_value = 1, criterion = 'centroid', threshold = 0)
print(Kmean.compute_purity())
print(Kmean.compute_NMI())
# 1




###################################################################################################3
Test SVM
Load data

X_train, Y_train = scipy.sparse.load_npz('./data_set/X_train.npz'), np.load('./data_set/Y_train.npy')
X_test, Y_test = scipy.sparse.load_npz('./data_set/X_test.npz'), np.load('./data_set/Y_test.npy')

print(X_train.toarray().shape)
print(Y_train.shape)