Esempio n. 1
0
def load():
	data = genfromtxt('../dataset/' + dataFile + '.csv', delimiter=',')
	label = genfromtxt('../dataset/' + labelFile + '.csv', delimiter=',')
	res = pickle.load(open(FN,'rb'))
	AE = res['autoencoder']
	encodedX = AE.encoder(AE.X)

	X = encodedX.data.numpy()
	#X = preprocessing.scale(encodedX.data.numpy())


	d_matrix = sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean')
	s = np.median(d_matrix)
	Vgamma = 1/(2*s*s)
	spAlloc = SpectralClustering(2, gamma=Vgamma).fit_predict(X)
	nmi_sp = np.around(normalized_mutual_info_score(label, spAlloc), 3)


	kmAlloc = KMeans(2).fit_predict(X)
	nmi_km = np.around(normalized_mutual_info_score(label, kmAlloc), 3)

	print X

	print nmi_sp
	print nmi_km

	print res['loss']
	#print res['autoencoder']

	txt = dataFile + ' nmiSP : ' + str(nmi_sp) + ' , nmiKM : ' + str(nmi_km) + ' , num_of_layers:' + str(num_of_layers) + ' , num_of_output:' +  str(num_of_output) + '\n'

	fin = open('auto_out.txt','a')
	fin.write(txt)
	fin.close()
Esempio n. 2
0
def test_exactly_zero_info_score():
    """Check numerical stability when information is exactly zero"""
    for i in np.logspace(1, 4, 4).astype(np.int):
        labels_a, labels_b = np.ones(i, dtype=np.int), np.arange(i, dtype=np.int)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(v_measure_score(labels_a, labels_b), 0.0)
        assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
def test_exactly_zero_info_score():
    # Check numerical stability when information is exactly zero
    for i in np.logspace(1, 4, 4).astype(np.int):
        labels_a, labels_b = (np.ones(i, dtype=np.int),
                              np.arange(i, dtype=np.int))
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(v_measure_score(labels_a, labels_b), 0.0)
        assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
        for method in ["min", "geometric", "arithmetic", "max"]:
            assert adjusted_mutual_info_score(labels_a, labels_b,
                                              method) == 0.0
            assert normalized_mutual_info_score(labels_a, labels_b,
                                                method) == 0.0
def test_single_linkage_clustering():
    # Check that we get the correct result in two emblematic cases
    moons, moon_labels = make_moons(noise=0.05, random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(moons)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     moon_labels), 1)

    circles, circle_labels = make_circles(factor=0.5, noise=0.025,
                                          random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(circles)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     circle_labels), 1)
def evaluation(X_selected, n_clusters, y):
    """
    This function calculates ARI, ACC and NMI of clustering results

    Input
    -----
    X_selected: {numpy array}, shape (n_samples, n_selected_features}
            input data on the selected features
    n_clusters: {int}
            number of clusters
    y: {numpy array}, shape (n_samples,)
            true labels

    Output
    ------
    nmi: {float}
        Normalized Mutual Information
    acc: {float}
        Accuracy
    """
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
                     tol=0.0001, precompute_distances=True, verbose=0,
                     random_state=None, copy_x=True, n_jobs=1)

    k_means.fit(X_selected)
    y_predict = k_means.labels_

    # calculate NMI
    nmi = normalized_mutual_info_score(y, y_predict)

    # calculate ACC
    y_permuted_predict = best_map(y, y_predict)
    acc = accuracy_score(y, y_permuted_predict)

    return nmi, acc
Esempio n. 6
0
def __eval_lda_clustering(lda_model, mm_corpus, gold_labels):
    # lda_model = gensim.models.ldamodel.LdaModel.load(model_file)
    sys_labels = list()
    for i, doc in enumerate(mm_corpus):
        topic_dist = lda_model[doc]
        # print topic_dist
        cluster_idx = 0
        max_dist = 0
        for tup in topic_dist:
            if tup[1] > max_dist:
                cluster_idx = tup[0]
                max_dist = tup[1]
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 5000 == 0:
            print len(sys_labels)
        # if i > 10:
        #     break
    # print len(sys_labels)
    # print len(gold_labels)

    nmi_score = normalized_mutual_info_score(gold_labels, sys_labels)
    purity_score = purity(gold_labels, sys_labels)
    ri_score = rand_index(gold_labels, sys_labels)

    # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, sys_labels)
    # print 'Purity: %f' % purity(gold_labels, sys_labels)
    # print 'Accuracy: %f' % cluster_accuracy(gold_labels, sys_labels)

    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score)
    return nmi_score, purity_score, ri_score
def NMI(groundTruth, predictionResult):
	oneListGT = _labelPreprocessing(groundTruth)
	oneListPR = _labelPreprocessing(predictionResult)
	return normalized_mutual_info_score(oneListGT, oneListPR)
# This show how NMI can produce high result while it should not
# I5SIM3DatasetTrueClassification = [[1, 22, 52, 67, 84, 88, 106, 124, 138, 156, 167, 172, 204, 228, 240, 245, 256, 283, 313, 322, 337, 355, 367, 375, 380, 382, 405, 421, 422, 449, 451, 452, 464, 468, 469, 519, 520, 539, 566, 596, 612, 627, 628, 642, 656, 683, 718, 780, 808, 817, 830, 831, 833, 835, 852, 853, 854, 870, 876, 878, 927, 948, 952, 958, 968, 972, 976, 1005, 1016, 1024, 1058, 1108, 1122, 1123, 1149,1152, 1190, 1217, 1236, 1243, 1244, 1257, 1260, 1325, 1331, 1346, 1348, 1375, 1382, 1390, 1393, 1416, 1433, 1445, 1451, 1491, 1521, 1557, 1580, 1588],
# [36, 71,76, 80, 85, 155, 157, 171, 182, 211, 215, 224, 237, 238, 239, 292, 298, 311, 315, 329, 342, 361, 370, 384, 403, 415, 416, 419, 425, 437, 483, 485, 488, 497, 522, 528, 545, 561, 569, 571, 572, 574, 621, 645, 653, 664, 674, 699, 712, 732, 734, 740, 755, 773, 774, 802, 828, 841, 846, 872, 896, 903, 906, 940, 991, 1006, 1013, 1042, 1074, 1085, 1100, 1125, 1148, 1170, 1179, 1180, 1203, 1237, 1239, 1252, 1256, 1274, 1279, 1289, 1297, 1307, 1353, 1358, 1383, 1415, 1417, 1431, 1438,1449, 1457, 1506, 1542, 1544, 1547, 1579],
# [2, 8, 18, 32, 45, 59, 64, 79, 86, 96, 103, 110, 140, 166, 174, 199, 201, 202, 214, 235, 242, 247, 250, 262, 273, 278, 279, 286, 290, 326, 344, 345, 350, 363, 364, 424, 426, 427, 447, 470, 493, 501, 523, 526, 548, 563, 586, 599, 601, 602, 605, 632, 643, 661, 662, 677, 694, 702, 706, 751, 768, 815, 832, 871, 892, 930, 932, 934, 973, 984, 994, 1010, 1064,1080, 1115, 1133, 1136, 1166, 1173, 1258, 1298, 1302, 1313, 1319, 1320, 1321, 1326, 1328, 1333, 1341, 1349, 1385, 1403, 1478, 1489, 1512, 1556, 1559, 1565, 1572], 
# [12, 25, 29, 97, 102, 107, 116, 119, 132, 141, 158, 178, 181, 252, 253, 309,332, 356, 359, 373, 374, 393, 402, 420, 433, 438, 448, 459, 472, 480, 482, 491,515, 517, 529, 530, 543, 581, 618, 623, 636, 650, 676, 687, 727, 728, 737, 742,787, 793, 807, 857, 858, 863, 882, 904, 907, 957, 969, 970, 971, 975, 1000, 1007, 1011, 1043, 1050, 1056, 1069, 1112, 1141, 1145, 1147, 1151, 1157, 1163, 1172,1246, 1316, 1335, 1352, 1360, 1376, 1387, 1399, 1412, 1468, 1505, 1509, 1510, 1513, 1514, 1517, 1529, 1567, 1576, 1578, 1581, 1583, 1591],
# [6, 21, 23, 34, 42, 53, 77, 90, 154, 168, 195, 264, 267, 269, 282, 284, 335, 336, 352, 354, 358, 379,400, 434, 444, 453, 463, 509, 511, 554, 556, 559, 580, 591, 594, 595, 615, 640,707, 714, 749, 756, 763, 764, 765, 792, 821, 843, 855, 860, 875, 894, 897, 920,954, 1002, 1012, 1025, 1028, 1057, 1076, 1083, 1109, 1127, 1130, 1134, 1144, 1153, 1160, 1194, 1195, 1198, 1207, 1209, 1242, 1245, 1262, 1265, 1284, 1373, 1392, 1407, 1429, 1446, 1461, 1470, 1477, 1479, 1485, 1511, 1519, 1520, 1534, 1539,1554, 1560, 1566, 1577, 1590, 1599],
# [14, 50, 55, 73, 94, 100, 109, 112, 144, 149, 183, 200, 207, 234, 249, 251, 258, 275, 297, 301, 305, 323, 346, 357, 386, 404, 409, 436, 440, 458, 496, 504, 525, 538, 570, 583, 608, 634, 635, 637, 665, 668, 673, 679, 682, 688, 689, 693, 711, 716, 752, 760, 783, 794, 819, 823, 836, 847, 881, 891, 893, 943, 946, 961, 974, 988, 995, 1001, 1003, 1040, 1075, 1155, 1171, 1228, 1259, 1264, 1269, 1291, 1304, 1305, 1308, 1332, 1343, 1356, 1357, 1398, 1401, 1427, 1428, 1435, 1482, 1483, 1484, 1496, 1533, 1535, 1555, 1574, 1586,1589],
# [5, 19, 46, 63, 68, 82, 111, 120, 129, 131, 145, 151, 152, 170, 176, 184,203, 208, 216, 220, 227, 246, 263, 302, 308, 331, 334, 351, 376, 378, 387, 396,408, 412, 428, 489, 500, 502, 555, 577, 622, 631, 651, 652, 686, 705, 745, 747,782, 798, 827, 874, 884, 885, 928, 937, 939, 956, 983, 1009, 1018, 1030, 1039,1062, 1063, 1070, 1072, 1073, 1084, 1093, 1103, 1107, 1128, 1158, 1168, 1169, 1224, 1227, 1230, 1272, 1315, 1327, 1336, 1388, 1394, 1395, 1400, 1409, 1432, 1447, 1448, 1452, 1453, 1465, 1476, 1503, 1515, 1523, 1525, 1582], 
# [9, 13, 15, 41, 44, 93, 113, 122, 134, 136, 139, 173, 185, 189, 205, 241, 259, 260, 268, 272, 299, 318, 327, 353, 389, 394, 413, 430, 479, 492, 499, 503, 516, 531, 535, 573, 579, 590, 609, 614, 638, 660, 666, 670, 691, 717, 724, 757, 762, 796, 801, 803, 809, 811, 838, 844, 873, 888, 899, 905, 908, 913, 918, 929, 953, 977, 982, 990, 992, 1020, 1021, 1051, 1094, 1116, 1124, 1164, 1183, 1184, 1199, 1208, 1219, 1226,1303, 1306, 1323, 1334, 1345, 1354, 1364, 1374, 1377, 1402, 1422, 1501, 1531, 1540, 1553, 1585, 1592, 1597],
# [3, 20, 26, 39, 40, 57, 70, 78, 87, 92, 126, 133, 143, 147, 153, 160, 190, 198, 206, 210, 254, 270, 280, 295, 314, 321, 338, 339, 347, 360, 365, 399, 401, 429, 435, 471, 475, 487, 490, 505, 506, 507, 565, 600, 624, 625, 649, 654, 659, 675, 709, 713, 719, 722, 766, 775, 784, 790, 820, 849, 887, 914, 938, 941, 960, 962, 999, 1054, 1060, 1095, 1099, 1106, 1139, 1142, 1154, 1165, 1186, 1189, 1197, 1273, 1276, 1295, 1301, 1309, 1314, 1324, 1366, 1368,1379, 1436, 1440, 1450, 1459, 1473, 1500, 1528, 1532, 1558, 1563, 1568],
# [0, 4,27, 56, 66, 91, 117, 118, 121, 142, 194, 209, 221, 236, 243, 244, 248, 277, 293,317, 333, 348, 371, 481, 510, 514, 532, 551, 568, 575, 585, 604, 620, 629, 644,681, 700, 704, 720, 726, 743, 748, 770, 779, 795, 850, 851, 859, 867, 889, 911,924, 933, 949, 955, 967, 989, 998, 1019, 1052, 1098, 1105, 1117, 1126, 1131, 1132, 1138, 1156, 1162, 1167, 1175, 1176, 1200, 1238, 1268, 1277, 1278, 1283, 1292, 1310, 1330, 1339, 1350, 1372, 1391, 1404, 1406, 1419, 1463, 1466, 1480, 1481,1490, 1498, 1526, 1538, 1549, 1562, 1570, 1598],
# [24, 51, 54, 74, 108, 130, 148,186, 196, 226, 230, 261, 281, 294, 304, 307, 349, 362, 372, 383, 417, 465, 477,537, 546, 553, 560, 578, 592, 593, 611, 613, 616, 619, 630, 658, 669, 685, 692,696, 791, 805, 806, 834, 837, 840, 845, 856, 868, 869, 898, 915, 945, 951, 964,987, 1004, 1008, 1015, 1036, 1041, 1045, 1071, 1081, 1082, 1088, 1111, 1119, 1120, 1135, 1143, 1159, 1174, 1177, 1193, 1202, 1205, 1221, 1248, 1253, 1280, 1281, 1290, 1293, 1311, 1312, 1340, 1378, 1397, 1405, 1474, 1475, 1486, 1487, 1516,1518, 1537, 1552, 1573, 1596],
# [16, 28, 37, 104, 128, 159, 164, 175, 187, 188, 212, 217, 223, 255, 312, 341, 343, 392, 397, 398, 406, 410, 418, 454, 455, 461, 462, 478, 494, 495, 512, 540, 550, 558, 597, 626, 633, 729, 735, 741, 750, 761, 781, 797, 799, 814, 822, 824, 879, 900, 910, 966, 979, 981, 993, 996, 1037, 1044,1046, 1053, 1061, 1065, 1077, 1079, 1096, 1097, 1113, 1121, 1146, 1150, 1182, 1185, 1196, 1210, 1235, 1241, 1254, 1263, 1275, 1285, 1287, 1338, 1355, 1359, 1363, 1380, 1381, 1396, 1437, 1441, 1467, 1493, 1494, 1495, 1522, 1541, 1546, 1551,1564, 1575],
# [11, 17, 31, 48, 75, 89, 95, 98, 115, 123, 125, 135, 137, 161, 179, 191, 219, 222, 257, 266, 276, 291, 300, 310, 320, 324, 411, 473, 476, 498, 518, 534, 557, 576, 582, 587, 639, 646, 648, 663, 701, 710, 723, 731, 744, 759, 772, 776, 812, 813, 816, 818, 839, 861, 864, 866, 883, 886, 916, 921, 922, 965, 980, 985, 997, 1031, 1032, 1038, 1047, 1059, 1114, 1137, 1191, 1201, 1206, 1215, 1218, 1223, 1234, 1249, 1251, 1266, 1288, 1317, 1318, 1370, 1371, 1414, 1420, 1430, 1439, 1444, 1460, 1464, 1507, 1508, 1524, 1536, 1543, 1548],
# [7, 10, 35, 58, 61, 69, 99, 146, 163, 165, 192, 213, 231, 233, 274, 287, 328, 330, 366, 377, 390,395, 445, 446, 450, 456, 460, 508, 536, 541, 547, 549, 564, 567, 598, 606, 617,657, 671, 672, 695, 703, 725, 733, 736, 754, 769, 771, 778, 785, 786, 800, 810,829, 865, 877, 880, 909, 919, 931, 935, 942, 963, 986, 1023, 1026, 1055, 1068,1086, 1089, 1101, 1102, 1104, 1110, 1140, 1181, 1212, 1229, 1267, 1270, 1286, 1322, 1337, 1347, 1361, 1362, 1369, 1384, 1411, 1413, 1423, 1426, 1456, 1469, 1472, 1488, 1499, 1530, 1561, 1595],
# [38, 43, 60, 62, 101, 225, 229, 232, 285, 288,289, 306, 316, 319, 340, 368, 381, 423, 439, 441, 457, 467, 474, 521, 533, 542,544, 552, 562, 588, 589, 603, 610, 641, 647, 698, 708, 721, 730, 746, 753, 767,788, 825, 862, 890, 895, 902, 912, 917, 925, 926, 947, 959, 1017, 1022, 1029, 1048, 1049, 1066, 1078, 1091, 1092, 1129, 1187, 1192, 1204, 1213, 1216, 1222, 1225, 1231, 1232, 1233, 1247, 1250, 1261, 1271, 1294, 1296, 1299, 1344, 1367, 1389,1410, 1421, 1424, 1425, 1434, 1442, 1443, 1454, 1455, 1458, 1471, 1492, 1550, 1584, 1587, 1593],
# [30, 33, 47, 49, 65, 72, 81, 83, 105, 114, 127, 150, 162, 169,177, 180, 193, 197, 218, 265, 271, 296, 303, 325, 369, 385, 388, 391, 407, 414,431, 432, 442, 443, 466, 484, 486, 513, 524, 527, 584, 607, 655, 667, 678, 680,684, 690, 697, 715, 738, 739, 758, 777, 789, 804, 826, 842, 848, 901, 923, 936,944, 950, 978, 1014, 1027, 1033, 1034, 1035, 1067, 1087, 1090, 1118, 1161, 1178,1188, 1211, 1214, 1220, 1240, 1255, 1282, 1300, 1329, 1342, 1351, 1365, 1386, 1408, 1418, 1462, 1497, 1502, 1504, 1527, 1545, 1569, 1571, 1594]]

# I5SIMTestDatasetTrueClassification = [[1, 22, 52, 67, 84, 88, 106, 124, 138, 156, 167, 172, 204, 228, 240, 245, 256, 283, 313, 322, 337, 355, 367, 375, 380, 382, 405, 421, 422, 449, 451, 452, 464, 468, 469, 519, 520, 539, 566, 596, 612, 627, 628, 642, 656, 683, 718, 780, 808, 817, 830, 831, 833, 835, 852, 853, 854, 870, 876, 878, 927, 948, 952, 958, 968, 972, 976, 1005, 1016, 1024, 1058, 1108, 1122, 1123, 1149,1152, 1190, 1217, 1236, 1243, 1244, 1257, 1260, 1325, 1331, 1346, 1348, 1375, 1382, 1390, 1393, 1416, 1433, 1445, 1451, 1491, 1521, 1557, 1580, 1588,36, 71,76, 80, 85, 155, 157, 171, 182, 211, 215, 224, 237, 238, 239, 292, 298, 311, 315, 329, 342, 361, 370, 384, 403, 415, 416, 419, 425, 437, 483, 485, 488, 497, 522, 528, 545, 561, 569, 571, 572, 574, 621, 645, 653, 664, 674, 699, 712, 732, 734, 740, 755, 773, 774, 802, 828, 841, 846, 872, 896, 903, 906, 940, 991, 1006, 1013, 1042, 1074, 1085, 1100, 1125, 1148, 1170, 1179, 1180, 1203, 1237, 1239, 1252, 1256, 1274, 1279, 1289, 1297, 1307, 1353, 1358, 1383, 1415, 1417, 1431, 1438,1449, 1457, 1506, 1542, 1544, 1547, 1579],
# [2, 8, 18, 32, 45, 59, 64, 79, 86, 96, 103, 110, 140, 166, 174, 199, 201, 202, 214, 235, 242, 247, 250, 262, 273, 278, 279, 286, 290, 326, 344, 345, 350, 363, 364, 424, 426, 427, 447, 470, 493, 501, 523, 526, 548, 563, 586, 599, 601, 602, 605, 632, 643, 661, 662, 677, 694, 702, 706, 751, 768, 815, 832, 871, 892, 930, 932, 934, 973, 984, 994, 1010, 1064,1080, 1115, 1133, 1136, 1166, 1173, 1258, 1298, 1302, 1313, 1319, 1320, 1321, 1326, 1328, 1333, 1341, 1349, 1385, 1403, 1478, 1489, 1512, 1556, 1559, 1565, 1572, 12, 25, 29, 97, 102, 107, 116, 119, 132, 141, 158, 178, 181, 252, 253, 309,332, 356, 359, 373, 374, 393, 402, 420, 433, 438, 448, 459, 472, 480, 482, 491,515, 517, 529, 530, 543, 581, 618, 623, 636, 650, 676, 687, 727, 728, 737, 742,787, 793, 807, 857, 858, 863, 882, 904, 907, 957, 969, 970, 971, 975, 1000, 1007, 1011, 1043, 1050, 1056, 1069, 1112, 1141, 1145, 1147, 1151, 1157, 1163, 1172,1246, 1316, 1335, 1352, 1360, 1376, 1387, 1399, 1412, 1468, 1505, 1509, 1510, 1513, 1514, 1517, 1529, 1567, 1576, 1578, 1581, 1583, 1591],
# [6, 21, 23, 34, 42, 53, 77, 90, 154, 168, 195, 264, 267, 269, 282, 284, 335, 336, 352, 354, 358, 379,400, 434, 444, 453, 463, 509, 511, 554, 556, 559, 580, 591, 594, 595, 615, 640,707, 714, 749, 756, 763, 764, 765, 792, 821, 843, 855, 860, 875, 894, 897, 920,954, 1002, 1012, 1025, 1028, 1057, 1076, 1083, 1109, 1127, 1130, 1134, 1144, 1153, 1160, 1194, 1195, 1198, 1207, 1209, 1242, 1245, 1262, 1265, 1284, 1373, 1392, 1407, 1429, 1446, 1461, 1470, 1477, 1479, 1485, 1511, 1519, 1520, 1534, 1539,1554, 1560, 1566, 1577, 1590, 1599, 14, 50, 55, 73, 94, 100, 109, 112, 144, 149, 183, 200, 207, 234, 249, 251, 258, 275, 297, 301, 305, 323, 346, 357, 386, 404, 409, 436, 440, 458, 496, 504, 525, 538, 570, 583, 608, 634, 635, 637, 665, 668, 673, 679, 682, 688, 689, 693, 711, 716, 752, 760, 783, 794, 819, 823, 836, 847, 881, 891, 893, 943, 946, 961, 974, 988, 995, 1001, 1003, 1040, 1075, 1155, 1171, 1228, 1259, 1264, 1269, 1291, 1304, 1305, 1308, 1332, 1343, 1356, 1357, 1398, 1401, 1427, 1428, 1435, 1482, 1483, 1484, 1496, 1533, 1535, 1555, 1574, 1586,1589],
# [5, 19, 46, 63, 68, 82, 111, 120, 129, 131, 145, 151, 152, 170, 176, 184,203, 208, 216, 220, 227, 246, 263, 302, 308, 331, 334, 351, 376, 378, 387, 396,408, 412, 428, 489, 500, 502, 555, 577, 622, 631, 651, 652, 686, 705, 745, 747,782, 798, 827, 874, 884, 885, 928, 937, 939, 956, 983, 1009, 1018, 1030, 1039,1062, 1063, 1070, 1072, 1073, 1084, 1093, 1103, 1107, 1128, 1158, 1168, 1169, 1224, 1227, 1230, 1272, 1315, 1327, 1336, 1388, 1394, 1395, 1400, 1409, 1432, 1447, 1448, 1452, 1453, 1465, 1476, 1503, 1515, 1523, 1525, 1582, 9, 13, 15, 41, 44, 93, 113, 122, 134, 136, 139, 173, 185, 189, 205, 241, 259, 260, 268, 272, 299, 318, 327, 353, 389, 394, 413, 430, 479, 492, 499, 503, 516, 531, 535, 573, 579, 590, 609, 614, 638, 660, 666, 670, 691, 717, 724, 757, 762, 796, 801, 803, 809, 811, 838, 844, 873, 888, 899, 905, 908, 913, 918, 929, 953, 977, 982, 990, 992, 1020, 1021, 1051, 1094, 1116, 1124, 1164, 1183, 1184, 1199, 1208, 1219, 1226,1303, 1306, 1323, 1334, 1345, 1354, 1364, 1374, 1377, 1402, 1422, 1501, 1531, 1540, 1553, 1585, 1592, 1597],
# [3, 20, 26, 39, 40, 57, 70, 78, 87, 92, 126, 133, 143, 147, 153, 160, 190, 198, 206, 210, 254, 270, 280, 295, 314, 321, 338, 339, 347, 360, 365, 399, 401, 429, 435, 471, 475, 487, 490, 505, 506, 507, 565, 600, 624, 625, 649, 654, 659, 675, 709, 713, 719, 722, 766, 775, 784, 790, 820, 849, 887, 914, 938, 941, 960, 962, 999, 1054, 1060, 1095, 1099, 1106, 1139, 1142, 1154, 1165, 1186, 1189, 1197, 1273, 1276, 1295, 1301, 1309, 1314, 1324, 1366, 1368,1379, 1436, 1440, 1450, 1459, 1473, 1500, 1528, 1532, 1558, 1563, 1568, 0, 4,27, 56, 66, 91, 117, 118, 121, 142, 194, 209, 221, 236, 243, 244, 248, 277, 293,317, 333, 348, 371, 481, 510, 514, 532, 551, 568, 575, 585, 604, 620, 629, 644,681, 700, 704, 720, 726, 743, 748, 770, 779, 795, 850, 851, 859, 867, 889, 911,924, 933, 949, 955, 967, 989, 998, 1019, 1052, 1098, 1105, 1117, 1126, 1131, 1132, 1138, 1156, 1162, 1167, 1175, 1176, 1200, 1238, 1268, 1277, 1278, 1283, 1292, 1310, 1330, 1339, 1350, 1372, 1391, 1404, 1406, 1419, 1463, 1466, 1480, 1481,1490, 1498, 1526, 1538, 1549, 1562, 1570, 1598],
# [24, 51, 54, 74, 108, 130, 148,186, 196, 226, 230, 261, 281, 294, 304, 307, 349, 362, 372, 383, 417, 465, 477,537, 546, 553, 560, 578, 592, 593, 611, 613, 616, 619, 630, 658, 669, 685, 692,696, 791, 805, 806, 834, 837, 840, 845, 856, 868, 869, 898, 915, 945, 951, 964,987, 1004, 1008, 1015, 1036, 1041, 1045, 1071, 1081, 1082, 1088, 1111, 1119, 1120, 1135, 1143, 1159, 1174, 1177, 1193, 1202, 1205, 1221, 1248, 1253, 1280, 1281, 1290, 1293, 1311, 1312, 1340, 1378, 1397, 1405, 1474, 1475, 1486, 1487, 1516,1518, 1537, 1552, 1573, 1596, 16, 28, 37, 104, 128, 159, 164, 175, 187, 188, 212, 217, 223, 255, 312, 341, 343, 392, 397, 398, 406, 410, 418, 454, 455, 461, 462, 478, 494, 495, 512, 540, 550, 558, 597, 626, 633, 729, 735, 741, 750, 761, 781, 797, 799, 814, 822, 824, 879, 900, 910, 966, 979, 981, 993, 996, 1037, 1044,1046, 1053, 1061, 1065, 1077, 1079, 1096, 1097, 1113, 1121, 1146, 1150, 1182, 1185, 1196, 1210, 1235, 1241, 1254, 1263, 1275, 1285, 1287, 1338, 1355, 1359, 1363, 1380, 1381, 1396, 1437, 1441, 1467, 1493, 1494, 1495, 1522, 1541, 1546, 1551,1564, 1575],
# [11, 17, 31, 48, 75, 89, 95, 98, 115, 123, 125, 135, 137, 161, 179, 191, 219, 222, 257, 266, 276, 291, 300, 310, 320, 324, 411, 473, 476, 498, 518, 534, 557, 576, 582, 587, 639, 646, 648, 663, 701, 710, 723, 731, 744, 759, 772, 776, 812, 813, 816, 818, 839, 861, 864, 866, 883, 886, 916, 921, 922, 965, 980, 985, 997, 1031, 1032, 1038, 1047, 1059, 1114, 1137, 1191, 1201, 1206, 1215, 1218, 1223, 1234, 1249, 1251, 1266, 1288, 1317, 1318, 1370, 1371, 1414, 1420, 1430, 1439, 1444, 1460, 1464, 1507, 1508, 1524, 1536, 1543, 1548, 7, 10, 35, 58, 61, 69, 99, 146, 163, 165, 192, 213, 231, 233, 274, 287, 328, 330, 366, 377, 390,395, 445, 446, 450, 456, 460, 508, 536, 541, 547, 549, 564, 567, 598, 606, 617,657, 671, 672, 695, 703, 725, 733, 736, 754, 769, 771, 778, 785, 786, 800, 810,829, 865, 877, 880, 909, 919, 931, 935, 942, 963, 986, 1023, 1026, 1055, 1068,1086, 1089, 1101, 1102, 1104, 1110, 1140, 1181, 1212, 1229, 1267, 1270, 1286, 1322, 1337, 1347, 1361, 1362, 1369, 1384, 1411, 1413, 1423, 1426, 1456, 1469, 1472, 1488, 1499, 1530, 1561, 1595],
# [38, 43, 60, 62, 101, 225, 229, 232, 285, 288,289, 306, 316, 319, 340, 368, 381, 423, 439, 441, 457, 467, 474, 521, 533, 542,544, 552, 562, 588, 589, 603, 610, 641, 647, 698, 708, 721, 730, 746, 753, 767,788, 825, 862, 890, 895, 902, 912, 917, 925, 926, 947, 959, 1017, 1022, 1029, 1048, 1049, 1066, 1078, 1091, 1092, 1129, 1187, 1192, 1204, 1213, 1216, 1222, 1225, 1231, 1232, 1233, 1247, 1250, 1261, 1271, 1294, 1296, 1299, 1344, 1367, 1389,1410, 1421, 1424, 1425, 1434, 1442, 1443, 1454, 1455, 1458, 1471, 1492, 1550, 1584, 1587, 1593, 30, 33, 47, 49, 65, 72, 81, 83, 105, 114, 127, 150, 162, 169,177, 180, 193, 197, 218, 265, 271, 296, 303, 325, 369, 385, 388, 391, 407, 414,431, 432, 442, 443, 466, 484, 486, 513, 524, 527, 584, 607, 655, 667, 678, 680,684, 690, 697, 715, 738, 739, 758, 777, 789, 804, 826, 842, 848, 901, 923, 936,944, 950, 978, 1014, 1027, 1033, 1034, 1035, 1067, 1087, 1090, 1118, 1161, 1178,1188, 1211, 1214, 1220, 1240, 1255, 1282, 1300, 1329, 1342, 1351, 1365, 1386, 1408, 1418, 1462, 1497, 1502, 1504, 1527, 1545, 1569, 1571, 1594],[],[],[],[],[],[],[],[]]

# print NMI(I5SIMTestDatasetTrueClassification,I5SIM3DatasetTrueClassification)
def calcNMI():

	dataset = readARFF();

	subSet = dataset[['class', 'cluster']]
	#print subSet

	NMI = normalized_mutual_info_score(subSet['class'], subSet['cluster'])
	print NMI
def pairwise_MI(data):
    columns = data.columns
    MI_df = pd.DataFrame(index = columns, columns = columns)
    for c1,c2 in combinations(columns, 2):
        cleaned = data[[c1,c2]].dropna()
        MI = normalized_mutual_info_score(cleaned[c1], cleaned[c2])
        MI_df.loc[c1,c2] = MI
        MI_df.loc[c2,c1] = MI
    return MI_df.astype(float)
Esempio n. 10
0
def main():
	file1 = sys.argv[1]
	file2 = sys.argv[2]
	
	c_true = {}
	c_pred = {}
	#read data from file
	with open(file1) as fd1, open(file2) as fd2:
		c_true = eval(fd1.readline())
		c_pred = eval(fd2.readline())
	#order the data in dictionary data structure
	c_true_order = collections.OrderedDict(sorted(c_true.items()))
	c_pred_order = collections.OrderedDict(sorted(c_pred.items()))
	c_true_label = []
	c_pred_label = []
	print c_true_order	
	#make list with community label 
	for k, v in c_true_order.items():
		c_true_label.append(v)
	for k, v in c_pred_order.items():
		c_pred_label.append(v)
	
	
	simi =  normalized_mutual_info_score(c_true_label,c_pred_label)
	

	DATA_FILE = sys.argv[3].split("/")
	FILE_LOG_NAME = "LOG_File_"+(DATA_FILE[-1])+ ".xlsx"
	Kcore_Value = int(sys.argv[4])

	if(not os.path.exists(FILE_LOG_NAME)):
		wb = openpyxl.Workbook()	
		sheet = wb.active
		sheet.title = "Sheet1"
		sheet['A1'] = 'K/R Value'
		sheet['B1'] = 'NMI Similarity'
		sheet['A2'] = 'v=10%'
		sheet['A3'] = 'v=20%'
		sheet['A4'] = 'v=30%'
		sheet['A5'] = 'v=40%'
		sheet['A6'] = 'v=50%'
		sheet['A7'] = 'v=60%'
		sheet['A8'] = 'v=70%'
		sheet['A9'] = 'v=80%'
		sheet['A10'] = 'v=90%'
		sheet['A11'] = 'v=100%'
	else:
		wb = openpyxl.load_workbook(FILE_LOG_NAME)
        
	sheet = wb.get_sheet_by_name('Sheet1')
	sheet['B'+str(Kcore_Value + 1)] = simi
	wb.save(FILE_LOG_NAME)
Esempio n. 11
0
def get_loss(ckernel_net, data_loader):
	#	Compute final average loss
	for idx, (data, target) in enumerate(data_loader):
		data = Variable(data.type(db['dataType']))
		loss = ckernel_net.CAE_compute_loss(data)


	dataOut = ckernel_net(data)
	dataOut = dataOut.cpu().data.numpy()

	allocation = KMeans(10).fit_predict(dataOut)
	nmi = normalized_mutual_info_score(allocation, target.numpy())
	return [loss.cpu().data.numpy()[0], nmi]
Esempio n. 12
0
def __eval_lda_clustering_20ng():
    text_doc_file = 'e:/dc/20ng_bydate/twe/docs-nl.txt'
    dict_file = 'e:/dc/20ng_bydate/lda/all-docs.dict'
    mm_file = 'e:/dc/20ng_bydate/lda/all-docs.mm'
    lda_model_file = 'e:/dc/20ng_bydate/lda/lda-model'

    dataset_label_file = 'e:/dc/20ng_bydate/doc_split_labels.bin'
    test_label_file = 'e:/dc/20ng_bydate/test_labels.bin'

    __text_file_to_mm_corpus(text_doc_file, dict_file, mm_file)

    __train_lda_model(dict_file, mm_file, lda_model_file)

    dataset_labels = ioutils.load_labels_file(dataset_label_file)
    lda_model = gensim.models.ldamodel.LdaModel.load(lda_model_file)
    mm_corpus = gensim.corpora.MmCorpus(mm_file)
    sys_labels = list()
    for i, doc in enumerate(mm_corpus):
        if dataset_labels[i] == 0:
            continue

        topic_dist = lda_model[doc]
        # print topic_dist
        cluster_idx = 0
        max_dist = 0
        for tup in topic_dist:
            if tup[1] > max_dist:
                cluster_idx = tup[0]
                max_dist = tup[1]
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 1000 == 0:
            print len(sys_labels)
        # if i > 10:
        #     break
    print len(sys_labels)
    gold_labels = ioutils.load_labels_file(test_label_file)
    print len(gold_labels)
    print normalized_mutual_info_score(gold_labels, sys_labels)
    print cluster_accuracy(gold_labels, sys_labels)
Esempio n. 13
0
def main():
	file1 = sys.argv[1]
	file2 = sys.argv[2]
	c_true = {}
	c_pred = {}
	#read data from file
	with open(file1) as fd1, open(file2) as fd2:
		c_true = eval(fd1.readline())
		c_pred = eval(fd2.readline())
	
	#order the data in dictionary data structure
	c_true_order = collections.OrderedDict(sorted(c_true.items()))
	c_pred_order = collections.OrderedDict(sorted(c_pred.items()))
	c_true_label = []
	c_pred_label = []
	
	#make list with community label 
	for k, v in c_true_order.items():
		c_true_label.append(v)
	for k, v in c_pred_order.items():
		c_pred_label.append(v)
	
	print normalized_mutual_info_score(c_true_label,c_pred_label)
Esempio n. 14
0
def test_v_measure_and_mutual_information(seed=36):
    # Check relation between v_measure, entropy and mutual information
    for i in np.logspace(1, 4, 4).astype(np.int):
        random_state = np.random.RandomState(seed)
        labels_a, labels_b = (random_state.randint(0, 10, i),
                              random_state.randint(0, 10, i))
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            2.0 * mutual_info_score(labels_a, labels_b) /
                            (entropy(labels_a) + entropy(labels_b)), 0)
        avg = 'arithmetic'
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            normalized_mutual_info_score(labels_a, labels_b,
                                                         average_method=avg)
                            )
Esempio n. 15
0
	def evaluate( self, partition, clustered_ids ):
		# no class info?
		if not self.has_class_info():
			return {}
		# get two clusterings that we can compare
		n = len(clustered_ids)
		classes_subset = np.zeros( n )
		for row in range(n):
			classes_subset[row] = self.class_map[clustered_ids[row]]		
		scores = {}
		scores["external-nmi"] = normalized_mutual_info_score( classes_subset, partition )
		scores["external-ami"] = adjusted_mutual_info_score( classes_subset, partition )
		scores["external-ari"] = adjusted_rand_score( classes_subset, partition )
		return scores
def sklearn_measures(U, V):
    #     http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
    import sklearn.metrics.cluster as sym
    U_labels = np.nonzero(U)[1]
    V_labels = np.nonzero(V)[1]
    print U_labels, V_labels
#     V2_labels = np.nonzero(V2)[1]
    print 'entro(U)=',sym.entropy(U_labels),'entro(V)=',sym.entropy(V_labels), 'entro(U,V)=',sym.mutual_info_score(U_labels, V_labels)
    res = [ ['ari', 'nmi', 'ami', 'vm' ], \
            [ sym.adjusted_rand_score(U_labels, V_labels),\
              sym.normalized_mutual_info_score(U_labels, V_labels),\
              sym.adjusted_mutual_info_score(U_labels, V_labels),\
              sym.v_measure_score(U_labels, V_labels)]]
    print res
    return res
Esempio n. 17
0
def checkout_CAE():
	X = pickle.load( open( 'mnist_60000_validation.pk', "rb" ) )
	Y = pickle.load( open( 'mnist_60000_label_validation.pk', "rb" ) )
	Y = torch.from_numpy(Y)

	kinfo = pickle.load( open( 'kernel_mnist.p', "rb" ) )
	cnn = kinfo['kernel_net']
	X_var = Variable(X.type(db['dataType']))

	xout = cnn(X_var)
	xout = xout.cpu().data.numpy()

	allocation = KMeans(10).fit_predict(xout)
	nmi = normalized_mutual_info_score(allocation, Y.numpy())
	print('nmi : %.3f', nmi)
Esempio n. 18
0
def bow_kmeans(bow_vecs, gold_labels, num_clusters):
    print 'performing kmeans ...'
    model = KMeans(n_clusters=num_clusters, n_jobs=4, n_init=20)
    model.fit(bow_vecs)

    # print len(gold_labels), 'samples'

    nmi_score = normalized_mutual_info_score(gold_labels, model.labels_)
    purity_score = purity(gold_labels, model.labels_)
    ri_score = rand_index(gold_labels, model.labels_)

    # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, model.labels_)
    # print 'Purity: %f' % purity(gold_labels, model.labels_)
    # print 'Accuracy: %f' % cluster_accuracy(gold_labels, model.labels_)
    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score)
    return nmi_score, purity_score, ri_score
Esempio n. 19
0
def nimSimilarity(c_true, c_pred):
	'''This will return the Normalized Mutual Information between two clusterings
	 Parameters:     c_true, communities detected without kcore, a dictionary with community node as the key and community lable as the value
			 c_pred, communities detected with kcore, a dictionary with community node as the key and community lable as the value
	Return nmi

	Example:
	x = {1:1,2:1,3:0,4:0}
	y = {1:0,2:0,3:1,4:1}

	print nimSimilarity(x,y)'''

	#put community lables (lable might be duplicate)into array
	c_true = list(c_true.values())
	#print sorted(c_true)
	c_pred = list(c_pred.values())
	#print sorted(c_pred)
	return normalized_mutual_info_score(c_true,c_pred)
Esempio n. 20
0
def mutualinfo(df):
	dfin=df
	Label=dfin['L']
	VALUES=['sentiment_polarity','sentiment_subjectivity','absPolarity','Clean tweet', 'L']
	
	
	headers_names=list(dfin.columns.values)
	headers_names = [x for x in headers_names if x not in VALUES]
	
	mutualinfowords=[]
	for header in headers_names:
		
		mutualcolumn= dfin[header]
		mutualvalue= normalized_mutual_info_score(mutualcolumn,Label)
		if mutualvalue>0.02:
			#print'mutual info',header, mutualvalue 
			mutualinfowords.append(header)
	return mutualinfowords
#mutualinfo(test)
Esempio n. 21
0
def crossvalidate(profiles, true_group_name, holdout_group_name=None, 
                  train=NNClassifier, distance='cosine'):
    profiles.assert_not_isnan()
    keys = profiles.keys()
    true_labels = profiles.regroup(true_group_name)
    profiles.data = np.array([d for k, d in zip(keys, profiles.data) if tuple(k) in true_labels])
    profiles._keys = [k for k in keys if tuple(k) in true_labels]
    keys = profiles.keys()
    labels = list(set(true_labels.values()))

    if holdout_group_name:
        holdouts = profiles.regroup(holdout_group_name)
    else:
        holdouts = dict((k, k) for k in keys)
    
    true_indices = []
    pred_indices = []
    for ho in set(holdouts.values()):
        test_set_mask = np.array([tuple(holdouts[k]) == ho for k in keys], 
                                 dtype=bool)
        training_features = profiles.data[~test_set_mask, :]
        training_labels = [labels.index(true_labels[tuple(k)]) 
                           for k, m in zip(keys, ~test_set_mask) if m]

        model = train(training_features, training_labels, distance=distance)
        for k, f, m in zip(keys, profiles.data, test_set_mask):
            if not m:
                continue
            true = true_labels[k]
            predicted = labels[model.classify(f)]
            
            true_indices.append(labels.index(true))
            pred_indices.append(labels.index(predicted))
    
    true_indices = np.array(true_indices)
    pred_indices = np.array(pred_indices)

    nmi_score = normalized_mutual_info_score(true_indices, pred_indices)
    ami_score = adjusted_mutual_info_score(true_indices, pred_indices)
 
    return nmi_score, ami_score 
Esempio n. 22
0
def process_evaluation(args, model):
    if args['true_row_labels']:
        try:
            with open(args['true_row_labels'], 'r') as f:
                labels = f.read().split()

            from sklearn.metrics.cluster import normalized_mutual_info_score
            from sklearn.metrics.cluster import adjusted_rand_score
            from sklearn.metrics import confusion_matrix

            n = normalized_mutual_info_score(labels, model.row_labels_)
            ari = adjusted_rand_score(labels, model.row_labels_)
            cm = confusion_matrix(labels, model.row_labels_)

            print("nmi ==>" + str(n))
            print("adjusted rand index ==>" + str(ari))
            print()
            print(cm)
        except Exception as e:
            logging.error("--true_row_labels option (evaluation) exception:\
                          %s" % e)
def test_identical_points():
    # Ensure identical points are handled correctly when using mst with
    # a sparse connectivity matrix
    X = np.array([[0, 0, 0], [0, 0, 0],
                  [1, 1, 1], [1, 1, 1],
                  [2, 2, 2], [2, 2, 2]])
    true_labels = np.array([0, 0, 1, 1, 2, 2])
    connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)
    connectivity, n_components = _fix_connectivity(X,
                                                   connectivity,
                                                   'euclidean')

    for linkage in ('single', 'average', 'average', 'ward'):
        clustering = AgglomerativeClustering(n_clusters=3,
                                             linkage=linkage,
                                             connectivity=connectivity)
        clustering.fit(X)

        assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                         true_labels), 1)
Esempio n. 24
0
def feature_selection():
    num_leading_progressions = 900
    num_selected_features = 50
    songs_list = Song.objects.filter(progressions__isnull=False).filter(tags__isnull=False).distinct()
    
    leading_progressions = get_leading_progressions(num_leading_progressions)
    
    feature_matrix = get_features_data(leading_progressions,songs_list)
    all_labels_lists = get_label_data(songs_list)
    num_labels = len(all_labels_lists)
    
    selected_features = [np.empty((num_selected_features), dtype=object) for _i in xrange(num_labels)]
    for l in range(num_labels):
        mi_results = np.zeros(num_leading_progressions)
            
        for p in range(num_leading_progressions):
            mi = normalized_mutual_info_score(feature_matrix[:,p], all_labels_lists[l])
            mi_results[p] = mi
            
        highest_mi_feature_indices = np.argsort(mi_results)[num_leading_progressions-num_selected_features:]
        selected_features[l] = [leading_progressions[i] for i in highest_mi_feature_indices]
        
    pickle.dump(selected_features,open(path_to_selected_progressions,'w'))
Esempio n. 25
0
def cluster_and_eval(vec_list, labels, num_clusters):
    if len(labels) < len(vec_list):
        vec_list = vec_list[-len(labels):]

    cl_data = np.asarray(vec_list)
    # print cl_data

    # model = sklearn.cluster.AgglomerativeClustering(n_clusters=5,
    #                                                 linkage="average", affinity="cosine")
    model = sklearn.cluster.KMeans(n_clusters=num_clusters, n_jobs=4, n_init=50)
    model.fit(cl_data)
    # print estimator.labels_
    # print labels[0:100]
    # print model.labels_

    nmi_score = normalized_mutual_info_score(labels, model.labels_)
    purity_score = purity(labels, model.labels_)
    ri_score = rand_index(labels, model.labels_)

    # print len(labels), 'samples'
    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score)
    # print 'Accuracy: %f' % cluster_accuracy(labels, model.labels_)

    return nmi_score, purity_score, ri_score
Esempio n. 26
0
### for example on digits
import read_tree
from sklearn.metrics.classification import accuracy_score, log_loss
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn import datasets

iris = datasets.load_iris()
data = scale(iris.data)
labels = iris.target

tree = read_file('output.in')
result_of_alg = clusters(tree, k)


def convert(clusters, n):
    clustering_vect = [0] * n
    for i in range(len(clusters)):
        for p in clusters[i]:
            clustering_vect[p] = i
    return clustering_vect


#test = [[3,4,5], [1,8,0], [2,6,7]]
#print(convert(test, 9))

normalized_mutual_info_score(convert(result_of_alg, len(labels)), labels)
Esempio n. 27
0
def mutual_info_plot(var_names_dict, df, name, nname, m_path):

    # setup
    cols = [col for col in df]
    col_names = [var_names_dict[col] for col in cols]
    ncols = len(cols)

    # compute mi's
    norm_mi = np.zeros((len(cols), len(cols)))

    for i, col1 in enumerate(cols):
        for j, col2 in enumerate(cols[:i]):
            raw_matrix = df.as_matrix([col1, col2])
            norm_mi[i][j] = normalized_mutual_info_score(
                raw_matrix[:, 0], raw_matrix[:, 1])

    # mask upper right duplicates
    mask = np.triu(np.ones(norm_mi.shape, dtype=int))
    norm_mi_masked = np.ma.masked_array(norm_mi, mask=mask)

    # now plot
    figsize = 10
    digit_size = 12.5
    if ncols > 15:
        figsize = 15
        digit_size = 11

    fig = plt.figure(figsize=(figsize, figsize))
    ax = fig.add_subplot(111)

    norm = mpl.colors.Normalize(vmin=0., vmax=1.)
    cmap = 'viridis'

    # create an axes on the right side of ax. The width of cax will be 5%
    # of ax and the padding between cax and ax will be fixed at 0.1 inch.
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.1)

    img = ax.imshow(norm_mi_masked, cmap=cmap, norm=norm)
    cb = plt.colorbar(img, cmap=cmap, norm=norm, cax=cax)

    # annotate
    for (j, i), value in np.ndenumerate(norm_mi):
        if (i < j):
            # https://stackoverflow.com/questions/11010683/how-to-have-negative-zero-always-formatted-as-positive-zero-in-a-python-string/36604981#36604981
            value_str = re.sub(r"^-(0\.?00*)$", r"\1", "%.2f" % value)
            ax.text(i,
                    j,
                    value_str,
                    ha='center',
                    va='center',
                    color='fuchsia',
                    size=digit_size)

    ax.set_xticks(np.arange(ncols))
    ax.set_yticks(np.arange(ncols))

    ax.set_xticklabels(col_names, rotation='vertical')
    ax.set_yticklabels(col_names)

    plt.figtext(0.5, 0.89, name, ha='center', va='center', size=18)

    plt.figtext(0.96,
                0.8,
                "(Dependent)",
                rotation='vertical',
                ha='center',
                va='center',
                size=16)
    plt.figtext(0.96,
                0.22,
                "(Independent)",
                rotation='vertical',
                ha='center',
                va='center',
                size=16)
    plt.figtext(0.96,
                0.5,
                "NMI",
                rotation='vertical',
                ha='center',
                va='center',
                size=18)

    make_path(m_path)
    fig.savefig(m_path + '/mutual_information_' + nname + '.pdf')
    def run(self):
        path = self.path
        #### Step 1: reading and sampling graphs
        '''
        m_graph, nx_graphs, total_edges = Reader.multi_readG_with_Merg(path)
        print("%d total nodes"%len(m_graph.nodes()))
        r_list, m_graph_sampled, nx_graphs_sampled = Sampler.multi_sampling_with_Merg(path, self.s_p)
        print("%d edges before sampling, %d edges after sampling. sampled %d "%(len(m_graph.edges()), len(m_graph_sampled.edges()), len(r_list)))

        r_set = set([node for edge in r_list for node in edge])
        '''
        nx_graphs_sampled, _ = Reader.multi_readG(self.path)
        cluster_true = []
        for i in range(29):
            if i < 12:
                cluster_true.append(0)
            else:
                cluster_true.append(1)

        for r in range(11):

            r_t = r / 10.0

            if r_t == 0:
                w_dict = Reader.weight(self.path)
                #print(w_dict)

                MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p,
                                                  self.q, 0.1)
                MK_G.preprocess_transition_probs(w_dict, 1)
                MK_walks = MK_G.simulate_walks(self.num_walks,
                                               self.walk_length)

                MK_words = []
                for walk in MK_walks:
                    MK_words.extend([str(step) for step in walk])

                M_L = Word2Vec.Learn(MK_words)
                M_matrix, M_mapping = M_L.train()
                '''
                eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p)
                precision, recall, F = eval_p.eval()
                print("*** MKII Biased: precision %f, accuracy %f, F %f"%(precision, recall, F))
                eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled)
                M_auc = eval_a.eval_auc(1)
                print("@@@ MKII Biased AUC:", M_auc)
                '''

            else:
                w_dict = Reader.weight(self.path)
                #print(w_dict)

                MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p,
                                                  self.q, r_t)
                MK_G.preprocess_transition_probs(w_dict, 3)
                MK_walks = MK_G.simulate_walks(self.num_walks,
                                               self.walk_length)

                MK_words = []
                for walk in MK_walks:
                    MK_words.extend([str(step) for step in walk])

                M_L = Word2Vec.Learn(MK_words)
                M_matrix, M_mapping = M_L.train()
                '''
                eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p)
                precision, recall, F = eval_p.eval()
                print("*** MKII Biased_ii with %f: precision %f, accuracy %f, F %f"%(r_t, precision, recall, F))
                eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled)
                M_auc = eval_a.eval_auc(1)
                print("@@@ MKII Biased_ii AUC:", M_auc)
                '''

            cluster_trained = KMeans(
                n_clusters=2, random_state=0).fit_predict(M_matrix).tolist()

            length = min(len(cluster_true), len(cluster_trained))

            r = normalized_mutual_info_score(cluster_true[0:length],
                                             cluster_trained[0:length])
            mi_f = f1_score(cluster_true[0:length],
                            cluster_trained[0:length],
                            average='micro')
            ma_f = f1_score(cluster_true[0:length],
                            cluster_trained[0:length],
                            average='macro')
            print("r is %f: nmi %f, micro_f %f, macro_f %f" %
                  (r_t, r, mi_f, ma_f))
            print(
                "-----------------------DONE--------------------------------")
Esempio n. 29
0
def main():
    global args
    args = parser.parse_args()

    # fix random seeds
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)

    # CNN
    if args.verbose:
        print('Architecture: {}'.format(args.arch))
    model = models.__dict__[args.arch](sobel=args.sobel)
    fd = int(model.top_layer.weight.size()[1])
    model.top_layer = None
    model.features = torch.nn.DataParallel(model.features)
    model.cuda()
    cudnn.benchmark = True

    # create optimizer
    optimizer = torch.optim.SGD(
        filter(lambda x: x.requires_grad, model.parameters()),
        lr=args.lr,
        momentum=args.momentum,
        weight_decay=10**args.wd,
    )

    # define loss function
    criterion = nn.CrossEntropyLoss().cuda()

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            # remove top_layer parameters from checkpoint
            for key in checkpoint['state_dict']:
                if 'top_layer' in key:
                    del checkpoint['state_dict'][key]
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # creating checkpoint repo
    exp_check = os.path.join(args.exp, 'checkpoints')
    if not os.path.isdir(exp_check):
        os.makedirs(exp_check)

    # creating cluster assignments log
    cluster_log = Logger(os.path.join(args.exp, 'clusters'))

    # preprocessing of data
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    tra = [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(), normalize
    ]

    # load the data
    end = time.time()
    dataset = datasets.ImageFolder(args.data,
                                   transform=transforms.Compose(tra))
    if args.verbose: print('Load dataset: {0:.2f} s'.format(time.time() - end))
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=args.batch,
                                             num_workers=args.workers,
                                             pin_memory=True)

    # clustering algorithm to use
    deepcluster = clustering.__dict__[args.clustering](args.nmb_cluster)

    # training convnet with DeepCluster
    for epoch in range(args.start_epoch, args.epochs):
        end = time.time()

        # remove head
        model.top_layer = None
        model.classifier = nn.Sequential(
            *list(model.classifier.children())[:-1])

        # get the features for the whole dataset
        features = compute_features(dataloader, model, len(dataset))

        # cluster the features
        clustering_loss = deepcluster.cluster(features, verbose=args.verbose)

        # assign pseudo-labels
        train_dataset = clustering.cluster_assign(deepcluster.images_lists,
                                                  dataset.imgs)

        # uniformely sample per target
        sampler = UnifLabelSampler(int(args.reassign * len(train_dataset)),
                                   deepcluster.images_lists)

        train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch,
            num_workers=args.workers,
            sampler=sampler,
            pin_memory=True,
        )

        # set last fully connected layer
        mlp = list(model.classifier.children())
        mlp.append(nn.ReLU(inplace=True).cuda())
        model.classifier = nn.Sequential(*mlp)
        model.top_layer = nn.Linear(fd, len(deepcluster.images_lists))
        model.top_layer.weight.data.normal_(0, 0.01)
        model.top_layer.bias.data.zero_()
        model.top_layer.cuda()

        # train network with clusters as pseudo-labels
        end = time.time()
        loss = train(train_dataloader, model, criterion, optimizer, epoch)

        # print log
        if args.verbose:
            print('###### Epoch [{0}] ###### \n'
                  'Time: {1:.3f} s\n'
                  'Clustering loss: {2:.3f} \n'
                  'ConvNet loss: {3:.3f}'.format(epoch,
                                                 time.time() - end,
                                                 clustering_loss, loss))
            try:
                nmi = normalized_mutual_info_score(
                    clustering.arrange_clustering(deepcluster.images_lists),
                    clustering.arrange_clustering(cluster_log.data[-1]))
                print('NMI against previous assignment: {0:.3f}'.format(nmi))
            except IndexError:
                pass
            print('####################### \n')
        # save running checkpoint
        torch.save(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
            }, os.path.join(args.exp, 'checkpoint.pth.tar'))

        # save cluster assignments
        cluster_log.log(deepcluster.images_lists)
Esempio n. 30
0
    def fit(self,
            trainloader,
            validloader,
            lr=0.001,
            batch_size=128,
            num_epochs=10,
            visualize=False,
            anneal=False,
            optimizer="adam"):
        use_cuda = torch.cuda.is_available()
        if use_cuda:
            self.cuda()
        if optimizer == "adam":
            optimizer = optim.Adam(self.parameters(), lr=lr)
        elif optimizer == "sgd":
            optimizer = optim.SGD(self.parameters(), lr=lr, momentum=0.9)

        # validate
        self.eval()
        valid_loss = 0.0
        for batch_idx, (inputs, _) in enumerate(validloader):
            inputs = inputs.view(inputs.size(0), -1).float()
            if use_cuda:
                inputs = inputs.cuda()
            inputs = Variable(inputs)
            z, outputs = self.forward(inputs)

            loss = self.loss_function(outputs, inputs)
            valid_loss += loss.data * len(inputs)
            # total_loss += valid_recon_loss.data[0] * inputs.size()[0]
            # total_num += inputs.size()[0]

        # valid_loss = total_loss / total_num
        print("#Epoch -1: Valid Loss: %.5f" %
              (valid_loss / len(validloader.dataset)))

        for epoch in range(num_epochs):
            # train 1 epoch
            self.train()
            if anneal:
                adjust_learning_rate(lr, optimizer, epoch)
            train_loss = 0
            for batch_idx, (inputs, labels) in enumerate(trainloader):
                inputs = inputs.view(inputs.size(0), -1).float()
                if use_cuda:
                    inputs = inputs.cuda()
                optimizer.zero_grad()
                inputs = Variable(inputs)

                z, outputs = self.forward(inputs)
                loss = self.loss_function(outputs, inputs)
                train_loss += loss.data * len(inputs)
                loss.backward()
                optimizer.step()
                # print("    #Iter %3d: Reconstruct Loss: %.3f" % (
                #     batch_idx, recon_loss.data[0]))

            # validate
            self.eval()
            valid_loss = 0.0
            for batch_idx, (inputs, labels) in enumerate(validloader):
                inputs = inputs.view(inputs.size(0), -1).float()
                if use_cuda:
                    inputs = inputs.cuda()
                inputs = Variable(inputs)
                z, outputs = self.forward(inputs)

                loss = self.loss_function(outputs, inputs)
                valid_loss += loss.data * len(inputs)

            print("#Epoch %3d: Train Loss: %.5f, Valid Loss: %.5f" %
                  (epoch, train_loss / len(trainloader.dataset),
                   valid_loss / len(validloader.dataset)))

            if epoch % int(num_epochs / 10) == 0 or epoch == num_epochs - 1:
                trainX, trainY = self.encodeBatch(trainloader, True)
                testX, testY = self.encodeBatch(validloader, True)
                trainX = trainX.cpu().numpy()
                trainY = trainY.cpu().numpy()
                testX = testX.cpu().numpy()
                testY = testY.cpu().numpy()
                n_components = len(np.unique(trainY))
                km = KMeans(n_clusters=n_components, n_init=20).fit(trainX)
                y_pred = km.predict(testX)
                print("acc: %.5f, nmi: %.5f" %
                      (acc(testY, y_pred),
                       normalized_mutual_info_score(testY, y_pred)))
                gmm = GaussianMixture(
                    n_components=n_components,
                    covariance_type='diag',
                    means_init=km.cluster_centers_).fit(trainX)
                y_pred = gmm.predict(testX)
                print("acc: %.5f, nmi: %.5f" %
                      (acc(testY, y_pred),
                       normalized_mutual_info_score(testY, y_pred)))
Esempio n. 31
0
aaa = 1 / aaa
# aaa=(aaa-aaa.min())/(aaa.max()-aaa.min())
aaa = np.concatenate([aaa] * length, axis=0)
aaa = aaa.reshape(length, length)
aaa = np.transpose(aaa)
# aaa=np.log(aaa+1)
# aaa=(aaa-aaa.min())/(aaa.max()-aaa.min())

p = count_percent(D3, D2)
p = p * aaa
D = getD(p)
L = getL(D, p)
eigvec = getEigen(L, n)
eigvec = np.real(eigvec)
clf = KMeans(n_clusters=n)
s = clf.fit(eigvec)
C = s.labels_
print('processed data using sc ARI:', metrics.adjusted_rand_score(y, C))
print('NMI:', normalized_mutual_info_score(y, C))
print('ACC:', acc(y, C))

from sklearn.cluster import SpectralClustering
sc1 = SpectralClustering(n_clusters=n, affinity='nearest_neighbors')
print('SC KNN ARI:', metrics.adjusted_rand_score(y, sc1.fit_predict(x1)))
c = 'ARI:' + str(metrics.adjusted_rand_score(y, C)) + '\n' + 'NMI:' + str(
    normalized_mutual_info_score(y, C)) + '\n'
c = c + 'ACC:' + str(acc(y, C)) + '\n' + 'SKARI' + str(
    metrics.adjusted_rand_score(y, sc1.fit_predict(x1)))
fh = open('performancegoolamimproved.txt', 'w', encoding='utf-8')
fh.write(c)
fh.close()
Esempio n. 32
0
noOfDSPoints = 0
for key, value in discardStats.items():
    noOfDSPoints += value[0]

noOfCSPoints = 0
for key, value in compressStats.items():
    noOfCSPoints += value[0]

noOfCSCluster = len(compressStats)
noOfRSPoints = len(retainedSet)

if iteration == 4:
    interResult[iteration] = (noOfDSPoints, len(compressStats), noOfCSPoints,
                              len(retainedSet))

accuracy = normalized_mutual_info_score(totalIndex, originalIndex)
print("Accuracy", accuracy)

################################################# File Output ##########################################################
f = open('output.txt', 'w')
f.write("The intermediate results:")
f.write("\n")
for i in interResult:
    current = str(i + 1)
    s = str("Round " + current + ": " + str(interResult[i][0]) + "," +
            str(interResult[i][1]) + "," + str(interResult[i][2]) + "," +
            str(interResult[i][3]))
    f.write(str(s))
    f.write("\n")

f.write("\n")
Esempio n. 33
0
def test_agglomerative_clustering():
    """
    Check that we obtain the correct number of clusters with
    agglomerative clustering.
    """
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    for linkage in ("ward", "complete", "average"):
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        clustering.fit(X)
        # test caching
        try:
            tempdir = mkdtemp()
            clustering = AgglomerativeClustering(
                n_clusters=10, connectivity=connectivity,
                memory=tempdir,
                linkage=linkage)
            clustering.fit(X)
            labels = clustering.labels_
            assert_true(np.size(np.unique(labels)) == 10)
        finally:
            shutil.rmtree(tempdir)
        # Turn caching off now
        clustering = AgglomerativeClustering(
            n_clusters=10, connectivity=connectivity, linkage=linkage)
        # Check that we obtain the same solution with early-stopping of the
        # tree building
        clustering.compute_full_tree = False
        clustering.fit(X)
        assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                         labels), 1)
        clustering.connectivity = None
        clustering.fit(X)
        assert_true(np.size(np.unique(clustering.labels_)) == 10)
        # Check that we raise a TypeError on dense matrices
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=sparse.lil_matrix(
                connectivity.toarray()[:10, :10]),
            linkage=linkage)
        assert_raises(ValueError, clustering.fit, X)

    # Test that using ward with another metric than euclidean raises an
    # exception
    clustering = AgglomerativeClustering(
        n_clusters=10,
        connectivity=connectivity.toarray(),
        affinity="manhattan",
        linkage="ward")
    assert_raises(ValueError, clustering.fit, X)

    # Test using another metric than euclidean works with linkage complete
    for affinity in PAIRED_DISTANCES.keys():
        # Compare our (structured) implementation to scipy
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=np.ones((n_samples, n_samples)),
            affinity=affinity,
            linkage="complete")
        clustering.fit(X)
        clustering2 = AgglomerativeClustering(
            n_clusters=10,
            connectivity=None,
            affinity=affinity,
            linkage="complete")
        clustering2.fit(X)
        assert_almost_equal(normalized_mutual_info_score(clustering2.labels_,
                                                         clustering.labels_),
                            1)

    # Test that using a distance matrix (affinity = 'precomputed') has same
    # results (with connectivity constraints)
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity,
                                         linkage="complete")
    clustering.fit(X)
    X_dist = pairwise_distances(X)
    clustering2 = AgglomerativeClustering(n_clusters=10,
                                          connectivity=connectivity,
                                          affinity='precomputed',
                                          linkage="complete")
    clustering2.fit(X_dist)
    assert_array_equal(clustering.labels_, clustering2.labels_)
Esempio n. 34
0
#!/usr/bin/python

import numpy as np
from sklearn.metrics.cluster import normalized_mutual_info_score

x = np.array([1 ,2 ,4 ,1 ,1 ,1 ,4 ,4 ,4 ,4 ,4 ,4 ,3 ,3 ,3 ,3 ,3 ,3 ,4 ,4 ,4 ,4 ,4 ,2 ,3 ,2 ,3 ,3 ,2])
y = np.array([1, 1, 3, 2, 3, 4, 1, 1, 1, 1, 1, 3, 3, 3, 3, 1, 1, 1, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 3])

numpy.savetxt('alternative_kernel.txt', X, fmt='%.18e', delimiter=',', newline='\n', header='', footer='')

print normalized_mutual_info_score(x, y)







Esempio n. 35
0
### Comparison model (n=2) ### For downgrading
print("\nComparison model: ")
print("-------------------")

#Fit the model with optimal, and use the 4 cores to run it faster
km = KMeans(n_clusters=2, n_jobs=4)
km.fit(KMEANSdf)
# Predict the clusters of each datum
Kms2 = km.predict(KMEANSdf)

#Evaluate the model
print("Inertia: %0.3f" % km.inertia_)
silhouette_values = metrics.silhouette_score(KMEANSdf, Kms2)
print("Silhouette: %0.3f" % silhouette_values)
print("NMI for downgrading from 5 to 2 cluster is: %0.3f" %
      normalized_mutual_info_score(Kms1, Kms2))
n_clusters_cKMEANS = len(set(Kms2))
print("\nEstimated number of clusters: %d" % n_clusters_cKMEANS)

#Adding the columns to the Dataset
KMEANSdf["Cluster_KMEANS"] = pd.DataFrame(Kms1)
KMEANSdf["Cluster(n=2)_KMEANS"] = pd.DataFrame(Kms2)

print("NMI: %0.3f" %
      normalized_mutual_info_score(Kms2, DBSCANdf["Cluster_DBSCAN"]))

####################################
#####  KMEANS clustering plots #####
####################################

### Calculate the cendroids for ploting
Esempio n. 36
0
hdbscan_pca = clusterer_hdbscan.fit_predict(embedding_pca)

#clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis', edge_alpha=0.6, node_size=80, edge_linewidth=2)
#clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)
#clusterer.condensed_tree_.plot()

cluster_result = [
    kmeans_umap, kmeans_tsne, kmeans_mds, kmeans_pca, hc_umap, hc_tsne, hc_mds,
    hc_pca, spc_umap, spc_tsne, spc_mds, spc_pca, gmm_umap, gmm_tsne, gmm_mds,
    gmm_pca, hdbscan_umap, hdbscan_tsne, hdbscan_mds, hdbscan_pca
]

file = open("clustering_accuracy_NMI.txt", "a")  #append mode
file.write(filename + "\n")
for _, cluster in enumerate(cluster_result):
    nmi = normalized_mutual_info_score(label_group, cluster)
    file.write(str(nmi) + "\n")
file.close()

file = open("clustering_accuracy_ARI.txt", "a")  #append mode
file.write(filename + "\n")
for _, cluster in enumerate(cluster_result):
    ari = adjusted_rand_score(label_group, cluster)
    file.write(str(ari) + "\n")
file.close()

#######################################################################
# Contour plot showing the visiting timestamp of each sample point
#######################################################################

label_days_cummulated = data[(data.shape[0] - 1):(data.shape[0])]

kValues = list()
scores = list()

for k in range(3, 22, 2):
    labels = bisecting_kmeans(denseMatrix, k, 10)
    
   # if (k == 7):
        # write result to output file
       # outputFile = open("output.dat", "w")
       # for index in labels:
        #    outputFile.write(str(index) +'\n')
        #outputFile.close()

    score = normalized_mutual_info_score(denseMatrix, labels)
    kValues.append(k)
    scores.append(score)
    
    print ("For K= %d NMI is %f" %(k, score))


# In[25]:


labels = bisecting_kmeans(denseMatrix, 7, 10)


# In[22]:

Esempio n. 38
0
def kmeans(encoder_val_clean,
           y,
           nClusters,
           y_pred_prev=None,
           weight_initilization='k-means++',
           seed=42,
           n_init=40,
           max_iter=300):
    # weight_initilization = { 'kmeans-pca', 'kmean++', 'random', None }

    if weight_initilization == 'kmeans-pca':

        start_time = timeit.default_timer()
        pca = PCA(n_components=nClusters).fit(encoder_val_clean)
        kmeans_model = KMeans(init=pca.components_,
                              n_clusters=nClusters,
                              n_init=1,
                              max_iter=300,
                              random_state=seed)
        y_pred = kmeans_model.fit_predict(encoder_val_clean)

        centroids = kmeans_model.cluster_centers_.T
        centroids = centroids / np.sqrt(
            np.diag(np.matmul(centroids.T, centroids)))

        end_time = timeit.default_timer()

    elif weight_initilization == 'k-means++':

        start_time = timeit.default_timer()
        kmeans_model = KMeans(init='k-means++',
                              n_clusters=nClusters,
                              n_init=n_init,
                              max_iter=max_iter,
                              n_jobs=15,
                              random_state=seed)
        y_pred = kmeans_model.fit_predict(encoder_val_clean)

        D = 1.0 / euclidean_distances(
            encoder_val_clean, kmeans_model.cluster_centers_, squared=True)
        D **= 2.0 / (2 - 1)
        D /= np.sum(D, axis=1)[:, np.newaxis]

        centroids = kmeans_model.cluster_centers_.T
        centroids = centroids / np.sqrt(
            np.diag(np.matmul(centroids.T, centroids)))

        end_time = timeit.default_timer()

    print('k-means: \t nmi =', normalized_mutual_info_score(y, y_pred),
          '\t arc =', adjusted_rand_score(y, y_pred),
          '\t acc = {:.4f} '.format(bestMap(y, y_pred)),
          'K-means objective = {:.1f} '.format(kmeans_model.inertia_),
          '\t runtime =', end_time - start_time)

    if y_pred_prev is not None:
        print(
            'Different Assignments: ',
            sum(y_pred == y_pred_prev), '\tbestMap: ',
            bestMap(y_pred, y_pred_prev), '\tdatapoints-bestMap*datapoints: ',
            encoder_val_clean.shape[0] -
            bestMap(y_pred, y_pred_prev) * encoder_val_clean.shape[0])

    return centroids, kmeans_model.inertia_, y_pred
Esempio n. 39
0
def test_check_clusterings():
    noise = np.random.rand(500)
    wavelength = np.linspace(0.01, 1, 500) * 1e-6

    with pytest.raises(ValueError):
        normalized_mutual_info_score(wavelength, noise)
Esempio n. 40
0
def clustering(dataset,
               X,
               y,
               input_var,
               encoder,
               num_clusters,
               output_path,
               test_batch_size=100,
               seed=42,
               continue_training=False):
    encoder_clean = lasagne.layers.get_output(encoder, deterministic=True)
    encoder_clean_function = theano.function([input_var], encoder_clean)

    # Extract MdA features
    minibatch_flag = 1
    for batch in iterate_minibatches(X, y, test_batch_size, shuffle=False):
        inputs, targets, idx = batch
        minibatch_x = encoder_clean_function(inputs)
        if minibatch_flag:
            encoder_val_clean = minibatch_x
            minibatch_flag = 0
        else:
            encoder_val_clean = np.concatenate(
                (encoder_val_clean, minibatch_x), axis=0)

    # Check kmeans results
    kmeans(encoder_val_clean, y, num_clusters, seed=seed)
    initial_time = timeit.default_timer()
    if (dataset == 'MNIST-full') | (dataset == 'FRGC') | (dataset == 'YTF') | (
            dataset == 'CMU-PIE'):
        # K-means on MdA Features
        centroids, inertia, y_pred = kmeans(encoder_val_clean,
                                            y,
                                            num_clusters,
                                            seed=seed)
        y_pred = (np.array(y_pred)).reshape(np.array(y_pred).shape[0], )
        y_pred = y_pred - 1
    else:
        # AC-PIC on MdA Features
        if os.path.isfile(
                os.path.join(output_path, '../params/pred' + dataset +
                             '.pickle')) & continue_training:
            with open(
                    os.path.join(output_path,
                                 '../params/pred' + dataset + '.pickle'),
                    "rb") as input_file:
                y_pred = pickle.load(input_file, encoding='latin1')
        else:
            try:
                import matlab.engine
                eng = matlab.engine.start_matlab()
                eng.addpath(eng.genpath('matlab'))
                targets_init = eng.predict_ac_mpi(
                    matlab.double(
                        encoder_val_clean.reshape(
                            encoder_val_clean.shape[0] *
                            encoder_val_clean.shape[1]).tolist()),
                    num_clusters, encoder_val_clean.shape[0],
                    encoder_val_clean.shape[1])
                y_pred = (np.array(targets_init)).reshape(
                    np.array(targets_init).shape[0], )
                eng.quit()
                y_pred = y_pred - 1
            except:
                y_pred = predict_ac_mpi(encoder_val_clean, num_clusters,
                                        encoder_val_clean.shape[0],
                                        encoder_val_clean.shape[1])
            with open(
                    os.path.join(output_path,
                                 '../params/pred' + dataset + '.pickle'),
                    "wb") as output_file:
                pickle.dump(y_pred, output_file)

        final_time = timeit.default_timer()
        print('AC-PIC: \t nmi =  ', normalized_mutual_info_score(y, y_pred),
              '\t arc = ', adjusted_rand_score(y, y_pred),
              '\t acc = {:.4f} '.format(bestMap(y, y_pred)),
              '\t time taken = {:.4f}'.format(final_time - initial_time))
        centroids_acpic = np.zeros(shape=(num_clusters,
                                          encoder_val_clean.shape[1]))
        for i in range(num_clusters):
            centroids_acpic[i] = encoder_val_clean[y_pred == i].mean(axis=0)

        centroids = centroids_acpic.T
        centroids = centroids_acpic / np.sqrt(
            np.diag(np.matmul(centroids.T, centroids)))

    return np.int32(y_pred), np.float32(centroids)
Esempio n. 41
0
    
    #Start log file, create log file and start.
    FILE_PATH = sys.argv[1]
    pre_clus = sys.argv[2]
    eigv_path = sys.argv[3]
        
    G = nx.read_edgelist(FILE_PATH)
        
    Lap = gen_laplacian(G, 2)
    w, v = LA.eig(Lap.todense())
    save_eigen(w, v, FILE_PATH)
        
    feature_matrix = np.transpose(v)
    partition_orig = KMeans(n_clusters=int(pre_clus)).fit(feature_matrix)
        
    ext_matrix = np.genfromtxt (eigv_path, delimiter=",")
    feature_ext_matrix = np.transpose(ext_matrix)
    partition_fast = KMeans(n_clusters = int(pre_clus)).fit(feature_ext_matrix)
    print partition_orig.inertia_
    print jaccard_similarity_score(partition_orig.labels_, partition_fast.labels_)
    print normalized_mutual_info_score(partition_orig.labels_, partition_fast.labels_)

  '''
    #spec_matrix1 = SpectralClustering(n_clusters=int(pre_clus)).fit(nx.to_numpy_matrix(G))
    spec_matrix = SpectralClustering(n_clusters = int(pre_clus)).fit(nx.to_numpy_matrix(G))
    kmeans_matrix = KMeans(n_clusters = int (pre_clus)).fit(nx.to_numpy_matrix(G))
    print kmeans_matrix.inertia_
    print normalized_mutual_info_score(spec_matrix1.labels_, kmeans_matrix.labels_)
    '''

Esempio n. 42
0
def train_depict(dataset,
                 X,
                 y,
                 input_var,
                 decoder,
                 encoder,
                 loss_recons,
                 num_clusters,
                 y_pred,
                 output_path,
                 batch_size=100,
                 test_batch_size=100,
                 num_epochs=1000,
                 learning_rate=1e-4,
                 prediction_status='soft',
                 rec_mult=1,
                 clus_mult=1,
                 centroids=None,
                 init_flag=1,
                 continue_training=False):
    ######################
    #   ADD RLC TO MdA   #
    ######################

    initial_time = timeit.default_timer()
    rec_lambda = theano.shared(lasagne.utils.floatX(rec_mult))
    clus_lambda = theano.shared(lasagne.utils.floatX(clus_mult))
    pred_normalizition_flag = 1
    num_batches = X.shape[0] // batch_size

    if prediction_status == 'soft':
        target_var = T.matrix('minibatch_out')
        target_init = T.ivector('kmeans_out')
    elif prediction_status == 'hard':
        target_var = T.ivector('minibatch_out')
        target_val = T.vector()

    network2 = build_eml(encoder, n_out=num_clusters, W_initial=centroids)
    network_prediction_noisy = lasagne.layers.get_output(network2,
                                                         input_var,
                                                         deterministic=False)
    network_prediction_clean = lasagne.layers.get_output(network2,
                                                         input_var,
                                                         deterministic=True)

    loss_clus_init = lasagne.objectives.categorical_crossentropy(
        network_prediction_noisy, target_init).mean()
    params_init = lasagne.layers.get_all_params([decoder, network2],
                                                trainable=True)

    if prediction_status == 'soft':
        loss_clus = lasagne.objectives.categorical_crossentropy(
            network_prediction_noisy, target_var)
    elif prediction_status == 'hard':
        loss_clus = target_val * lasagne.objectives.categorical_crossentropy(
            network_prediction_noisy, target_var)

    loss_clus = clus_lambda * loss_clus.mean()
    loss_recons = rec_lambda * loss_recons
    loss = loss_recons + loss_clus
    params2 = lasagne.layers.get_all_params([decoder, network2],
                                            trainable=True)
    updates = lasagne.updates.adam(loss, params2, learning_rate=learning_rate)
    train_fn = theano.function([input_var, target_var],
                               [loss, loss_recons, loss_clus],
                               updates=updates)

    loss_clus_init = clus_lambda * loss_clus_init
    loss_init = loss_clus_init + loss_recons
    updates_init = lasagne.updates.adam(loss_init,
                                        params_init,
                                        learning_rate=learning_rate)
    train_fn_init = theano.function([input_var, target_init],
                                    [loss_init, loss_recons, loss_clus_init],
                                    updates=updates_init)

    test_fn = theano.function([input_var], network_prediction_clean)
    final_time = timeit.default_timer()

    print("\n...Start DEPICT initialization")
    if init_flag:
        if os.path.isfile(
                os.path.join(output_path, '../params/weights' + dataset +
                             '.pickle')) & continue_training:
            with open(
                    os.path.join(output_path,
                                 '../params/weights' + dataset + '.pickle'),
                    "rb") as input_file:
                weights = pickle.load(input_file, encoding='latin1')
                lasagne.layers.set_all_param_values([decoder, network2],
                                                    weights)
        else:
            X_train, X_val, y_train, y_val, y_pred_train, y_pred_val = train_test_split(
                X, y, y_pred, stratify=y, test_size=0.10, random_state=42)
            last_update = 0
            # Initilization
            y_targ_train = np.copy(y_pred_train)
            y_targ_val = np.copy(y_pred_val)
            y_val_prob = test_fn(X_val)
            y_val_pred = np.argmax(y_val_prob, axis=1)
            val_nmi = normalized_mutual_info_score(y_targ_val, y_val_pred)
            best_val = val_nmi
            print('initial val nmi: ', val_nmi)
            best_params_values = lasagne.layers.get_all_param_values(
                [decoder, network2])
            for epoch in range(1000):
                train_err, val_err = 0, 0
                lossre_train, lossre_val = 0, 0
                losspre_train, losspre_val = 0, 0
                num_batches_train = 0
                for batch in iterate_minibatches(X_train,
                                                 y_train,
                                                 batch_size,
                                                 shuffle=True):
                    minibatch_inputs, targets, idx = batch
                    minibatch_error, lossrec, losspred = train_fn_init(
                        minibatch_inputs, np.int32(y_targ_train[idx]))
                    train_err += minibatch_error
                    lossre_train += lossrec
                    losspre_train += losspred
                    num_batches_train += 1

                y_val_prob = test_fn(X_val)
                y_val_pred = np.argmax(y_val_prob, axis=1)

                y_pred = np.zeros(X.shape[0])
                for batch in iterate_minibatches(X,
                                                 y,
                                                 test_batch_size,
                                                 shuffle=False):
                    minibatch_inputs, targets, idx = batch
                    minibatch_prob = test_fn(minibatch_inputs)
                    minibatch_pred = np.argmax(minibatch_prob, axis=1)
                    y_pred[idx] = minibatch_pred

                val_nmi = normalized_mutual_info_score(y_targ_val, y_val_pred)

                print(
                    'epoch:', epoch + 1, '\t nmi = {:.4f}  '.format(
                        normalized_mutual_info_score(y, y_pred)),
                    '\t arc = {:.4f} '.format(adjusted_rand_score(y, y_pred)),
                    '\t acc = {:.4f} '.format(bestMap(y, y_pred)),
                    '\t loss= {:.10f}'.format(train_err / num_batches_train),
                    '\t loss_reconstruction= {:.10f}'.format(
                        lossre_train / num_batches_train),
                    '\t loss_prediction= {:.10f}'.format(losspre_train /
                                                         num_batches_train),
                    '\t val nmi = {:.4f}  '.format(val_nmi))
                last_update += 1
                if val_nmi > best_val:
                    last_update = 0
                    print("new best val nmi: ", val_nmi)
                    best_val = val_nmi
                    best_params_values = lasagne.layers.get_all_param_values(
                        [decoder, network2])
                    # if (losspre_val / num_batches_val) < 0.2:
                    #     break

                if last_update > 5:
                    break

            lasagne.layers.set_all_param_values([decoder, network2],
                                                best_params_values)
            with open(
                    os.path.join(output_path,
                                 '../params/weights' + dataset + '.pickle'),
                    "wb") as output_file:
                pickle.dump(
                    lasagne.layers.get_all_param_values([decoder, network2]),
                    output_file)

    # Epoch 0
    print("\n...Start DEPICT training")
    y_prob = np.zeros((X.shape[0], num_clusters))
    y_prob_prev = np.zeros((X.shape[0], num_clusters))
    for batch in iterate_minibatches(X, y, test_batch_size, shuffle=False):
        minibatch_inputs, targets, idx = batch
        minibatch_prob = test_fn(minibatch_inputs)
        y_prob[idx] = minibatch_prob

    y_prob_max = np.max(y_prob, axis=1)
    if pred_normalizition_flag:
        cluster_frequency = np.sum(y_prob, axis=0)
        y_prob = y_prob**2 / cluster_frequency
        y_prob = np.transpose(y_prob.T / np.sum(y_prob, axis=1))
    y_pred = np.argmax(y_prob, axis=1)

    print('epoch: 0',
          '\t nmi = {:.4f}  '.format(normalized_mutual_info_score(y, y_pred)),
          '\t arc = {:.4f} '.format(adjusted_rand_score(y, y_pred)),
          '\t acc = {:.4f} '.format(bestMap(y, y_pred)))
    if os.path.isfile(
            os.path.join(output_path, '../params/rlc' + dataset +
                         '.pickle')) & continue_training:
        with open(
                os.path.join(output_path,
                             '../params/rlc' + dataset + '.pickle'),
                "rb") as input_file:
            weights = pickle.load(input_file, encoding='latin1')
            lasagne.layers.set_all_param_values([decoder, network2], weights)
    else:
        for epoch in range(num_epochs):

            # In each epoch, we do a full pass over the training data:
            train_err = 0
            lossre = 0
            losspre = 0

            for batch in iterate_minibatches(X, y, batch_size, shuffle=True):
                minibatch_inputs, targets, idx = batch

                # M_step
                if prediction_status == 'hard':
                    minibatch_err, lossrec, losspred = train_fn(
                        minibatch_inputs,
                        np.ndarray.astype(y_pred[idx], 'int32'),
                        np.ndarray.astype(y_prob_max[idx], 'float32'))
                elif prediction_status == 'soft':
                    minibatch_err, lossrec, losspred = train_fn(
                        minibatch_inputs,
                        np.ndarray.astype(y_prob[idx], 'float32'))

                minibatch_prob = test_fn(minibatch_inputs)
                y_prob[idx] = minibatch_prob
                train_err += minibatch_err
                lossre += lossrec
                losspre += losspred

            y_prob_max = np.max(y_prob, axis=1)
            if pred_normalizition_flag:
                cluster_frequency = np.sum(
                    y_prob, axis=0)  # avoid unbalanced assignment
                y_prob = y_prob**2 / cluster_frequency
                # y_prob = y_prob / np.sqrt(cluster_frequency)
                y_prob = np.transpose(y_prob.T / np.sum(y_prob, axis=1))
            y_pred = np.argmax(y_prob, axis=1)

            # print('mse: ', mean_squared_error(y_prob, y_prob_prev))

            if mean_squared_error(y_prob, y_prob_prev) < 1e-7:
                with open(
                        os.path.join(output_path,
                                     '../params/rlc' + dataset + '.pickle'),
                        "wb") as output_file:
                    pickle.dump(
                        lasagne.layers.get_all_param_values(
                            [decoder, network2]), output_file)
                break
            y_prob_prev = np.copy(y_prob)

            print(
                'epoch:', epoch + 1, '\t nmi = {:.4f}  '.format(
                    normalized_mutual_info_score(y, y_pred)),
                '\t arc = {:.4f} '.format(adjusted_rand_score(y, y_pred)),
                '\t acc = {:.4f} '.format(bestMap(y, y_pred)),
                '\t loss= {:.10f}'.format(train_err / num_batches),
                '\t loss_recons= {:.10f}'.format(lossre / num_batches),
                '\t loss_pred= {:.10f}'.format(losspre / num_batches))

    # test
    y_pred = np.zeros(X.shape[0])
    for batch in iterate_minibatches(X, y, test_batch_size, shuffle=False):
        minibatch_inputs, targets, idx = batch
        minibatch_prob = test_fn(minibatch_inputs)
        minibatch_pred = np.argmax(minibatch_prob, axis=1)
        y_pred[idx] = minibatch_pred

    print('final: ',
          '\t nmi = {:.4f}  '.format(normalized_mutual_info_score(y, y_pred)),
          '\t arc = {:.4f} '.format(adjusted_rand_score(y, y_pred)),
          '\t acc = {:.4f} '.format(bestMap(y, y_pred)))
Esempio n. 43
0
def report_clustering(distance_file,
                      biom_file,
                      metadata_file,
                      num_clusters,
                      verbose,
                      L=2,
                      output_file=None):
    if not isinstance(distance_file, list):
        distance_matrix = CSV.read(distance_file)
    else:
        distance_matrix = distance_file

    if output_file is not None:
        f = open(output_file, 'w')

    output_matrix = []

    AgglomerativeCluster = AgglomerativeClustering(
        n_clusters=num_clusters, affinity='precomputed',
        linkage='complete').fit_predict(distance_matrix)
    KMedoidsCluster = KMedoids(n_clusters=num_clusters,
                               metric='precomputed',
                               method='pam',
                               init='heuristic').fit_predict(distance_matrix)

    PCoA_Samples = BW.extract_samples(biom_file)
    metadata = meta.extract_metadata(metadata_file)
    region_names = []
    for i in range(len(PCoA_Samples)):
        if metadata[PCoA_Samples[i]]['body_site'] not in region_names:
            region_names.append(metadata[PCoA_Samples[i]]['body_site'])
        PCoA_Samples[i] = region_names.index(
            metadata[PCoA_Samples[i]]['body_site'])

    if verbose and L == 1:
        print('Printing results for L1-UniFrac:')
    elif verbose and L == 2:
        print('Printing results for L2-UniFrac:')
    if verbose:
        print('Metric\t\t\t\t\t\t\tAgglomerativeClustering\t\tKMedoids')

    if output_file is not None:
        if L == 1:
            f.write('Printing results for L1-UniFrac:\n')
        elif L == 2:
            f.write('Printing results for L2-UniFrac:\n')
        f.write('Metric\t\t\t\tAgglomerativeClustering\t\t\tKMedoids\n')

    if L == 1:
        output_matrix.append(['Printing results for L1-UniFrac:'])
    if L == 2:
        output_matrix.append(['Printing results for L2-UniFrac:'])
    output_matrix.append(['Metric', 'AgglomerativeClustering', 'KMedoids'])

    RI1 = rand_score(PCoA_Samples, AgglomerativeCluster)
    RI2 = rand_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Rand Index Score:               {RI1}\t\t\t{RI2}')
    ARI1 = adjusted_rand_score(PCoA_Samples, AgglomerativeCluster)
    ARI2 = adjusted_rand_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Adjusted Rand Index Score:      {ARI1}\t\t\t{ARI2}')
    NMI1 = normalized_mutual_info_score(PCoA_Samples, AgglomerativeCluster)
    NMI2 = normalized_mutual_info_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Normalized Mutual Index Score:  {NMI1}\t\t\t{NMI2}')
    AMI1 = adjusted_mutual_info_score(PCoA_Samples, AgglomerativeCluster)
    AMI2 = adjusted_mutual_info_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Adjusted Mutual Info Score:     {AMI1}\t\t\t{AMI2}')
    FM1 = fowlkes_mallows_score(PCoA_Samples, AgglomerativeCluster)
    FM2 = fowlkes_mallows_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Fowlkes Mallows Score:          {FM1}\t\t\t{FM2}')

    if output_file is not None:
        f.write(f'Rand Index Score:               {RI1}\t\t\t{RI2}\n')
        f.write(f'Adjusted Rand Index Score:      {ARI1}\t\t\t{ARI2}\n')
        f.write(f'Normalized Mutual Index Score:  {NMI1}\t\t\t{NMI2}\n')
        f.write(f'Adjusted Mutual Info Score:     {AMI1}\t\t\t{AMI2}\n')
        f.write(f'Fowlkes Mallows Score:          {FM1}\t\t\t{FM2}\n')

    output_matrix.append(['Rand Index Score:', RI1, RI2])
    output_matrix.append(['Adjusted Rand Index Score:', ARI1, ARI2])
    output_matrix.append(['Normalized Mutual Index Score:', NMI1, NMI2])
    output_matrix.append(['Adjusted Mutual Info Score:', AMI1, AMI2])
    output_matrix.append(['Fowlkes Mallows Score:', FM1, FM2])

    return output_matrix
Esempio n. 44
0
input = open('C:\\Users\\Administrator\\Desktop\\Tweets.txt', 'r')
for line in input.readlines():
    tweets = json.loads(line)
    texts.append(tweets['text'])
    labels.append(tweets['cluster'])

vectorizer = TfidfVectorizer()
vec = vectorizer.fit_transform(texts)
vectorizer_2 = CountVectorizer()
vec_w2v = vectorizer_2.fit_transform(texts)

# KMeans
clf = KMeans(n_clusters=100)
a = clf.fit(vec)
labels_predict = clf.labels_
nml = normalized_mutual_info_score(labels, labels_predict)
print('the nml of Kmeans:', nml)

# Affinity Propagation
afp = AffinityPropagation().fit(vec)
cluster_centers_indices = afp.cluster_centers_indices_
labels_predict = afp.labels_
nml = normalized_mutual_info_score(labels, labels_predict)
print('the nml of Affinity Propagation:', nml)

# MeanShift
vec_w2v_a = preprocessing.scale(vec_w2v.toarray())
clustering = MeanShift(bandwidth=5).fit(vec_w2v_a)
labels_predict = clustering.labels_
nml = normalized_mutual_info_score(labels, labels_predict)
print('the nml of MeanShift:', nml)
X = X[index]
X = X.toarray()
labels = labels[index]

# train_x=X[:train_len]
# train_y=labels[:train_len]
#
# test_x=X[train_len:]
# test_y=labels[train_len:]

# KMeans
km = KMeans(n_clusters=class_num)
km.fit(X)
pred_y = km.labels_

nmi = normalized_mutual_info_score(labels, pred_y)
print('KMeans NMI:{:.4f}'.format(nmi))

# AffinityPropagation
affinity_propagation = AffinityPropagation(damping=0.9, preference=-1)
affinity_propagation.fit(X)
pred_y = affinity_propagation.labels_

nmi = normalized_mutual_info_score(labels, pred_y)
print('AffinityPropagation NMI:{:.4f}'.format(nmi))

# Mean-shift
bandwidth = estimate_bandwidth(X, quantile=0.2)
mean_shift = MeanShift(bandwidth=0.8, bin_seeding=True)
mean_shift.fit(X)
pred_y = mean_shift.labels_
Esempio n. 46
0
def test_agglomerative_clustering():
    # Check that we obtain the correct number of clusters with
    # agglomerative clustering.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    for linkage in ("ward", "complete", "average"):
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        clustering.fit(X)
        # test caching
        try:
            tempdir = mkdtemp()
            clustering = AgglomerativeClustering(n_clusters=10,
                                                 connectivity=connectivity,
                                                 memory=tempdir,
                                                 linkage=linkage)
            clustering.fit(X)
            labels = clustering.labels_
            assert_true(np.size(np.unique(labels)) == 10)
        finally:
            shutil.rmtree(tempdir)
        # Turn caching off now
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        # Check that we obtain the same solution with early-stopping of the
        # tree building
        clustering.compute_full_tree = False
        clustering.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering.labels_, labels), 1)
        clustering.connectivity = None
        clustering.fit(X)
        assert_true(np.size(np.unique(clustering.labels_)) == 10)
        # Check that we raise a TypeError on dense matrices
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]),
            linkage=linkage)
        assert_raises(ValueError, clustering.fit, X)

    # Test that using ward with another metric than euclidean raises an
    # exception
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity.toarray(),
                                         affinity="manhattan",
                                         linkage="ward")
    assert_raises(ValueError, clustering.fit, X)

    # Test using another metric than euclidean works with linkage complete
    for affinity in PAIRED_DISTANCES.keys():
        # Compare our (structured) implementation to scipy
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=np.ones(
                                                 (n_samples, n_samples)),
                                             affinity=affinity,
                                             linkage="complete")
        clustering.fit(X)
        clustering2 = AgglomerativeClustering(n_clusters=10,
                                              connectivity=None,
                                              affinity=affinity,
                                              linkage="complete")
        clustering2.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering2.labels_,
                                         clustering.labels_), 1)

    # Test that using a distance matrix (affinity = 'precomputed') has same
    # results (with connectivity constraints)
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity,
                                         linkage="complete")
    clustering.fit(X)
    X_dist = pairwise_distances(X)
    clustering2 = AgglomerativeClustering(n_clusters=10,
                                          connectivity=connectivity,
                                          affinity='precomputed',
                                          linkage="complete")
    clustering2.fit(X_dist)
    assert_array_equal(clustering.labels_, clustering2.labels_)
Esempio n. 47
0
def multirun(datasetName):
    # init_population,init_ari,datamat,datalabels = ini_Cluster(kNumber=6) #多种聚类算法产生初始种群
    # datamat,datalabels = loadDataset("../dataset/glass.data")
    path = '../dataset/'+datasetName
    datamat,datalabels = loadDataset(path)
    print 'data ready'
    # datalabels_to_float = list(map(lambda x: float(x), datalabels))

    sampledData, remainedData, sampledIndex, remainedIndex= data_sample(datamat,1,10)
    print 'sampledData ready'
    #
    pop_kmeans = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'kmeans')
    print 'kmeans end'
    max_nmi1 = -inf
    for ind1 in pop_kmeans:
        nmi1 = normalized_mutual_info_score(datalabels, ind1)
        if nmi1 > max_nmi1:
            max_nmi1 = nmi1
    print '初始kmeans最大nmi为%s'%max_nmi1
    pop_ward = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'ward')
    print 'ward end'
    max_nmi2 = -inf
    for ind2 in pop_ward:
        nmi2 = normalized_mutual_info_score(datalabels, ind2)
        if nmi2 > max_nmi2:
            max_nmi2 = nmi2
    print '初始ward最大nmi为%s'%max_nmi2
    pop_complete = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'complete')
    print 'complete end'
    max_nmi3 = -inf
    for ind3 in pop_complete:
        nmi3 = normalized_mutual_info_score(datalabels, ind3)
        if nmi3 > max_nmi3:
            max_nmi3 = nmi3
    print '初始complete最大nmi为%s'%max_nmi3
    pop_average = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'average')
    print 'average end'
    max_nmi4 = -inf
    for ind4 in pop_average:
        nmi4 = normalized_mutual_info_score(datalabels, ind4)
        if nmi4 > max_nmi4:
            max_nmi4 = nmi4
    print '初始average最大nmi为%s'%max_nmi4
    pop = []
    pop.extend(pop_kmeans)
    pop.extend(pop_ward)
    pop.extend(pop_complete)
    pop.extend(pop_average)


    init_population = []
    for indiv1 in pop:
        ind1 = creator.Individual(indiv1)
        init_population.append(ind1)

    filter_pop = filter(lambda x:len(x)>0,init_population) ##去除初始化聚类失败的结果
    population = filter_pop #population是总的种群,后续的交叉算法的结果也要添加进来


    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(population,(len(invalid_ind),1,1)),invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    population = toolbox.select(population, len(population))
    popeliteLen = len(population)
    for i in range(generation):
        print '第%s代'%i
        popElite = toolbox.select(population, popeliteLen) #top half from population

        # Vary the population
        parentSpring = tools.selTournamentDCD(population, len(population))
        parentSpring = [toolbox.clone(ind) for ind in parentSpring]
        newoffspring = []
        # applying crossover
        subpopArr=[]
        for subtimes in range(5): #这里循环10次,是因为subpop写死成4次,乘起来又产生40个个体,用于产生dsce运算产生40个结果,视作交叉
            subpopOneArr = getSubPop(parentSpring)
            subpopArr.extend(subpopOneArr)
        for subpop in subpopArr:
            transMatrix, popClusterArr_3, popClusterArr_2, clusterNumArr = transformation(datamat, subpop)
            similiarMatrix, unionClusterArr_2 = measureSimilarity(transMatrix, popClusterArr_3, popClusterArr_2,
                                                              clusterNumArr, datamat, a1=0.8)
            dictCownP = assign(similiarMatrix, 0.7)
            resultList = resultTransform(dictCownP, datamat)
            ind_ensemble = creator.Individual(resultList)
            newoffspring.append(ind_ensemble)


        # evaluating fitness of individuals with invalid fitnesses
        invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(newoffspring,(len(invalid_ind),1,1)),invalid_ind)#这里只用了未经处理的数据,没有用到真实类别
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Chossing a population for the next generation
        population = toolbox.select(popElite + newoffspring, popeliteLen)
    result1 = toolbox.nondominated(population,len(population))
    ari_arr = []
    max_ari = -inf
    for ind in result1[0]:
        ari = adjusted_rand_score(datalabels, ind)
        ari_arr.append(ari)
        if ari > max_ari:
            max_ari = ari
    nmi_arr = []
    max_nmi = -inf
    print 'nmi值'
    for ind in result1[0]:
        nmi = normalized_mutual_info_score(datalabels, ind)
        nmi_arr.append(nmi)
        if nmi > max_nmi:
            max_nmi = nmi
    print '最大nmi值为:%s' % max_nmi
    return max_nmi,max_ari
Esempio n. 48
0
 def clusterscores(self):
     target,pred = self.conf2label()
     NMI = normalized_mutual_info_score(target,pred)
     ARI = adjusted_rand_score(target,pred)
     AMI = adjusted_mutual_info_score(target,pred)
     return {'NMI':NMI,'ARI':ARI,'AMI':AMI}
Esempio n. 49
0
            X=X, K=self.n_clusters, max_iter=self.max_iter)
        return self

    def fit_predict(self, X, y=None):
        if self.fit(X).isConverge:
            return self.best_labels
        else:
            return 'Not convergence with current parameter ' \
                   'or centroids,Please try again'

    def get_params(self):
        return self.isConverge, self.n_clusters, 'KMEAS'

    def get_cost(self):
        return self.cost


def load_data():
    data = load_iris()
    x, y = data.data, data.target
    return x, y


if __name__ == '__main__':
    x, y = load_data()
    K = len(np.unique(y))
    model = KMeans(n_clusters=K)
    y_pred = model.fit_predict(x)
    nmi = normalized_mutual_info_score(y, y_pred)
    print("NMI: ", nmi)
Esempio n. 50
0
    def _calculate(self, input):
        input = input[~np.isnan(input).any(axis=1)]

        return normalized_mutual_info_score(input[:, 0], input[:, 1])
Esempio n. 51
0
def my_kmeans():
    X, cluster = loaddata()
    kmeans = KMeans(n_clusters=len(set(cluster)), random_state=0).fit(X)
    print('kmeans result:')
    print('NMI score:%f\n' %
          normalized_mutual_info_score(cluster, kmeans.labels_))
Esempio n. 52
0
def evaluation(file_name):

    #[V, miu] = [[], []]
    #V_target = [[]]
    V_target = []
    #with gzip.open(file_name, 'rb') as f:
    #[V, miu] = pickle.load(f)
    #V_target = pickle.load(f)

    #with open(file_name, 'r') as fin:
    with open(file_name + '.tsv', 'r') as fin:
        for line in fin:
            l = line[0:-1].split('\t')
            l = [float(x) for x in l]
            V_target.append(l)

    #V_star = get_V_star(V, miu)

    #clusters = np.argmax(V_star[target], axis = 1)
    clusters_prob = V_target
    clusters = np.argmax(clusters_prob, axis=1)

    #Xinwei edited
    #clusters_prob = V_star[target]
    if args.debug:
        print("clusters_prob:")
        print(clusters_prob)
        print("clusters:")
        print(clusters)

    clusters_prob_parse = []

    total = 0
    correct = 0
    aal = []
    ppl = []

    #new data only
    results = {}

    for i in range(len(target_list)):
        a = target_list[i]
        predict = clusters[i] + 1
        if a in labels_dict_test:
            total += 1
            actual = labels_dict_test[a]
            if actual == predict:
                correct += 1
            aal.append(actual)
            ppl.append(predict)
            results[a] = (actual, predict)
            clusters_prob_parse.append(clusters_prob[i])

    if args.debug:
        print('results is: {}'.format(results))
        print("Total test size: {}".format(total))
    try:
        print("Accuracy: {}".format(correct / total))
    except ZeroDivisionError:
        print("0 total labels")

    precision = precision_score(np.array(aal), np.array(ppl), average=None)
    micro_precision = precision_score(np.array(aal),
                                      np.array(ppl),
                                      average='micro')
    macro_precision = precision_score(np.array(aal),
                                      np.array(ppl),
                                      average='macro')
    recall = recall_score(np.array(aal), np.array(ppl), average=None)
    micro_recall = recall_score(np.array(aal), np.array(ppl), average='micro')
    macro_recall = recall_score(np.array(aal), np.array(ppl), average='macro')
    f1 = f1_score(np.array(aal), np.array(ppl), average=None)
    micro_f1 = f1_score(np.array(aal), np.array(ppl), average='micro')
    macro_f1 = f1_score(np.array(aal), np.array(ppl), average='macro')
    nmi = normalized_mutual_info_score(np.array(aal), np.array(ppl))
    ll = log_loss(np.array(aal), clusters_prob_parse)

    print('len(np.array(aal)) is: {}'.format(len(np.array(aal))))
    print('len(clusters_prob) is: {}'.format(len(clusters_prob)))
    print('len(clusters_prob_parse) is: {}'.format(len(clusters_prob_parse)))
    print("precision: {}".format(precision))
    print('micro precision: {}'.format(micro_precision))
    print('macro precision: {}'.format(macro_precision))
    print("recall: {}".format(recall))
    print('micro recall: {}'.format(micro_recall))
    print('macro recall: {}'.format(macro_recall))
    print("f1: {}".format(f1))
    print('micro f1: {}'.format(micro_f1))
    print('macro f1: {}'.format(macro_f1))
    print("nmi: {}".format(nmi))
    print("ll: {}".format(ll))
print ccr(crossDatasetTrueClassification, clusteringResults)
# print labelAssignment	
fileContainer.write('\n')
fileContainer.write('The Correct Clustering Rate is : '+ str(ccr(crossDatasetTrueClassification, clusteringResults)))
fileContainer.write('\n')
fileContainer.write('The clustering Jaccard Similarity is : '+ str(jaccardSim(crossDatasetTrueClassification, clusteringResults)))


wholeTrueClasses = np.zeros(1900)
for i, trueClass in enumerate(crossDatasetTrueClassification):
    for trajectory in trueClass:
        wholeTrueClasses[trueClass] = i
print list(wholeTrueClasses[:20])

wholePredClasses = np.zeros(1900)
for i, predClass in enumerate(clusteringResults):
    for trajectory in predClass:
        wholePredClasses[predClass] = i

print list(wholePredClasses[:20])

print normalized_mutual_info_score(wholeTrueClasses, wholePredClasses)
fileContainer.write('\nThe NMI is : '+ str(normalized_mutual_info_score(wholeTrueClasses, wholePredClasses)))
fileContainer.write('\n--------------------------------------------------------------------')

print measurements.ccr(crossDatasetTrueClassification, clusteringResults)
print measurements.jaccardSim(crossDatasetTrueClassification, clusteringResults)
print measurements.NMI(crossDatasetTrueClassification, clusteringResults)


Esempio n. 54
0
def NMI(GT, pred):
    return normalized_mutual_info_score(GT, pred)
Esempio n. 55
0
#separate data into training and test sets and choose state so that results are repeatable
X_train, X_test, y_train, y_test = train_test_split(df[cols],
                                                    df["quality"],
                                                    test_size=0.2,
                                                    random_state=4)

#declare and fit svm with rbf kernel to training set
clf = svm.SVC(kernel="rbf", gamma=1, C=1, decision_function_shape="ovo")
clf.fit(X_train, y_train)

ypred = clf.predict(X_test)

#get scores for how well the svm performs on test set
print("Accuracy rbf kernel: %.2f" % clf.score(X_test, y_test))
print("nmi rbf kernel: %.2f" % normalized_mutual_info_score(y_test, ypred))
print(classification_report(y_test, ypred))

X = StandardScaler().fit_transform(df[cols])

#finding ideal eps for DBSCAN by calculating the distance to the nearest n points for each point,
#sorting and plotting the results. Then we look to see where the change is most pronounce
#and select that as epsilon.
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X)
distances, indices = nbrs.kneighbors(X)

distances = np.sort(distances, axis=0)
distances = distances[:, 1]
plt.plot(distances)
plt.title("finding optimal eps value for DBSCAN using elbow method")
Esempio n. 56
0
ASC = alt_spectral_clust(data)
omg = objective_magnitude
db = ASC.db

ASC.set_values('q', 2)
ASC.set_values('C_num', 2)
ASC.set_values('lambda', 1)
ASC.set_values('kernel_type', 'Gaussian Kernel')
ASC.set_values('sigma', 15)
ASC.run()
a = db['allocation']
print a
import pdb
pdb.set_trace()

#print db['Y_matrix']
start_time = time.time()
ASC.run()
print("--- %s seconds ---" % (time.time() - start_time))
b = db['allocation']

print "NMI : ", normalized_mutual_info_score(a, b)

#sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean', n_jobs=1, **kwds)

#new_d = db['data'].dot(db['W_matrix'])
#dm = sklearn.metrics.pairwise.pairwise_distances(new_d)
#np.savetxt('original_similarity.txt', db['Kernel_matrix'], fmt='%5.3f', delimiter=',', newline='\n', header='', footer='', comments='# ')
import pdb
pdb.set_trace()
Esempio n. 57
0
                if w in model.vocab:
                    tw[w] = prob[model.vocab[w].index][tp]
                    tmp += prob[model.vocab[w].index][tp]
            lista.append(tw)
            tw_topics.append(tmp)
        dist_topics.append(tw_topics)
        tw_l.append(lista)
        assign_topics.append(tw_topics.index(max(tw_topics)))

    clf = svm.SVC(kernel = 'linear', C = 1)
    scores = cross_val_score(clf, dist_topics, labels, cv = k - 1)

    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    scores = cross_val_score(clf, dist_topics, labels, cv = k - 1, scoring = 'f1_macro')
    print("F1_macro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    print("Topic Coherence")
    print(topic_coherence(word_topic, texts, 15))

    print("Topic NMI")
    print(normalized_mutual_info_score(assign_topics, labels))

    print("Topic Purity")
    print(purity_score(assign_topics,labels))





Esempio n. 58
0
def main():

    shutil.rmtree(save_path)
    os.mkdir(save_path)

    if data_path == '20newsgroups':
        newsgroups_data = fetch_20newsgroups_vectorized(subset='all')
        x = newsgroups_data.data.toarray()
        labels = newsgroups_data.target
        n_clusters = 20
    elif data_path == 'r8':
        df = pd.read_csv('data/r8-all-stemmed.txt')
        labels_idx = [
            'acq', 'crude', 'earn', 'grain', 'interest', 'money-fx', 'ship',
            'trade'
        ]
        labels = df['class'].values
        labels = [labels_idx.index(ele) for ele in labels]
        labels = np.asarray(labels, dtype=np.int64)
        x_df = df.drop(['class'], axis=1)
        corpus = np.squeeze(x_df.values)

        is_TfidfVectorizer = True
        if is_TfidfVectorizer:
            vectorizer = TfidfVectorizer()
            x = vectorizer.fit_transform(corpus).toarray()
        else:
            vectorizer = CountVectorizer()
            x = vectorizer.fit_transform(corpus).toarray()
        n_clusters = 8
    elif data_path == 'olivetti_faces':
        data = fetch_olivetti_faces()
        x = data.data
        labels = data.target
        n_clusters = 40
    elif data_path == 'rcv1':
        # data = fetch_rcv1()
        # x = data.data.toarray()
        # labels = data.target.toarray()
        # n_clusters = 103

        x, labels = get_data_from_svmlight_file('data/rcv1_train.binary')
        x = x.toarray()
        n_clusters = 2
    elif data_path == 'sector':
        x, labels = get_data_from_svmlight_file('data/sector.scale.all')
        x = x.toarray()
        n_clusters = 105
    else:
        raise Exception("Invalid data path!")
    print("Data shape: (%d, %d)" % x.shape)
    data_size = labels.size

    # build model
    model = RDP_Model(in_c=x.shape[1],
                      out_c=out_c,
                      USE_GPU=USE_GPU,
                      LR=LR,
                      logfile=logfile,
                      dropout_r=dropout_r)

    best_nmi = best_epoch = 0
    loss = 0

    for epoch in range(0, total_epoch):

        # random sampling with replacement
        for batch_i in range(epoch_batch):
            random_pos = random_list(0, data_size - 1, batch_size)
            batch_data = x[random_pos]
            loss = model.train_model(batch_data, epoch)

        if epoch % eval_interval == 0:
            print("epoch ", epoch, "loss:", loss)
            if logfile:
                logfile.write("epoch " + str(epoch) + " loss: " + str(loss) +
                              '\n')

            model.save_model(save_path + 'model_latest.h5')

            # eval
            if is_eval:
                gap_dims = model.eval_model(x)

                kmeans_results = KMeans(n_clusters=n_clusters,
                                        random_state=0).fit(gap_dims)
                # Match each learned cluster label with the true labels found in them
                y_pred = kmeans_results.labels_
                labels_pred = np.zeros_like(y_pred)
                for i in range(n_clusters):
                    mask = (y_pred == i)
                    labels_pred[mask] = mode(labels[mask])[0]

                # evaluations
                nmi_scores = normalized_mutual_info_score(labels, labels_pred)
                print("nmi_scores:", nmi_scores)
                if logfile:
                    logfile.write("nmi_scores: %.4f\n" % nmi_scores)

                fscores = f1_score(labels, labels_pred, average='macro')
                print("fscores_macro:", fscores)
                if logfile:
                    logfile.write("fscores_macro: %.4f\n" % fscores)

                fscores = f1_score(labels, labels_pred, average='micro')
                print("fscores_micro:", fscores)
                if logfile:
                    logfile.write("fscores_micro: %.4f\n" % fscores)

                fscores = f1_score(labels, labels_pred, average='weighted')
                print("fscores_weighted:", fscores)
                if logfile:
                    logfile.write("fscores_weighted: %.4f\n" % fscores)

                RI_scores = adjusted_rand_score(labels, labels_pred)
                print("RI_scores:", RI_scores)
                if logfile:
                    logfile.write("RI_scores: %.4f\n" % RI_scores)

                if best_nmi < nmi_scores:
                    best_nmi = nmi_scores
                    best_epoch = epoch

                print("Best NMI: %.4f" % best_nmi)
                print("Best Epoch %d\n" % best_epoch)
                if logfile:
                    logfile.write("Best NMI: %.4f\n" % best_nmi)
                    logfile.write("Best Epoch %d\n\n" % best_epoch)
                    logfile.flush()
Esempio n. 59
0
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GMM

 
k = 4
X = genfromtxt('dataset/min_words.csv', delimiter=',')

univ_label = genfromtxt('dataset/webkbRaw_label_univ.csv', delimiter=',') 
topic_label = genfromtxt('dataset/webkbRaw_label_topic.csv', delimiter=',') 


clf = KMeans(n_clusters=k)
allocation = clf.fit_predict(X)
kmeans_nmi = normalized_mutual_info_score(allocation, univ_label)
print "K means : " , kmeans_nmi



d_matrix = sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean')
sigma = np.median(d_matrix)
Gamma = 1/(2*np.power(sigma,2))


clf = SpectralClustering(n_clusters=k, gamma=Gamma)
allocation = clf.fit_predict(X)
spectral_nmi = normalized_mutual_info_score(allocation, univ_label)
print 'Spectral Clustering : ' , spectral_nmi

def main(args):
    # fix random seeds
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)
    device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
    print(device)
    criterion = nn.CrossEntropyLoss()
    cluster_log = Logger(os.path.join(args.exp, 'clusters.pickle'))

    # CNN
    if args.verbose:
        print('Architecture: {}'.format(args.arch))
    '''
    ##########################################
    ##########################################
    # Model definition
    ##########################################
    ##########################################'''
    model = models.__dict__[args.arch](bn=True,
                                       num_cluster=args.nmb_cluster,
                                       num_category=args.nmb_category)
    fd = int(model.cluster_layer[0].weight.size()
             [1])  # due to transpose, fd is input dim of W (in dim, out dim)
    model.cluster_layer = None
    model.category_layer = None
    model.features = torch.nn.DataParallel(model.features)
    model = model.double()
    model.to(device)
    cudnn.benchmark = True

    if args.optimizer is 'Adam':
        print('Adam optimizer: conv')
        optimizer_body = torch.optim.Adam(
            filter(lambda x: x.requires_grad, model.parameters()),
            lr=args.lr_Adam,
            betas=(0.9, 0.999),
            weight_decay=10**args.wd,
        )
    else:
        print('SGD optimizer: conv')
        optimizer_body = torch.optim.SGD(
            filter(lambda x: x.requires_grad, model.parameters()),
            lr=args.lr_SGD,
            momentum=args.momentum,
            weight_decay=10**args.wd,
        )
    '''
    ###############
    ###############
    category_layer
    ###############
    ###############
    '''
    model.category_layer = nn.Sequential(
        nn.Linear(fd, args.nmb_category),
        nn.Softmax(dim=1),
    )
    model.category_layer[0].weight.data.normal_(0, 0.01)
    model.category_layer[0].bias.data.zero_()
    model.category_layer = model.category_layer.double()
    model.category_layer.to(device)

    if args.optimizer is 'Adam':
        print('Adam optimizer: conv')
        optimizer_category = torch.optim.Adam(
            filter(lambda x: x.requires_grad,
                   model.category_layer.parameters()),
            lr=args.lr_Adam,
            betas=(0.9, 0.999),
            weight_decay=10**args.wd,
        )
    else:
        print('SGD optimizer: conv')
        optimizer_category = torch.optim.SGD(
            filter(lambda x: x.requires_grad,
                   model.category_layer.parameters()),
            lr=args.lr_SGD,
            momentum=args.momentum,
            weight_decay=10**args.wd,
        )
    '''
    ########################################
    ########################################
    Create echogram sampling index
    ########################################
    ########################################'''

    print('Sample echograms.')
    dataset_cp, dataset_semi = sampling_echograms_full(args)
    dataloader_cp = torch.utils.data.DataLoader(dataset_cp,
                                                shuffle=False,
                                                batch_size=args.batch,
                                                num_workers=args.workers,
                                                drop_last=False,
                                                pin_memory=True)

    dataloader_semi = torch.utils.data.DataLoader(dataset_semi,
                                                  shuffle=False,
                                                  batch_size=args.batch,
                                                  num_workers=args.workers,
                                                  drop_last=False,
                                                  pin_memory=True)

    dataset_test_bal, dataset_test_unbal = sampling_echograms_test(args)
    dataloader_test_bal = torch.utils.data.DataLoader(dataset_test_bal,
                                                      shuffle=False,
                                                      batch_size=args.batch,
                                                      num_workers=args.workers,
                                                      drop_last=False,
                                                      pin_memory=True)

    dataloader_test_unbal = torch.utils.data.DataLoader(
        dataset_test_unbal,
        shuffle=False,
        batch_size=args.batch,
        num_workers=args.workers,
        drop_last=False,
        pin_memory=True)

    # clustering algorithm to use
    deepcluster = clustering.__dict__[args.clustering](args.nmb_cluster,
                                                       args.pca)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            # remove top located layer parameters from checkpoint
            copy_checkpoint_state_dict = checkpoint['state_dict'].copy()
            for key in list(copy_checkpoint_state_dict):
                if 'cluster_layer' in key:
                    del copy_checkpoint_state_dict[key]
                # if 'category_layer' in key:
                #     del copy_checkpoint_state_dict[key]
            checkpoint['state_dict'] = copy_checkpoint_state_dict
            model.load_state_dict(checkpoint['state_dict'])
            optimizer_body.load_state_dict(checkpoint['optimizer_body'])
            optimizer_category.load_state_dict(
                checkpoint['optimizer_category'])
            category_save = os.path.join(args.exp, 'category_layer.pth.tar')
            if os.path.isfile(category_save):
                category_layer_param = torch.load(category_save)
                model.category_layer.load_state_dict(category_layer_param)
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # creating checkpoint repo
    exp_check = os.path.join(args.exp, 'checkpoints')
    if not os.path.isdir(exp_check):
        os.makedirs(exp_check)

    exp_bal = os.path.join(args.exp, 'bal')
    exp_unbal = os.path.join(args.exp, 'unbal')
    for dir_bal in [exp_bal, exp_unbal]:
        for dir_2 in ['features', 'pca_features', 'pred']:
            dir_to_make = os.path.join(dir_bal, dir_2)
            if not os.path.isdir(dir_to_make):
                os.makedirs(dir_to_make)

    if os.path.isfile(os.path.join(args.exp, 'loss_collect.pickle')):
        with open(os.path.join(args.exp, 'loss_collect.pickle'), "rb") as f:
            loss_collect = pickle.load(f)
    else:
        loss_collect = [[], [], [], [], [], [], [], [], []]

    if os.path.isfile(os.path.join(args.exp, 'nmi_collect.pickle')):
        with open(os.path.join(args.exp, 'nmi_collect.pickle'), "rb") as ff:
            nmi_save = pickle.load(ff)
    else:
        nmi_save = []
    '''
    #######################
    #######################
    MAIN TRAINING
    #######################
    #######################'''
    for epoch in range(args.start_epoch, args.epochs):
        end = time.time()
        print(
            '#####################  Start training at Epoch %d ################'
            % epoch)
        model.classifier = nn.Sequential(
            *list(model.classifier.children())
            [:-1])  # remove ReLU at classifier [:-1]
        model.cluster_layer = None
        model.category_layer = None
        '''
        #######################
        #######################
        PSEUDO-LABEL GENERATION
        #######################
        #######################
        '''
        print('Cluster the features')
        features_train, input_tensors_train, labels_train = compute_features(
            dataloader_cp, model, len(dataset_cp), device=device, args=args)
        clustering_loss, pca_features = deepcluster.cluster(
            features_train, verbose=args.verbose)

        nan_location = np.isnan(pca_features)
        inf_location = np.isinf(pca_features)
        if (not np.allclose(nan_location, 0)) or (not np.allclose(
                inf_location, 0)):
            print('PCA: Feature NaN or Inf found. Nan count: ',
                  np.sum(nan_location), ' Inf count: ', np.sum(inf_location))
            print('Skip epoch ', epoch)
            torch.save(pca_features, 'tr_pca_NaN_%d.pth.tar' % epoch)
            torch.save(features_train, 'tr_feature_NaN_%d.pth.tar' % epoch)
            continue

        print('Assign pseudo labels')
        size_cluster = np.zeros(len(deepcluster.images_lists))
        for i, _list in enumerate(deepcluster.images_lists):
            size_cluster[i] = len(_list)
        print('size in clusters: ', size_cluster)
        img_label_pair_train = zip_img_label(input_tensors_train, labels_train)
        train_dataset = clustering.cluster_assign(
            deepcluster.images_lists,
            img_label_pair_train)  # Reassigned pseudolabel

        # uniformly sample per target
        sampler_train = UnifLabelSampler(int(len(train_dataset)),
                                         deepcluster.images_lists)

        train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch,
            shuffle=False,
            num_workers=args.workers,
            sampler=sampler_train,
            pin_memory=True,
        )
        '''
        ####################################################################
        ####################################################################
        TRSNSFORM MODEL FOR SELF-SUPERVISION // SEMI-SUPERVISION
        ####################################################################
        ####################################################################
        '''
        # Recover classifier with ReLU (that is not used in clustering)
        mlp = list(model.classifier.children(
        ))  # classifier that ends with linear(512 * 128). No ReLU at the end
        mlp.append(nn.ReLU(inplace=True).to(device))
        model.classifier = nn.Sequential(*mlp)
        model.classifier.to(device)
        '''SELF-SUPERVISION (PSEUDO-LABELS)'''
        model.category_layer = None
        model.cluster_layer = nn.Sequential(
            nn.Linear(fd, args.nmb_cluster),  # nn.Linear(4096, num_cluster),
            nn.Softmax(
                dim=1
            ),  # should be removed and replaced by ReLU for category_layer
        )
        model.cluster_layer[0].weight.data.normal_(0, 0.01)
        model.cluster_layer[0].bias.data.zero_()
        model.cluster_layer = model.cluster_layer.double()
        model.cluster_layer.to(device)
        ''' train network with clusters as pseudo-labels '''
        with torch.autograd.set_detect_anomaly(True):
            pseudo_loss, semi_loss, semi_accuracy = semi_train(
                train_dataloader,
                dataloader_semi,
                model,
                fd,
                criterion,
                optimizer_body,
                optimizer_category,
                epoch,
                device=device,
                args=args)

        # save checkpoint
        if (epoch + 1) % args.checkpoints == 0:
            path = os.path.join(
                args.exp,
                'checkpoints',
                'checkpoint_' + str(epoch) + '.pth.tar',
            )
            if args.verbose:
                print('Save checkpoint at: {0}'.format(path))
            torch.save(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'optimizer_body': optimizer_body.state_dict(),
                    'optimizer_category': optimizer_category.state_dict(),
                }, path)
        '''
        ##############
        ##############
        # TEST phase
        ##############
        ##############
        '''
        test_loss_bal, test_accuracy_bal, test_pred_bal, test_label_bal = test(
            dataloader_test_bal, model, criterion, device, args)
        test_loss_unbal, test_accuracy_unbal, test_pred_unbal, test_label_unbal = test(
            dataloader_test_unbal, model, criterion, device, args)
        '''Save prediction of the test set'''
        if (epoch % args.save_epoch == 0):
            with open(
                    os.path.join(args.exp, 'bal', 'pred',
                                 'sup_epoch_%d_te_bal.pickle' % epoch),
                    "wb") as f:
                pickle.dump([test_pred_bal, test_label_bal], f)
            with open(
                    os.path.join(args.exp, 'unbal', 'pred',
                                 'sup_epoch_%d_te_unbal.pickle' % epoch),
                    "wb") as f:
                pickle.dump([test_pred_unbal, test_label_unbal], f)

        if args.verbose:
            print('###### Epoch [{0}] ###### \n'
                  'Time: {1:.3f} s\n'
                  'Pseudo tr_loss: {2:.3f} \n'
                  'SEMI tr_loss: {3:.3f} \n'
                  'TEST_bal loss: {4:.3f} \n'
                  'TEST_unbal loss: {5:.3f} \n'
                  'Clustering loss: {6:.3f} \n\n'
                  'SEMI accu: {7:.3f} \n'
                  'TEST_bal accu: {8:.3f} \n'
                  'TEST_unbal accu: {9:.3f} \n'.format(
                      epoch,
                      time.time() - end, pseudo_loss, semi_loss, test_loss_bal,
                      test_loss_unbal, clustering_loss, semi_accuracy,
                      test_accuracy_bal, test_accuracy_unbal))
            try:
                nmi = normalized_mutual_info_score(
                    clustering.arrange_clustering(deepcluster.images_lists),
                    clustering.arrange_clustering(cluster_log.data[-1]))
                nmi_save.append(nmi)
                print('NMI against previous assignment: {0:.3f}'.format(nmi))
                with open(os.path.join(args.exp, 'nmi_collect.pickle'),
                          "wb") as ff:
                    pickle.dump(nmi_save, ff)
            except IndexError:
                pass
            print('####################### \n')

        # save cluster assignments
        cluster_log.log(deepcluster.images_lists)

        # save running checkpoint
        torch.save(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer_body': optimizer_body.state_dict(),
                'optimizer_category': optimizer_category.state_dict(),
            }, os.path.join(args.exp, 'checkpoint.pth.tar'))
        torch.save(model.category_layer.state_dict(),
                   os.path.join(args.exp, 'category_layer.pth.tar'))

        loss_collect[0].append(epoch)
        loss_collect[1].append(pseudo_loss)
        loss_collect[2].append(semi_loss)
        loss_collect[3].append(clustering_loss)
        loss_collect[4].append(test_loss_bal)
        loss_collect[5].append(test_loss_unbal)
        loss_collect[6].append(semi_accuracy)
        loss_collect[7].append(test_accuracy_bal)
        loss_collect[8].append(test_accuracy_unbal)
        with open(os.path.join(args.exp, 'loss_collect.pickle'), "wb") as f:
            pickle.dump(loss_collect, f)
        '''
        ############################
        ############################
        # PSEUDO-LABEL GEN: Test set (balanced UA)
        ############################
        ############################
        '''
        model.classifier = nn.Sequential(
            *list(model.classifier.children())
            [:-1])  # remove ReLU at classifier [:-1]
        model.cluster_layer = None
        model.category_layer = None

        print('TEST set: Cluster the features')
        features_te_bal, input_tensors_te_bal, labels_te_bal = compute_features(
            dataloader_test_bal,
            model,
            len(dataset_test_bal),
            device=device,
            args=args)
        clustering_loss_te_bal, pca_features_te_bal = deepcluster.cluster(
            features_te_bal, verbose=args.verbose)

        mlp = list(model.classifier.children(
        ))  # classifier that ends with linear(512 * 128). No ReLU at the end
        mlp.append(nn.ReLU(inplace=True).to(device))
        model.classifier = nn.Sequential(*mlp)
        model.classifier.to(device)

        nan_location_bal = np.isnan(pca_features_te_bal)
        inf_location_bal = np.isinf(pca_features_te_bal)
        if (not np.allclose(nan_location_bal, 0)) or (not np.allclose(
                inf_location_bal, 0)):
            print('PCA: Feature NaN or Inf found. Nan count: ',
                  np.sum(nan_location_bal), ' Inf count: ',
                  np.sum(inf_location_bal))
            print('Skip epoch ', epoch)
            torch.save(pca_features_te_bal,
                       'te_pca_NaN_%d_bal.pth.tar' % epoch)
            torch.save(features_te_bal,
                       'te_feature_NaN_%d_bal.pth.tar' % epoch)
            continue

        # save patches per epochs
        cp_epoch_out_bal = [
            features_te_bal, deepcluster.images_lists,
            deepcluster.images_dist_lists, input_tensors_te_bal, labels_te_bal
        ]

        if (epoch % args.save_epoch == 0):
            with open(
                    os.path.join(args.exp, 'bal', 'features',
                                 'cp_epoch_%d_te_bal.pickle' % epoch),
                    "wb") as f:
                pickle.dump(cp_epoch_out_bal, f)
            with open(
                    os.path.join(args.exp, 'bal', 'pca_features',
                                 'pca_epoch_%d_te_bal.pickle' % epoch),
                    "wb") as f:
                pickle.dump(pca_features_te_bal, f)
        '''
        ############################
        ############################
        # PSEUDO-LABEL GEN: Test set (Unbalanced UA)
        ############################
        ############################
        '''
        model.classifier = nn.Sequential(
            *list(model.classifier.children())
            [:-1])  # remove ReLU at classifier [:-1]
        model.cluster_layer = None
        model.category_layer = None

        print('TEST set: Cluster the features')
        features_te_unbal, input_tensors_te_unbal, labels_te_unbal = compute_features(
            dataloader_test_unbal,
            model,
            len(dataset_test_unbal),
            device=device,
            args=args)
        clustering_loss_te_unbal, pca_features_te_unbal = deepcluster.cluster(
            features_te_unbal, verbose=args.verbose)

        mlp = list(model.classifier.children(
        ))  # classifier that ends with linear(512 * 128). No ReLU at the end
        mlp.append(nn.ReLU(inplace=True).to(device))
        model.classifier = nn.Sequential(*mlp)
        model.classifier.to(device)

        nan_location_unbal = np.isnan(pca_features_te_unbal)
        inf_location_unbal = np.isinf(pca_features_te_unbal)
        if (not np.allclose(nan_location_unbal, 0)) or (not np.allclose(
                inf_location_unbal, 0)):
            print('PCA: Feature NaN or Inf found. Nan count: ',
                  np.sum(nan_location_unbal), ' Inf count: ',
                  np.sum(inf_location_unbal))
            print('Skip epoch ', epoch)
            torch.save(pca_features_te_unbal,
                       'te_pca_NaN_%d_unbal.pth.tar' % epoch)
            torch.save(features_te_unbal,
                       'te_feature_NaN_%d_unbal.pth.tar' % epoch)
            continue

        # save patches per epochs
        cp_epoch_out_unbal = [
            features_te_unbal, deepcluster.images_lists,
            deepcluster.images_dist_lists, input_tensors_te_unbal,
            labels_te_unbal
        ]

        if (epoch % args.save_epoch == 0):
            with open(
                    os.path.join(args.exp, 'unbal', 'features',
                                 'cp_epoch_%d_te_unbal.pickle' % epoch),
                    "wb") as f:
                pickle.dump(cp_epoch_out_unbal, f)
            with open(
                    os.path.join(args.exp, 'unbal', 'pca_features',
                                 'pca_epoch_%d_te_unbal.pickle' % epoch),
                    "wb") as f:
                pickle.dump(pca_features_te_unbal, f)