def giveThreshold_clusterCenter_listAnswer():
    folderTrainBase = 'D:/dataSet_crossValidation/'
    folderTestBase = 'D:/testSet/'

    list_threshold = []

    t0 = time.time()
    for i in range(32):
        if i < 9:
            strNum = '0' + str(i + 1)
        else:
            strNum = str(i + 1)

        bigMat_file = open(folderTrainBase + '/' + strNum + '/' + 'bigMat.pkl',
                           'rb')
        bigMat_train = pickle.load(bigMat_file)
        bigMat_file.close()

        listName_file = open(
            folderTrainBase + '/' + strNum + '/' + 'listLabel.pkl', 'rb')
        listName_train = pickle.load(listName_file)
        listName_file.close()

        bigMat_file_ = open(folderTestBase + '/' + strNum + '/' + 'bigMat.pkl',
                            'rb')
        bigMat_test = pickle.load(bigMat_file_)
        bigMat_file_.close()

        listName_file_ = open(
            folderTestBase + '/' + strNum + '/' + 'listLabel.pkl', 'rb')
        listName_test = pickle.load(listName_file_)
        listName_file_.close()

        for j in range(len(listName_test)):
            testData, trainData = normalization.normalizator(
                bigMat_test[j], bigMat_train)
            centerMat, centerNamelist = pre.clusterCenterGenerator(
                trainData, listName_train)
            index = centerNamelist.index(listName_test[j])
            list_threshold.append(
                distaneceCalculate(centerMat[index], testData))

            del testData
            del trainData
            del centerMat
            del centerNamelist
            gc.collect()
        del bigMat_train
        del bigMat_test
        del listName_train
        del listName_test
        gc.collect()

        t1 = time.time()
        print(i)
        print('time: ' + str(((t1 - t0) / 60)))
        print(max(list_threshold))
        print('----------------------')

    return list_threshold
def giveThreshold_knn_listAnswer():
    folderTrainBase = 'D:/dataSet_crossValidation/'
    folderTestBase = 'D:/testSet/'

    list_threshold = []

    t0 = time.time()
    for i in range(32):
        if i < 9:
            strNum = '0' + str(i + 1)
        else:
            strNum = str(i + 1)

        bigMat_file = open(folderTrainBase + '/' + strNum + '/' + 'bigMat.pkl',
                           'rb')
        bigMat_train = pickle.load(bigMat_file)
        bigMat_file.close()

        listName_file = open(
            folderTrainBase + '/' + strNum + '/' + 'listLabel.pkl', 'rb')
        listName_train = pickle.load(listName_file)
        listName_file.close()

        bigMat_file_ = open(folderTestBase + '/' + strNum + '/' + 'bigMat.pkl',
                            'rb')
        bigMat_test = pickle.load(bigMat_file_)
        bigMat_file_.close()

        listName_file_ = open(
            folderTestBase + '/' + strNum + '/' + 'listLabel.pkl', 'rb')
        listName_test = pickle.load(listName_file_)
        listName_file_.close()

        arrayName_train = np.array(listName_train)

        for j in range(len(listName_test)):
            disTestData = []
            testData, trainData = normalization.normalizator(
                bigMat_test[j], bigMat_train)
            index = np.argwhere(arrayName_train == listName_test[j])
            for k in index:
                kk = k[0]
                disTestData.append(distaneceCalculate(testData, trainData[kk]))

            list_threshold.append(min(disTestData))

            del testData
            del trainData
            gc.collect()

        del arrayName_train
        del bigMat_train
        del bigMat_test
        del listName_train
        del listName_test
        gc.collect()

        t1 = time.time()
        print(i)
        print('time: ' + str(((t1 - t0) / 60)))
        print(max(list_threshold))
        print('----------------------')

    return list_threshold
def hurdlingDict2_knn_without():
    """
    1 hurdlingDict
    2 predict_knn_nearest
    """
    list_hD_knn = []
    rightRate = []

    t0 = time.time()
    for i in range(32):

        list_hD_knn_tmp = []

        if i < 9:
            strNum = '0' + str(i + 1)
        else:
            strNum = str(i + 1)

        # take big mat and name list of train set
        bigMat_file = open(folderTrainBase + '/' + strNum + '/' + 'bigMat.pkl',
                           'rb')
        bigMat_train = pickle.load(bigMat_file)
        bigMat_file.close()

        listName_file = open(
            folderTrainBase + '/' + strNum + '/' + 'listLabel.pkl', 'rb')
        listName_train = pickle.load(listName_file)
        listName_file.close()

        # take big mat and name list of test set
        bigMat_file_ = open(folderTestBase + '/' + strNum + '/' + 'bigMat.pkl',
                            'rb')
        bigMat_test = pickle.load(bigMat_file_)
        bigMat_file_.close()

        listName_file_ = open(
            folderTestBase + '/' + strNum + '/' + 'listLabel.pkl', 'rb')
        listName_test = pickle.load(listName_file_)
        listName_file_.close()

        # predict each data in test set
        for j in range(len(listName_test)):
            # first predict method: hurdlingDictMatch => return a list of names
            img = getImageMatFromName(listName_test[j] + strNum + '.png')
            preName_hD_list = hDM.MatchList_threshold_without(
                img, th_hurdling, strNum + '/')
            # second predict method: knn_bestMatch_nearest => return prediction result
            trainData_hD, listName_train_hD = filterForTrainData(
                preName_hD_list, listName_train, bigMat_train)
            testData, trainData = normalization.normalizator(
                bigMat_test[j], trainData_hD)
            preName_nearst = pre.predict_knn_BestMatch_nearest(
                testData, trainData, listName_train_hD)
            if preName_nearst == listName_test[j]:
                list_hD_knn.append(1)
                list_hD_knn_tmp.append(1)
            elif sC.discriminateSimilarWord(preName_nearst, listName_test[j]):
                list_hD_knn.append(1)
                list_hD_knn_tmp.append(1)
            else:
                list_hD_knn.append(0)
                list_hD_knn_tmp.append(0)

            del img
            del preName_hD_list
            del trainData_hD
            del listName_train_hD
            del testData
            del trainData
            gc.collect()
        rightRate.append(list_hD_knn_tmp.count(1) / len(list_hD_knn_tmp))
        print('right rate for ' + str(i) + ' >>>' +
              str(list_hD_knn_tmp.count(1) / len(list_hD_knn_tmp)))

        del list_hD_knn_tmp
        del bigMat_train
        del bigMat_test
        del listName_train
        del listName_test
        gc.collect()

        t11 = time.time()
        t_res = ((t11 - t0) / (i + 1)) * (32 - (i + 1))
        print('rest time: ' + str(t_res / 60) + ' min')
        print('---------------')

    t1 = time.time()
    print('-------------------')
    print('time: ' + str((t1 - t0) / 60) + ' min')
    print('right rate:')
    print(list_hD_knn.count(1) / len(list_hD_knn))
    print('best right rate:')
    print(max(rightRate))
    print('worst right rate:')
    print(min(rightRate))
def knn_hurdlingDict_without():
    """
    1 predict_knn_listAnswer
    2 hurdlingDict
    """
    list_knn_hD = []
    rightRate = []

    t0 = time.time()
    for i in range(32):
        list_knn_hD_tmp = []

        if i < 9:
            strNum = '0' + str(i + 1)
        else:
            strNum = str(i + 1)

        # take big mat and name list of train set
        bigMat_file = open(folderTrainBase + '/' + strNum + '/' + 'bigMat.pkl',
                           'rb')
        bigMat_train = pickle.load(bigMat_file)
        bigMat_file.close()

        listName_file = open(
            folderTrainBase + '/' + strNum + '/' + 'listLabel.pkl', 'rb')
        listName_train = pickle.load(listName_file)
        listName_file.close()

        # take big mat and name list of test set
        bigMat_file_ = open(folderTestBase + '/' + strNum + '/' + 'bigMat.pkl',
                            'rb')
        bigMat_test = pickle.load(bigMat_file_)
        bigMat_file_.close()

        listName_file_ = open(
            folderTestBase + '/' + strNum + '/' + 'listLabel.pkl', 'rb')
        listName_test = pickle.load(listName_file_)
        listName_file_.close()

        # predict each data in test set
        for j in range(len(listName_test)):
            # normalization
            testData, trainData = normalization.normalizator(
                bigMat_test[j], bigMat_train)
            # first predict method: knn_nearest => return a list of names
            preName_knn_list = pre.predict_knn_listAnswer(
                th_knn, testData, trainData, listName_train)
            # second predict method: hurdlingDictMatch => return prediction result
            img = getImageMatFromName(listName_test[j] + strNum + '.png')
            answer = hDM.matchBest_without(img, strNum + '/', preName_knn_list)

            if listName_test[j] == answer:
                list_knn_hD.append(1)
                list_knn_hD_tmp.append(1)
            elif sC.discriminateSimilarWord(listName_test[j], answer):
                list_knn_hD.append(1)
                list_knn_hD_tmp.append(1)
            else:
                list_knn_hD.append(0)
                list_knn_hD_tmp.append(0)

            del testData
            del trainData
            del img
            gc.collect()
        rightRate.append(list_knn_hD_tmp.count(1) / len(list_knn_hD_tmp))
        print('right rate for ' + str(i) + ' >>>' +
              str(list_knn_hD_tmp.count(1) / len(list_knn_hD_tmp)))

        del list_knn_hD_tmp
        del bigMat_train
        del bigMat_test
        del listName_train
        del listName_test
        gc.collect()

        t11 = time.time()
        t_res = ((t11 - t0) / (i + 1)) * (32 - (i + 1))
        print('rest time: ' + str(t_res / 60) + ' min')
        print('---------------')

    t1 = time.time()
    print('time: ' + str((t1 - t0) / 60) + ' min')
    print('right rate:')
    print(list_knn_hD.count(1) / len(list_knn_hD))
    print('best right rate:')
    print(max(rightRate))
    print('worst right rate:')
    print(min(rightRate))
    listName_train = pickle.load(listName_file)
    listName_file.close()

    bigMat_file_ = open(folderTestBase + '/' + strNum + '/' + 'bigMat.pkl', 'rb')
    bigMat_test = pickle.load(bigMat_file_)
    bigMat_file_.close()

    listName_file_ = open(folderTestBase + '/' + strNum + '/' + 'listLabel.pkl', 'rb')
    listName_test = pickle.load(listName_file_)
    listName_file_.close()

    listAnswer_nearst_tmp = []
    listAnswer_cluster_tmp = []

    for j in range(len(listName_test)):
        testData, trainData = normalization.normalizator(bigMat_test[j], bigMat_train)

        preName_nearst = pre.predict_knn_BestMatch_nearest(testData, trainData, listName_train)
        if listName_test[j] == preName_nearst:
            listAnswer_nearst.append(1)
            listAnswer_nearst_tmp.append(1)
        elif sC.discriminateSimilarWord(listName_test[j], preName_nearst):
            listAnswer_nearst.append(1)
            listAnswer_nearst_tmp.append(1)
        else:
            listAnswer_nearst.append(0)
            listAnswer_nearst_tmp.append(0)

        centerMat, centerNamelist = pre.clusterCenterGenerator(trainData, listName_train)
        preName_cluster = pre.predict_clusterCenter_BestMatch(testData, centerMat, centerNamelist)
        if listName_test[j] == preName_cluster: