Python get_data Exemples, dataprocess.get_data Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : LDAtest1.py Projet : mindtra/MachineLearningSZU2020

def LDA_test(k=2,
             i_of_e=0,
             fun_type=None,
             data_name='orl',
             n_used_class=None,
             dimension=None):
    # k:训练集个数; i_of_e 最终取e值的i值, 0或-1为不取e值
    print('K = ' + str(k) + ':')
    data, n_class, n_one_class, _ = get_data(data_name)

    if n_used_class is not None:
        # 选取一部分
        n_class = n_used_class
        data = data[:n_class * n_one_class]

    accuracy = 0
    total_time = 0

    best_d = n_class - 1
    if dimension is not None and 0 < dimension < n_class:
        best_d = dimension
    n_train = int(n_one_class * k / 10)
    if n_train == 1:
        n_train = 2
    print('n of train set: %d' % n_train)

    for i in range(n_one_class):
        train_data, train_data_index, test_data, test_data_index = separateData(
            data, n_train, i, n_class)

        train_data_norm = get_normalize(train_data)
        test_data_norm = get_normalize(test_data)

        time1 = time.time()
        W = lda.LDATrain(train_data_norm, n_class, n_train, i_of_e, fun_type)
        tempW = np.array(W[:best_d, :])  # get 1 to max_d dimension
        train_mat, test_mat = get_mat(tempW, train_data_norm, test_data_norm)
        res = np.mean(test_data_index == KNN(train_mat, test_mat,
                                             train_data_index, n_train))
        print("res %d" % (i + 1))
        print(res)
        accuracy += res
        time2 = time.time()
        total_time += time2 - time1
    accuracy /= n_one_class
    total_time /= n_one_class
    print('total_time:')
    print(total_time)
    print('accuracy:')
    print(accuracy)
    return accuracy

Exemple #2

0

Afficher le fichier

Fichier : VGG_ver3.py Projet : mindtra/MachineLearningSZU2020

def getORL():
    data, n_class, n_one_class, _ = get_data()
    train_data, train_data_index, test_data, test_data_index = separateData(data, 6, 0, n_class)
    # validX, testX, validY, testY = train_test_split(test_data, test_data_index, train_size=0.4, test_size=0.6,
    #                                                 random_state=0)
    train_data_index = keras.utils.to_categorical(train_data_index, n_class)
    test_data_index = keras.utils.to_categorical(test_data_index, n_class)

    size = 46
    validNum = 1
    testNum = n_one_class - 6 - validNum
    validX = np.zeros((n_class * validNum, size * size))
    validY = np.zeros((n_class * validNum, n_class))
    testX = np.zeros((n_class * testNum, size * size))
    testY = np.zeros((n_class * testNum, n_class))
    valid_num = 0  # 训练集的个数
    test_num = 0  # 测试集的个数
    for j in range(test_data.shape[0]):
        if j % (n_one_class - 6) < validNum:
            validX[valid_num] = test_data[j]
            validY[valid_num] = test_data_index[j]
            valid_num += 1
        else:
            testX[test_num] = test_data[j]
            testY[test_num] = test_data_index[j]
            test_num += 1

    print(train_data.shape)
    print(validX.shape)
    print(testX.shape)

    train_data = normalize_image_array(train_data)
    validX = normalize_image_array(validX)
    testX = normalize_image_array(testX)

    train_data = np.reshape(train_data, [-1, size, size, 1])
    validX = np.reshape(validX, [-1, size, size, 1])
    testX = np.reshape(testX, [-1, size, size, 1])
    print(train_data.shape)
    print(validX.shape)
    print(testX.shape)
    print(train_data_index.shape)
    print(validY.shape)
    print(testY.shape)
    return train_data, train_data_index, validX, validY, testX, testY

Exemple #3

0

Afficher le fichier

#starter.py
import dataprocess
import measurement
import models
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss, zero_one_loss
from sklearn import preprocessing

if __name__ == '__main__':
    #-------GET DATA FROM CSV FILE-------------
    X_train, Y_train = dataprocess.get_data(path='../data/trainingset.csv')
    X_cross, Y_cross = dataprocess.get_data(path='../data/crossset.csv')
    X_test, Y_test = dataprocess.get_data(path='../data/testset.csv')
    X_train_cross, Y_train_cross = dataprocess.get_data(
        path='../data/train_and_cross.csv')
    #Check size of data
    # print(len(Y_train))
    # print(len(Y_cross))
    # print(len(Y_test))
    # print(len(Y_train_cross))

    #-------Preprocess DATA-------------
    # Add more features for data
    # X_train = dataprocess.addDimensional(X_train)
    # X_cross = dataprocess.addDimensional(X_cross)
    # X_test = dataprocess.addDimensional(X_test)
    # X_train = dataprocess.addDimensionalCube(X_train)
    # X_cross = dataprocess.addDimensionalCube(X_cross)
    # X_test = dataprocess.addDimensionalCube(X_test)
    #normalize:

Exemple #4

0

Afficher le fichier

Fichier : PCATest.py Projet : mindtra/MachineLearningSZU2020

def pca_test(k=8, data_name='orl', n_used_class=None, dimension=120):
    print('K = ' + str(k) + ':')
    data, n_class, n_one_class, _ = get_data(data_name)
    data = data / np.max(data)

    if n_used_class is not None:
        # 选取一部分
        n_class = n_used_class
        data = data[:n_class * n_one_class]

    accuracy = 0
    total_time = 0

    n_train = int(n_one_class * k / 10)
    if n_train == 1:
        n_train = 2
    print('n of train set: %d' % n_train)

    for i in range(n_one_class):
        train_data, train_data_index, test_data, test_data_index = separateData(
            data, n_train, i, n_class)
        # train_data_norm = get_normalize(train_data)
        # test_data_norm = get_normalize(test_data)
        # train_data_norm = train_data
        # test_data_norm = test_data
        time1 = time.time()
        pca_W = pca(train_data)  # 特征矩阵
        # lda_W = lda.LDATrain(train_data, n_class, k)  # 特征矩阵
        # lda_W = lda.LDATrain(train_data, n_class, k)  # 特征矩阵

        # matplot show image
        # img = train_data[0].reshape((46, 46))
        # fig = plt.figure()
        # for j in range(4):
        #     # PCA
        #     img = pca_W[j].reshape((46, 46))
        #     plt.subplot(2, 4, j + 1)
        #     plt.axis('off')
        #     plt.xticks([])  # 去掉横坐标值
        #     plt.yticks([])  # 去掉纵坐标值
        #     plt.title('PCA')
        #     plt.imshow(img, cmap='Greys_r')
        #     # LDA
        #     img = lda_W[j].reshape((46, 46))
        #     plt.subplot(2, 4, j + 5)
        #     plt.axis('off')
        #     plt.xticks([])  # 去掉横坐标值
        #     plt.yticks([])  # 去掉纵坐标值
        #     plt.title('LDA')
        #     plt.imshow(img, cmap='Greys_r')
        # plt.tight_layout()
        # plt.savefig('E:\\机器学习\\实验报告\\实验2\\eigenface与fisherface(pinv).png')
        # plt.show()

        # showImage
        # 我不用这一部分，仅作参考和日后学习
        # image1 = Image.new("RGB", (46*8, 132), 'white')
        # size = 46
        # x = 0
        # y = 0
        # img = Image.fromarray(train_data[0].reshape((size, size)))
        # font = ImageFont.truetype("simhei.ttf", 5, encoding="utf-8")
        # draw = ImageDraw.Draw(image1)  # 可以理解为画笔工具
        # draw.text((x, y), 'original', "black")
        # for j in range(8):
        #     image1.paste(img, (x+j*46, y+20))
        # image1.show()

        # continue matplot image show
        # for j in range(8):
        #     plt.subplot(2, 8, j + 9)
        #     plt.axis('off')
        #     plt.xticks([])  # 去掉横坐标值
        #     plt.yticks([])  # 去掉纵坐标值
        #     plt.title('dimension = '+str((j+1)*20))
        #     img = (train_data[0].reshape((1, train_data.shape[1]))-mu) @ np.array(W[:(j+1)*20]).T @ np.array(W[:(j+1)*20]) + mu
        #     img = (img.reshape((46, 46))).real
        #     plt.imshow(img, cmap='Greys_r')
        # plt.tight_layout()
        # plt.savefig('E:\\机器学习\\实验报告\\face_in_dif_D.png')
        # plt.show()
        # break

        #     测试识别率
        res_W = np.array(pca_W[:dimension])  # get dimension dimension
        train_mat, test_mat = get_mat(res_W, train_data, test_data, dimension)
        res = np.mean(
            test_data_index == pcaKNN(train_mat, test_mat, train_data_index))
        print("res %d " % (i + 1))
        print(res)
        accuracy += res
        time2 = time.time()
        total_time = time2 - time1
    total_time /= n_one_class
    print('total_time:')
    print(total_time)
    accuracy /= n_one_class
    print('accuracy:')
    print(accuracy)
    return accuracy

Exemple #5

0

Afficher le fichier

Fichier : PCATest.py Projet : mindtra/MachineLearningSZU2020

def B2DPCA_test(k=8, data_name='orl', dimension=5, fun_name='B2DPCA'):
    """

    :param k:
    :param data_name:
    :param dimension:
    :param fun_name: B2DPCA, 2DPCA
    :return:
    """
    print('K = ' + str(k) + ':')
    data, n_class, n_one_class, _ = get_data(data_name, keepDim=True)
    # data = data / np.max(data)
    accuracy = 0
    total_time = 0

    n_train = int(n_one_class * k / 10)
    if n_train == 1:
        n_train = 2
    print('n of train set: %d' % n_train)

    for i in range(n_one_class):
        train_data, train_data_index, test_data, test_data_index = separateData(
            data, n_train, i, n_class)

        time1 = time.time()
        if fun_name.lower() == 'b2dpca':
            pca_W1, pca_W2 = BTwoDPCA(train_data, dimension)  # 特征矩阵
        elif fun_name.lower() == '2dpca':
            pca_W1 = twoDPCA(train_data, dimension)
        train_mat = []
        test_mat = []

        for j in range(train_data.shape[0]):
            if fun_name.lower() == 'b2dpca':
                temp1 = (train_data[j, :, :]
                         @ pca_W1).T @ pca_W2  # (b, c) @ (c, k) = (b, k)
            else:
                temp1 = train_data[
                    j, :, :] @ pca_W1  # (b, c) @ (c, k) = (b, k)

            train_mat.append(temp1.T)  # (k, b)  1

        for j in range(test_data.shape[0]):
            if fun_name.lower() == 'b2dpca':
                temp1 = (test_data[j, :, :]
                         @ pca_W1).T @ pca_W2  # (b, c) @ (c, k) = (b, k)
            else:
                temp1 = test_data[j, :, :] @ pca_W1  # (b, c) @ (c, k) = (b, k)

            test_mat.append(temp1.T)  # (k, b)  1

        train_mat = np.array(train_mat)
        test_mat = np.array(test_mat)

        res = np.mean(
            test_data_index == pcaKNN(train_mat, test_mat, train_data_index))
        print("res %d " % (i + 1))
        print(res)
        accuracy += res
        time2 = time.time()
        total_time = time2 - time1
    total_time /= n_one_class
    print('total_time:')
    print(total_time)
    accuracy /= n_one_class
    print('accuracy:')
    print(accuracy)
    return accuracy

Exemple #6

0

Afficher le fichier

Fichier : KMeans.py Projet : mindtra/MachineLearningSZU2020

def clusterTest(fun_name=KMeans, n_cluster=2, data_name='orl', k=5, kernel_type=None):
    """

    :param n_cluster: 聚类数目
    :param fun_name: K
    :param data_name:
    :param k: training sample percent
    :param kernel_type: none
    :return:
    """
    print('In cluster:\n' + 'cluster type: ' + fun_name.__name__)
    print('Kernel type: ' + str(kernel_type))
    print('In ' + data_name)
    print('K = %d' % k)

    # 加载数据集
    oldData, n_class, n_one_class, oldLabel = get_data(data_name)

    # 注意PCA处理
    # data = pcaProject(oldData, dimension=160)

    # ---------------取前面3个类-------------
    # n_class = 3
    # n = None
    # # PCA 处理
    # data = pcaProject(oldData[0:n_class * n_one_class], dimension=160)
    # # 不处理
    # # data = oldData[:n_class * n_one_class]
    # if oldLabel is not None:
    #     label = oldLabel[0:n_class * n_one_class]
    # ---------------------------------------

    # ---------------或者取前面n张-------------
    n = 100
    """
    修改张数
    """
    # PCA 处理0.791   0.797  0.626   0.99
    #         0.221   1.27   5.76
    data = pcaProject(oldData[0:n], dimension=120)
    # 不处理
    # data = oldData[:n]
    if oldLabel is not None:
        label = oldLabel[:n]
    else:
        label = None
    # ---------------------------------------

    # 生成点集
    # n_class = 3
    # n_one_class = 10
    # data = getPointSet(n_class=n_class, n_one_class=n_one_class, dimension=2)

    print(data.shape)
    accuracy = 0
    if n is None and n_one_class is not None:
        n_train = int(n_one_class * k / 10)
    elif n is not None:
        n_train = n
    else:
        print('something wrong...')
        return

    # 忘了这是干嘛的
    if n_train == 1:
        n_train = 2

    print('n of train set: %d' % n_train)
    # 聚类没有单独的投影矩阵给测试集进行准确度的预测，因此测试集可以去除
    train_data, train_data_index, test_data, test_data_index = separateData(data, 1, 0, n_class, k_is_rate=True)
    if oldLabel is not None:
        train_data_index, _, test_data_index, _ = separateData(label, 1, 0, n_class, k_is_rate=True)

    print(train_data.shape)

    t1 = time()
    centroids, clusterAssignment = fun_name(train_data, n_cluster, distFun=distEuclidean, createCent=randCent)
    t2 = time()
    print("运行时间：", t2 - t1)
    # 类别如果按顺序放
    # res = 0
    # for i in range(n_class):  # n_class：种类数 n_train：一类样本的个数
    #     a = clusterAssignment[i * n_train:(i + 1) * n_train, 0]  # label
    #     temp = max([np.sum(a == j) for j in set(a.flat)])  # if res == label
    #     print('temp' + str(temp))
    #     res += temp / n_train
    # res /= n_class

    # 不按顺序放
    res = 0
    for i in range(n_class):  # n_class：种类数 n_train：一类样本的个数
        a = np.array(
            [clusterAssignment[j, 0] for j in range(clusterAssignment.shape[0]) if train_data_index[j] == i])  # label
        temp = max([np.sum(a == j) for j in set(a.flat)])  # if res == label
        print('class ' + str(i) + ' num ' + str(temp) + ' of ' + str(a.size))
        res += temp / a.size
    res /= n_class
    print(res)

    # plotData = train_data
    plotData = pcaProject(oldData[:train_data.shape[0]], dimension=2)
    print('plotData.shape: ', plotData.shape)
    fig1 = plt.figure()
    ax = fig1.add_subplot(111)
    for i in range(plotData.shape[0]):
        t = int(clusterAssignment[i, 0])
        ax.scatter(plotData[i, 0], plotData[i, 1], s=30, c=colors[t], marker=markers[int(train_data_index[i])])
    plt.savefig('E:\\最优化方法\\实验\\test'+str(n)+'.png', dpi=1000)
    plt.show()

Exemple #7

0

Afficher le fichier

Fichier : linear_regression.py Projet : mindtra/MachineLearningSZU2020

def linear_test(k=5,
                fun_name='linear',
                data_name='orl',
                kernel_type='linear',
                n_used_class=None):
    """

    :param k:
    :param fun_name: linear, Softmax, USSL, Locality, KRRC
    :param data_name:
    :param kernel_type: rbf, poly, linear
    :return:
    """
    print('In linear regression:\n' + 'Regression type: ' + fun_name)
    print('In ' + data_name)
    print('K = %d' % k)
    data, n_class, n_one_class = get_data(data_name)
    if fun_name.lower() != 'ussl':
        data = data / np.max(data)
    if n_used_class is not None:
        # 选取一部分
        n_class = n_used_class
        data = data[:n_class * n_one_class]
    # data = get_normalize(data)
    # project_W = pca(data)
    # d = 160
    # project_W = project_W[:d]
    # data = data @ project_W
    # data = mat["Yale5040165"].reshape((2000, 165))
    accuracy = 0
    accuracy2 = 0
    n_iters = 2000
    for i in range(n_one_class):
        # 处理数据集
        train_data, train_data_index, test_data, test_data_index = separateData(
            data, k, i, n_class)
        #         # 归一化可以提高识别率
        # train_data = get_normalize(train_data)
        # test_data = get_normalize(test_data)
        res = 0
        # 线性回归
        if fun_name.lower() == 'linear':
            re1 = Regression()
            # 调用库看看效果
            # re1 = sklearn.linear_model.LinearRegression()
            # re1.fit(train_data, train_data_index)
            # predict1 = re1.normal_equation_predict(train_data)
            # predict1 = re1.linear_predict(train_data)
            # re1.train_normal_equation(train_data, train_data_index, k, n_class)
            re1.train_normal_equation(train_data,
                                      train_data_index,
                                      k,
                                      n_class,
                                      type='ridge',
                                      lam=0.01)
            predict1 = re1.normal_equation_predict(test_data)
            # predict1 = re1.linear_predict(test_data)
            res = np.mean(test_data_index == predict1)

        # softmax梯度下降
        elif fun_name.lower() == 'softmax':
            re1 = Regression()
            W, costs = re1.train_gradient_descent(train_data,
                                                  train_data_index,
                                                  learning_rate=0.1,
                                                  n_iters=n_iters,
                                                  type='softmax',
                                                  n_class=n_class,
                                                  n_batch=None)
            res = np.mean(test_data_index == re1.softmax_predict(test_data))
        # # draw costs
        # fig = plt.figure()
        # x = [i for i in range(n_iters)]
        # plt.plot(x, costs, c='r')
        # plt.show()

        # USSL 谱回归
        elif fun_name.lower() == 'ussl':
            re1 = USSL(train_data, train_data_index)
            re1.ussl_fit(type='ussl')
            # re1.ussl_fit(type='one_hot')
            res = np.mean(test_data_index == re1.classify(test_data))

        # 局部加权回归
        elif fun_name.lower() == 'locality':
            re1 = LocalityRegression(train_data,
                                     train_data_index,
                                     n_class,
                                     lam=0.005,
                                     sigma=1)
            # res = np.mean(test_data_index == re1.classify(test_data, type='one_hot'))
            res = np.mean(
                test_data_index == re1.classify(test_data, type='hat'))

        # 核岭回归
        elif fun_name.lower() == 'krrc':
            # re1 = KRRC(train_data, train_data_index, n_class, lam=0.005, sigma=1, kernel='rbf')
            print('kernel type: ' + kernel_type)
            re1 = KRRC(train_data,
                       train_data_index,
                       n_class,
                       lam=0.005,
                       sigma=3,
                       kernel=kernel_type)
            # re1 = KRRC(train_data, train_data_index, n_class, lam=0.005, sigma=1, kernel='linear')
            # re1 = KRRC(train_data, train_data_index, n_class, lam=0.005, sigma=1, kernel='linear', locality=True)
            res = np.mean(test_data_index == re1.classify(test_data))

        # res = np.mean(train_data_index == pcaKNN(train_data, predict1, train_data_index))
        # print("res %d" % (i + 1))
        # print(res)
        # accuracy += res
        # predict1 = re1.normal_equation_predict(test_data)
        # predict1 = re1.softmax_predict(test_data)
        # predict1 = re1.linear_predict(test_data)
        # predict1 = predict1.astype(int)
        # res = np.mean(test_data_index == predict1)
        # res = np.mean(test_data_index == pcaKNN(test_data, predict1, test_data_index))
        print("res %d" % (i + 1))
        print(res)
        accuracy += res

    accuracy /= n_one_class
    print('In ' + fun_name + ' accuracy: ')
    print(accuracy)
    return accuracy