def getStandardedMatData(rawData):
    # 将数据标准化后,再分割为数据集、标签集,然后将格式转换为矩阵
    dataSet, labelSet = LoadFile.splitData(rawData)
    dataSet = np.mat(dataSet)
    labelSet = np.mat(labelSet).T
    dataSet = standardize(dataSet)
    return dataSet, labelSet
def kFoldCrossValidation(rawData, processFunc, k):
    rawData = pd.DataFrame(rawData)
    n = np.shape(rawData)[0]
    step = n // k
    bestModel = 0
    leastErr = 0
    for i in range(k):
        if i == (k-1):
            testData = rawData.iloc[step*i: n, :]
            trainData = rawData.iloc[: step*i, :]
        else:
            testData = rawData.iloc[step*i: step*(i+1), :]
            trainData = rawData.iloc[: step*i, :]
            trainData = trainData.append(rawData.iloc[step*(i+1):])
        testData, testLabel = LoadFile.splitData(testData)
        trainData, trainLabel = LoadFile.splitData(trainData)
        model = getModel(trainData, trainLabel)
        err = testModel(model, testData, testLabel)
        if err < leastErr:
            leastErr = err
            bestModel = model
    return model
Beispiel #3
0
    return math.sqrt(z)


def minmaxNormalize(dataSet):
    # 数据映射到[0, 1]
    dataSet = pd.DataFrame(dataSet)
    minDf = dataSet.min()
    maxDf = dataSet.max()
    normalizedSet = (dataSet - minDf) / (maxDf - minDf)
    return normalizedSet


def zscoreStanderize(dataSet):
    # 用z-score方法进行数据标准化
    dataSet = pd.DataFrame(dataSet)
    meanDf = dataSet.mean()
    stdDf = dataSet.std()
    standerizedSet = (dataSet - meanDf) / stdDf
    return standerizedSet


if __name__ == "__main__":
    filePath = "F:/2020AI_SummerCamp/dataSet/"
    # rawData = LoadFile.loadCSV(filePath + "Pima.csv")
    rawData = LoadFile.loadCSV(filePath + "diabetesN.csv")
    dataSet, labelSet = LoadFile.splitData(rawData)
    # dataSet = solveMissingData(dataSet)
    # dataSet = minmaxNormalize(dataSet)
    dataSet = zscoreStanderize(dataSet)
    print(dataSet)