def getStandardedMatData(rawData): # 将数据标准化后,再分割为数据集、标签集,然后将格式转换为矩阵 dataSet, labelSet = LoadFile.splitData(rawData) dataSet = np.mat(dataSet) labelSet = np.mat(labelSet).T dataSet = standardize(dataSet) return dataSet, labelSet
def kFoldCrossValidation(rawData, processFunc, k): rawData = pd.DataFrame(rawData) n = np.shape(rawData)[0] step = n // k bestModel = 0 leastErr = 0 for i in range(k): if i == (k-1): testData = rawData.iloc[step*i: n, :] trainData = rawData.iloc[: step*i, :] else: testData = rawData.iloc[step*i: step*(i+1), :] trainData = rawData.iloc[: step*i, :] trainData = trainData.append(rawData.iloc[step*(i+1):]) testData, testLabel = LoadFile.splitData(testData) trainData, trainLabel = LoadFile.splitData(trainData) model = getModel(trainData, trainLabel) err = testModel(model, testData, testLabel) if err < leastErr: leastErr = err bestModel = model return model
return math.sqrt(z) def minmaxNormalize(dataSet): # 数据映射到[0, 1] dataSet = pd.DataFrame(dataSet) minDf = dataSet.min() maxDf = dataSet.max() normalizedSet = (dataSet - minDf) / (maxDf - minDf) return normalizedSet def zscoreStanderize(dataSet): # 用z-score方法进行数据标准化 dataSet = pd.DataFrame(dataSet) meanDf = dataSet.mean() stdDf = dataSet.std() standerizedSet = (dataSet - meanDf) / stdDf return standerizedSet if __name__ == "__main__": filePath = "F:/2020AI_SummerCamp/dataSet/" # rawData = LoadFile.loadCSV(filePath + "Pima.csv") rawData = LoadFile.loadCSV(filePath + "diabetesN.csv") dataSet, labelSet = LoadFile.splitData(rawData) # dataSet = solveMissingData(dataSet) # dataSet = minmaxNormalize(dataSet) dataSet = zscoreStanderize(dataSet) print(dataSet)