return math.sqrt(z) def minmaxNormalize(dataSet): # 数据映射到[0, 1] dataSet = pd.DataFrame(dataSet) minDf = dataSet.min() maxDf = dataSet.max() normalizedSet = (dataSet - minDf) / (maxDf - minDf) return normalizedSet def zscoreStanderize(dataSet): # 用z-score方法进行数据标准化 dataSet = pd.DataFrame(dataSet) meanDf = dataSet.mean() stdDf = dataSet.std() standerizedSet = (dataSet - meanDf) / stdDf return standerizedSet if __name__ == "__main__": filePath = "F:/2020AI_SummerCamp/dataSet/" # rawData = LoadFile.loadCSV(filePath + "Pima.csv") rawData = LoadFile.loadCSV(filePath + "diabetesN.csv") dataSet, labelSet = LoadFile.splitData(rawData) # dataSet = solveMissingData(dataSet) # dataSet = minmaxNormalize(dataSet) dataSet = zscoreStanderize(dataSet) print(dataSet)
sortResult = sorted(labelsCnt.items(), key=operator.itemgetter(1), reverse=True) return sortResult[0][0] # 根据列表生成式生成切分后的数据集 def splitToFeat(dataSet, labelSet, feat, val): dataSet = np.array(dataSet) labelSet = np.array(labelSet) leftData = dataSet[np.nonzero(dataSet[:, feat] < val)[0]] leftLabel = labelSet[np.nonzero(dataSet[:, feat] < val)[0]] rightData = dataSet[np.nonzero(dataSet[:, feat] >= val)[0]] rightLabel = labelSet[np.nonzero(dataSet[:, feat] >= val)[0]] return leftData, leftLabel, rightData, rightLabel if __name__ == '__main__': # 读入文件 # 使用的是网上找到的一个数据集 filePath = "F:/2020AI_SummerCamp/dataSet/" rawData = LoadFile.loadCSV(filePath + "cartDS.csv") # 预处理 dataSet, labelSet = LoadFile.splitData(rawData) tree = buildTree(dataSet, labelSet) # 测试数据 testVec = np.array([7, 3.2, 4.7, 1.4]) print(classify(tree, testVec))