예제 #1
0
    return math.sqrt(z)


def minmaxNormalize(dataSet):
    # 数据映射到[0, 1]
    dataSet = pd.DataFrame(dataSet)
    minDf = dataSet.min()
    maxDf = dataSet.max()
    normalizedSet = (dataSet - minDf) / (maxDf - minDf)
    return normalizedSet


def zscoreStanderize(dataSet):
    # 用z-score方法进行数据标准化
    dataSet = pd.DataFrame(dataSet)
    meanDf = dataSet.mean()
    stdDf = dataSet.std()
    standerizedSet = (dataSet - meanDf) / stdDf
    return standerizedSet


if __name__ == "__main__":
    filePath = "F:/2020AI_SummerCamp/dataSet/"
    # rawData = LoadFile.loadCSV(filePath + "Pima.csv")
    rawData = LoadFile.loadCSV(filePath + "diabetesN.csv")
    dataSet, labelSet = LoadFile.splitData(rawData)
    # dataSet = solveMissingData(dataSet)
    # dataSet = minmaxNormalize(dataSet)
    dataSet = zscoreStanderize(dataSet)
    print(dataSet)
예제 #2
0
    sortResult = sorted(labelsCnt.items(),
                        key=operator.itemgetter(1),
                        reverse=True)
    return sortResult[0][0]


# 根据列表生成式生成切分后的数据集
def splitToFeat(dataSet, labelSet, feat, val):
    dataSet = np.array(dataSet)
    labelSet = np.array(labelSet)
    leftData = dataSet[np.nonzero(dataSet[:, feat] < val)[0]]
    leftLabel = labelSet[np.nonzero(dataSet[:, feat] < val)[0]]
    rightData = dataSet[np.nonzero(dataSet[:, feat] >= val)[0]]
    rightLabel = labelSet[np.nonzero(dataSet[:, feat] >= val)[0]]
    return leftData, leftLabel, rightData, rightLabel


if __name__ == '__main__':
    # 读入文件
    # 使用的是网上找到的一个数据集
    filePath = "F:/2020AI_SummerCamp/dataSet/"
    rawData = LoadFile.loadCSV(filePath + "cartDS.csv")

    # 预处理
    dataSet, labelSet = LoadFile.splitData(rawData)
    tree = buildTree(dataSet, labelSet)

    # 测试数据
    testVec = np.array([7, 3.2, 4.7, 1.4])
    print(classify(tree, testVec))