예제 #1
0
def main(filename, scaler, MAXFEATURENUM, **svmParameter):
    # 获取基本参数
    kernel = svmParameter["kernel"]
    C = svmParameter["C"]
    gamma = svmParameter["gamma"]

    # 读取数据,Data1一定是多数类
    dataSet1, labels1, dataSet2, labels2 = basef.readData(filename, scaler)

    # 鲁棒测试开关
    flag = 0
    if flag == 1:
        pickoutArray = range(len(labels1))
        pickoutNum = round(len(labels1) * 0.1)
        if len(labels2) < pickoutNum:
            dataSet2 = dataSet2[:]
            labels2 = labels2[:]
        else:
            PICKdata = []
            PICKlabels = []
            x = 0
            for i in pickoutArray:
                if random.randrange(0, 2) == 1 and x <= pickoutNum:
                    PICKdata.append(dataSet2[i])
                    PICKlabels.append(labels2[i])
                x += 1
            dataSet2 = np.array(PICKdata[:])
            labels2 = np.array(PICKlabels[:])

    # print "step 2: 预处理数据..."
    # print "\tstep2.1: 计算指标信息增益"
    initArray = en.featureSample(dataSet1, labels1, dataSet2, labels2, MAXFEATURENUM)
    print "初始化指标:", initArray

    preTestX = np.vstack([dataSet1, dataSet2])
    preLabels = np.hstack([labels1, labels2])

    # print "\tstep2.2: 平衡数据"
    # Todo:此处可以直接调用遗传算法模块了 -- By nevin47
    data = (dataSet1, labels1, dataSet2, labels2, initArray)
    arrayValue = [0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 1]
    train_X, train_label = Hyper.balanceDataforGA(arrayValue, *data)

    # print "step 3: 训练..."
    clf = HSVM.trainSVM(train_X, train_label, kernel=kernel, C=C, gamma=gamma)

    # print "step 4: 测试..."
    pre = HSVM.testSVM(preTestX, clf)
    proba = HSVM.testSVMwithProb(preTestX, clf)
    tt = np.array(preLabels, dtype="float64")

    print "Final G-means:", basef.testSample(pre, tt)
    print "\nPRO:", proba, "\n"
    print "\nPRE:", pre, "\n"
    print basef.testSampleShow(pre, tt)
    return basef.testSampleShow(pre, tt)
예제 #2
0
def main(filename, testfilename, scaler, MAXFEATURENUM, **svmParameter):
    kernel = svmParameter['kernel']
    C = svmParameter['C']
    gamma = svmParameter['gamma']

    dataSet1, labels1, dataSet2, labels2 = basef.readData(filename, scaler)

    initArray = en.featureSample(dataSet1, labels1, dataSet2, labels2, MAXFEATURENUM)


    #preTestX = np.vstack([dataSet1,dataSet2])
    #preLabels = np.hstack([labels1,labels2])

    preTestX = basef.readTestData(testfilename, scaler)

    # Todo:此处可以直接调用遗传算法模块了 -- By nevin47
    data = (dataSet1, labels1, dataSet2, labels2, initArray)
    arrayValue = [0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 1]
    train_X, train_label = Hyper.balanceDataforGA(arrayValue,*data)

    clf = HSVM.trainSVM(train_X, train_label, kernel=kernel, C=C, gamma= gamma)

    pre = HSVM.testSVM(preTestX , clf)
    proba = HSVM.testSVMwithProb(preTestX, clf)

    return proba, pre, initArray

# if __name__ == "__main__":
#     # demo
#     filename1 = '/Users/nevin47/Desktop/Project/Academic/Code/Python/SVM/UnbalancedDataSVM/DataSet/test/wpbc.csv' # 设置读取文件
#     filename = '/Users/nevin47/Desktop/Project/Academic/Code/Python/SVM/UnbalancedDataSVM/DataSet/CreditOriginData2.csv'
#     # filename = '/Users/nevin47/Desktop/Project/Academic/Code/Python/SVM/UnbalancedDataSVM/DataSet/Heart2.csv'
#     scaler = 1 # 决定是否归一化数据
#     MAXFEATURENUM = 5 # 设置指标离散最大值
#     SUMG = []
#     SUMF = []
#     for i in range(30):
#         tempG,tempF = main(filename, scaler, MAXFEATURENUM, kernel='rbf', C=15.0, gamma= 1)
#         SUMG.append(tempG)
#         SUMF.append(tempF)
#     print "AVG-G: %f,AVG-F %f",sum(SUMG)/30.0,sum(SUMF)/30.0
#
# # test
#
# # print dataSet1
예제 #3
0
def main(filename, scaler, MAXFEATURENUM, **svmParameter):
    # 获取基本参数
    kernel = svmParameter['kernel']
    C = svmParameter['C']
    gamma = svmParameter['gamma']

    print "step 1: 读取数据..."


    cloudReader = csv.reader(file("/Users/nevin47/Desktop/Project/Academic/Code/Python/SVM/cloudSVM/Dataset/cloud.csv",'rb'))
    Cloud = []
    Origin = []

    for i in cloudReader:
        Cloud.append(i)

    originReader =  csv.reader(file("/Users/nevin47/Desktop/Project/Academic/Code/Python/SVM/cloudSVM/Dataset/Origin.csv",'rb'))

    for j in originReader:
        Origin.append(j)
    Cloud = np.array(Cloud)
    Origin = np.array(Origin,dtype='float64')

    for i in range(10):
        testSample = Origin[:,i]

        # print testSample
        lowIndex = []
        highIndex = []
        x = 0
        for index,score in enumerate(testSample):
            if score > 3:
                highIndex.append(index)
                x += 1
            elif score < 3 and score != 0:
                lowIndex.append(index)

        highCloud = []
        lowCloud = []
        # 导入云数据
        for m in lowIndex:
            lowCloud.append(Cloud[m])
        for n in highIndex:
            highCloud.append(Cloud[n])

        highCloud = np.array(highCloud)
        lowCloud = np.array(lowCloud)
        if len(highCloud) > len(lowCloud):
            highlabels = [1 for i in range(len(highCloud))]
            lowlabels = [-1 for i in range(len(lowCloud))]
            dataSet1, labels1, dataSet2, labels2 = highCloud,highlabels,lowCloud,lowlabels
        else:
            highlabels = [-1 for i in range(len(highCloud))]
            lowlabels = [1 for i in range(len(lowCloud))]
            dataSet1, labels1, dataSet2, labels2 = lowCloud,lowlabels,highCloud,highlabels

        initArray = en.featureSample(dataSet1, labels1, dataSet2, labels2, MAXFEATURENUM)


        preTestX = np.vstack([dataSet1,dataSet2])
        preLabels = np.hstack([labels1,labels2])
        # GMM = []
        # for i in range(500):
        #     ii = i + 1
        #     tempi = ii/500.0
        #     arrayValue = [0.1,0.1,1,0.1]
        #     arrayValue[1] = tempi
        #     print "step2.2: 平衡数据"
        #     data = (dataSet1, labels1, dataSet2, labels2, initArray)
        #
        #     train_X, train_label = Hyper.balanceDataforGA(arrayValue,*data)
        #
        #     print "step 3: 训练..."
        #     clf = HSVM.trainSVM(train_X, train_label, kernel=kernel, C=C, gamma= gamma)
        #
        #     print "step 4: 测试..."
        #     pre = HSVM.testSVM(preTestX , clf)
        #     tt = np.array(preLabels, dtype="float64")
        #     Gm = basef.testSample(pre, tt)
        #     GMM.append(Gm)
        # pl.plot(range(500),GMM)
        # pl.show()


        print "\tstep2.2: 平衡数据"
        # Todo:此处可以直接调用遗传算法模块了 -- By nevin47
        data = (dataSet1, labels1, dataSet2, labels2, initArray)
        arrayValue = [1,1,1,1,1,1,1]
        train_X, train_label = Hyper.balanceDataforGA(arrayValue,*data)
        # train_X, train_label = Hyper.balanceData(dataSet1, labels1, dataSet2, labels2, initArray)

        print "step 3: 训练..."
        clf = HSVM.trainSVM(train_X, train_label, kernel=kernel, C=C, gamma= gamma)

        print "step 4: 测试..."
        pre = HSVM.testSVM(preTestX , clf)
        trainLabel = np.array(preLabels, dtype="float64")

        #######
        testCount = co.Counter(trainLabel)
        allN = testCount[1]
        allP = testCount[-1]
        testResult = trainLabel - pre
        FN = 0
        FP = 0
        for index, num in enumerate(testResult):
            if (num == 2):
                FN += 1
            elif (num == -2):
                FP += 1
        TP = allP - FP
        TN = allN - FN
        if (TP+FN) == 0:
            TPR = 0
        else:
            TPR = float(TP) / (TP + FN)
        TNR = float(TN) / (TN + FP)
        precision = float(TP) / (TP + FP)
        Gmeans = math.sqrt(TPR * TNR)
        if (precision+TPR) == 0:
            Fmeasure = 0
        else:
            Fmeasure = 2 * TPR * precision / (TPR + precision)
        print Gmeans,Fmeasure