Esempio n. 1
0
def doExp(datasetPath,
          epsilon,
          varianceRatio,
          n_trails,
          numOfDimensions,
          logPath,
          isLinearSVM=True):
    if os.path.basename(datasetPath).endswith('npy'):
        data = np.load(datasetPath)
    else:
        #data = np.loadtxt(datasetPath, delimiter=",");
        data = pd.read_csv(datasetPath, delimiter=",", header=None).values
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data[:, 1:])
    globalPCA = PCAImpl(data_std)

    numOfFeature = data.shape[1] - 1
    largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(
        varianceRatio)
    print "%d/%d dimensions captures %.2f variance." % (
        largestReducedFeature, numOfFeature, varianceRatio)
    xDimensions = None
    if numOfDimensions > numOfFeature:
        xDimensions = np.arange(1, numOfFeature)
        largestReducedFeature = numOfFeature
    else:
        xDimensions = np.arange(
            1, largestReducedFeature,
            max(largestReducedFeature / numOfDimensions, 1))

    cprResult = []
    rs = StratifiedShuffleSplit(n_splits=n_trails,
                                test_size=.2,
                                random_state=0)
    rs.get_n_splits(data[:, 1:], data[:, 0])

    for train_index, test_index in rs.split(data[:, 1:], data[:, 0]):
        trainingData = data[train_index]
        testingData = data[test_index]

        tmpResult = singleExp(xDimensions, trainingData, testingData,
                              largestReducedFeature, epsilon, isLinearSVM)
        with open(logPath, "a") as f:
            np.savetxt(f, tmpResult, delimiter=",", fmt='%1.3f')
        cprResult.append(tmpResult)

    cprResult = np.vstack(cprResult)
    for result in cprResult:
        print ','.join(['%.3f' % num for num in result])

    return cprResult
def doExp(datasetPath,
          epsilon,
          varianceRatio,
          numOfRounds,
          numOfPointsinXAxis,
          isLinearSVM=True):
    if os.path.basename(datasetPath).endswith('npy'):
        data = np.load(datasetPath)
    else:
        data = np.loadtxt(datasetPath, delimiter=",")
    numOfFeature = data.shape[1] - 1
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data[:, 1:])
    globalPCA = PCAImpl(data_std)
    largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(
        varianceRatio)

    print "%d/%d dimensions captures %.2f variance." % (
        largestReducedFeature, numOfFeature, varianceRatio)
    cprResult = None

    #rs = StratifiedShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0);
    #rs.get_n_splits(data[:,1:],data[:,0]);
    rs = ShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0)
    rs.get_n_splits(data)
    for train_index, test_index in rs.split(data):
        #for train_index, test_index in rs.split(data[:,1:],data[:,0]):

        trainingData = data[train_index]
        testingData = data[test_index]
        print "number of training samples %d" % trainingData.shape[0]
        #tmpResult = p.apply_async(singleExp, (xDimensions,trainingData,testingData,topK,isLinearSVM));
        #cprResult += tmpResult.get();
        mostSamplesPerDataOwner = trainingData.shape[0] / 2
        xSamples = np.arange(
            2, mostSamplesPerDataOwner,
            max(mostSamplesPerDataOwner / numOfPointsinXAxis, 1))
        print "number of samples be tested: %s" % xSamples
        tmpResult = singleExp(xSamples, trainingData, testingData,
                              largestReducedFeature, epsilon, isLinearSVM)
        if cprResult is None:
            cprResult = tmpResult
        else:
            cprResult = np.concatenate((cprResult, tmpResult), axis=0)

    for result in cprResult:
        print ','.join(['%.3f' % num for num in result])

    return cprResult
Esempio n. 3
0
def doExp(datasetPath,varianceRatio,numOfRounds):
    if os.path.basename(datasetPath).endswith('npy'):
        data = np.load(datasetPath);
    else:
        data = np.loadtxt(datasetPath, delimiter=",");

    rs = ShuffleSplit(n_splits=numOfRounds, test_size=2, random_state=0);
    rs.get_n_splits(data);
    globalPCA = PCAImpl(data[:, 1:]);
    numOfFeature = data.shape[1] - 1;
    matrixRank = LA.matrix_rank(data[:, 1:]);

    print "Matrix rank of the data is %d." % matrixRank;
    largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(varianceRatio);
    print "%d/%d dimensions captures %.2f variance." % (largestReducedFeature, numOfFeature, varianceRatio);

    xEpsilons = np.arange(0.1, 1.1, 0.1);
    # print xDimensions;
    # p = Pool(numOfRounds);
    # allResults = [];
    cprResult = [];
    m = 0;
    for train_index, test_index in rs.split(data):
        print "Trail %d" % m;
        trainingData = data[train_index];
        pureTrainingData = trainingData[:, 1:];
        tmpResult = singleExp(xEpsilons, pureTrainingData, largestReducedFeature);
        cprResult.extend(tmpResult);
        m += 1;
        # print tmpResult.shape;
        # print tmpResult;
        # tmpResult = p.apply_async(singleExp, (xEpsilons,pureTrainingData,largestReducedFeature));
        # cprResult += tmpResult.get();
    """
        for i in range(0,len(cprResult)):
            print "%.4f,%.4f,%.4f" % (cprResult[i][0],cprResult[i][1],cprResult[i][2]);
        print "******************************";
    """
    # Compute the average value after numOfRounds experiments.
    # avgCprResult = cprResult/numOfRounds;
    # p.close();
    # p.join();
    for result in cprResult:
        print ','.join(['%.3f' % num for num in result]);

    return np.asarray(cprResult, dtype=float);
Esempio n. 4
0
def singleExp(xEpsilons,pureTrainingData,largestReducedFeature):

    numOfTrainingSamples = pureTrainingData.shape[0];
    scaler = StandardScaler(copy=False);
    # print pureTrainingData[0];
    scaler.fit(pureTrainingData);
    scaler.transform(pureTrainingData);
    # numOfFeature = trainingData.shape[1]-1;
    matrixRank = LA.matrix_rank(pureTrainingData);

    pcaImpl = PCAImpl(pureTrainingData);
    dpGaussianPCAImpl = DiffPrivPCAImpl(pureTrainingData);
    dpWishartPCAImpl = DiffPrivPCAImpl(pureTrainingData);

    pcaEnergies = pcaImpl.getEigValueEnergies();
    cprResult = [];
    cprResult.append(calcEigRatios(pcaImpl.eigValues)[:largestReducedFeature]);
    delta = np.divide(1.0, numOfTrainingSamples);
    gaussianResult = [];
    wishartResult = [];
    # print cprResult;
    for k, targetEpsilon in np.ndenumerate(xEpsilons):
        # print "epsilon: %.2f, delta: %f" % (targetEpsilon,delta);
        isGaussianDist = True;
        dpGaussianPCAImpl.setEpsilonAndGamma(targetEpsilon, delta);
        dpGaussianPCAImpl.getDiffPrivPCs(isGaussianDist, matrixRank, onlyEigvalues=True);
        # print dpGaussianPCAImpl.eigValues;
        GaussianEigRatio = calcEigRatios(dpGaussianPCAImpl.eigValues);
        gaussianResult.append(GaussianEigRatio[:largestReducedFeature]);
        # print GaussianEigRatio;
        isGaussianDist = False;
        dpWishartPCAImpl.setEpsilonAndGamma(targetEpsilon, delta);
        dpWishartPCAImpl.getDiffPrivPCs(isGaussianDist, matrixRank, onlyEigvalues=True);
        WishartEigRatio = calcEigRatios(dpWishartPCAImpl.eigValues);
        wishartResult.append(WishartEigRatio[:largestReducedFeature]);
        # print WishartEigRatio;
    cprResult.extend(gaussianResult);
    cprResult.extend(wishartResult);
    # print cprResult;
    return np.asarray(cprResult);
def singleExp(xEpsilons, trainingData, testingData, largestReducedFeature,
              isLinearSVM):
    pureTrainingData = trainingData[:, 1:]
    trainingLabel = trainingData[:, 0]

    numOfTrainingSamples = trainingData.shape[0]

    pureTestingData = testingData[:, 1:]
    testingLabel = testingData[:, 0]

    scaler = StandardScaler()
    # print pureTrainingData[0];
    #scaler.fit(pureTrainingData);
    pureTrainingData = scaler.fit_transform(pureTrainingData)
    # print pureTrainingData[0];

    # print pureTestingData[0];
    pureTestingData = scaler.transform(pureTestingData)
    # print pureTestingData[0];

    pcaImpl = PCAImpl(pureTrainingData)
    pcaImpl.getPCs(largestReducedFeature)

    dpGaussianPCAImpl = DiffPrivPCAImpl(pureTrainingData)
    dpWishartPCAImpl = DiffPrivPCAImpl(pureTrainingData)

    delta = np.divide(1.0, numOfTrainingSamples)
    projTrainingData = pcaImpl.transform(pureTrainingData,
                                         largestReducedFeature)
    projTestingData = pcaImpl.transform(pureTestingData, largestReducedFeature)
    #print projTrainingData.shape;
    cprResult = []
    print "non noise PCA SVM training"
    if isLinearSVM:
        pcaResult = SVMModule.SVMClf.linearSVM(projTrainingData, trainingLabel,
                                               projTestingData, testingLabel)
    else:
        pcaResult = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel,
                                            projTestingData, testingLabel)

    randomProjector = GaussianRandomProjection(
        n_components=largestReducedFeature)
    randomProjector.fit(pureTrainingData)

    for k, targetEpsilon in np.ndenumerate(xEpsilons):
        #print pcaImpl.projMatrix[:,0];
        print "epsilon: %.2f, delta: %f" % (targetEpsilon, delta)
        cprResult.append(targetEpsilon)
        isGaussianDist = True
        dpGaussianPCAImpl.setEpsilonAndGamma(targetEpsilon, delta)
        dpGaussianPCAImpl.getDiffPrivPCs(isGaussianDist, largestReducedFeature)

        isGaussianDist = False
        dpWishartPCAImpl.setEpsilonAndGamma(targetEpsilon, delta)
        dpWishartPCAImpl.getDiffPrivPCs(isGaussianDist, largestReducedFeature)
        '''
        We don't need to project the data multiple times.
        '''
        cprResult.extend(pcaResult)

        projTrainingData = dpGaussianPCAImpl.transform(pureTrainingData,
                                                       largestReducedFeature)
        projTestingData = dpGaussianPCAImpl.transform(pureTestingData,
                                                      largestReducedFeature)
        print "Gaussian-DPDPCA SVM training"

        if isLinearSVM:
            result = SVMModule.SVMClf.linearSVM(projTrainingData,
                                                trainingLabel, projTestingData,
                                                testingLabel)
        else:
            result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel,
                                             projTestingData, testingLabel)
        cprResult.extend(result)

        projTrainingData = dpWishartPCAImpl.transform(pureTrainingData,
                                                      largestReducedFeature)
        projTestingData = dpWishartPCAImpl.transform(pureTestingData,
                                                     largestReducedFeature)
        #print projTestingData.shape;
        print "Wishart-DPPCA SVM training"
        if isLinearSVM:
            result = SVMModule.SVMClf.linearSVM(projTrainingData,
                                                trainingLabel, projTestingData,
                                                testingLabel)
        else:
            result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel,
                                             projTestingData, testingLabel)
        cprResult.extend(result)

        projTrainingData, projTestingData = DPPro(
            pureTrainingData,
            pureTestingData,
            largestReducedFeature,
            targetEpsilon,
            randomProjector=randomProjector)
        # print projTestingData.shape;
        print "DPPro SVM training"
        if isLinearSVM:
            result = SVMModule.SVMClf.linearSVM(projTrainingData,
                                                trainingLabel, projTestingData,
                                                testingLabel)
        else:
            result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel,
                                             projTestingData, testingLabel)
        cprResult.extend(result)
    cprResult = np.asarray(cprResult)
    return cprResult.reshape((len(xEpsilons), -1))
Esempio n. 6
0
def singleExp(xDimensions, trainingData, testingData, largestReducedFeature,
              epsilon, isLinearSVM):
    pureTrainingData = trainingData[:, 1:]
    trainingLabel = trainingData[:, 0]

    pureTestingData = testingData[:, 1:]
    testingLabel = testingData[:, 0]

    scaler = StandardScaler()
    # print pureTrainingData[0];
    #scaler.fit(pureTrainingData);
    pureTrainingData = scaler.fit_transform(pureTrainingData)
    # print pureTrainingData[0];

    # print pureTestingData[0];
    pureTestingData = scaler.transform(pureTestingData)
    # print pureTestingData[0];

    cprResult = []
    pcaImpl = PCAImpl(pureTrainingData)

    pcaImpl.getPCs(largestReducedFeature)
    numOfTrainingSamples = trainingData.shape[0]

    delta = np.divide(1.0, numOfTrainingSamples)
    print "epsilon: %.2f, delta: %f" % (epsilon, delta)

    isGaussianDist = True
    dpGaussianPCAImpl = DiffPrivPCAImpl(pureTrainingData)
    dpGaussianPCAImpl.setEpsilonAndGamma(epsilon, delta)
    dpGaussianPCAImpl.getDiffPrivPCs(isGaussianDist, largestReducedFeature)

    isGaussianDist = False
    dpWishartPCAImpl = DiffPrivPCAImpl(pureTrainingData)
    dpWishartPCAImpl.setEpsilonAndGamma(epsilon, delta)
    dpWishartPCAImpl.getDiffPrivPCs(isGaussianDist, largestReducedFeature)

    for k, targetDimension in np.ndenumerate(xDimensions):
        #print pcaImpl.projMatrix[:,0];
        #result = SVMModule.SVMClf.rbfSVM(pureTrainingData,trainingLabel,pureTestingData,testingLabel);
        #print k;
        cprResult.append(targetDimension)
        projTrainingData = pcaImpl.transform(pureTrainingData, targetDimension)
        projTestingData = pcaImpl.transform(pureTestingData, targetDimension)
        print "Non-noise PCA %d" % targetDimension
        if isLinearSVM:
            result = SVMModule.SVMClf.linearSVM(projTrainingData,
                                                trainingLabel, projTestingData,
                                                testingLabel)
        else:
            result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel,
                                             projTestingData, testingLabel)

        cprResult.extend(result)

        isGaussianDist = True
        #dpGaussianPCAImpl.getDiffPrivPCs(isGaussianDist);
        projTrainingData = dpGaussianPCAImpl.transform(pureTrainingData,
                                                       targetDimension)
        projTestingData = dpGaussianPCAImpl.transform(pureTestingData,
                                                      targetDimension)
        #print projTestingData.shape;
        print "Gaussian-noise PCA %d" % targetDimension
        if isLinearSVM:
            result = SVMModule.SVMClf.linearSVM(projTrainingData,
                                                trainingLabel, projTestingData,
                                                testingLabel)
        else:
            result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel,
                                             projTestingData, testingLabel)
        cprResult.extend(result)

        isGaussianDist = False
        #dpWishartPCAImpl.getDiffPrivPCs(isGaussianDist);
        projTrainingData = dpWishartPCAImpl.transform(pureTrainingData,
                                                      targetDimension)
        projTestingData = dpWishartPCAImpl.transform(pureTestingData,
                                                     targetDimension)
        #print projTestingData.shape;
        print "Wishart-noise PCA %d" % targetDimension
        if isLinearSVM:
            result = SVMModule.SVMClf.linearSVM(projTrainingData,
                                                trainingLabel, projTestingData,
                                                testingLabel)
        else:
            result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel,
                                             projTestingData, testingLabel)
        cprResult.extend(result)

        projTrainingData, projTestingData = DPPro(pureTrainingData,
                                                  pureTestingData,
                                                  targetDimension, epsilon)

        print "DPPro %d" % targetDimension
        if isLinearSVM:
            result = SVMModule.SVMClf.linearSVM(projTrainingData,
                                                trainingLabel, projTestingData,
                                                testingLabel)
        else:
            result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel,
                                             projTestingData, testingLabel)
        cprResult.extend(result)
        """
        for result in cprResult:
            print "%f,%f,%f" % (result[0],result[1],result[2]);
        """
    #print(cprResult);
    cprResult = np.asarray(cprResult)
    return cprResult.reshape((len(xDimensions), -1))
def singleExp(xSamples, trainingData, testingData, topK, epsilon, isLinearSVM):

    pureTrainingData = trainingData[:, 1:]
    trainingLabel = trainingData[:, 0]

    pureTestingData = testingData[:, 1:]
    testingLabel = testingData[:, 0]

    scaler = StandardScaler()
    #print pureTrainingData[0];
    #scaler.fit(pureTrainingData);
    pureTrainingData = scaler.fit_transform(pureTrainingData)
    #print pureTrainingData[0];

    #print pureTestingData[0];
    pureTestingData = scaler.transform(pureTestingData)
    #print pureTestingData[0];

    preprocessing.normalize(pureTrainingData, copy=False)
    preprocessing.normalize(pureTestingData, copy=False)

    numOfFeature = trainingData.shape[1] - 1

    pcaImpl = PCAImpl(pureTrainingData)
    pcaImpl.getPCs(topK)
    '''
    To get a Wishart Noise projection matrix.
    '''
    WishartNoiseMatrix = DiffPrivImpl.SymmWishart(epsilon, numOfFeature)
    noisyCovMatrix = pcaImpl.covMatrix + WishartNoiseMatrix

    noisyLeftSigVectors, noisyEigValues, noisyProjMatrix = sparse.linalg.svds(
        noisyCovMatrix, k=topK, tol=0.001)
    noisyProjMatrix = np.real(noisyProjMatrix.T)
    """
    projTrainingData2 = np.dot(pureTrainingData, noisyProjMatrix);
    projTestingData2 = np.dot(pureTestingData, noisyProjMatrix);

    print "DPDPCA %d" % topK;
    if isLinearSVM:
        result = SVMModule.SVMClf.linearSVM(projTrainingData2, trainingLabel, projTestingData2, testingLabel);
    else:
        result = SVMModule.SVMClf.rbfSVM(projTrainingData2, trainingLabel, projTestingData2, testingLabel);

    cprResult.append(result[2]);
    """

    cprResult = []

    for k, numOfSamples in np.ndenumerate(xSamples):

        cprResult.append(numOfSamples)
        # Project the data using different projection matrix.
        projTrainingData1 = pcaImpl.transform(pureTrainingData, numOfSamples)
        projTestingData1 = pcaImpl.transform(pureTestingData, numOfSamples)
        print "Non-noise PCA %d" % numOfSamples
        if isLinearSVM:
            result = SVMModule.SVMClf.linearSVM(projTrainingData1,
                                                trainingLabel,
                                                projTestingData1, testingLabel)
        else:
            result = SVMModule.SVMClf.rbfSVM(projTrainingData1, trainingLabel,
                                             projTestingData1, testingLabel)

        cprResult.extend(result)

        projTrainingData2 = np.dot(pureTrainingData,
                                   noisyProjMatrix[:, :numOfSamples])
        projTestingData2 = np.dot(pureTestingData,
                                  noisyProjMatrix[:, :numOfSamples])

        print "DPDPCA %d" % numOfSamples
        if isLinearSVM:
            result = SVMModule.SVMClf.linearSVM(projTrainingData2,
                                                trainingLabel,
                                                projTestingData2, testingLabel)
        else:
            result = SVMModule.SVMClf.rbfSVM(projTrainingData2, trainingLabel,
                                             projTestingData2, testingLabel)

        cprResult.extend(result)

        pgProjMatrix = simulatePrivateGlobalPCA(pureTrainingData, numOfSamples,
                                                topK, epsilon)

        projTrainingData3 = np.dot(pureTrainingData, pgProjMatrix)
        projTestingData3 = np.dot(pureTestingData, pgProjMatrix)

        print "\nPrivateLocalPCA with %d data held by each data owner" % numOfSamples
        if isLinearSVM:
            result = SVMModule.SVMClf.linearSVM(projTrainingData3,
                                                trainingLabel,
                                                projTestingData3, testingLabel)
        else:
            result = SVMModule.SVMClf.rbfSVM(projTrainingData3, trainingLabel,
                                             projTestingData3, testingLabel)
        cprResult.extend(result)

    resultArray = np.asarray(cprResult)
    resultArray = np.reshape(resultArray, (len(xSamples), -1))
    return resultArray
def singleExp(xEpsilons, trainingData, testingData, largestReducedFeature):
    pureTrainingData = trainingData[:, 1:]
    trainingLabel = trainingData[:, 0]

    numOfTrainingSamples = trainingData.shape[0]

    pureTestingData = testingData[:, 1:]
    testingLabel = testingData[:, 0]

    scaler = StandardScaler()
    # print pureTrainingData[0];
    # scaler.fit(pureTrainingData);
    pureTrainingData = scaler.fit_transform(pureTrainingData)
    # print pureTrainingData[0];

    # print pureTestingData[0];
    pureTestingData = scaler.transform(pureTestingData)
    # print pureTestingData[0];

    pcaImpl = PCAImpl(pureTrainingData)
    pcaImpl.getPCs(largestReducedFeature)

    projTrainingData = pcaImpl.transform(pureTrainingData,
                                         largestReducedFeature)
    projTestingData = pcaImpl.transform(pureTestingData, largestReducedFeature)
    pcaResult = fit_MLP(projTrainingData,
                        trainingLabel,
                        projTestingData,
                        testingLabel,
                        n_classes=40)

    dpGaussianPCAImpl = DiffPrivPCAImpl(pureTrainingData)
    delta = np.divide(1.0, numOfTrainingSamples)

    # print projTrainingData.shape;
    cprResult = []
    print "non noise PCA NN training"

    for k, targetEpsilon in np.ndenumerate(xEpsilons):
        #print pcaImpl.projMatrix[:,0];
        #print k;
        print "epsilon: %.2f, delta: %f" % (targetEpsilon, delta)
        cprResult.append(targetEpsilon)
        isGaussianDist = True
        dpGaussianPCAImpl.setEpsilonAndGamma(targetEpsilon, delta)
        dpGaussianPCAImpl.getDiffPrivPCs(isGaussianDist, largestReducedFeature)

        cprResult.append(pcaResult)

        projTrainingData = dpGaussianPCAImpl.transform(pureTrainingData,
                                                       largestReducedFeature)
        projTestingData = dpGaussianPCAImpl.transform(pureTestingData,
                                                      largestReducedFeature)

        result = fit_MLP(projTrainingData,
                         trainingLabel,
                         projTestingData,
                         testingLabel,
                         n_classes=40)

        cprResult.append(result)

    cprResult = np.asarray(cprResult)
    return cprResult.reshape((len(xEpsilons), -1))