def doExp(datasetPath, epsilon, varianceRatio, n_trails, numOfDimensions, logPath, isLinearSVM=True): if os.path.basename(datasetPath).endswith('npy'): data = np.load(datasetPath) else: #data = np.loadtxt(datasetPath, delimiter=","); data = pd.read_csv(datasetPath, delimiter=",", header=None).values scaler = StandardScaler() data_std = scaler.fit_transform(data[:, 1:]) globalPCA = PCAImpl(data_std) numOfFeature = data.shape[1] - 1 largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance( varianceRatio) print "%d/%d dimensions captures %.2f variance." % ( largestReducedFeature, numOfFeature, varianceRatio) xDimensions = None if numOfDimensions > numOfFeature: xDimensions = np.arange(1, numOfFeature) largestReducedFeature = numOfFeature else: xDimensions = np.arange( 1, largestReducedFeature, max(largestReducedFeature / numOfDimensions, 1)) cprResult = [] rs = StratifiedShuffleSplit(n_splits=n_trails, test_size=.2, random_state=0) rs.get_n_splits(data[:, 1:], data[:, 0]) for train_index, test_index in rs.split(data[:, 1:], data[:, 0]): trainingData = data[train_index] testingData = data[test_index] tmpResult = singleExp(xDimensions, trainingData, testingData, largestReducedFeature, epsilon, isLinearSVM) with open(logPath, "a") as f: np.savetxt(f, tmpResult, delimiter=",", fmt='%1.3f') cprResult.append(tmpResult) cprResult = np.vstack(cprResult) for result in cprResult: print ','.join(['%.3f' % num for num in result]) return cprResult
def doExp(datasetPath, epsilon, varianceRatio, numOfRounds, numOfPointsinXAxis, isLinearSVM=True): if os.path.basename(datasetPath).endswith('npy'): data = np.load(datasetPath) else: data = np.loadtxt(datasetPath, delimiter=",") numOfFeature = data.shape[1] - 1 scaler = StandardScaler() data_std = scaler.fit_transform(data[:, 1:]) globalPCA = PCAImpl(data_std) largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance( varianceRatio) print "%d/%d dimensions captures %.2f variance." % ( largestReducedFeature, numOfFeature, varianceRatio) cprResult = None #rs = StratifiedShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0); #rs.get_n_splits(data[:,1:],data[:,0]); rs = ShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0) rs.get_n_splits(data) for train_index, test_index in rs.split(data): #for train_index, test_index in rs.split(data[:,1:],data[:,0]): trainingData = data[train_index] testingData = data[test_index] print "number of training samples %d" % trainingData.shape[0] #tmpResult = p.apply_async(singleExp, (xDimensions,trainingData,testingData,topK,isLinearSVM)); #cprResult += tmpResult.get(); mostSamplesPerDataOwner = trainingData.shape[0] / 2 xSamples = np.arange( 2, mostSamplesPerDataOwner, max(mostSamplesPerDataOwner / numOfPointsinXAxis, 1)) print "number of samples be tested: %s" % xSamples tmpResult = singleExp(xSamples, trainingData, testingData, largestReducedFeature, epsilon, isLinearSVM) if cprResult is None: cprResult = tmpResult else: cprResult = np.concatenate((cprResult, tmpResult), axis=0) for result in cprResult: print ','.join(['%.3f' % num for num in result]) return cprResult
def doExp(datasetPath,varianceRatio,numOfRounds): if os.path.basename(datasetPath).endswith('npy'): data = np.load(datasetPath); else: data = np.loadtxt(datasetPath, delimiter=","); rs = ShuffleSplit(n_splits=numOfRounds, test_size=2, random_state=0); rs.get_n_splits(data); globalPCA = PCAImpl(data[:, 1:]); numOfFeature = data.shape[1] - 1; matrixRank = LA.matrix_rank(data[:, 1:]); print "Matrix rank of the data is %d." % matrixRank; largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(varianceRatio); print "%d/%d dimensions captures %.2f variance." % (largestReducedFeature, numOfFeature, varianceRatio); xEpsilons = np.arange(0.1, 1.1, 0.1); # print xDimensions; # p = Pool(numOfRounds); # allResults = []; cprResult = []; m = 0; for train_index, test_index in rs.split(data): print "Trail %d" % m; trainingData = data[train_index]; pureTrainingData = trainingData[:, 1:]; tmpResult = singleExp(xEpsilons, pureTrainingData, largestReducedFeature); cprResult.extend(tmpResult); m += 1; # print tmpResult.shape; # print tmpResult; # tmpResult = p.apply_async(singleExp, (xEpsilons,pureTrainingData,largestReducedFeature)); # cprResult += tmpResult.get(); """ for i in range(0,len(cprResult)): print "%.4f,%.4f,%.4f" % (cprResult[i][0],cprResult[i][1],cprResult[i][2]); print "******************************"; """ # Compute the average value after numOfRounds experiments. # avgCprResult = cprResult/numOfRounds; # p.close(); # p.join(); for result in cprResult: print ','.join(['%.3f' % num for num in result]); return np.asarray(cprResult, dtype=float);
def singleExp(xEpsilons,pureTrainingData,largestReducedFeature): numOfTrainingSamples = pureTrainingData.shape[0]; scaler = StandardScaler(copy=False); # print pureTrainingData[0]; scaler.fit(pureTrainingData); scaler.transform(pureTrainingData); # numOfFeature = trainingData.shape[1]-1; matrixRank = LA.matrix_rank(pureTrainingData); pcaImpl = PCAImpl(pureTrainingData); dpGaussianPCAImpl = DiffPrivPCAImpl(pureTrainingData); dpWishartPCAImpl = DiffPrivPCAImpl(pureTrainingData); pcaEnergies = pcaImpl.getEigValueEnergies(); cprResult = []; cprResult.append(calcEigRatios(pcaImpl.eigValues)[:largestReducedFeature]); delta = np.divide(1.0, numOfTrainingSamples); gaussianResult = []; wishartResult = []; # print cprResult; for k, targetEpsilon in np.ndenumerate(xEpsilons): # print "epsilon: %.2f, delta: %f" % (targetEpsilon,delta); isGaussianDist = True; dpGaussianPCAImpl.setEpsilonAndGamma(targetEpsilon, delta); dpGaussianPCAImpl.getDiffPrivPCs(isGaussianDist, matrixRank, onlyEigvalues=True); # print dpGaussianPCAImpl.eigValues; GaussianEigRatio = calcEigRatios(dpGaussianPCAImpl.eigValues); gaussianResult.append(GaussianEigRatio[:largestReducedFeature]); # print GaussianEigRatio; isGaussianDist = False; dpWishartPCAImpl.setEpsilonAndGamma(targetEpsilon, delta); dpWishartPCAImpl.getDiffPrivPCs(isGaussianDist, matrixRank, onlyEigvalues=True); WishartEigRatio = calcEigRatios(dpWishartPCAImpl.eigValues); wishartResult.append(WishartEigRatio[:largestReducedFeature]); # print WishartEigRatio; cprResult.extend(gaussianResult); cprResult.extend(wishartResult); # print cprResult; return np.asarray(cprResult);
def singleExp(xEpsilons, trainingData, testingData, largestReducedFeature, isLinearSVM): pureTrainingData = trainingData[:, 1:] trainingLabel = trainingData[:, 0] numOfTrainingSamples = trainingData.shape[0] pureTestingData = testingData[:, 1:] testingLabel = testingData[:, 0] scaler = StandardScaler() # print pureTrainingData[0]; #scaler.fit(pureTrainingData); pureTrainingData = scaler.fit_transform(pureTrainingData) # print pureTrainingData[0]; # print pureTestingData[0]; pureTestingData = scaler.transform(pureTestingData) # print pureTestingData[0]; pcaImpl = PCAImpl(pureTrainingData) pcaImpl.getPCs(largestReducedFeature) dpGaussianPCAImpl = DiffPrivPCAImpl(pureTrainingData) dpWishartPCAImpl = DiffPrivPCAImpl(pureTrainingData) delta = np.divide(1.0, numOfTrainingSamples) projTrainingData = pcaImpl.transform(pureTrainingData, largestReducedFeature) projTestingData = pcaImpl.transform(pureTestingData, largestReducedFeature) #print projTrainingData.shape; cprResult = [] print "non noise PCA SVM training" if isLinearSVM: pcaResult = SVMModule.SVMClf.linearSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) else: pcaResult = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) randomProjector = GaussianRandomProjection( n_components=largestReducedFeature) randomProjector.fit(pureTrainingData) for k, targetEpsilon in np.ndenumerate(xEpsilons): #print pcaImpl.projMatrix[:,0]; print "epsilon: %.2f, delta: %f" % (targetEpsilon, delta) cprResult.append(targetEpsilon) isGaussianDist = True dpGaussianPCAImpl.setEpsilonAndGamma(targetEpsilon, delta) dpGaussianPCAImpl.getDiffPrivPCs(isGaussianDist, largestReducedFeature) isGaussianDist = False dpWishartPCAImpl.setEpsilonAndGamma(targetEpsilon, delta) dpWishartPCAImpl.getDiffPrivPCs(isGaussianDist, largestReducedFeature) ''' We don't need to project the data multiple times. ''' cprResult.extend(pcaResult) projTrainingData = dpGaussianPCAImpl.transform(pureTrainingData, largestReducedFeature) projTestingData = dpGaussianPCAImpl.transform(pureTestingData, largestReducedFeature) print "Gaussian-DPDPCA SVM training" if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) else: result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) cprResult.extend(result) projTrainingData = dpWishartPCAImpl.transform(pureTrainingData, largestReducedFeature) projTestingData = dpWishartPCAImpl.transform(pureTestingData, largestReducedFeature) #print projTestingData.shape; print "Wishart-DPPCA SVM training" if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) else: result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) cprResult.extend(result) projTrainingData, projTestingData = DPPro( pureTrainingData, pureTestingData, largestReducedFeature, targetEpsilon, randomProjector=randomProjector) # print projTestingData.shape; print "DPPro SVM training" if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) else: result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) cprResult.extend(result) cprResult = np.asarray(cprResult) return cprResult.reshape((len(xEpsilons), -1))
def singleExp(xDimensions, trainingData, testingData, largestReducedFeature, epsilon, isLinearSVM): pureTrainingData = trainingData[:, 1:] trainingLabel = trainingData[:, 0] pureTestingData = testingData[:, 1:] testingLabel = testingData[:, 0] scaler = StandardScaler() # print pureTrainingData[0]; #scaler.fit(pureTrainingData); pureTrainingData = scaler.fit_transform(pureTrainingData) # print pureTrainingData[0]; # print pureTestingData[0]; pureTestingData = scaler.transform(pureTestingData) # print pureTestingData[0]; cprResult = [] pcaImpl = PCAImpl(pureTrainingData) pcaImpl.getPCs(largestReducedFeature) numOfTrainingSamples = trainingData.shape[0] delta = np.divide(1.0, numOfTrainingSamples) print "epsilon: %.2f, delta: %f" % (epsilon, delta) isGaussianDist = True dpGaussianPCAImpl = DiffPrivPCAImpl(pureTrainingData) dpGaussianPCAImpl.setEpsilonAndGamma(epsilon, delta) dpGaussianPCAImpl.getDiffPrivPCs(isGaussianDist, largestReducedFeature) isGaussianDist = False dpWishartPCAImpl = DiffPrivPCAImpl(pureTrainingData) dpWishartPCAImpl.setEpsilonAndGamma(epsilon, delta) dpWishartPCAImpl.getDiffPrivPCs(isGaussianDist, largestReducedFeature) for k, targetDimension in np.ndenumerate(xDimensions): #print pcaImpl.projMatrix[:,0]; #result = SVMModule.SVMClf.rbfSVM(pureTrainingData,trainingLabel,pureTestingData,testingLabel); #print k; cprResult.append(targetDimension) projTrainingData = pcaImpl.transform(pureTrainingData, targetDimension) projTestingData = pcaImpl.transform(pureTestingData, targetDimension) print "Non-noise PCA %d" % targetDimension if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) else: result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) cprResult.extend(result) isGaussianDist = True #dpGaussianPCAImpl.getDiffPrivPCs(isGaussianDist); projTrainingData = dpGaussianPCAImpl.transform(pureTrainingData, targetDimension) projTestingData = dpGaussianPCAImpl.transform(pureTestingData, targetDimension) #print projTestingData.shape; print "Gaussian-noise PCA %d" % targetDimension if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) else: result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) cprResult.extend(result) isGaussianDist = False #dpWishartPCAImpl.getDiffPrivPCs(isGaussianDist); projTrainingData = dpWishartPCAImpl.transform(pureTrainingData, targetDimension) projTestingData = dpWishartPCAImpl.transform(pureTestingData, targetDimension) #print projTestingData.shape; print "Wishart-noise PCA %d" % targetDimension if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) else: result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) cprResult.extend(result) projTrainingData, projTestingData = DPPro(pureTrainingData, pureTestingData, targetDimension, epsilon) print "DPPro %d" % targetDimension if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) else: result = SVMModule.SVMClf.rbfSVM(projTrainingData, trainingLabel, projTestingData, testingLabel) cprResult.extend(result) """ for result in cprResult: print "%f,%f,%f" % (result[0],result[1],result[2]); """ #print(cprResult); cprResult = np.asarray(cprResult) return cprResult.reshape((len(xDimensions), -1))
def singleExp(xSamples, trainingData, testingData, topK, epsilon, isLinearSVM): pureTrainingData = trainingData[:, 1:] trainingLabel = trainingData[:, 0] pureTestingData = testingData[:, 1:] testingLabel = testingData[:, 0] scaler = StandardScaler() #print pureTrainingData[0]; #scaler.fit(pureTrainingData); pureTrainingData = scaler.fit_transform(pureTrainingData) #print pureTrainingData[0]; #print pureTestingData[0]; pureTestingData = scaler.transform(pureTestingData) #print pureTestingData[0]; preprocessing.normalize(pureTrainingData, copy=False) preprocessing.normalize(pureTestingData, copy=False) numOfFeature = trainingData.shape[1] - 1 pcaImpl = PCAImpl(pureTrainingData) pcaImpl.getPCs(topK) ''' To get a Wishart Noise projection matrix. ''' WishartNoiseMatrix = DiffPrivImpl.SymmWishart(epsilon, numOfFeature) noisyCovMatrix = pcaImpl.covMatrix + WishartNoiseMatrix noisyLeftSigVectors, noisyEigValues, noisyProjMatrix = sparse.linalg.svds( noisyCovMatrix, k=topK, tol=0.001) noisyProjMatrix = np.real(noisyProjMatrix.T) """ projTrainingData2 = np.dot(pureTrainingData, noisyProjMatrix); projTestingData2 = np.dot(pureTestingData, noisyProjMatrix); print "DPDPCA %d" % topK; if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData2, trainingLabel, projTestingData2, testingLabel); else: result = SVMModule.SVMClf.rbfSVM(projTrainingData2, trainingLabel, projTestingData2, testingLabel); cprResult.append(result[2]); """ cprResult = [] for k, numOfSamples in np.ndenumerate(xSamples): cprResult.append(numOfSamples) # Project the data using different projection matrix. projTrainingData1 = pcaImpl.transform(pureTrainingData, numOfSamples) projTestingData1 = pcaImpl.transform(pureTestingData, numOfSamples) print "Non-noise PCA %d" % numOfSamples if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData1, trainingLabel, projTestingData1, testingLabel) else: result = SVMModule.SVMClf.rbfSVM(projTrainingData1, trainingLabel, projTestingData1, testingLabel) cprResult.extend(result) projTrainingData2 = np.dot(pureTrainingData, noisyProjMatrix[:, :numOfSamples]) projTestingData2 = np.dot(pureTestingData, noisyProjMatrix[:, :numOfSamples]) print "DPDPCA %d" % numOfSamples if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData2, trainingLabel, projTestingData2, testingLabel) else: result = SVMModule.SVMClf.rbfSVM(projTrainingData2, trainingLabel, projTestingData2, testingLabel) cprResult.extend(result) pgProjMatrix = simulatePrivateGlobalPCA(pureTrainingData, numOfSamples, topK, epsilon) projTrainingData3 = np.dot(pureTrainingData, pgProjMatrix) projTestingData3 = np.dot(pureTestingData, pgProjMatrix) print "\nPrivateLocalPCA with %d data held by each data owner" % numOfSamples if isLinearSVM: result = SVMModule.SVMClf.linearSVM(projTrainingData3, trainingLabel, projTestingData3, testingLabel) else: result = SVMModule.SVMClf.rbfSVM(projTrainingData3, trainingLabel, projTestingData3, testingLabel) cprResult.extend(result) resultArray = np.asarray(cprResult) resultArray = np.reshape(resultArray, (len(xSamples), -1)) return resultArray
def singleExp(xEpsilons, trainingData, testingData, largestReducedFeature): pureTrainingData = trainingData[:, 1:] trainingLabel = trainingData[:, 0] numOfTrainingSamples = trainingData.shape[0] pureTestingData = testingData[:, 1:] testingLabel = testingData[:, 0] scaler = StandardScaler() # print pureTrainingData[0]; # scaler.fit(pureTrainingData); pureTrainingData = scaler.fit_transform(pureTrainingData) # print pureTrainingData[0]; # print pureTestingData[0]; pureTestingData = scaler.transform(pureTestingData) # print pureTestingData[0]; pcaImpl = PCAImpl(pureTrainingData) pcaImpl.getPCs(largestReducedFeature) projTrainingData = pcaImpl.transform(pureTrainingData, largestReducedFeature) projTestingData = pcaImpl.transform(pureTestingData, largestReducedFeature) pcaResult = fit_MLP(projTrainingData, trainingLabel, projTestingData, testingLabel, n_classes=40) dpGaussianPCAImpl = DiffPrivPCAImpl(pureTrainingData) delta = np.divide(1.0, numOfTrainingSamples) # print projTrainingData.shape; cprResult = [] print "non noise PCA NN training" for k, targetEpsilon in np.ndenumerate(xEpsilons): #print pcaImpl.projMatrix[:,0]; #print k; print "epsilon: %.2f, delta: %f" % (targetEpsilon, delta) cprResult.append(targetEpsilon) isGaussianDist = True dpGaussianPCAImpl.setEpsilonAndGamma(targetEpsilon, delta) dpGaussianPCAImpl.getDiffPrivPCs(isGaussianDist, largestReducedFeature) cprResult.append(pcaResult) projTrainingData = dpGaussianPCAImpl.transform(pureTrainingData, largestReducedFeature) projTestingData = dpGaussianPCAImpl.transform(pureTestingData, largestReducedFeature) result = fit_MLP(projTrainingData, trainingLabel, projTestingData, testingLabel, n_classes=40) cprResult.append(result) cprResult = np.asarray(cprResult) return cprResult.reshape((len(xEpsilons), -1))