def testModel(trainingResult): # Initialize FileDataSource to retrieve the input data from a .csv file testDataSource = FileDataSource(testDatasetFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for testing data and ground truth values testData = HomogenNumericTable(NUM_FEATURES, 0, NumericTableIface.doNotAllocate) testGroundTruth = HomogenNumericTable(NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate) mergedData = MergedNumericTable(testData, testGroundTruth) # Retrieve the data from an input file testDataSource.loadDataBlock(mergedData) # Create an algorithm object to predict values of ridge regression algorithm = prediction.Batch() # Pass a testing data set and the trained model to the algorithm algorithm.input.setTable(prediction.data, testData) algorithm.input.setModel(prediction.model, trainingResult.get(training.model)) # Predict values of ridge regression res = algorithm.compute() # Retrieve the algorithm results printNumericTable(res.get(prediction.prediction), "Ridge Regression prediction results: (first 10 rows):", 10) printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
def predictReducedModelResults(self,trainingResult,trainData, nDependentVariables, reducedBeta): model = trainingResult.get(training.model) betas = model.getBeta () nBetas = model.getNumberOfBetas () savedBeta = np.zeros((nDependentVariables,nBetas)) block = BlockDescriptor () betas.getBlockOfRows (0, nDependentVariables, readWrite, block) pBeta = block.getArray() if type (reducedBeta) == int: reducedBeta = [reducedBeta] for beta in reducedBeta: for i in range (nDependentVariables): savedBeta[i][beta] = pBeta[i][beta] pBeta[i][beta] = 0 betas.releaseBlockOfRows (block) printNumericTable(betas) predictedResults = LinearRegression.predict(self,trainingResult,trainData) block = BlockDescriptor () betas.getBlockOfRows (0, nBetas, readWrite, block) pBeta = block.getArray() for beta in reducedBeta: for i in range (0, nDependentVariables): pBeta[i][beta] = savedBeta[i][beta] betas.releaseBlockOfRows (block) printNumericTable(betas) return predictedResults
def printAllQualityMetrics(self, qualityMetricSetResult): # Print the quality metrics printNumericTable(qualityMetricSetResult.get('confusionMatrix'), "Confusion matrix:") print("Accuracy: {0:.3f}".format(qualityMetricSetResult.get('accuracy'))) print("Precision: {0:.3f}".format(qualityMetricSetResult.get('precision'))) print("Recall: {0:.3f}".format(qualityMetricSetResult.get('recall'))) print("F1-score: {0:.3f}".format(qualityMetricSetResult.get('fscore'))) print("Specificity: {0:.3f}".format(qualityMetricSetResult.get('specificity'))) print("AUC: {0:.3f}".format(qualityMetricSetResult.get('auc')))
def printAllQualityMetrics(self, qualityMetricSetResult): # Print the quality metrics printNumericTable(qualityMetricSetResult.get('confusionMatrix'), "Confusion matrix:") print ("Average accuracy: {0:.3f}".format (qualityMetricSetResult.get('averageAccuracy'))) print ("Error rate: {0:.3f}".format (qualityMetricSetResult.get('errorRate'))) print ("Micro precision: {0:.3f}".format (qualityMetricSetResult.get('microPrecision'))) print ("Micro recall: {0:.3f}".format (qualityMetricSetResult.get('microRecall'))) print ("Micro F-score: {0:.3f}".format (qualityMetricSetResult.get('microFscore'))) print ("Macro precision: {0:.3f}".format (qualityMetricSetResult.get('macroPrecision'))) print ("Macro recall: {0:.3f}".format (qualityMetricSetResult.get('macroRecall'))) print ("Macro F-score: {0:.3f}".format (qualityMetricSetResult.get('macroFscore')))
def printAllQualityMetrics(self, qualityMetricSetResult): # Print the quality metrics qualityMetricResult = qualityMetricSetResult.getResult(quality_metric_set.confusionMatrix) printNumericTable(qualityMetricResult.get(binary_confusion_matrix.confusionMatrix), "Confusion matrix:") block = BlockDescriptor() qualityMetricsTable = qualityMetricResult.get(binary_confusion_matrix.binaryMetrics) qualityMetricsTable.getBlockOfRows(0, 1, readOnly, block) qualityMetricsData = block.getArray().flatten() print("Accuracy: {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.accuracy])) print("Precision: {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.precision])) print("Recall: {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.recall])) print("F1-score: {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.fscore])) print("Specificity: {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.specificity])) print("AUC: {0:.3f}".format(qualityMetricsData[binary_confusion_matrix.AUC])) qualityMetricsTable.releaseBlockOfRows(block)
def printAllQualityMetrics(self, qualityMetricSetResult): # Print the quality metrics qualityMetricResult = qualityMetricSetResult.getResult ( multi_class_classifier.quality_metric_set.confusionMatrix) printNumericTable (qualityMetricResult.get (multiclass_confusion_matrix.confusionMatrix), "Confusion matrix:") block = BlockDescriptor () qualityMetricsTable = qualityMetricResult.get (multiclass_confusion_matrix.multiClassMetrics) qualityMetricsTable.getBlockOfRows (0, 1, readOnly, block) qualityMetricsData = block.getArray ().flatten () print ("Average accuracy: {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.averageAccuracy])) print ("Error rate: {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.errorRate])) print ("Micro precision: {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.microPrecision])) print ("Micro recall: {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.microRecall])) print ("Micro F-score: {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.microFscore])) print ("Macro precision: {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.macroPrecision])) print ("Macro recall: {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.macroRecall])) print ("Macro F-score: {0:.3f}".format (qualityMetricsData[multiclass_confusion_matrix.macroFscore])) qualityMetricsTable.releaseBlockOfRows (block)
def printResultsM(): # Print the classification results printNumericTables(groundTruthLabels, predictedLabels, "Ground truth", "Classification results", "SVM classification results (first 20 observations):", 20, interval=15, flt64=False) # Print the quality metrics qualityMetricResult = qualityMetricSetResult.getResult( multi_class_classifier.quality_metric_set.confusionMatrix) printNumericTable( qualityMetricResult.get(multiclass_confusion_matrix.confusionMatrix), "Confusion matrix:") block = BlockDescriptor() qualityMetricsTable = qualityMetricResult.get( multiclass_confusion_matrix.multiClassMetrics) qualityMetricsTable.getBlockOfRows(0, 1, readOnly, block) qualityMetricsData = block.getArray().flatten() print("Average accuracy: {0:.3f}".format( qualityMetricsData[multiclass_confusion_matrix.averageAccuracy])) print("Error rate: {0:.3f}".format( qualityMetricsData[multiclass_confusion_matrix.errorRate])) print("Micro precision: {0:.3f}".format( qualityMetricsData[multiclass_confusion_matrix.microPrecision])) print("Micro recall: {0:.3f}".format( qualityMetricsData[multiclass_confusion_matrix.microRecall])) print("Micro F-score: {0:.3f}".format( qualityMetricsData[multiclass_confusion_matrix.microFscore])) print("Macro precision: {0:.3f}".format( qualityMetricsData[multiclass_confusion_matrix.macroPrecision])) print("Macro recall: {0:.3f}".format( qualityMetricsData[multiclass_confusion_matrix.macroRecall])) print("Macro F-score: {0:.3f}".format( qualityMetricsData[multiclass_confusion_matrix.macroFscore])) qualityMetricsTable.releaseBlockOfRows(block)
def printResultsB(): # Print the classification results printNumericTables(groundTruthLabels, predictedLabels, "Ground truth", "Classification results", "SVM classification results (first 20 observations):", 20, interval=15, flt64=False) # Print the quality metrics qualityMetricResult = qualityMetricSetResult.getResult( svm.quality_metric_set.confusionMatrix) printNumericTable( qualityMetricResult.get(binary_confusion_matrix.confusionMatrix), "Confusion matrix:") block = BlockDescriptor() qualityMetricsTable = qualityMetricResult.get( binary_confusion_matrix.binaryMetrics) qualityMetricsTable.getBlockOfRows(0, 1, readOnly, block) qualityMetricsData = block.getArray().flatten() print("Accuracy: {0:.3f}".format( qualityMetricsData[binary_confusion_matrix.accuracy])) print("Precision: {0:.3f}".format( qualityMetricsData[binary_confusion_matrix.precision])) print("Recall: {0:.3f}".format( qualityMetricsData[binary_confusion_matrix.recall])) print("F-score: {0:.3f}".format( qualityMetricsData[binary_confusion_matrix.fscore])) print("Specificity: {0:.3f}".format( qualityMetricsData[binary_confusion_matrix.specificity])) print("AUC: {0:.3f}".format( qualityMetricsData[binary_confusion_matrix.AUC])) qualityMetricsTable.releaseBlockOfRows(block)
def printAllQualityMetrics(self, resultSingleBeta, resultGroupBeta): print ("Quality metrics for a single beta") printNumericTable (resultSingleBeta.getResult (single_beta.rms), "Root means square errors for each response (dependent variable):") printNumericTable (resultSingleBeta.getResult (single_beta.variance), "Variance for each response (dependent variable):") printNumericTable (resultSingleBeta.getResult (single_beta.zScore), "Z-score statistics:") printNumericTable (resultSingleBeta.getResult (single_beta.confidenceIntervals), "Confidence intervals for each beta coefficient:") printNumericTable (resultSingleBeta.getResult (single_beta.inverseOfXtX), "Inverse(Xt * X) matrix:") coll = resultSingleBeta.getResultDataCollection (single_beta.betaCovariances) for i in range (0, coll.size ()): message = "Variance-covariance matrix for betas of " + str (i) + "-th response" betaCov = resultSingleBeta.get (single_beta.betaCovariances, i) printNumericTable (betaCov, message) # Print quality metrics for a group of betas print ("Quality metrics for a group of betas") printNumericTable (resultGroupBeta.get (group_of_betas.expectedMeans), "Means of expected responses for each dependent variable:", 0, 0, 20) printNumericTable (resultGroupBeta.get (group_of_betas.expectedVariance), "Variance of expected responses for each dependent variable:", 0, 0, 20) printNumericTable (resultGroupBeta.get (group_of_betas.regSS), "Regression sum of squares of expected responses:", 0, 0, 20) printNumericTable (resultGroupBeta.get (group_of_betas.resSS), "Sum of squares of residuals for each dependent variable:", 0, 0, 20) printNumericTable (resultGroupBeta.get (group_of_betas.tSS), "Total sum of squares for each dependent variable:", 0, 0, 20) printNumericTable (resultGroupBeta.get (group_of_betas.determinationCoeff), "Determination coefficient for each dependent variable:", 0, 0, 20) printNumericTable (resultGroupBeta.get (group_of_betas.fStatistics), "F-statistics for each dependent variable:", 0, 0, 20)
if __name__ == "__main__": # Create SparkContext that loads defaults from the system properties and the classpath and sets the name sc = SparkContext( conf=SparkConf().setAppName("Spark QR").setMaster('local[4]')) # Read from the distributed HDFS data set at a specified path dd = DistributedHDFSDataSet("/Spark/QR/data/") dataRDD = dd.getAsPairRDD(sc) # Compute QR decomposition for dataRDD result = runQR(dataRDD, sc) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('QR.out', 'w') # Print the results ntRPList = result['Q'].collect() for key, table in ntRPList: deserialized_table = deserializeNumericTable(table) printNumericTable(deserialized_table, "Q (2 first vectors from node #{}):".format(key), 2) printNumericTable(result['R'], "R:") # Restore stdout sys.stdout = stdout sc.stop()
trainDataFilesPath, trainDataLabelsFilesPath, sc) # Read the test data and labels from a specified path testDataAndLabelsRDD = getMergedDataAndLabelsRDD(testDataFilesPath, testDataLabelsFilesPath, sc) # Compute linear regression for dataRDD res = runLinearRegression(trainDataAndLabelsRDD, testDataAndLabelsRDD) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('LinearRegressionNormEq.out', 'w') # Print the results parts_list = testDataAndLabelsRDD.collect() for key, (_, h_table2) in parts_list: expected = h_table2 deserialized_expected = deserializeNumericTable(expected) printNumericTable(res['beta'], "Coefficients:") printNumericTable(res['predicted'], "First 10 rows of results (obtained): ", 10) printNumericTable(deserialized_expected, "First 10 rows of results (expected): ", 10) # Restore stdout sys.stdout = stdout sc.stop()
y_test = y_test_temp[:, np.newaxis] trainData = HomogenNumericTable(x_train) trainDependentVariables = HomogenNumericTable(y_train) testData = HomogenNumericTable(x_test) testGroundTruth = HomogenNumericTable(y_test) #Instantiate Linear Regression object lr = LinearRegression() #Training trainingResult = lr.training(trainData, trainDependentVariables) #Prediction prediction_nT = lr.predict(trainingResult, testData) #Evaluation qualityMet = lr.qualityMetrics(trainingResult, prediction_nT, testGroundTruth) printNumericTable(qualityMet.get('rms'), "Root mean square") #To print all the metrics lr.printAllQualityMetrics(qualityMet) #To predict and evaluate. Note that this method performs predictions on both unrestricted and restricted(reduced) model predRes, predResRed, qualityMet = lr.predictWithQualityMetrics( trainingResult, testData, testGroundTruth, [1, 2]) #Serialize lr.serialize(trainingResult, fileName='LR.npy') #Deseriailze de_trainingResult = lr.deserialize(fileName="LR.npy") #Print Metrics results #print predicted responses and actual response printNumericTable(predRes, "Linear Regression prediction results: (first 10 rows):", 10) printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
''' Instantiate Decision Forest object: Classification(nClasses, nTrees = 100, observationsPerTreeFraction = 1,featuresPerNode=0,maxTreeDepth=0, minObservationsInLeafNodes=1,impurityThreshold=0,varImportance='MDI') ''' daal_DF = Classification(len(np.unique(y)),observationsPerTreeFraction=0.7,varImportance='MDI',resultsToCompute=3) #Train trainingResult = daal_DF.training(trainData,trainDependentVariables) #Predict predictResults = daal_DF.predict(trainingResult,testData) #Evaluate you model qualityMet = daal_DF.qualityMetrics(predictResults,testGroundTruth) #print accuracy print("Accuracy".format(qualityMet.get('accuracy'))) #print confusion matrix printNumericTable(qualityMet.get('confusionMatrix'),"Confusion Matrix") #print all metrics print("All available metrics") daal_DF.printAllQualityMetrics(qualityMet) #Serialize and save daal_DF.serialize(trainingResult, fileName='DF', useCompression=True) #Deserialize dese_trainingRes = daal_DF.deserialize(fileName='DF.npy', useCompression=True) #Print predicted responses and actual responses printNumericTables ( testGroundTruth, predictResults, "Ground truth", "Classification results", "Decision Forest classification results (first 20 observations):", 20, flt64=False )
maximum = res.get(low_order_moments.maximum) sum = res.get(low_order_moments.sum) sumSquares = res.get(low_order_moments.sumSquares) sumSquaresCentered = res.get(low_order_moments.sumSquaresCentered) mean = res.get(low_order_moments.mean) secondOrderRawMoment = res.get(low_order_moments.secondOrderRawMoment) variance = res.get(low_order_moments.variance) standardDeviation = res.get(low_order_moments.standardDeviation) variation = res.get(low_order_moments.variation) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('LowOrderMomentsDense.out', 'w') print("Low order moments:") printNumericTable(minimum, "Min:") printNumericTable(maximum, "Max:") printNumericTable(sum, "Sum:") printNumericTable(sumSquares, "SumSquares:") printNumericTable(sumSquaresCentered, "SumSquaredDiffFromMean:") printNumericTable(mean, "Mean:") printNumericTable(secondOrderRawMoment, "SecondOrderRawMoment:") printNumericTable(variance, "Variance:") printNumericTable(standardDeviation, "StandartDeviation:") printNumericTable(variation, "Variation:") # Restore stdout sys.stdout = stdout sc.stop()
def printAllQualityMetrics(self, qualityMet): # Print quality metrics for single belta print("Quality metrics for a single beta") printNumericTable( qualityMet.get('rms'), "Root means square errors for each response (dependent variable):") printNumericTable(qualityMet.get('variance'), "Variance for each response (dependent variable):") printNumericTable(qualityMet.get('zScore'), "Z-score statistics:") printNumericTable(qualityMet.get('confidenceIntervals'), "Confidence intervals for each beta coefficient:") printNumericTable(qualityMet.get('inverseOfXtX'), "Inverse(Xt * X) matrix:") betaCov = qualityMet.get('betaCovariances') for i in range(len(betaCov)): message = "Variance-covariance matrix for betas of " + str( i + 1) + "-th response" printNumericTable(betaCov[i], message) # Print quality metrics for a group of betas print("Quality metrics for a group of betas") printNumericTable( qualityMet.get('expectedMeans'), "Means of expected responses for each dependent variable:") printNumericTable( qualityMet.get('expectedVariance'), "Variance of expected responses for each dependent variable:") printNumericTable(qualityMet.get('regSS'), "Regression sum of squares of expected responses:") printNumericTable( qualityMet.get('resSS'), "Sum of squares of residuals for each dependent variable:") printNumericTable(qualityMet.get('tSS'), "Total sum of squares for each dependent variable:") printNumericTable( qualityMet.get('determinationCoeff'), "Determination coefficient for each dependent variable:") printNumericTable(qualityMet.get('fStatistics'), "F-statistics for each dependent variable:")
kmeansMaster.compute() # Finalize computations and retrieve the results res = kmeansMaster.finalizeCompute() return res.get(kmeans.centroids) if __name__ == "__main__": # Create SparkContext that loads defaults from the system properties and the classpath and sets the name sc = SparkContext(conf=SparkConf().setAppName("Spark Kmeans").setMaster('local[4]')) # Read from the distributed HDFS data set at a specified path dd = DistributedHDFSDataSet("/Spark/KmeansDense/data/") dataRDD = dd.getAsPairRDD(sc) # Compute k-means for dataRDD result = runKmeans(dataRDD) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('KmeansDense.out', 'w') # Print the results printNumericTable(result, "First 10 dimensions of centroids:", 20, 10) # Restore stdout sys.stdout = stdout sc.stop()
def trainModel(comm, rankId): trainingResult = None # Initialize FileDataSource to retrieve the input data from a .csv file trainDataSource = FileDataSource(trainDatasetFileNames[rankId], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(NUM_FEATURES, 0, NumericTableIface.doNotAllocate) trainDependentVariables = HomogenNumericTable( NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) # Create an algorithm object to train the ridge regression model based on the local-node data localAlgorithm = training.Distributed(step1Local) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(training.data, trainData) localAlgorithm.input.set(training.dependentVariables, trainDependentVariables) # Train the ridge regression model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) # Transfer partial results to step 2 on the root node nodeResults = dataArch.getArchiveAsArray() serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final ridge regression model on the master node masterAlgorithm = training.Distributed(step2Master) for i in range(NUM_BLOCKS): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local ridge regression model as input for the master-node algorithm masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the ridge regression model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute() # Retrieve the algorithm results printNumericTable( trainingResult.get(training.model).getBeta(), "Ridge Regression coefficients:") return trainingResult
# Create a data archive to deserialize the numeric table dataArch = OutputDataArchive(buffer) # Deserialize the numeric table from the data archive object.deserialize(dataArch) return object if __name__ == "__main__": comm = MPI.COMM_WORLD rankId = comm.Get_rank() transposedDataTable = createSparseTable( transposedTrainDatasetFileNames[rankId]) step4LocalInput = KeyValueDataCollection() itemsPartialResultPrediction = KeyValueDataCollection() dataTable = initializeModel() trainModel(dataTable, transposedDataTable) testModel() if rankId == MPI_ROOT: for i in range(nBlocks): for j in range(nBlocks): print("prediction {}, {}".format(i, j)) printNumericTable(predictedRatingsMaster[i][j].get( ratings.prediction))
return result if __name__ == "__main__": # Create JavaSparkContext that loads defaults from the system properties and the classpath and sets the name sc = SparkContext(conf=SparkConf().setAppName( "Spark covariance(CSR)").setMaster("local[4]")) # Read from the distributed HDFS data set at a specified path dd = DistributedHDFSDataSet("/Spark/CovarianceCSR/data/") dataRDD = dd.getCSRAsPairRDD(sc) # Compute a sparse variance-covariance matrix for dataRDD final_result = runCovariance(dataRDD) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('CovarianceCSR.out', 'w') # Print the results printNumericTable(final_result['covariance'], "Covariance matrix (upper left square 10*10) :", 10, 10, 9) printNumericTable(final_result['mean'], "Mean vector:", 1, 10, 9) # Restore stdout sys.stdout = stdout sc.stop()
# Transfer partial results to step 2 on the root node data = comm_size.gather(nodeResults, MPI_ROOT) if rankId == MPI_ROOT: # Create an algorithm to compute a sparse variance-covariance matrix on the master node masterAlgorithm = covariance.Distributed(step2Master, method=covariance.fastCSR) for i in range(nBlocks): # Deserialize partial results from step 1 dataArch = OutputDataArchive(data[i]) dataForStep2FromStep1 = covariance.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set local partial results as input for the master-node algorithm masterAlgorithm.input.add(covariance.partialResults, dataForStep2FromStep1) # Merge and finalizeCompute a sparse variance-covariance matrix on the master node masterAlgorithm.compute() result = masterAlgorithm.finalizeCompute( ) # Retrieve the algorithm results # Print the results printNumericTable(result.get(covariance.covariance), "Covariance matrix (upper left square 10*10) :", 10, 10) printNumericTable(result.get(covariance.mean), "Mean vector:", 1, 10)
result['mean'] = res.get(covariance.mean) return result if __name__ == "__main__": # Create SparkContext that loads defaults from the system properties and the classpath and sets the name sc = SparkContext(conf=SparkConf().setAppName( "Spark covariance(dense)").setMaster("local[4]")) # Read from the distributed HDFS data set at a specified path dd = DistributedHDFSDataSet("/Spark/CovarianceDense/data/") dataRDD = dd.getAsPairRDD(sc) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('CovarianceDense.out', 'w') # Compute a dense variance-covariance matrix for dataRDD final_result = runCovariance(dataRDD) # Print the results printNumericTable(final_result['covariance'], "Covariance:", interval=9) printNumericTable(final_result['mean'], "Mean:", interval=9) # Restore stdout sys.stdout = stdout sc.stop()
# Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() # Transfer partial results to step 2 on the root node serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm for principal component analysis using the SVD method on the master node masterAlgorithm = pca.Distributed(step2Master, method=pca.svdDense) for i in range(nBlocks): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = pca.PartialResult(pca.svdDense) dataForStep2FromStep1.deserialize(dataArch) # Set local partial results as input for the master-node algorithm masterAlgorithm.input.add(pca.partialResults, dataForStep2FromStep1) # Merge and finalizeCompute PCA decomposition on the master node masterAlgorithm.compute() res = masterAlgorithm.finalizeCompute() # Print the results printNumericTable(res.get(pca.eigenvalues), "Eigenvalues:") printNumericTable(res.get(pca.eigenvectors), "Eigenvectors:")
'eigenvectors': res.get(pca.eigenvectors), 'eigenvalues': res.get(pca.eigenvalues) } if __name__ == "__main__": # Create SparkContext that loads defaults from the system properties and the classpath and sets the name sc = SparkContext( conf=SparkConf().setAppName("Spark PCA(COR)").setMaster('local[4]')) # Read from the distributed HDFS data set at a specified path dd = DistributedHDFSDataSet("/Spark/PcaCorCSR/data/") dataRDD = dd.getCSRAsPairRDD(sc) # Compute PCA decomposition for dataRDD using the correlation method result = runPCA(dataRDD) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('PcaCorCSR.out', 'w') # Print the results printNumericTable(result['eigenvalues'], "Eigenvalues:") printNumericTable(result['eigenvectors'], "Eigenvectors:") # Restore sdtout sys.stdout = stdout sc.stop()
# Create SparkContext that loads defaults from the system properties and the classpath and sets the name sc = SparkContext( conf=SparkConf().setAppName('Spark SVD').setMaster('local[4]')) # Read from the distributed HDFS data set at a specified path dd = DistributedHDFSDataSet("/Spark/Svd/data/") dataRDD = dd.getAsPairRDD(sc) # Compute SVD decomposition for dataRDD res = runSVD(dataRDD) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('Svd.out', 'w') # Print the results ntRPList = res['U'].collect() for num, table in ntRPList: deserialized_table = deserializeNumericTable(table) printNumericTable(deserialized_table, "U (2 first vectors from node #{}):".format(num), 2) printNumericTable(res['Sigma'], "Sigma:") printNumericTable(res['V'], "V:") # Restore stdout sys.stdout = stdout sc.stop()
Ui = res.get(svd.leftSingularMatrix) if __name__ == "__main__": comm = MPI.COMM_WORLD comm_size = comm.Get_size() rankId = comm.Get_rank() if nBlocks != comm_size: if rankId == MPI_ROOT: frmt = "{} MPI ranks != {} datasets available, so please start exactly {} ranks." print(frmt.format(comm_size, nBlocks, nBlocks)) sys.exit(0) computestep1Local() if rankId == MPI_ROOT: computeOnMasterNode() finalizeComputestep1Local() # Print the results if rankId == MPI_ROOT: printNumericTable(Sigma, "Singular values:") printNumericTable(V, "Right orthogonal matrix V:") printNumericTable(Ui, "Part of left orthogonal matrix U from root node:", 10)
from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split data = load_boston() X = data.data Y = data.target x_train, x_test, y_train_temp, y_test_temp = train_test_split(X, Y, test_size=0.40, random_state=42) y_train = y_train_temp[:, np.newaxis] y_test = y_test_temp[:, np.newaxis] trainData = HomogenNumericTable(x_train) trainDependentVariables = HomogenNumericTable(y_train) testData = HomogenNumericTable(x_test) testGroundTruth = HomogenNumericTable(y_test) #Instantiate Linear Regression object rigde = RidgeRegression(ridgeParameters=0.0005) #Training trainingResult = rigde.training(trainData, trainDependentVariables) #Prediction pred_nT = rigde.predict(trainingResult, trainData) #Serialize rigde.serialize(trainingResult, fileName='RR.npy') #Deseriailze de_trainingResult = rigde.deserialize(fileName="RR.npy") #print predicted responses and actual response printNumericTable(pred_nT, "Ridge Regression prediction results: (first 10 rows):", 10) printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
# Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = low_order_moments.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set local partial results as input for the master-node algorithm masterAlgorithm.input.add(low_order_moments.partialResults, dataForStep2FromStep1) # Merge and finalizeCompute low order moments on the master node masterAlgorithm.compute() res = masterAlgorithm.finalizeCompute() # Print the results printNumericTable(res.get(low_order_moments.minimum), "Minimum:") printNumericTable(res.get(low_order_moments.maximum), "Maximum:") printNumericTable(res.get(low_order_moments.sum), "Sum:") printNumericTable(res.get(low_order_moments.sumSquares), "Sum of squares:") printNumericTable(res.get(low_order_moments.sumSquaresCentered), "Sum of squared difference from the means:") printNumericTable(res.get(low_order_moments.mean), "Mean:") printNumericTable(res.get(low_order_moments.secondOrderRawMoment), "Second order raw moment:") printNumericTable(res.get(low_order_moments.variance), "Variance:") printNumericTable(res.get(low_order_moments.standardDeviation), "Standard deviation:") printNumericTable(res.get(low_order_moments.variation), "Variation:")
nodeResults = dataArch.getArchiveAsArray() # Transfer partial results to step 2 on the root node data = comm_size.gather(nodeResults, MPI_ROOT) if rankId == MPI_ROOT: # Create an algorithm to compute a variance-covariance matrix on the master node masterAlgorithm = covariance.Distributed(step2Master) for i in range(nBlocks): # Deserialize partial results from step 1 dataArch = OutputDataArchive(data[i]) dataForStep2FromStep1 = covariance.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set local partial results as input for the master-node algorithm masterAlgorithm.input.add(covariance.partialResults, dataForStep2FromStep1) # Merge and finalizeCompute a dense variance-covariance matrix on the master node */ masterAlgorithm.compute() result = masterAlgorithm.finalizeCompute() # Print the results printNumericTable(result.get(covariance.covariance), "Covariance matrix:") printNumericTable(result.get(covariance.mean), "Mean vector:")
trainData = HomogenNumericTable(seeded.rand(200, nFeatures)) trainDependentVariables = HomogenNumericTable( seeded.rand(200, nDependentVariables)) testData = HomogenNumericTable(seeded.rand(50, nFeatures)) testGroundTruth = HomogenNumericTable(seeded.rand(50, nDependentVariables)) #Instantiate Linear Regression object lr = LinearRegression() #Training trainingResult = lr.training(trainData, trainDependentVariables) #Prediction pred_array = lr.predict(trainingResult, trainData) #Serialize lr.serialize(trainingResult, fileName='trainingResult.npy') #Deseriailze de_trainingResult = lr.deserialize(fileName="trainingResult.npy") #Predict with Metrics predRes, predResRed, singleBeta, groupBeta = lr.predictWithQualityMetrics( trainingResult, trainData, trainDependentVariables, reducedBetaIndex=[2, 10]) #Print Metrics results lr.printAllQualityMetrics(singleBeta, groupBeta) #print predicted responses and actual response printNumericTable(predRes, "Linear Regression prediction results: (first 10 rows):", 10) printNumericTable(predResRed, "Linear Regression prediction results: (first 10 rows):", 10) printNumericTable(trainDependentVariables, "Ground truth (first 10 rows):", 10)
algorithm.compute() res = algorithm.finalizeCompute() Qi = res.get(qr.matrixQ) if __name__ == "__main__": comm = MPI.COMM_WORLD comm_size = comm.Get_size() rankId = comm.Get_rank() if nBlocks != comm_size: if rankId == MPI_ROOT: print( "{} MPI ranks != {} datasets available, so please start exactly {} ranks.", comm_size, nBlocks, nBlocks) sys.exit(0) computestep1Local() if rankId == MPI_ROOT: computeOnMasterNode() finalizeComputestep1Local() # Print the results if rankId == MPI_ROOT: printNumericTable(Qi, "Part of orthogonal matrix Q from 1st node:", 10) printNumericTable(R, "Triangular matrix R:")