def mapper(tup): key, tables = tup h_table1, h_table2 = tables # Create an algorithm object to train the multiple linear regression model with a QR decomposition-based method linearRegressionTraining = training.Distributed(step1Local, method=training.qrDense) # Set the input data on local nodes deserialized_h_table1 = deserializeNumericTable(h_table1) deserialized_h_table2 = deserializeNumericTable(h_table2) linearRegressionTraining.input.set(training.data, deserialized_h_table1) linearRegressionTraining.input.set(training.dependentVariables, deserialized_h_table2) # Build a partial multiple linear regression model pres = linearRegressionTraining.compute() serialized_pres = serializeNumericTable(pres) return (key, serialized_pres)
def mapper(tup): key, val = tup # Create an algorithm to compute k-means on local nodes kmeansLocal = kmeans.Distributed(step1Local, nClusters, method=kmeans.defaultDense) # Set the input data on local nodes deserialized_val = deserializeNumericTable(val) deserialized_centroids = deserializeNumericTable(centroids) kmeansLocal.input.set(kmeans.data, deserialized_val) kmeansLocal.input.set(kmeans.inputCentroids, deserialized_centroids) # Compute k-means on local nodes pres = kmeansLocal.compute() serialized_pres = serializeNumericTable(pres) return (key, serialized_pres)
def mapper(tup): key, val = tup t1, t2 = val # Create an algorithm to train the Naive Bayes model on local nodes algorithm = training.Distributed(step1Local, nClasses) # Set the input data on local nodes deserialized_t1 = deserializeNumericTable(t1) deserialized_t2 = deserializeNumericTable(t2) algorithm.input.set(classifier.training.data, deserialized_t1) algorithm.input.set(classifier.training.labels, deserialized_t2) # Train the Naive Bayes model on local nodes pres = algorithm.compute() serialized_pres = serializeNumericTable(pres) return (key, serialized_pres)
def mapper(tup): key, tables = tup homogen_table1, homogen_table2 = tables # Create an algorithm object to train the multiple linear regression model with the normal equations method ridgeRegressionTraining = training.Distributed(step1Local) # Set the input data on local nodes deserialized_homogen_table1 = deserializeNumericTable(homogen_table1) deserialized_homogen_table2 = deserializeNumericTable(homogen_table2) ridgeRegressionTraining.input.set(training.data, deserialized_homogen_table1) ridgeRegressionTraining.input.set(training.dependentVariables, deserialized_homogen_table2) # Build a partial multiple linear regression model pres = ridgeRegressionTraining.compute() serialized_pres = serializeNumericTable(pres) return (key, serialized_pres)
def mapper(tup): key, val = tup # Create an algorithm to compute low order moments on local nodes momentsLocal = low_order_moments.Distributed( step1Local, method=low_order_moments.defaultDense) # Set the input data on local nodes deserialized_val = deserializeNumericTable(val) momentsLocal.input.set(low_order_moments.data, deserialized_val) # Compute low order moments on local nodes pres = momentsLocal.compute() serialized_pres = serializeNumericTable(pres) return (key, serialized_pres)
def mapper(tup): key, homogen_table = tup # Create an algorithm to compute PCA decomposition using the SVD method on local nodes pcaLocal = pca.Distributed(step1Local, method=pca.svdDense) # Set the input data on local nodes deserialized_homogen_table = deserializeNumericTable(homogen_table) pcaLocal.input.setDataset(pca.data, deserialized_homogen_table) # Compute PCA decomposition on local nodes pres = pcaLocal.compute() serialized_pres = serializeNumericTable(pres) return (key, serialized_pres)
def mapper(tup): key, homogen_table = tup # Create an algorithm to compute SVD on local nodes svdStep1Local = svd.Distributed(step1Local) deserialized_homogen_table = deserializeNumericTable(homogen_table) svdStep1Local.input.set(svd.data, deserialized_homogen_table) # Compute SVD in step 1 pres = svdStep1Local.compute() dataFromStep1ForStep2 = pres.get(svd.outputOfStep1ForStep2) serialized_data_1for2 = serializeNumericTable(dataFromStep1ForStep2) dataFromStep1ForStep3 = pres.get(svd.outputOfStep1ForStep3) serialized_data_1for3 = serializeNumericTable(dataFromStep1ForStep3) return (key, (serialized_data_1for2, serialized_data_1for3))
def testModel(testData, model): # Create algorithm objects to predict values of multiple linear regression with the default method linearRegressionPredict = prediction.Batch(method=prediction.defaultDense) # Pass the test data to the algorithm parts_list = testData.collect() for key, (h_table1, _) in parts_list: deserialized_h_table1 = deserializeNumericTable(h_table1) linearRegressionPredict.input.setTable(prediction.data, deserialized_h_table1) linearRegressionPredict.input.setModel(prediction.model, model) # Compute and retrieve the prediction results predictionResult = linearRegressionPredict.compute() return predictionResult.get(prediction.prediction)
def mapper(tup): key, homogen_table = tup # Create an algorithm to compute QR decomposition on local nodes qrStep1Local = qr.Distributed(step1Local, method=qr.defaultDense) deserialized_homogen_table = deserializeNumericTable(homogen_table) qrStep1Local.input.set(qr.data, deserialized_homogen_table) # Compute QR decomposition in step 1 pres = qrStep1Local.compute() dataFromStep1ForStep2 = pres.get(qr.outputOfStep1ForStep2) serialized_1for2 = serializeNumericTable(dataFromStep1ForStep2) dataFromStep1ForStep3 = pres.get(qr.outputOfStep1ForStep3) serialized_1for3 = serializeNumericTable(dataFromStep1ForStep3) return (key, (serialized_1for2, serialized_1for3))
def mapper(tup): key, val = tup # Create an algorithm to initialize the K-Means algorithm on local nodes kmeansLocalInit = init.Distributed(step1Local, nClusters, nBlocks * nVectorsInBlock, nVectorsInBlock * key, method=init.randomDense) # Set the input data on local nodes deserialized_val = deserializeNumericTable(val) kmeansLocalInit.input.set(init.data, deserialized_val) # Initialize the K-Means algorithm on local nodes pres = kmeansLocalInit.compute() serialized_pres = serializeNumericTable(pres) return (key, serialized_pres)
def mapper(tup): key, val = tup # Create an algorithm to compute a dense variance-covariance matrix on local nodes covarianceLocal = covariance.Distributed( step=step1Local, method=covariance.defaultDense) # Set the input data on local nodes deserialized_val = deserializeNumericTable(val) covarianceLocal.input.set(covariance.data, deserialized_val) # Compute a dense variance-covariance matrix on local nodes pres = covarianceLocal.compute() serialized_pres = serializeNumericTable(pres) return (key, serialized_pres)
def testModel(testData, model): # Create algorithm objects to predict values of the Naive Bayes model with the defaultDense method algorithm = prediction.Batch(nClasses) # Pass the test data to the algorithm parts_List = testData.collect() for key, (t1, t2) in parts_List: deserialized_t1 = deserializeNumericTable(t1) algorithm.input.setTable(classifier.prediction.data, deserialized_t1) algorithm.input.setModel(classifier.prediction.model, model) # Compute the prediction results predictionResult = algorithm.compute() # Retrieve the results return predictionResult.get(classifier.prediction.prediction)
def mapper(tup): key, tables = tup csr_table, homogen_table = tables # Create an algorithm to train the Naive Bayes model on local nodes algorithm = training.Distributed(step1Local, nClasses, method=training.fastCSR) # Set the input data on local nodes deserialized_csr_table = deserializeCSRNumericTable(csr_table) deserialized_homogen_table = deserializeNumericTable(homogen_table) algorithm.input.set(classifier.training.data, deserialized_csr_table) algorithm.input.set(classifier.training.labels, deserialized_homogen_table) # Train the Naive Bayes model on local nodes pres = algorithm.compute() serialized_pres = serializeNumericTable(pres) return (key, serialized_pres)
if __name__ == "__main__": # Create SparkContext that loads defaults from the system properties and the classpath and sets the name sc = SparkContext( conf=SparkConf().setAppName("Spark QR").setMaster('local[4]')) # Read from the distributed HDFS data set at a specified path dd = DistributedHDFSDataSet("/Spark/QR/data/") dataRDD = dd.getAsPairRDD(sc) # Compute QR decomposition for dataRDD result = runQR(dataRDD, sc) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('QR.out', 'w') # Print the results ntRPList = result['Q'].collect() for key, table in ntRPList: deserialized_table = deserializeNumericTable(table) printNumericTable(deserialized_table, "Q (2 first vectors from node #{}):".format(key), 2) printNumericTable(result['R'], "R:") # Restore stdout sys.stdout = stdout sc.stop()
trainDataLabelsFilesPath = "/Spark/LinearRegressionQR/data/LinearRegressionQR_train_labels_?.csv" testDataFilesPath = "/Spark/LinearRegressionQR/data/LinearRegressionQR_test_1.csv" testDataLabelsFilesPath = "/Spark/LinearRegressionQR/data/LinearRegressionQR_test_labels_1.csv" # Read the training data and labels from a specified path trainDataAndLabelsRDD = getMergedDataAndLabelsRDD(trainDataFilesPath, trainDataLabelsFilesPath, sc) # Read the test data and labels from a specified path testDataAndLabelsRDD = getMergedDataAndLabelsRDD(testDataFilesPath, testDataLabelsFilesPath, sc) # Compute linear regression for dataRDD res = runLinearRegression(trainDataAndLabelsRDD, testDataAndLabelsRDD) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('LinearRegressionQR.out', 'w') # Print the results parts_list = testDataAndLabelsRDD.collect() for key, (_, h_table2) in parts_list: deserialzied_expected = deserializeNumericTable(h_table2) printNumericTable(res['beta'], "Coefficients:") printNumericTable(res['predicted'], "First 10 rows of results (obtained): ", 10) printNumericTable(deserialzied_expected, "First 10 rows of results (expected): ", 10) # Restore stdout sys.stdout = stdout sc.stop()
trainDataFilesPath = "/Spark/NaiveBayesDense/data/NaiveBayesDense_train_?.csv" trainDataLabelsFilesPath = "/Spark/NaiveBayesDense/data/NaiveBayesDense_train_labels_?.csv" testDataFilesPath = "/Spark/NaiveBayesDense/data/NaiveBayesDense_test_1.csv" testDataLabelsFilesPath = "/Spark/NaiveBayesDense/data/NaiveBayesDense_test_labels_1.csv" # Read the training data and labels from a specified path trainDataAndLabelsRDD = getMergedDataAndLabelsRDD(trainDataFilesPath, trainDataLabelsFilesPath, sc) # Read the test data and labels from a specified path testDataAndLabelsRDD = getMergedDataAndLabelsRDD(testDataFilesPath, testDataLabelsFilesPath, sc) # Compute the results of the Naive Bayes algorithm for dataRDD result = runNaiveBayes(trainDataAndLabelsRDD, testDataAndLabelsRDD) # Print the results parts_List = testDataAndLabelsRDD.collect() for _, (t1, t2) in parts_List: expected = deserializeNumericTable(t2) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('NaiveBayesDense.out', 'w') printNumericTables(expected, result, "Ground truth", "Classification results", "NaiveBayes classification results (first 20 observations):", 20, flt64=False) # Restore stdout sys.stdout = stdout sc.stop()
# Read the training data and labels from a specified path trainDataAndLabelsRDD = getMergedCSRDataAndLabelsRDD( trainDataFilesPath, trainDataLabelsFilesPath, sc) # Read the test data and labels from a specified path testDataAndLabelsRDD = getMergedCSRDataAndLabelsRDD( testDataFilesPath, testDataLabelsFilesPath, sc) # Compute the results of the Naive Bayes algorithm for dataRDD predicted = runNaiveBayes(trainDataAndLabelsRDD, testDataAndLabelsRDD) # Print the results parts_list = testDataAndLabelsRDD.collect() for _, (csr, homogen) in parts_list: expected = deserializeNumericTable(homogen) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('NaiveBayesCSR.out', 'w') printNumericTables( expected, predicted, "Ground truth", "Classification results", "NaiveBayes classification results (first 20 observations):", 20, flt64=False) # Restore stdout
trainDataFilesPath, trainDataLabelsFilesPath, sc) # Read the test data and labels from a specified path testDataAndLabelsRDD = getMergedDataAndLabelsRDD(testDataFilesPath, testDataLabelsFilesPath, sc) # Compute linear regression for dataRDD res = runLinearRegression(trainDataAndLabelsRDD, testDataAndLabelsRDD) # Redirect stdout to a file for correctness verification stdout = sys.stdout sys.stdout = open('LinearRegressionNormEq.out', 'w') # Print the results parts_list = testDataAndLabelsRDD.collect() for key, (_, h_table2) in parts_list: expected = h_table2 deserialized_expected = deserializeNumericTable(expected) printNumericTable(res['beta'], "Coefficients:") printNumericTable(res['predicted'], "First 10 rows of results (obtained): ", 10) printNumericTable(deserialized_expected, "First 10 rows of results (expected): ", 10) # Restore stdout sys.stdout = stdout sc.stop()