def mapper(tup):

        key, tables = tup
        h_table1, h_table2 = tables

        # Create an algorithm object to train the multiple linear regression model with a QR decomposition-based method
        linearRegressionTraining = training.Distributed(step1Local, method=training.qrDense)
        # Set the input data on local nodes
        deserialized_h_table1 = deserializeNumericTable(h_table1)
        deserialized_h_table2 = deserializeNumericTable(h_table2)
        linearRegressionTraining.input.set(training.data, deserialized_h_table1)
        linearRegressionTraining.input.set(training.dependentVariables, deserialized_h_table2)

        # Build a partial multiple linear regression model
        pres = linearRegressionTraining.compute()
        serialized_pres = serializeNumericTable(pres)

        return (key, serialized_pres)
Beispiel #2
0
    def mapper(tup):

        key, val = tup

        # Create an algorithm to compute k-means on local nodes
        kmeansLocal = kmeans.Distributed(step1Local, nClusters, method=kmeans.defaultDense)

        # Set the input data on local nodes
        deserialized_val = deserializeNumericTable(val)
        deserialized_centroids = deserializeNumericTable(centroids)
        kmeansLocal.input.set(kmeans.data, deserialized_val)
        kmeansLocal.input.set(kmeans.inputCentroids, deserialized_centroids)

        # Compute k-means on local nodes
        pres = kmeansLocal.compute()
        serialized_pres = serializeNumericTable(pres)

        return (key, serialized_pres)
    def mapper(tup):
        key, val = tup
        t1, t2 = val

        # Create an algorithm to train the Naive Bayes model on local nodes
        algorithm = training.Distributed(step1Local, nClasses)

        # Set the input data on local nodes
        deserialized_t1 = deserializeNumericTable(t1)
        deserialized_t2 = deserializeNumericTable(t2)
        algorithm.input.set(classifier.training.data, deserialized_t1)
        algorithm.input.set(classifier.training.labels, deserialized_t2)

        # Train the Naive Bayes model on local nodes
        pres = algorithm.compute()
        serialized_pres = serializeNumericTable(pres)

        return (key, serialized_pres)
    def mapper(tup):
        key, tables = tup
        homogen_table1, homogen_table2 = tables

        # Create an algorithm object to train the multiple linear regression model with the normal equations method
        ridgeRegressionTraining = training.Distributed(step1Local)
        # Set the input data on local nodes
        deserialized_homogen_table1 = deserializeNumericTable(homogen_table1)
        deserialized_homogen_table2 = deserializeNumericTable(homogen_table2)

        ridgeRegressionTraining.input.set(training.data,
                                          deserialized_homogen_table1)
        ridgeRegressionTraining.input.set(training.dependentVariables,
                                          deserialized_homogen_table2)

        # Build a partial multiple linear regression model
        pres = ridgeRegressionTraining.compute()
        serialized_pres = serializeNumericTable(pres)

        return (key, serialized_pres)
    def mapper(tup):

        key, val = tup
        # Create an algorithm to compute low order moments on local nodes
        momentsLocal = low_order_moments.Distributed(
            step1Local, method=low_order_moments.defaultDense)

        # Set the input data on local nodes
        deserialized_val = deserializeNumericTable(val)
        momentsLocal.input.set(low_order_moments.data, deserialized_val)

        # Compute low order moments on local nodes
        pres = momentsLocal.compute()
        serialized_pres = serializeNumericTable(pres)

        return (key, serialized_pres)
Beispiel #6
0
    def mapper(tup):

        key, homogen_table = tup

        # Create an algorithm to compute PCA decomposition using the SVD method on local nodes
        pcaLocal = pca.Distributed(step1Local, method=pca.svdDense)

        # Set the input data on local nodes
        deserialized_homogen_table = deserializeNumericTable(homogen_table)
        pcaLocal.input.setDataset(pca.data, deserialized_homogen_table)

        # Compute PCA decomposition on local nodes
        pres = pcaLocal.compute()
        serialized_pres = serializeNumericTable(pres)

        return (key, serialized_pres)
Beispiel #7
0
    def mapper(tup):
        key, homogen_table = tup

        # Create an algorithm to compute SVD on local nodes
        svdStep1Local = svd.Distributed(step1Local)

        deserialized_homogen_table = deserializeNumericTable(homogen_table)
        svdStep1Local.input.set(svd.data, deserialized_homogen_table)

        # Compute SVD in step 1
        pres = svdStep1Local.compute()
        dataFromStep1ForStep2 = pres.get(svd.outputOfStep1ForStep2)
        serialized_data_1for2 = serializeNumericTable(dataFromStep1ForStep2)
        dataFromStep1ForStep3 = pres.get(svd.outputOfStep1ForStep3)
        serialized_data_1for3 = serializeNumericTable(dataFromStep1ForStep3)

        return (key, (serialized_data_1for2, serialized_data_1for3))
def testModel(testData, model):

    # Create algorithm objects to predict values of multiple linear regression with the default method
    linearRegressionPredict = prediction.Batch(method=prediction.defaultDense)

    # Pass the test data to the algorithm
    parts_list = testData.collect()
    for key, (h_table1, _) in parts_list:
        deserialized_h_table1 = deserializeNumericTable(h_table1)
        linearRegressionPredict.input.setTable(prediction.data, deserialized_h_table1)

    linearRegressionPredict.input.setModel(prediction.model, model)

    # Compute and retrieve the prediction results
    predictionResult = linearRegressionPredict.compute()

    return predictionResult.get(prediction.prediction)
Beispiel #9
0
    def mapper(tup):

        key, homogen_table = tup

        # Create an algorithm to compute QR decomposition on local nodes
        qrStep1Local = qr.Distributed(step1Local, method=qr.defaultDense)
        deserialized_homogen_table = deserializeNumericTable(homogen_table)
        qrStep1Local.input.set(qr.data, deserialized_homogen_table)

        # Compute QR decomposition in step 1
        pres = qrStep1Local.compute()
        dataFromStep1ForStep2 = pres.get(qr.outputOfStep1ForStep2)
        serialized_1for2 = serializeNumericTable(dataFromStep1ForStep2)
        dataFromStep1ForStep3 = pres.get(qr.outputOfStep1ForStep3)
        serialized_1for3 = serializeNumericTable(dataFromStep1ForStep3)

        return (key, (serialized_1for2, serialized_1for3))
Beispiel #10
0
    def mapper(tup):
        key, val = tup
        # Create an algorithm to initialize the K-Means algorithm on local nodes
        kmeansLocalInit = init.Distributed(step1Local,
                                           nClusters,
                                           nBlocks * nVectorsInBlock,
                                           nVectorsInBlock * key,
                                           method=init.randomDense)

        # Set the input data on local nodes
        deserialized_val = deserializeNumericTable(val)
        kmeansLocalInit.input.set(init.data, deserialized_val)

        # Initialize the K-Means algorithm on local nodes
        pres = kmeansLocalInit.compute()
        serialized_pres = serializeNumericTable(pres)

        return (key, serialized_pres)
    def mapper(tup):

        key, val = tup

        # Create an algorithm to compute a dense variance-covariance matrix on local nodes
        covarianceLocal = covariance.Distributed(
            step=step1Local, method=covariance.defaultDense)

        # Set the input data on local nodes
        deserialized_val = deserializeNumericTable(val)
        covarianceLocal.input.set(covariance.data, deserialized_val)

        # Compute a dense variance-covariance matrix on local nodes
        pres = covarianceLocal.compute()

        serialized_pres = serializeNumericTable(pres)

        return (key, serialized_pres)
def testModel(testData, model):

    # Create algorithm objects to predict values of the Naive Bayes model with the defaultDense method
    algorithm = prediction.Batch(nClasses)

    # Pass the test data to the algorithm
    parts_List = testData.collect()
    for key, (t1, t2) in parts_List:
        deserialized_t1 = deserializeNumericTable(t1)
        algorithm.input.setTable(classifier.prediction.data, deserialized_t1)

    algorithm.input.setModel(classifier.prediction.model, model)

    # Compute the prediction results
    predictionResult = algorithm.compute()

    # Retrieve the results
    return predictionResult.get(classifier.prediction.prediction)
Beispiel #13
0
    def mapper(tup):

        key, tables = tup
        csr_table, homogen_table = tables

        # Create an algorithm to train the Naive Bayes model on local nodes
        algorithm = training.Distributed(step1Local,
                                         nClasses,
                                         method=training.fastCSR)

        # Set the input data on local nodes
        deserialized_csr_table = deserializeCSRNumericTable(csr_table)
        deserialized_homogen_table = deserializeNumericTable(homogen_table)
        algorithm.input.set(classifier.training.data, deserialized_csr_table)
        algorithm.input.set(classifier.training.labels,
                            deserialized_homogen_table)

        # Train the Naive Bayes model on local nodes
        pres = algorithm.compute()
        serialized_pres = serializeNumericTable(pres)

        return (key, serialized_pres)
Beispiel #14
0
if __name__ == "__main__":

    # Create SparkContext that loads defaults from the system properties and the classpath and sets the name
    sc = SparkContext(
        conf=SparkConf().setAppName("Spark QR").setMaster('local[4]'))

    # Read from the distributed HDFS data set at a specified path
    dd = DistributedHDFSDataSet("/Spark/QR/data/")
    dataRDD = dd.getAsPairRDD(sc)

    # Compute QR decomposition for dataRDD
    result = runQR(dataRDD, sc)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('QR.out', 'w')

    # Print the results
    ntRPList = result['Q'].collect()
    for key, table in ntRPList:
        deserialized_table = deserializeNumericTable(table)
        printNumericTable(deserialized_table,
                          "Q (2 first vectors from node #{}):".format(key), 2)

    printNumericTable(result['R'], "R:")

    # Restore stdout
    sys.stdout = stdout

    sc.stop()
    trainDataLabelsFilesPath = "/Spark/LinearRegressionQR/data/LinearRegressionQR_train_labels_?.csv"
    testDataFilesPath = "/Spark/LinearRegressionQR/data/LinearRegressionQR_test_1.csv"
    testDataLabelsFilesPath = "/Spark/LinearRegressionQR/data/LinearRegressionQR_test_labels_1.csv"

    # Read the training data and labels from a specified path
    trainDataAndLabelsRDD = getMergedDataAndLabelsRDD(trainDataFilesPath, trainDataLabelsFilesPath, sc)

    # Read the test data and labels from a specified path
    testDataAndLabelsRDD = getMergedDataAndLabelsRDD(testDataFilesPath, testDataLabelsFilesPath, sc)

    # Compute linear regression for dataRDD
    res = runLinearRegression(trainDataAndLabelsRDD, testDataAndLabelsRDD)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('LinearRegressionQR.out', 'w')

    # Print the results
    parts_list = testDataAndLabelsRDD.collect()
    for key, (_, h_table2) in parts_list:
        deserialzied_expected = deserializeNumericTable(h_table2)

    printNumericTable(res['beta'], "Coefficients:")
    printNumericTable(res['predicted'], "First 10 rows of results (obtained): ", 10)
    printNumericTable(deserialzied_expected, "First 10 rows of results (expected): ", 10)

    # Restore stdout
    sys.stdout = stdout

    sc.stop()
    trainDataFilesPath = "/Spark/NaiveBayesDense/data/NaiveBayesDense_train_?.csv"
    trainDataLabelsFilesPath = "/Spark/NaiveBayesDense/data/NaiveBayesDense_train_labels_?.csv"
    testDataFilesPath = "/Spark/NaiveBayesDense/data/NaiveBayesDense_test_1.csv"
    testDataLabelsFilesPath = "/Spark/NaiveBayesDense/data/NaiveBayesDense_test_labels_1.csv"

    # Read the training data and labels from a specified path
    trainDataAndLabelsRDD = getMergedDataAndLabelsRDD(trainDataFilesPath, trainDataLabelsFilesPath, sc)

    # Read the test data and labels from a specified path
    testDataAndLabelsRDD = getMergedDataAndLabelsRDD(testDataFilesPath, testDataLabelsFilesPath, sc)

    # Compute the results of the Naive Bayes algorithm for dataRDD
    result = runNaiveBayes(trainDataAndLabelsRDD, testDataAndLabelsRDD)

    # Print the results
    parts_List = testDataAndLabelsRDD.collect()
    for _, (t1, t2) in parts_List:
        expected = deserializeNumericTable(t2)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('NaiveBayesDense.out', 'w')

    printNumericTables(expected, result, "Ground truth", "Classification results",
                       "NaiveBayes classification results (first 20 observations):", 20, flt64=False)

    # Restore stdout
    sys.stdout = stdout

    sc.stop()
Beispiel #17
0
    # Read the training data and labels from a specified path
    trainDataAndLabelsRDD = getMergedCSRDataAndLabelsRDD(
        trainDataFilesPath, trainDataLabelsFilesPath, sc)

    # Read the test data and labels from a specified path
    testDataAndLabelsRDD = getMergedCSRDataAndLabelsRDD(
        testDataFilesPath, testDataLabelsFilesPath, sc)

    # Compute the results of the Naive Bayes algorithm for dataRDD
    predicted = runNaiveBayes(trainDataAndLabelsRDD, testDataAndLabelsRDD)

    # Print the results
    parts_list = testDataAndLabelsRDD.collect()
    for _, (csr, homogen) in parts_list:
        expected = deserializeNumericTable(homogen)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('NaiveBayesCSR.out', 'w')

    printNumericTables(
        expected,
        predicted,
        "Ground truth",
        "Classification results",
        "NaiveBayes classification results (first 20 observations):",
        20,
        flt64=False)

    # Restore stdout
        trainDataFilesPath, trainDataLabelsFilesPath, sc)

    # Read the test data and labels from a specified path
    testDataAndLabelsRDD = getMergedDataAndLabelsRDD(testDataFilesPath,
                                                     testDataLabelsFilesPath,
                                                     sc)

    # Compute linear regression for dataRDD
    res = runLinearRegression(trainDataAndLabelsRDD, testDataAndLabelsRDD)

    # Redirect stdout to a file for correctness verification
    stdout = sys.stdout
    sys.stdout = open('LinearRegressionNormEq.out', 'w')

    # Print the results
    parts_list = testDataAndLabelsRDD.collect()
    for key, (_, h_table2) in parts_list:
        expected = h_table2
        deserialized_expected = deserializeNumericTable(expected)

    printNumericTable(res['beta'], "Coefficients:")
    printNumericTable(res['predicted'],
                      "First 10 rows of results (obtained): ", 10)
    printNumericTable(deserialized_expected,
                      "First 10 rows of results (expected): ", 10)

    # Restore stdout
    sys.stdout = stdout

    sc.stop()