コード例 #1
0
    def svd(self, Data_Path, target, n):
        '''
        daal4py SVD SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # Train setup
        file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file_path)
        data = data.drop(target, axis=1)

        algo = d4p.svd(distributed=True)
        self.logger.info('Training the SVD in pydaal SPMD Mode')

        # SVD result
        svd_start_time = time.time()
        result = algo.compute(data)
        self.latency["Parallel_SVD_SPMD_Time"] = time.time() - svd_start_time

        # result is available on all processes - but we print only on root
        if d4p.my_procid() == 0:
            print("SVD completed", result)

        self.logger.info('Completed SVD in pydaal SPMD Mode')
        d4p.daalfini()

        return
コード例 #2
0
    def pca(self, Data_Path, target, n):
        '''
        daal4py PCA SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # Train setup
        file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file_path)
        data = data.drop(target, axis=1)

        # configure a PCA object
        algo = d4p.pca(method='svdDense', distributed=True)

        self.logger.info('Training the PCA in  pydaal SPMD Mode')

        start = time.time()

        result = algo.compute(data)
        self.latency['Parallel_PCA_SPMD_Time'] = time.time() - start

        # result is available on all processes - but we print only on root
        if d4p.my_procid() == 0:
            print("PCA completed", result)
            self.latency["Overall Parallel PCA SPMD Time"] = time.time() - \
                start

        d4p.daalfini()

        self.logger.info('Completed PCA in pydaal SPMD Mode')

        return
コード例 #3
0
    def linearRegression(self, Data_Path, test_data_path, target, n):
        '''
        daal4py Linear Regression SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # training setup
        file = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file)
        X = data.drop(columns=target)
        y = data[target]

        train_algo = d4p.linear_regression_training(method='qrDense',
                                                    distributed=True)

        self.logger.info('Training the Linear Regression in pydaal SPMD Mode')

        start = time.time()

        train_result = train_algo.compute(X, y)

        self.latency['Parallel_LinearRegression_Pydaal_Time'] = time.time() - \
            start

        # test file setup
        test = pd.read_csv(test_data_path)

        y_test = test[target]
        X_test = test.drop(target, axis=1)

        if d4p.my_procid() == 0:
            predict_algo = d4p.linear_regression_prediction()

            # now predict using the model from the training above
            predict_result = predict_algo.compute(X_test, train_result.model)
            self.latency[
                "Overall Parallel Linear Regression Prediction SPMD Time"] = time.time(
                ) - start

            # The prediction result provides prediction
            #assert predict_result.prediction.shape == (X_test.shape[0], y.shape[1])

        d4p.daalfini()

        self.logger.info('Completed Linear Regression in pydaal SPMD Mode')

        # Compute metrics
        mse = mean_squared_error(y_test, predict_result.prediction)
        r2score = r2_score(y_test, predict_result.prediction)

        # Store the time taken and model metrics
        self.metrics['MSE_Parallel_LinearRegression_Pydaal'] = mse
        self.metrics['r2score_Parallel_LinearRegression_Pydaal'] = r2score

        return
コード例 #4
0
    def naiveBayes(self, Data_Path, test_data_path, target, n):
        '''
        daal4py Naive Bayes SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # training setup
        file = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file)
        X = data.drop(columns=target)
        y = data[target]

        # test file setup
        test = pd.read_csv(test_data_path)

        y_test = test[target]
        X_test = test.drop(target, axis=1)

        # store unique target values
        category_count = len(y.unique())
        # print(category_count)

        # Configure a training object
        train_algo = d4p.multinomial_naive_bayes_training(
            category_count, method='defaultDense', distributed=True)
        self.logger.info('Training the Naive Bayes in pydaal SPMD Mode')

        start = time.time()

        train_result = train_algo.compute(X, y)
        self.latency['Parallel_NaiveBayes_Pydaal_Time'] = time.time() - start
        # Now let's do some prediction
        # It runs only on a single node
        if d4p.my_procid() == 0:
            predict_algo = d4p.multinomial_naive_bayes_prediction(
                category_count)

            # now predict using the model from the training above
            presult = predict_algo.compute(X_test, train_result.model)

            self.latency[
                "Overall Parallel Naive Bayes Prediction SPMD Time"] = time.time(
                ) - start

        d4p.daalfini()

        self.logger.info('Completed Naive Bayes in pydaal SPMD Mode')

        return
コード例 #5
0
    def ridgeRegression(self, Data_Path, test_data_path, target, n):
        '''
        daal4py Ridge Regression SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        file = Data_Path + str(d4p.my_procid() + 1) + ".csv"

        # training
        data = pd.read_csv(file)
        X = data.drop(columns=target)
        y = data[target]

        # test file setup
        test = pd.read_csv(test_data_path)
        y_test = test[target]
        X_test = test.drop(target, axis=1)

        # Configure a Ridge regression training object
        train_algo = d4p.ridge_regression_training(distributed=True,
                                                   interceptFlag=True)
        self.logger.info('Training the Ridge Regression in pydaal SPMD Mode')

        start_time = time.time()

        train_result = train_algo.compute(X, y)

        self.latency["Parallel Ridge Regression SPMD Time"] = time.time() - \
            start_time

        # Only process #0 reports results
        if d4p.my_procid() == 0:
            predict_algo = d4p.ridge_regression_prediction()
            # now predict using the model from the training above
            predict_result = predict_algo.compute(X_test, train_result.model)

        self.logger.info('Completed Ridge Regression in pydaal SPMD Mode')
        d4p.daalfini()

        # Compute metrics
        mse = mean_squared_error(y_test, predict_result.prediction)
        r2score = r2_score(y_test, predict_result.prediction)

        # Store the time taken and model metrics
        self.metrics["MSE For Parallel Ridge regression SPMD"] = mse
        self.metrics["R2 Score For Parallel Ridge regression SPMD"] = r2score

        return
コード例 #6
0
    def kMeans(self, Data_Path, n):
        '''
        daal4py KMeans Clustering SPMD Mode
        '''

        nClusters = 4

        maxIter = 25  # fixed maximum number of itertions

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # training setup
        file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file_path)
        init_algo = d4p.kmeans_init(nClusters=nClusters,
                                    distributed=True,
                                    method="plusPlusDense")

        self.logger.info('Training the KMeans in pydaal SPMD Mode')

        # compute initial centroids
        centroids = init_algo.compute(data).centroids
        init_result = init_algo.compute(data)

        # configure kmeans main object
        algo = d4p.kmeans(nClusters, maxIter, distributed=True)
        kmeans_start_time = time.time()
        # compute the clusters/centroids
        result = algo.compute(data, init_result.centroids)
        self.latency["Parallel_KMeans_SPMD_Time"] = time.time() - \
            kmeans_start_time

        # result is available on all processes - but we print only on root
        if d4p.my_procid() == 0:
            print("KMeans completed", result)

        self.logger.info('Completed KMeans in pydaal SPMD Mode')

        d4p.daalfini()

        return
コード例 #7
0
    #         method="plusPlusDense",
    #         distributed=True
    #     ).compute(data).centroids
    # )

    # Kmeans result objects provide centroids, goalFunction,
    # nIterations and objectiveFunction
    assert result.centroids.shape[0] == nClusters
    assert result.nIterations <= maxIter
    # we need an extra call to kmeans to get the assignments
    # (not directly supported through parameter assignFlag yet in SPMD mode)
    algo = d4p.kmeans(nClusters, 0, assignFlag=True)
    # maxIt=0; not distributed, we compute on local data only!
    assignments = algo.compute(data, result.centroids).assignments

    return (assignments, result)


if __name__ == "__main__":
    # Initialize SPMD mode
    d4p.daalinit()
    (assignments, result) = main()
    # result is available on all processes - but we print only on root
    if d4p.my_procid() == 0:
        print("\nFirst 10 cluster assignments:\n", assignments[0:10])
        print("\nFirst 10 dimensions of centroids:\n", result.centroids[:,
                                                                        0:10])
        print("\nObjective function value:\n", result.objectiveFunction)
        print('All looks good!')
    d4p.daalfini()
コード例 #8
0
# # Assign The Data to Clusters and Save The Results

# Let's **assign the data** to clusters.

# In[7]:

# compute the clusters/centroids
kmeans_result = d4p.kmeans(nClusters=3, maxIterations=5,
                           assignFlag=True).compute(X, init_result.centroids)

# To **get Kmeans result objects** (assignments, centroids, goalFunction [deprecated], nIterations, and objectiveFunction):

# In[8]:

# retrieving and printing cluster assignments
assignments = kmeans_result.assignments
print("Here is our cluster assignments for first 5 datapoints: \n\n",
      assignments[:5])

# Now let's **export the cluster assignments** to a **CSV file**. We will also **stop the distribution engine.**

# In[9]:

# now export the results to a CSV file
results_filename = "./results/daal4py_Distributed_Kmeans_results_" + str(
    d4p.my_procid() + 1) + ".csv"
np.savetxt(results_filename, assignments, delimiter=",")

d4p.daalfini()  # stops the distribution engine
print('[CODE_SAMPLE_COMPLETED_SUCCESFULLY]')
コード例 #9
0
ファイル: spmd_test_examples.py プロジェクト: rlnx/daal4py
 def tearDownClass(cls):
     d4p.daalfini()
コード例 #10
0
import daal4py
import hpat
import numpy as np

daal4py.daalinit(spmd=True)


@hpat.jit
def lr_predict(N, D, model):
    data = np.random.ranf((N / 2, D))
    return daal4py.linear_regression_prediction().compute(data, model)


@hpat.jit
def lr_train(N, D):
    data = np.random.ranf((N, D))
    gt = np.random.ranf((N, 2))
    return daal4py.linear_regression_training(interceptFlag=True,
                                              method='qrDense').compute(
                                                  data, gt)


t_res = lr_train(1000, 10)
p_res = lr_predict(1000, 10, t_res.model)

print(p_res.prediction[0], t_res.model.NumberOfBetas)

hpat.distribution_report()

daal4py.daalfini()