Exemple #1
0
#*******************************************************************************

# daal4py Linear Regression example for distributed memory systems; SPMD mode
# run like this:
#    mpirun -n 4 python ./linreg_spmd.py

import daal4py as d4p
from numpy import loadtxt, allclose

if __name__ == "__main__":
    # Initialize SPMD mode
    d4p.daalinit()

    # Each process gets its own data
    infile = "./data/distributed/linear_regression_train_" + str(
        d4p.my_procid() + 1) + ".csv"

    # Configure a Linear regression training object
    train_algo = d4p.linear_regression_training(distributed=True)

    # Read data. Let's have 10 independent, and 2 dependent variables (for each observation)
    indep_data = loadtxt(infile, delimiter=',', usecols=range(10))
    dep_data = loadtxt(infile, delimiter=',', usecols=range(10, 12))
    # Now train/compute, the result provides the model for prediction
    train_result = train_algo.compute(indep_data, dep_data)

    # Now let's do some prediction
    # It run only on a single node
    if d4p.my_procid() == 0:
        predict_algo = d4p.linear_regression_prediction()
        # read test data (with same #features)
Exemple #2
0
    train_result = train_alg.compute(train_data, train_labels)

    # Now let's do some prediction
    # It operates on the same data on each process
    # read testing data from file with 20 features per observation
    testfile = "./data/batch/binary_cls_test.csv"
    predict_data = readcsv(testfile, range(nFeatures))
    predict_labels = readcsv(testfile, range(nFeatures, nFeatures + 1))
    
    # set parameters and compute predictions
    predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses, method=method)
    predict_result = predict_alg.compute(predict_data, train_result.model)
    
    # the prediction result provides prediction
    assert predict_result.prediction.shape == (predict_data.shape[0], train_labels.shape[1])
    
    return (train_result, predict_result, predict_labels)

if __name__ == "__main__":
    # Initialize SPMD mode
    d4p.daalinit()

    (train_result, predict_result, predict_labels) = main()
    if d4p.my_procid() == 0:
        print("\nLogistic Regression coefficients:\n", train_result.model.Beta)
        print("\nLogistic regression prediction results (first 10 rows):\n", predict_result.prediction[0:10])
        print("\nGround truth (first 10 rows):\n", predict_labels[0:10])
        print('All looks good!')

    d4p.daalfini()
import pandas as pd
import numpy as np


# Now let's **load** in the dataset and **organize** it as necessary to work with our model. For distributed, every file has a unique ID.
# 
# We will also **initialize the distribution engine**.

# In[3]:


d4p.daalinit() #initializes the distribution engine

# organizing variables used in the model for prediction
# each process gets its own data
infile = "./data/distributed_data/daal4py_Distributed_Kmeans_" + str(d4p.my_procid()+1) + ".csv"

# read data
X = pd.read_csv(infile)


# ## Computing and Saving Initial Centroids

# Time to **initialize our centroids!**

# In[4]:


# computing inital centroids
init_result = d4p.kmeans_init(nClusters = 3, method = "plusPlusDense").compute(X)
import pandas as pd
import numpy as np
import pickle

# Now let's **load** in the dataset and **organize** it as necessary to work with our model. For distributed, every file has a unique ID.
#
# We will also **initialize the distribution engine**.

# In[3]:

d4p.daalinit()  #initializes the distribution engine

# organizing variables used in the model for prediction
# each process gets its own data
infile = "./data/distributed_data/linear_regression_train_" + str(
    d4p.my_procid() + 1) + ".csv"

# read data
indep_data = pd.read_csv(infile).drop(["target"],
                                      axis=1)  # house characteristics
dep_data = pd.read_csv(infile)["target"]  # house price

# ## Training and Saving the Model

# Time to **train our model** and look at the model's features!

# In[4]:

# training the model for prediction
train_result = d4p.linear_regression_training(distributed=True).compute(
    indep_data, dep_data)
Exemple #5
0
# limitations under the License.
#*******************************************************************************

# daal4py SVD example for distributed memory systems; SPMD mode
# run like this:
#    mpirun -n 4 python ./svd_spmd.py

import daal4py as d4p
from numpy import loadtxt, allclose

if __name__ == "__main__":
    # Initialize SPMD mode
    d4p.daalinit()

    # Each process gets its own data
    infile = "./data/distributed/svd_{}.csv".format(d4p.my_procid() + 1)

    # configure a SVD object
    algo = d4p.svd(distributed=True)

    # let's provide a file directly, not a table/array
    result1 = algo.compute(infile)

    # We can also load the data ourselfs and provide the numpy array
    data = loadtxt(infile, delimiter=',')
    result2 = algo.compute(data)

    # SVD result objects provide leftSingularMatrix, rightSingularMatrix and singularValues
    # leftSingularMatrix not yet supported in dist mode
    assert result1.leftSingularMatrix == None and result2.leftSingularMatrix == None
    assert allclose(result1.rightSingularMatrix,
# limitations under the License.
#===============================================================================

# daal4py PCA example for distributed memory systems; SPMD mode
# run like this:
#    mpirun -n 4 python ./pca_spmd.py

import daal4py as d4p
from numpy import loadtxt, allclose

if __name__ == "__main__":
    # Initialize SPMD mode
    d4p.daalinit()

    # Each process gets its own data
    infile = "./data/distributed/pca_normalized_" + str(d4p.my_procid() +
                                                        1) + ".csv"

    # configure a PCA object to use svd instead of default correlation
    algo = d4p.pca(method='svdDense', distributed=True)
    # let's provide a file directly, not a table/array
    result1 = algo.compute(infile)

    # We can also load the data ourselfs and provide the numpy array
    data = loadtxt(infile, delimiter=',')
    result2 = algo.compute(data)

    # PCA result objects provide eigenvalues, eigenvectors, means and variances
    assert allclose(result1.eigenvalues, result2.eigenvalues)
    assert allclose(result1.eigenvectors, result2.eigenvectors)
    assert result1.means is None and \
Exemple #7
0
#*******************************************************************************

# daal4py Linear Regression example for distributed memory systems; SPMD mode
# run like this:
#    mpirun -genv DIST_CNC=MPI -n 4 python ./linreg_spmd.py

import daal4py as d4p
from numpy import loadtxt, allclose

if __name__ == "__main__":

    # Initialize SPMD mode
    d4p.daalinit(spmd=True)
    
    # Each process gets its own data
    infile = "./data/distributed/linear_regression_train_" + str(d4p.my_procid()+1) + ".csv"

    # Configure a Linear regression training object
    train_algo = d4p.linear_regression_training(distributed=True)
    
    # Read data. Let's have 10 independent, and 2 dependent variables (for each observation)
    indep_data = loadtxt(infile, delimiter=',', usecols=range(10))
    dep_data   = loadtxt(infile, delimiter=',', usecols=range(10,12))
    # Now train/compute, the result provides the model for prediction
    train_result = train_algo.compute(indep_data, dep_data)

    # Now let's do some prediction
    # It run only on a single node
    if d4p.my_procid() == 0:
        predict_algo = d4p.linear_regression_prediction(distributed=True)
        # read test data (with same #features)
Exemple #8
0

def main(method='defaultDense'):
    infile = "./data/batch/dbscan_dense.csv"
    epsilon = 0.02
    minObservations = 180

    # Load the data
    data = np.loadtxt(infile, delimiter=',')
    rpp = int(data.shape[0] / d4p.num_procs())
    data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :]

    # configure dbscan main object
    algo = d4p.dbscan(minObservations=minObservations,
                      epsilon=epsilon,
                      distributed=True)
    # and compute
    result = algo.compute(data)

    return result


if __name__ == "__main__":
    # Initialize SPMD mode
    d4p.daalinit()
    result = main()
    print("\nResults on node with id = ", d4p.my_procid(), " :\n",
          "\nFirst 10 cluster assignments:\n", result.assignments[0:10],
          "\nNumber of clusters:\n", result.nClusters)
    d4p.daalfini()
Exemple #9
0
#*******************************************************************************

# daal4py SVD example for distributed memory systems; SPMD mode
# run like this:
#    mpirun -genv DIST_CNC=MPI -n 4 python ./svd_spmd.py

import daal4py as d4p
from numpy import loadtxt, allclose

if __name__ == "__main__":

    # Initialize SPMD mode
    d4p.daalinit(spmd=True)

    # Each process gets its own data
    infile = "./data/distributed/svd_" + str(d4p.my_procid() + 1) + ".csv"

    # configure a SVD object
    algo = d4p.svd(distributed=True)

    # let's provide a file directly, not a table/array
    result1 = algo.compute(infile)

    # We can also load the data ourselfs and provide the numpy array
    data = loadtxt(infile, delimiter=',')
    result2 = algo.compute(data)

    # SVD result objects provide leftSingularMatrix, rightSingularMatrix and singularValues
    # leftSingularMatrix not yet supported in dist mode
    assert result1.leftSingularMatrix == None and result2.leftSingularMatrix == None
    assert allclose(result1.rightSingularMatrix,
Exemple #10
0
    # Initialize SPMD mode
    d4p.daalinit()

    infile = "./data/distributed/kmeans_dense.csv"
    nClusters = 10
    maxIter = 25

    # configure a kmeans-init
    init_algo = d4p.kmeans_init(nClusters,
                                method="plusPlusDense",
                                distributed=True)
    # Load the data
    data = loadtxt(infile, delimiter=',')
    # now slice the data, it would have been better to read only what we need, of course...
    rpp = int(data.shape[0] / d4p.num_procs())
    data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :]

    # compute initial centroids
    init_result = init_algo.compute(data)
    # The results provides the initial centroids
    assert init_result.centroids.shape[0] == nClusters

    # configure kmeans main object
    algo = d4p.kmeans(nClusters, maxIter, distributed=True)
    # compute the clusters/centroids
    result = algo.compute(data, init_result.centroids)

    # Note: we could have done this in just one line:
    # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids)

    # Kmeans result objects provide centroids, goalFunction, nIterations and objectiveFunction