def main(): nClasses = 2 nFeatures = 20 # read training data from file with 20 features per observation and 1 class label # and use only a chunk per process trainfile = "./data/batch/binary_cls_train.csv" train_data = np.split(read_csv(trainfile, range(nFeatures)), d4p.num_procs())[d4p.my_procid()] train_labels = np.split(read_csv(trainfile, range(nFeatures, nFeatures + 1)), d4p.num_procs())[d4p.my_procid()] # set parameters and train train_alg = d4p.logistic_regression_training(nClasses=nClasses, interceptFlag=True, distributed=True) train_result = train_alg.compute(train_data, train_labels) # Now let's do some prediction # It operates on the same data on each process # read testing data from file with 20 features per observation testfile = "./data/batch/binary_cls_test.csv" predict_data = read_csv(testfile, range(nFeatures)) predict_labels = read_csv(testfile, range(nFeatures, nFeatures + 1)) # set parameters and compute predictions predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses) predict_result = predict_alg.compute(predict_data, train_result.model) # the prediction result provides prediction assert predict_result.prediction.shape == (predict_data.shape[0], train_labels.shape[1]) return (train_result, predict_result, predict_labels)
def main(): infile = "./data/batch/covcormoments_dense.csv" # We know the number of lines in the file and use this to separate data between processes skiprows, nrows = get_chunk_params(lines_count=200, chunks_count=d4p.num_procs(), chunk_number=d4p.my_procid()) # Each process reads its chunk of the file data = read_csv(infile, sr=skiprows, nr=nrows) # Create algorithm with distributed mode alg = d4p.low_order_moments(method='defaultDense', distributed=True) # Perform computation res = alg.compute(data) # result provides minimum, maximum, sum, sumSquares, sumSquaresCentered, # mean, secondOrderRawMoment, variance, standardDeviation, variation assert (all( getattr(res, name).shape == (1, data.shape[1]) for name in [ 'minimum', 'maximum', 'sum', 'sumSquares', 'sumSquaresCentered', 'mean', 'secondOrderRawMoment', 'variance', 'standardDeviation', 'variation' ])) return res
def main(method='plusPlusDense'): infile = "./data/distributed/kmeans_dense.csv" nClusters = 10 maxIter = 25 # configure a kmeans-init init_algo = d4p.kmeans_init(nClusters, method=method, distributed=True) # Load the data data = loadtxt(infile, delimiter=',') # now slice the data, it would have been better to read only what we need, of course... rpp = int(data.shape[0]/d4p.num_procs()) data = data[rpp*d4p.my_procid():rpp*d4p.my_procid()+rpp,:] # compute initial centroids init_result = init_algo.compute(data) # The results provides the initial centroids assert init_result.centroids.shape[0] == nClusters # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) # compute the clusters/centroids result = algo.compute(data, init_result.centroids) # Note: we could have done this in just one line: # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids) # Kmeans result objects provide centroids, goalFunction, nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters assert result.nIterations <= maxIter # we need an extra call to kmeans to get the assignments (not directly supported through parameter assignFlag yet in SPMD mode) algo = d4p.kmeans(nClusters, 0, assignFlag=True) # maxIt=0; not distributed, we compute on local data only! assignments = algo.compute(data, result.centroids).assignments return (assignments, result)
def test_dbscan_spmd(self): epsilon = 0.04 minObservations = 45 data = np_read_csv( os.path.join(".", 'data', 'batch', 'dbscan_dense.csv')) batch_algo = d4p.dbscan(minObservations=minObservations, epsilon=epsilon, resultsToCompute='computeCoreIndices') batch_result = batch_algo.compute(data) rpp = int(data.shape[0] / d4p.num_procs()) node_stride = rpp * d4p.my_procid() node_range = range(node_stride, node_stride + rpp) node_data = data[node_range, :] spmd_algo = d4p.dbscan(minObservations=minObservations, epsilon=epsilon, distributed=True) spmd_result = spmd_algo.compute(node_data) # clusters can get different indexes in batch and spmd algos, # to compare them we should take care about it cluster_index_dict = {} for i in node_range: # border points assignments can be different # with different amount of nodes but cores are the same if i in batch_result.coreIndices: right = spmd_result.assignments[i - node_stride][0] if not batch_result.assignments[i][0] in cluster_index_dict: cluster_index_dict[batch_result.assignments[i] [0]] = right left = cluster_index_dict[batch_result.assignments[i][0]] self.assertTrue(left == right)
def test_kmeans_spmd(self): nClusters = 10 maxIter = 25 data = np.loadtxt("./data/distributed/kmeans_dense.csv", delimiter=',') rpp = int(data.shape[0] / d4p.num_procs()) spmd_data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] for init_method in [ 'plusPlusDense', 'parallelPlusDense', 'deterministicDense' ]: batch_init_res = d4p.kmeans_init( nClusters=nClusters, method=init_method).compute(data) spmd_init_res = d4p.kmeans_init( nClusters=nClusters, method=init_method, distributed=True).compute(spmd_data) if init_method in ['parallelPlusDense']: print("Warning: It is well known " "that results of parallelPlusDense init " "does not match with batch algorithm") else: reason = "Initial centroids with " + init_method reason += " does not match with batch algorithm" self.assertTrue( np.allclose(batch_init_res.centroids, spmd_init_res.centroids), reason) batch_res = d4p.kmeans(nClusters=nClusters, maxIterations=maxIter).compute( data, batch_init_res.centroids) spmd_res = d4p.kmeans(nClusters=nClusters, maxIterations=maxIter, distributed=True).compute( spmd_data, spmd_init_res.centroids) if init_method in ['parallelPlusDense']: print("Warning: It is well known " "that results of parallelPlusDense init " "does not match with batch algorithm") else: reason = "Final centroids with " + init_method reason += " does not match with batch algorithm" self.assertTrue( np.allclose(batch_res.centroids, spmd_res.centroids), reason)
def test_dbscan_spmd(self): import dbscan_spmd as ex result = self.call(ex) test_data = np_read_csv( os.path.join(unittest_data_path, "dbscan_batch.csv")) rpp = int(test_data.shape[0] / d4p.num_procs()) test_data = test_data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] # clusters can get different indexes in batch and spmd algos, to compare them we should take care about it cluster_index_dict = {} for i in range(test_data.shape[0]): if not test_data[i][0] in cluster_index_dict: cluster_index_dict[test_data[i] [0]] = result.assignments[i][0] self.assertTrue(cluster_index_dict[test_data[i][0]] == result.assignments[i][0])
def main(method='defaultDense'): infile = "./data/batch/dbscan_dense.csv" epsilon = 0.04 minObservations = 45 # Load the data data = np.loadtxt(infile, delimiter=',') rpp = int(data.shape[0] / d4p.num_procs()) data = data[rpp * d4p.my_procid(): rpp * d4p.my_procid() + rpp, :] # configure dbscan main object algo = d4p.dbscan(minObservations=minObservations, epsilon=epsilon, distributed=True) # and compute result = algo.compute(data) return result
def main(): infile = "./data/batch/covcormoments_dense.csv" # We know the number of lines in the file and use this to separate data between processes skiprows, nrows = get_chunk_params(lines_count=200, chunks_count=d4p.num_procs(), chunk_number=d4p.my_procid()) # Each process reads its chunk of the file data = read_csv(infile, sr=skiprows, nr=nrows) # Create algorithm with distributed mode alg = d4p.covariance(method="defaultDense", distributed=True) # Perform computation res = alg.compute(data) # covariance result objects provide correlation, covariance and mean assert res.covariance.shape == (data.shape[1], data.shape[1]) assert res.mean.shape == (1, data.shape[1]) assert res.correlation.shape == (data.shape[1], data.shape[1]) return res
# Initialize SPMD mode d4p.daalinit() infile = "./data/distributed/kmeans_dense.csv" nClusters = 10 maxIter = 25 # configure a kmeans-init init_algo = d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True) # Load the data data = loadtxt(infile, delimiter=',') # now slice the data, it would have been better to read only what we need, of course... rpp = int(data.shape[0] / d4p.num_procs()) data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] # compute initial centroids init_result = init_algo.compute(data) # The results provides the initial centroids assert init_result.centroids.shape[0] == nClusters # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) # compute the clusters/centroids result = algo.compute(data, init_result.centroids) # Note: we could have done this in just one line: # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids)
max_iter = 300 acc_tres = 1e-4 img = Image.open( './Yushan.jpg' ) #https://commons.wikimedia.org/wiki/File:%E7%8E%89%E5%B1%B1%E4%B8%BB%E5%B3%B0_02.jpg img.load() china = np.array(img, dtype=np.float64) / 255 # Load Image and transform to a 2D numpy array. w, h, d = original_shape = tuple(china.shape) assert d == 3 image_array = np.reshape(china, (w * h, d)) o_colors = 344038 #Yushan n_slices = int(image_array.shape[0] / d4p.num_procs()) print("Number of MPI tasks: ", d4p.num_procs()) image_array = image_array[n_slices * d4p.my_procid():n_slices * d4p.my_procid() + n_slices, :] print("Fitting model on the data") t0 = time() # compute initial centroids init_result = init_algo.compute(image_array) assert init_result.centroids.shape[0] == n_colors # configure kmeans main object algo = d4p.kmeans(n_colors, max_iter, distributed=True)
# Initialize SPV mode d4p.daalinit() infile = "./data/distributed/kmeans_dense.csv" nClusters = 10 maxIter = 25 # configure a kmeans-init initrain_algo = d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True) # Load the data data = loadtxt(infile, delimiter=',') # We need partioned input data, let's slice the data rpp = int(data.shape[0] / d4p.num_procs()) data = [data[rpp * x:rpp * x + rpp, :] for x in range(d4p.num_procs())] # Note, providing a list of files instead also distributes the file read! # compute initial centroids initrain_result = initrain_algo.compute(data) # The results provides the initial centroids assert initrain_result.centroids.shape[0] == nClusters # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) # compute the clusters/centroids result = algo.compute(data, initrain_result.centroids) # Note: we could have done this in just one line: # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids)