def main(readcsv=read_csv, method='defaultDense'): # input data file infile = "./data/batch/naivebayes_train_dense.csv" testfile = "./data/batch/naivebayes_test_dense.csv" # Configure a training object (20 classes) talgo = d4p.multinomial_naive_bayes_training(20, method=method) # Read data. Let's use 20 features per observation data = readcsv(infile, range(20)) labels = readcsv(infile, range(20,21)) tresult = talgo.compute(data, labels) # Now let's do some prediction palgo = d4p.multinomial_naive_bayes_prediction(20, method=method) # read test data (with same #features) pdata = readcsv(testfile, range(20)) plabels = readcsv(testfile, range(20,21)) # now predict using the model from the training above presult = palgo.compute(pdata, tresult.model) # Prediction result provides prediction assert(presult.prediction.shape == (pdata.shape[0], 1)) return (presult, plabels)
def naiveBayes(self, X_train, X_test, y_train, y_test, target): ''' Method for Serial ''' # store unique target values category_count = len(y_train.unique()) # Configure a training object (20 classes) train_algo = d4p.multinomial_naive_bayes_training( category_count, method='defaultDense') self.logger.info( 'Training the Naive Bayes in pydaal Batch/Serial Mode') start = time.time() train_result = train_algo.compute(X_train, y_train) self.latency["Serial Naive Bayes Batch Time"] = time.time() - start # Now let's do some prediction predict_algo = d4p.multinomial_naive_bayes_prediction(category_count) # now predict using the model from the training above presult = predict_algo.compute(X_test, train_result.model) # Prediction result provides prediction assert (presult.prediction.shape == (X_test.shape[0], 1)) # Store the time taken self.latency[ 'Overall Serial Naive bayes Prediction Batch Time'] = time.time( ) - start self.logger.info('Completed Naive Bayes in pydaal Batch/Serial Mode') return
def naiveBayes(self, Data_Path, test_data_path, target, n): ''' daal4py Naive Bayes SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # training setup file = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file) X = data.drop(columns=target) y = data[target] # test file setup test = pd.read_csv(test_data_path) y_test = test[target] X_test = test.drop(target, axis=1) # store unique target values category_count = len(y.unique()) # print(category_count) # Configure a training object train_algo = d4p.multinomial_naive_bayes_training( category_count, method='defaultDense', distributed=True) self.logger.info('Training the Naive Bayes in pydaal SPMD Mode') start = time.time() train_result = train_algo.compute(X, y) self.latency['Parallel_NaiveBayes_Pydaal_Time'] = time.time() - start # Now let's do some prediction # It runs only on a single node if d4p.my_procid() == 0: predict_algo = d4p.multinomial_naive_bayes_prediction( category_count) # now predict using the model from the training above presult = predict_algo.compute(X_test, train_result.model) self.latency[ "Overall Parallel Naive Bayes Prediction SPMD Time"] = time.time( ) - start d4p.daalfini() self.logger.info('Completed Naive Bayes in pydaal SPMD Mode') return
def main(readcsv=read_csv, method='defaultDense'): # input data file infile = "./data/batch/naivebayes_train_dense.csv" testfile = "./data/batch/naivebayes_test_dense.csv" # Configure a training object (20 classes) train_algo = d4p.multinomial_naive_bayes_training(20, streaming=True, method=method) chunk_size = 250 lines_read = 0 # read and feed chunk by chunk while True: # Read data in chunks # Read data. Let's use 20 features per observation try: data = readcsv(infile, range(20), lines_read, chunk_size) labels = readcsv(infile, range(20, 21), lines_read, chunk_size) except: break # Now feed chunk train_algo.compute(data, labels) lines_read += data.shape[0] # All chunks are done, now finalize the computation train_result = train_algo.finalize() # Now let's do some prediction pred_algo = d4p.multinomial_naive_bayes_prediction(20, method=method) # read test data (with same #features) pred_data = readcsv(testfile, range(20)) pred_labels = readcsv(testfile, range(20, 21)) # now predict using the model from the training above pred_result = pred_algo.compute(pred_data, train_result.model) # Prediction result provides prediction assert (pred_result.prediction.shape == (pred_data.shape[0], 1)) return (pred_result, pred_labels)
# Each process gets its own data infile = "./data/batch/naivebayes_train_dense.csv" # Configure a training object (20 classes) talgo = d4p.multinomial_naive_bayes_training(20, distributed=True) # Read data. Let's use 20 features per observation data = loadtxt(infile, delimiter=',', usecols=range(20)) labels = loadtxt(infile, delimiter=',', usecols=range(20, 21)) labels.shape = (labels.size, 1) # must be a 2d array tresult = talgo.compute(data, labels) # Now let's do some prediction # It runs only on a single node if d4p.my_procid() == 0: palgo = d4p.multinomial_naive_bayes_prediction(20) # read test data (with same #features) pdata = loadtxt("./data/batch/naivebayes_test_dense.csv", delimiter=',', usecols=range(20)) # now predict using the model from the training above presult = palgo.compute(pdata, tresult.model) # Prediction result provides prediction assert (presult.prediction.shape == (pdata.shape[0], 1)) print('All looks good!') d4p.daalfini()