def getBlockOfCols(nT,*args): mnT = MergedNumericTable() for idx in range(args[0],args[1]): doubleBlock = BlockDescriptor_Float64() nT.getBlockOfColumnValues(idx, 0, nT.getNumberOfRows(), readOnly, doubleBlock) mnT.releaseBlockOfColumnValues(doubleBlock) return mnT
def testModel(trainingResult): # Initialize FileDataSource to retrieve the input data from a .csv file testDataSource = FileDataSource(testDatasetFileName, DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for testing data and ground truth values testData = HomogenNumericTable(NUM_FEATURES, 0, NumericTableIface.doNotAllocate) testGroundTruth = HomogenNumericTable(NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate) mergedData = MergedNumericTable(testData, testGroundTruth) # Retrieve the data from an input file testDataSource.loadDataBlock(mergedData) # Create an algorithm object to predict values of ridge regression algorithm = prediction.Batch() # Pass a testing data set and the trained model to the algorithm algorithm.input.setTable(prediction.data, testData) algorithm.input.setModel(prediction.model, trainingResult.get(training.model)) # Predict values of ridge regression res = algorithm.compute() # Retrieve the algorithm results printNumericTable(res.get(prediction.prediction), "Ridge Regression prediction results: (first 10 rows):", 10) printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
def trainModel(): global trainingResult masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense() for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size): trainDataSource = FileDataSource( trainDatasetFileNames[filenameIndex], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) trainDependentVariables = HomogenNumericTable( nDependentVariables, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) trainDataSource.loadDataBlock(mergedData) localAlgorithm = training.Distributed_Step1LocalFloat64NormEqDense() localAlgorithm.input.set(training.data, trainData) localAlgorithm.input.set(training.dependentVariables, trainDependentVariables) pres = localAlgorithm.compute() masterAlgorithm.input.add(training.partialModels, pres) mergedData.freeDataMemory() trainData.freeDataMemory() trainDependentVariables.freeDataMemory() pres = masterAlgorithm.compute() dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults = dataArch.getArchiveAsArray() serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: print("Number of processes is %d." % (len(serializedData))) masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense() for i in range(comm_size): dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute()
def getBlockOfNumericTable(nT, Rows='All', Columns='All'): from daal.data_management import HomogenNumericTable_Float64, \ MergedNumericTable, readOnly, BlockDescriptor import numpy as np # Get First and Last Row indexes lastRow = nT.getNumberOfRows() if type(Rows) != str: if type(Rows) == list: firstRow = Rows[0] if len(Rows) == 2: lastRow = min(Rows[1], lastRow) else: firstRow = 0 lastRow = Rows elif Rows == 'All': firstRow = 0 else: warnings.warn( 'Type error in "Rows" arguments, Can be only int/list type') raise SystemExit # Get First and Last Column indexes nEndDim = nT.getNumberOfColumns() if type(Columns) != str: if type(Columns) == list: nStartDim = Columns[0] if len(Columns) == 2: nEndDim = min(Columns[1], nEndDim) else: nStartDim = 0 nEndDim = Columns elif Columns == 'All': nStartDim = 0 else: warnings.warn( 'Type error in "Columns" arguments, Can be only int/list type') raise SystemExit #Retrieve block of Columns Values within First & Last Rows #Merge all the retrieved block of Columns Values #Return merged numeric table mnT = MergedNumericTable() for idx in range(nStartDim, nEndDim): block = BlockDescriptor() nT.getBlockOfColumnValues(idx, firstRow, (lastRow - firstRow), readOnly, block) mnT.addNumericTable(HomogenNumericTable_Float64(block.getArray())) nT.releaseBlockOfColumnValues(block) block = BlockDescriptor() mnT.getBlockOfRows(0, mnT.getNumberOfRows(), readOnly, block) mnT = HomogenNumericTable(block.getArray()) return mnT
DATA_PREFIX = os.path.join(os.path.dirname(sys.executable), 'share', 'pydaal_examples', 'examples', 'data', 'batch') trainDatasetFileName = os.path.join(DATA_PREFIX, 'adaboost_train.csv') nFeatures = 20 # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file trainDataSource = FileDataSource(trainDatasetFileName, DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.doNotAllocate) trainGroundTruth = HomogenNumericTable(1, 0, NumericTableIface.doNotAllocate) mergedData = MergedNumericTable(trainData, trainGroundTruth) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) #default keyword arguments ''' GridSearch(<args>, tuned_parameters = None, score=None, best_score_criteria='high', create_best_training_model = False, save_model=False,nClasses=None ) ''' #create a dictionary of hyperparameter values in a list adaB_params = [{'accuracyThreshold': [0.99, 0.1], 'maxIterations': [1, 5]}] #Create GridSearch object clf = GridSearch(adaB,
def trainModel(): global trainingResult nodeResults = [] # Create an algorithm object to build the final Naive Bayes model on the master node masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses) for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size): # Initialize FileDataSource to retrieve the input data from a .csv file #print("The worker with rank %d will read %s." % (rankId, trainDatasetFileNames[filenameIndex])) trainDataSource = FileDataSource(trainDatasetFileNames[filenameIndex], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) trainDependentVariables = HomogenNumericTable(1, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) # Create an algorithm object to train the Naive Bayes model based on the local-node data localAlgorithm = training.Distributed_Step1LocalFloat64DefaultDense(nClasses) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(classifier.training.data, trainData) localAlgorithm.input.set(classifier.training.labels, trainDependentVariables) # Train the Naive Bayes model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) masterAlgorithm.input.add(classifier.training.partialModels, pres) """ nodeResults.append(dataArch.getArchiveAsArray().copy()) localAlgorithm.clean() """ mergedData.freeDataMemory() trainData.freeDataMemory() trainDependentVariables.freeDataMemory() # Transfer partial results to step 2 on the root node pres = masterAlgorithm.compute() dataArch = InputDataArchive() pres.serialize(dataArch) nodeResults.append(dataArch.getArchiveAsArray().copy()) serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final Naive Bayes model on the master node masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses) for currentRank in range(len(serializedData)): for currentBlock in range(0, len(serializedData[currentRank])): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[currentRank][currentBlock]) dataForStep2FromStep1 = classifier.training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local Naive Bayes model as input for the master-node algorithm masterAlgorithm.input.add(classifier.training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the Naive Bayes model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute()
def trainModel(comm, rankId): trainingResult = None # Initialize FileDataSource to retrieve the input data from a .csv file trainDataSource = FileDataSource(trainDatasetFileNames[rankId], DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(NUM_FEATURES, 0, NumericTableIface.doNotAllocate) trainDependentVariables = HomogenNumericTable( NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate) mergedData = MergedNumericTable(trainData, trainDependentVariables) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) # Create an algorithm object to train the ridge regression model based on the local-node data localAlgorithm = training.Distributed(step1Local) # Pass a training data set and dependent values to the algorithm localAlgorithm.input.set(training.data, trainData) localAlgorithm.input.set(training.dependentVariables, trainDependentVariables) # Train the ridge regression model on local nodes pres = localAlgorithm.compute() # Serialize partial results required by step 2 dataArch = InputDataArchive() pres.serialize(dataArch) # Transfer partial results to step 2 on the root node nodeResults = dataArch.getArchiveAsArray() serializedData = comm.gather(nodeResults) if rankId == MPI_ROOT: # Create an algorithm object to build the final ridge regression model on the master node masterAlgorithm = training.Distributed(step2Master) for i in range(NUM_BLOCKS): # Deserialize partial results from step 1 dataArch = OutputDataArchive(serializedData[i]) dataForStep2FromStep1 = training.PartialResult() dataForStep2FromStep1.deserialize(dataArch) # Set the local ridge regression model as input for the master-node algorithm masterAlgorithm.input.add(training.partialModels, dataForStep2FromStep1) # Merge and finalizeCompute the ridge regression model on the master node masterAlgorithm.compute() trainingResult = masterAlgorithm.finalizeCompute() # Retrieve the algorithm results printNumericTable( trainingResult.get(training.model).getBeta(), "Ridge Regression coefficients:") return trainingResult
# Input data set parameters trainDatasetFileName = os.path.join(DATA_PREFIX, 'decision_tree_train.csv') pruneDatasetFileName = os.path.join(DATA_PREFIX, 'decision_tree_prune.csv') nFeatures = 5 nClasses = 5 # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file trainDataSource = FileDataSource(trainDatasetFileName, DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for training data and labels trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) trainGroundTruth = HomogenNumericTable(1, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(trainData, trainGroundTruth) # Retrieve the data from the input file trainDataSource.loadDataBlock(mergedData) # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file pruneDataSource = FileDataSource(pruneDatasetFileName, DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext) # Create Numeric Tables for pruning data and labels pruneData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) pruneGroundTruth = HomogenNumericTable(1, 0, NumericTableIface.notAllocate) pruneMergedData = MergedNumericTable(pruneData, pruneGroundTruth) # Retrieve the data from the input file
def testModel(): thresholdValues = np.linspace(-25.0, 25.0, num=101) numberOfCorrectlyClassifiedObjects = np.zeros(len(thresholdValues)) numberOfObjectsInTestFiles = 0 numberOfNonzeroObjectsInTestFiles = 0 for filenameIndex in range(0, len(testDatasetFileNames)): testDataSource = FileDataSource( testDatasetFileNames[filenameIndex], DataSourceIface.doAllocateNumericTable, DataSourceIface.doDictionaryFromContext) testData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate) testGroundTruth = HomogenNumericTable(nDependentVariables, 0, NumericTableIface.notAllocate) mergedData = MergedNumericTable(testData, testGroundTruth) testDataSource.loadDataBlock(mergedData) algorithm = prediction.Batch_Float64DefaultDense() algorithm.input.setNumericTableInput(prediction.data, testData) algorithm.input.setModelInput(prediction.model, trainingResult.get(training.model)) predictionResult = algorithm.compute() block1 = BlockDescriptor() block2 = BlockDescriptor() testGroundTruth.getBlockOfRows(0, testGroundTruth.getNumberOfRows(), readOnly, block1) predictionResult.get(prediction.prediction).getBlockOfRows( 0, testGroundTruth.getNumberOfRows(), readOnly, block2) y_true = getClassVector(block1.getArray(), 0.000000000000) predictionRegression = block2.getArray() for thresholdIndex in range(0, len(thresholdValues)): y_pred = getClassVector(predictionRegression, thresholdValues[thresholdIndex]) numberOfCorrectlyClassifiedObjects[ thresholdIndex] += accuracy_score(y_true, y_pred, normalize=False) numberOfObjectsInTestFiles += len(y_true) numberOfNonzeroObjectsInTestFiles += np.count_nonzero(y_true) mergedData.freeDataMemory() testData.freeDataMemory() testGroundTruth.freeDataMemory() classificationAccuracyResult = np.zeros(len(thresholdValues)) best_threshold = None best_accuracy = -1 for thresholdIndex in range(0, len(thresholdValues)): classificationAccuracyResult[ thresholdIndex] = numberOfCorrectlyClassifiedObjects[ thresholdIndex] / numberOfObjectsInTestFiles if (classificationAccuracyResult[thresholdIndex] > best_accuracy): best_threshold = thresholdValues[thresholdIndex] best_accuracy = classificationAccuracyResult[thresholdIndex] print('Best threshold:{:.4f}. Best accuracy:{:.4f}'.format( best_threshold, best_accuracy)) print( 'Test set. Number of objects of 0 class:{:.4f}.Number of objects of 1 class:{:.4f}. ' 'Frequency of 1 class:{:.4f}'.format( numberOfObjectsInTestFiles - numberOfNonzeroObjectsInTestFiles, numberOfNonzeroObjectsInTestFiles, numberOfNonzeroObjectsInTestFiles / numberOfObjectsInTestFiles)) indexOfZeroThreshold = np.where(thresholdValues == 0.0)[0][0] print('Threshold=0. Classification accuracy:{:.4f}'.format( classificationAccuracyResult[indexOfZeroThreshold]))