Example #1
0
    def serializeTrainingResult(self):
        #  Create a data archive to serialize the numeric table
        dataArch = InputDataArchive()
        #  Serialize the numeric table into the data archive
        self.trainingResult.serialize(dataArch)
        #  Get the length of the serialized data in bytes
        length = dataArch.getSizeOfArchive()
        #  Store the serialized data in an array
        buffer = np.zeros(length, dtype=np.ubyte)
        dataArch.copyArchiveToArray(buffer)

        return buffer
def computeOnMasterNode():
    global R, serializedData

    # Create an algorithm to compute QR decomposition on the master node
    algorithm = qr.Distributed(step2Master)

    for i in range(nBlocks):
        # Deserialize partial results from step 1
        dataArch = OutputDataArchive(serializedData[i])

        dataForStep2FromStep1 = DataCollection()
        dataForStep2FromStep1.deserialize(dataArch)

        algorithm.input.add(qr.inputOfStep2FromStep1, i, dataForStep2FromStep1)

    # Compute QR decomposition
    pres = algorithm.compute()

    inputForStep3FromStep2 = pres.getCollection(qr.outputOfStep2ForStep3)

    for i in range(nBlocks):
        # Serialize partial results to transfer to local nodes for step 3
        dataArch = InputDataArchive()
        inputForStep3FromStep2[i].serialize(dataArch)
        length = dataArch.getSizeOfArchive()

        serializedData[i] = np.empty(length, dtype=np.uint8)
        dataArch.copyArchiveToArray(serializedData[i])

    # Result class from qr
    res = algorithm.getResult()

    R = res.get(qr.matrixR)
Example #3
0
def trainModel():
    global trainingResult
    masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense()

    for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size):
        trainDataSource = FileDataSource(
            trainDatasetFileNames[filenameIndex],
            DataSourceIface.notAllocateNumericTable,
            DataSourceIface.doDictionaryFromContext)
        trainData = HomogenNumericTable(nFeatures, 0,
                                        NumericTableIface.notAllocate)
        trainDependentVariables = HomogenNumericTable(
            nDependentVariables, 0, NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(trainData, trainDependentVariables)
        trainDataSource.loadDataBlock(mergedData)

        localAlgorithm = training.Distributed_Step1LocalFloat64NormEqDense()
        localAlgorithm.input.set(training.data, trainData)
        localAlgorithm.input.set(training.dependentVariables,
                                 trainDependentVariables)
        pres = localAlgorithm.compute()
        masterAlgorithm.input.add(training.partialModels, pres)

        mergedData.freeDataMemory()
        trainData.freeDataMemory()
        trainDependentVariables.freeDataMemory()

    pres = masterAlgorithm.compute()
    dataArch = InputDataArchive()
    pres.serialize(dataArch)
    nodeResults = dataArch.getArchiveAsArray()
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        print("Number of processes is %d." % (len(serializedData)))
        masterAlgorithm = training.Distributed_Step2MasterFloat64NormEqDense()

        for i in range(comm_size):
            dataArch = OutputDataArchive(serializedData[i])
            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
Example #4
0
File: Kmeans.py Project: ravi9/csp
 def serialize(
     self,
     data,
     fileName=None,
     useCompression=False,
 ):
     buffArrObjName = (str(type(data)).split()[1].split('>')[0] +
                       '()').replace("'", '')
     dataArch = InputDataArchive()
     data.serialize(dataArch)
     length = dataArch.getSizeOfArchive()
     bufferArray = np.zeros(length, dtype=np.ubyte)
     dataArch.copyArchiveToArray(bufferArray)
     if useCompression == True:
         if fileName != None:
             if len(fileName.rsplit('.', 1)) == 2:
                 fileName = fileName.rsplit('.', 1)[0]
             compressedData = Kmeans.compress(self, bufferArray)
             np.save(fileName, compressedData)
         else:
             comBufferArray = Kmeans.compress(self, bufferArray)
             serialObjectDict = {
                 'Array Object': comBufferArray,
                 'Object Information': buffArrObjName
             }
             return serialObjectDict
     else:
         if fileName != None:
             if len(fileName.rsplit('.', 1)) == 2:
                 fileName = fileName.rsplit('.', 1)[0]
             np.save(fileName, bufferArray)
         else:
             serialObjectDict = {
                 'Array Object': bufferArray,
                 'Object Information': buffArrObjName
             }
             return serialObjectDict
     infoFile = open(fileName + '.txt', 'w')
     infoFile.write(buffArrObjName)
     infoFile.close()
def gatherPartialResultsFromNodes(partialResult, partialResults,
                                  partialResultArchLength,
                                  partialResultLocalBuffer,
                                  partialResultMasterBuffer):

    dataArch = InputDataArchive()
    partialResult.serialize(dataArch)
    if partialResultArchLength == 0:
        partialResultArchLength = dataArch.getSizeOfArchive()

    # Serialized data is of equal size on each node
    if rankId == MPI_ROOT and len(partialResultMasterBuffer) == 0:
        partialResultMasterBuffer = np.zeros(partialResultArchLength * nNodes,
                                             dtype=np.uint8)

    if len(partialResultLocalBuffer) == 0:
        partialResultLocalBuffer = np.zeros(partialResultArchLength,
                                            dtype=np.uint8)

    dataArch.copyArchiveToArray(partialResultLocalBuffer)

    # Transfer partial results to step 2 on the root node
    partialResultMasterBuffer = comm.gather(partialResultLocalBuffer)

    if rankId == MPI_ROOT:
        for node in range(nNodes):
            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(partialResultMasterBuffer[node])

            partialResults[node] = training.PartialResult()
            partialResults[node].deserialize(dataArch)
 def serialize(self, data, fileName=None, useCompression=False):
     buffArrObjName = (str(type(data)).split()[1].split('>')[0] +
                       "()").replace("'", '')
     dataArch = InputDataArchive()
     data.serialize(dataArch)
     length = dataArch.getSizeOfArchive()
     bufferArray = np.zeros(length, dtype=np.ubyte)
     dataArch.copyArchiveToArray(bufferArray)
     if useCompression == True:
         if fileName != None:
             if len(fileName.rsplit(".", 1)) == 2:
                 fileName = fileName.rsplit(".", 1)[0]
             compressedData = self.compress(bufferArray)
             np.save(fileName, compressedData)
         else:
             comBufferArray = self.compress(bufferArray)
             serialObjectDict = {
                 "Array Object": comBufferArray,
                 "Object Information": buffArrObjName
             }
             return serialObjectDict
     else:
         if fileName != None:
             if len(fileName.rsplit(".", 1)) == 2:
                 fileName = fileName.rsplit(".", 1)[0]
             np.save(fileName, bufferArray)
         else:
             serialObjectDict = {
                 "Array Object": bufferArray,
                 "Object Information": buffArrObjName
             }
             return serialObjectDict
     infoFile = open(fileName + ".txt", "w")
     infoFile.write(buffArrObjName)
     infoFile.close()
     print("Data successfully serialized and saved as {} and {}".format(
         fileName, infoFile.name))
def broadcastWeightsAndBiasesToNodes(wb):

    wbBuffer = None
    # Serialize weights and biases on the root node
    if rankId == MPI_ROOT:
        if not wb:
            # Weights and biases table should be valid and not NULL on master
            return HomogenNumericTable()

        wbDataArch = InputDataArchive()
        wb.serialize(wbDataArch)
        wbBuffer = np.zeros(wbDataArch.getSizeOfArchive(), dtype=np.uint8)
        wbDataArch.copyArchiveToArray(wbBuffer)

    # Broadcast the serialized weights and biases
    wbBuffer = comm.bcast(wbBuffer)

    # Deserialize weights and biases
    wbDataArchLocal = OutputDataArchive(wbBuffer)

    wbLocal = HomogenNumericTable(ntype=np.float32)
    wbLocal.deserialize(wbDataArchLocal)

    return wbLocal
def computeOnMasterNode():
    global serializedData, Sigma, V

    # Create an algorithm to compute SVD on the master node
    algorithm = svd.Distributed(step2Master)

    for i in range(nBlocks):
        # Deserialize partial results from step 1
        dataArch = OutputDataArchive(serializedData[i])

        dataForStep2FromStep1 = DataCollection()
        dataForStep2FromStep1.deserialize(dataArch)

        algorithm.input.add(svd.inputOfStep2FromStep1, i,
                            dataForStep2FromStep1)

    # Compute SVD
    # DistributedPartialResult class from svd
    pres = algorithm.compute()

    inputForStep3FromStep2 = pres.getCollection(svd.outputOfStep2ForStep3)

    for i in range(nBlocks):
        # Serialize partial results to transfer to local nodes for step 3
        dataArch = InputDataArchive()
        inputForStep3FromStep2[i].serialize(dataArch)
        length = dataArch.getSizeOfArchive()

        serializedData[i] = np.empty(length, dtype=np.uint8)
        dataArch.copyArchiveToArray(serializedData[i])

    # DistributedPartialResult class from svd
    res = algorithm.getResult()

    Sigma = res.get(svd.singularValues)
    V = res.get(svd.rightSingularMatrix)
Example #9
0
def trainModel():
    global trainingResult
    nodeResults = []
    # Create an algorithm object to build the final Naive Bayes model on the master node
    masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses)
    for filenameIndex in range(rankId, len(trainDatasetFileNames), comm_size):
        # Initialize FileDataSource to retrieve the input data from a .csv file
        #print("The worker with rank %d will read %s." % (rankId, trainDatasetFileNames[filenameIndex]))
        trainDataSource = FileDataSource(trainDatasetFileNames[filenameIndex],
                                         DataSourceIface.notAllocateNumericTable,
                                         DataSourceIface.doDictionaryFromContext)

        # Create Numeric Tables for training data and labels
        trainData = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate)
        trainDependentVariables = HomogenNumericTable(1, 0, NumericTableIface.notAllocate)
        mergedData = MergedNumericTable(trainData, trainDependentVariables)

        # Retrieve the data from the input file
        trainDataSource.loadDataBlock(mergedData)

        # Create an algorithm object to train the Naive Bayes model based on the local-node data
        localAlgorithm = training.Distributed_Step1LocalFloat64DefaultDense(nClasses)

        # Pass a training data set and dependent values to the algorithm
        localAlgorithm.input.set(classifier.training.data, trainData)
        localAlgorithm.input.set(classifier.training.labels, trainDependentVariables)

        # Train the Naive Bayes model on local nodes
        pres = localAlgorithm.compute()
        # Serialize partial results required by step 2
        dataArch = InputDataArchive()
        pres.serialize(dataArch)

        masterAlgorithm.input.add(classifier.training.partialModels, pres)
        """
        nodeResults.append(dataArch.getArchiveAsArray().copy())
        localAlgorithm.clean()
        """
        mergedData.freeDataMemory()
        trainData.freeDataMemory()
        trainDependentVariables.freeDataMemory()
    # Transfer partial results to step 2 on the root node
    pres = masterAlgorithm.compute()
    dataArch = InputDataArchive()
    pres.serialize(dataArch)
    nodeResults.append(dataArch.getArchiveAsArray().copy())
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        # Create an algorithm object to build the final Naive Bayes model on the master node
        masterAlgorithm = training.Distributed_Step2MasterFloat64DefaultDense(nClasses)

        for currentRank in range(len(serializedData)):
            for currentBlock in range(0, len(serializedData[currentRank])):
                # Deserialize partial results from step 1
                dataArch = OutputDataArchive(serializedData[currentRank][currentBlock])

                dataForStep2FromStep1 = classifier.training.PartialResult()
                dataForStep2FromStep1.deserialize(dataArch)

                # Set the local Naive Bayes model as input for the master-node algorithm
                masterAlgorithm.input.add(classifier.training.partialModels, dataForStep2FromStep1)

        # Merge and finalizeCompute the Naive Bayes model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()
def trainModel(comm, rankId):

    trainingResult = None

    # Initialize FileDataSource to retrieve the input data from a .csv file
    trainDataSource = FileDataSource(trainDatasetFileNames[rankId],
                                     DataSourceIface.notAllocateNumericTable,
                                     DataSourceIface.doDictionaryFromContext)

    # Create Numeric Tables for training data and labels
    trainData = HomogenNumericTable(NUM_FEATURES, 0,
                                    NumericTableIface.doNotAllocate)
    trainDependentVariables = HomogenNumericTable(
        NUM_DEPENDENT_VARS, 0, NumericTableIface.doNotAllocate)
    mergedData = MergedNumericTable(trainData, trainDependentVariables)

    # Retrieve the data from the input file
    trainDataSource.loadDataBlock(mergedData)

    # Create an algorithm object to train the ridge regression model based on the local-node data
    localAlgorithm = training.Distributed(step1Local)

    # Pass a training data set and dependent values to the algorithm
    localAlgorithm.input.set(training.data, trainData)
    localAlgorithm.input.set(training.dependentVariables,
                             trainDependentVariables)

    # Train the ridge regression model on local nodes
    pres = localAlgorithm.compute()

    # Serialize partial results required by step 2
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    # Transfer partial results to step 2 on the root node
    nodeResults = dataArch.getArchiveAsArray()

    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:

        # Create an algorithm object to build the final ridge regression model on the master node
        masterAlgorithm = training.Distributed(step2Master)

        for i in range(NUM_BLOCKS):

            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])
            dataForStep2FromStep1 = training.PartialResult()
            dataForStep2FromStep1.deserialize(dataArch)

            # Set the local ridge regression model as input for the master-node algorithm
            masterAlgorithm.input.add(training.partialModels,
                                      dataForStep2FromStep1)

        # Merge and finalizeCompute the ridge regression model on the master node
        masterAlgorithm.compute()
        trainingResult = masterAlgorithm.finalizeCompute()

        # Retrieve the algorithm results
        printNumericTable(
            trainingResult.get(training.model).getBeta(),
            "Ridge Regression coefficients:")

    return trainingResult
Example #11
0
    )

    # Retrieve the input data
    dataSource.loadDataBlock()

    # Create an algorithm to compute a variance-covariance matrix on local nodes
    localAlgorithm = covariance.Distributed(step1Local)

    # Set the input data set to the algorithm
    localAlgorithm.input.set(covariance.data, dataSource.getNumericTable())

    # Compute a variance-covariance matrix
    pres = localAlgorithm.compute()

    # Serialize partial results required by step 2
    dataArch = InputDataArchive()

    pres.serialize(dataArch)
    perNodeArchLength = dataArch.getSizeOfArchive()

    nodeResults = dataArch.getArchiveAsArray()

    # Transfer partial results to step 2 on the root node
    data = comm_size.gather(nodeResults, MPI_ROOT)

    if rankId == MPI_ROOT:

        # Create an algorithm to compute a variance-covariance matrix on the master node
        masterAlgorithm = covariance.Distributed(step2Master)

        for i in range(nBlocks):
    # Retrieve the input data from a file
    dataTable = createSparseTable(datasetFileNames[rankId])

    # Create an algorithm to compute low order moments on local nodes
    localAlgorithm = low_order_moments.Distributed(
        step1Local, method=low_order_moments.fastCSR)

    # Set the input data set to the algorithm
    localAlgorithm.input.set(low_order_moments.data, dataTable)

    # Compute low order moments
    pres = localAlgorithm.compute()

    # Serialize partial results required by step 2
    dataArch = InputDataArchive()
    pres.serialize(dataArch)

    nodeResults = dataArch.getArchiveAsArray()

    # Transfer partial results to step 2 on the root node
    serializedData = comm.gather(nodeResults)

    if rankId == MPI_ROOT:
        # Create an algorithm to compute low order moments on the master node
        masterAlgorithm = low_order_moments.Distributed(
            step2Master, method=low_order_moments.fastCSR)

        for i in range(nBlocks):
            # Deserialize partial results from step 1
            dataArch = OutputDataArchive(serializedData[i])