Example #1
0
    def multiboxDataTasksSort(self, ct, workers=2, memoryLimit=100, pageSizeOverrideMB=1):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = __ct__;
            let aPrime = 503

            let toSort = Vector.range(N, { ((_ * _) % aPrime, _) }).paged;

            let result = cached`(#ExternalIoTask(#DistributedDataOperation(#Sort(toSort))))

            sorting.isSorted(result)
            """.replace("__ct__", str(ct))

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text,
            s3,
            workers,
            timeout=TIMEOUT,
            memoryLimitMb=memoryLimit,
            pageSizeOverride=pageSizeOverrideMB*1024*1024
            )

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)
Example #2
0
    def test_sortVecOfVec(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let values = []
            let ct = 500000

            values = [(ix % 100, Vector.range(40)) for ix in sequence(ct)]

            let res = cached`(#ExternalIoTask(#DistributedDataOperation(#Sort(values.paged))));

            let firstAreSorted = true;
            for ix in sequence(size(res)-1)
                if (res[ix][0] > res[ix+1][0])
                    firstAreSorted = false;

            size(res) == size(values) and firstAreSorted
            """

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text, s3, 1, timeout=TIMEOUT, memoryLimitMb=3000)

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)
Example #3
0
    def test_takeFromLargeObjects(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = 100;

            //each string is 1 MB
            let takeFrom = [" " * 100 * 100 * 10 * 10 + " " * ix for ix in sequence(N)].paged;
            let indices = Vector.range(N,fun(x) { x }).paged;

            cached`(#ExternalIoTask(#DistributedDataOperation(#Take(indices, takeFrom))))
            """

        try:
            result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                text,
                s3,
                1,
                timeout=TIMEOUT,
                memoryLimitMb=1000,
                returnSimulation = True,
                pageSizeOverride = 1024 * 1024
                )

            self.assertTrue(result is not None)
            self.assertTrue(result.isResult(), result)

            for page in result.asResult.result.getVectorPageIds(simulation.getWorkerVdm(0)):
                self.assertLess(page.bytecount / 1024.0 / 1024.0, 2.0)
        finally:
            simulation.teardown()
Example #4
0
    def test_multiboxDataTasksTake_1(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = 10000000;
            let isPrime = fun(p) {
                let x = 2
                while (x*x <= p) {
                    if (p%x == 0)
                        return 0
                    x = x + 1
                    }
                return x
                }

            let takeFrom = Vector.range(N, isPrime).paged;
            let indices = Vector.range(N,fun(x) { (0, (x * 503) % N ) }).paged;

            cached`(#ExternalIoTask(#DistributedDataOperation(#Take(indices, takeFrom)))) ==
                indices ~~ { takeFrom[_[1]] }
            """

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text,
            s3,
            2,
            timeout=TIMEOUT,
            memoryLimitMb=1000
            )

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)
Example #5
0
    def test_splitToRowMajor(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        nRows = 100000
        nColumns = 50

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.setupScript(nRows, nColumns),
            s3,
            1,
            timeout=30,
            memoryLimitMb=8 * 1024,
            threadCount=4,
            returnSimulation=True,
            useInMemoryCache=False)

        try:
            self.assertTrue(result.isResult())

            setup = result.asResult.result

            t0 = time.time()
            result = simulation.compute(self.splitToRowMajorScript(),
                                        timeout=360,
                                        chunks=setup)
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            PerformanceTestReporter.recordTest(
                "algorithms.text.splitToRowMajor.%srows_%scolumns" %
                (nRows, nColumns), totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Example #6
0
    def test_sortHeterogeneous(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let values = []
            let ct = 1000000
            for ix in sequence(ct)
                values = values :: ix :: Float64(ix)

            let sortedVals = cached`(#ExternalIoTask(#DistributedDataOperation(#Sort(values.paged))))

            let sortedAndHomogenous = fun(v) {
                for ix in sequence(size(v)-1)
                    if (v[ix] >= v[ix+1] or `TypeJOV(v[ix]) is not `TypeJOV(v[ix+1]))
                        throw (ix, v[ix], v[ix+1])
                return true;
                }
            
            if (size(sortedVals) != size(values))
                throw "expected " + String(size(values)) + ", not " + String(size(sortedVals))
            sortedAndHomogenous(sortedVals[,ct]) and 
                sortedAndHomogenous(sortedVals[ct,])
            """

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text,
            s3,
            1,
            timeout=TIMEOUT,
            memoryLimitMb=1000
            )

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)
    def test_vector_string_apply(self):
        #verify that the compiler doesn't crap out during many runs.
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        InMemoryCumulusSimulation.computeUsingSeveralWorkers("""
            let v = Vector.range(10000000)

            let v2 = v.apply(String)

            let v3 = v2.apply({_ + "a"})

            v3.sum(size)
            """,
                                                             s3,
                                                             4,
                                                             timeout=240)
Example #8
0
    def test_sortVecOfVec(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let values = []
            let ct = 500000

            values = [(ix % 100, Vector.range(40)) for ix in sequence(ct)]

            let res = cached`(#ExternalIoTask(#DistributedDataOperation(#Sort(values.paged))));

            let firstAreSorted = true;
            for ix in sequence(size(res)-1)
                if (res[ix][0] > res[ix+1][0])
                    firstAreSorted = false;

            size(res) == size(values) and firstAreSorted
            """

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text,
            s3,
            1,
            timeout=TIMEOUT,
            memoryLimitMb=3000
            )

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)
Example #9
0
    def basicTaskPathwayTest(self, sz, machines=1, memory=1000):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = __size__;

            //let values = Vector.range(N,fun(x) { ((x * 503) % N, x) }).paged;
            let values = Vector.range(N).paged;

            let s1 = cached`(#ExternalIoTask(#DistributedDataOperation(#Sort(values))))
            let s2 = sorting.sort(values)

            if (size(s1) != size(s2))
                return 'wrong size: %s != %s'.format(size(s1), size(s2))
            for ix in sequence(size(s1))
                if (s1[ix] != s2[ix])
                    return 'not equal: index=%s. %s != %s'.format(ix, s1[ix], s2[ix])
            return true
            """.replace("__size__", str(sz))

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text,
            s3,
            machines,
            timeout=TIMEOUT,
            memoryLimitMb=memory
            )

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)
    def test_expansionWithVecOfVec(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        simulation = InMemoryCumulusSimulation.InMemoryCumulusSimulation(
            4,  #worker count
            1,
            memoryPerWorkerMB=100,
            threadsPerWorker=2,
            s3Service=s3)

        try:
            self.assertTrue(simulation.waitForGlobalScheduler(timeout=2.0))

            simulation.getGlobalScheduler().setCheckpointStatusInterval(0.0001)

            simulation.submitComputation(
                "Vector.range(20, fun(ix) { Vector.range(100000+ix).paged }).paged"
            )

            simulation.waitForAnyResult()

            simulation.addWorker()

            self.assertTrue(simulation.waitForHandshake())
        finally:
            simulation.teardown()
Example #11
0
    def basic_gpu_works_helper(self, function, onGPU=True):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        testingVectorText = "Vector.range(1024*4, {_+1000000})"

        text = """
            let f = fun(ct) {
                let res = 0.0
                let x = 1.0
                while (x < ct)
                    {
                    x = x + 1.0
                    res = res + `""" + function + """(x)
                    }
                res
                }"""

        if onGPU:
            text += """`CUDAVectorApply(f,""" + testingVectorText + """)"""
        else:
            text += testingVectorText + """ ~~ f"""

        res = InMemoryCumulusSimulation.computeUsingSeveralWorkers(text, s3, 1, timeout=120, threadCount=4)
        self.assertIsNotNone(res)
        self.assertTrue(res.isResult(), res)
Example #12
0
    def test_cumulusCanTriggerNewRegimes(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        simulation = InMemoryCumulusSimulation.InMemoryCumulusSimulation(
            4,  #worker count
            1,
            memoryPerWorkerMB=100,
            threadsPerWorker=2,
            s3Service=s3)

        try:
            self.assertTrue(simulation.waitForGlobalScheduler(timeout=2.0))

            simulation.waitForHandshake()
            regime = simulation.getWorker(0).getRegimeHash()
            self.assertTrue(regime is not None)

            simulation.getWorker(0).triggerRegimeChange()

            time.sleep(1.0)

            simulation.waitForHandshake()

            regime2 = simulation.getWorker(0).getRegimeHash()
            self.assertTrue(regime2 is not None)
            self.assertTrue(regime2 != regime)
        finally:
            simulation.teardown()
Example #13
0
    def basic_gpu_works_helper(self, function, onGPU=True):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        testingVectorText = "Vector.range(1024*4, {_+1000000})"

        text = """
            let f = fun(ct) {
                let res = 0.0
                let x = 1.0
                while (x < ct)
                    {
                    x = x + 1.0
                    res = res + `""" + function + """(x)
                    }
                res
                }"""

        if onGPU:
            text += """`CUDAVectorApply(f,""" + testingVectorText + """)"""
        else:
            text += testingVectorText + """ ~~ f"""

        res = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text, s3, 1, timeout=120, threadCount=4)
        self.assertIsNotNone(res)
        self.assertTrue(res.isResult(), res)
Example #14
0
    def test_transposeToColumnMajor(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        nRows = 100000
        nColumns = 50

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.transposeSetupScript(nRows, nColumns),
            s3, 1, timeout = 300, memoryLimitMb = 45 * 1024, threadCount = 30,
            returnSimulation = True, useInMemoryCache = False)

        try:
            self.assertTrue(result.isResult())

            rowMajor = result.asResult.result

            t0 = time.time()
            result = simulation.compute(
                self.transposeRowMajorToColumnMajorScript(nRows, nColumns),
                timeout = 500,
                rowMajor = rowMajor
                )
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            PerformanceTestReporter.recordTest(
                "algorithms.text.transposeRowMajorToColumnMajor.%srows_%scolumns" % (nRows, nColumns),
                totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Example #15
0
    def classSortingTest(self, sz, useClass = True, machines=1, memory=1000):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = __size__;

            let C = if (__use_class__) { class { member x; } } else { Int64 }

            let values = Vector.range(N, C).paged;

            let s1 = cached`(#ExternalIoTask(#DistributedDataOperation(#Sort(values))))
            
            return size(s1) == N
            """.replace("__size__", str(sz)).replace("__use_class__", '1' if useClass else '0')

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text,
            s3,
            machines,
            timeout=TIMEOUT,
            memoryLimitMb=memory
            )

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)
    def test_vector_string_apply(self):
        #verify that the compiler doesn't crap out during many runs.
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        InMemoryCumulusSimulation.computeUsingSeveralWorkers("""
            let v = Vector.range(10000000)

            let v2 = v.apply(String)

            let v3 = v2.apply({_ + "a"})

            v3.sum(size)
            """,
            s3,
            4,
            timeout=240
            )
Example #17
0
    def gbmRegressionFittingTest(self, nRows, nColumns, depth, nThreads,
                                 maxBoosts):
        testName = self.getTestName(nRows, nColumns, depth, maxBoosts,
                                    nThreads)

        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(nRows, nColumns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=30 * 1024,
            threadCount=nThreads,
            returnSimulation=True,
            useInMemoryCache=False)
        try:
            self.assertTrue(result.isResult())

            dfPredictors, dfResponse = result.asResult.result

            fitter = simulation.compute(
                self.regressionScript(depth, 1),
                timeout=360,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors).asResult.result

            t0 = time.time()

            for nBoosts in range(1, maxBoosts):
                testName = self.getTestName(nRows, nColumns, depth, nBoosts,
                                            nThreads)

                predictions = simulation.compute(
                    "fitter.predictionsAndPseudoresiduals()",
                    timeout=360,
                    fitter=fitter).asResult.result
                totalTimeToReturnResult = time.time() - t0

                PerformanceTestReporter.recordTest(testName + "_predict",
                                                   totalTimeToReturnResult,
                                                   None)

                fitter = simulation.compute(
                    "fitter.nextGivenPredictions(predictions)",
                    timeout=360,
                    fitter=fitter,
                    predictions=predictions).asResult.result
                totalTimeToReturnResult = time.time() - t0

                PerformanceTestReporter.recordTest(testName,
                                                   totalTimeToReturnResult,
                                                   None)

        finally:
            simulation.teardown()
    def test_CalculationRicochet(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let f = fun(ct, seed = 1) {
                let x = 0

                let res = []

                let it = iterator(math.random.UniformReal(0, size(v), seed))

                for ix in sequence(ct) {
                    let x = Int64(pull it)
                    res = res :: (x / Float64(size(v)), v[x])
                    }

                return res
                }

            v[2]
            f(__count__,__seed__)
            """

        vResult, sim = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            "Vector.range(125000000, math.log)",
            s3,
            4,
            timeout=120,
            memoryLimitMb=400,
            threadCount=1,
            useInMemoryCache=True,
            returnSimulation=True)

        try:
            v = vResult.asResult.result

            t0 = time.time()
            sim.compute(text.replace("__seed__",
                                     "1").replace("__count__", "1000"),
                        timeout=120,
                        v=v)
            PerformanceTestReporter.recordTest(
                "python.InMemoryCumulus.Ricochet1000.Pass1",
                time.time() - t0, None)

            t0 = time.time()
            sim.compute(text.replace("__seed__",
                                     "2").replace("__count__", "1000"),
                        timeout=120,
                        v=v)
            PerformanceTestReporter.recordTest(
                "python.InMemoryCumulus.Ricochet1000.Pass2",
                time.time() - t0, None)
        finally:
            sim.teardown()
Example #19
0
    def gbmRegressionFittingTest(self, nRows, nColumns, depth, nThreads, maxBoosts):
        testName = self.getTestName(nRows, nColumns, depth, maxBoosts, nThreads)

        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(nRows, nColumns),
                        s3,
                        1,
                        timeout = 360,
                        memoryLimitMb = 30 * 1024,
                        threadCount = nThreads,
                        returnSimulation = True,
                        useInMemoryCache = False
                        )
        try:
            self.assertTrue(result.isResult())

            dfPredictors, dfResponse = result.asResult.result

            fitter = simulation.compute(
                self.regressionScript(depth, 1),
                timeout = 360,
                dfResponse = dfResponse,
                dfPredictors = dfPredictors
                ).asResult.result

            t0 = time.time()

            for nBoosts in range(1, maxBoosts):
                testName = self.getTestName(nRows, nColumns, depth, nBoosts, nThreads)

                predictions = simulation.compute(
                    "fitter.predictionsAndPseudoresiduals()",
                    timeout = 360,
                    fitter = fitter
                    ).asResult.result
                totalTimeToReturnResult = time.time() - t0

                PerformanceTestReporter.recordTest(
                    testName + "_predict", totalTimeToReturnResult, None)

                fitter = simulation.compute(
                    "fitter.nextGivenPredictions(predictions)",
                    timeout = 360,
                    fitter = fitter,
                    predictions = predictions
                    ).asResult.result
                totalTimeToReturnResult = time.time() - t0

                PerformanceTestReporter.recordTest(
                    testName, totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Example #20
0
        def takeTest(indexExpr):
            result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                takeText.replace("__indices__", indexExpr),
                s3,
                1,
                timeout=TIMEOUT,
                memoryLimitMb=1000)

            self.assertTrue(result is not None)
            self.assertTrue(result.isResult(), result)
            self.assertTrue(result.asResult.result.pyval == True, result)
        def takeTest(indexExpr):
            result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                takeText.replace("__indices__", indexExpr),
                s3,
                1,
                timeout=TIMEOUT,
                memoryLimitMb=1000
                )

            self.assertTrue(result is not None)
            self.assertTrue(result.isResult(), result)
            self.assertTrue(result.asResult.result.pyval == True, result)
Example #22
0
    def test_disk_scans(self):
        s3 = ActualS3Interface.ActualS3InterfaceFactory()
        objectStore = S3ObjectStore.S3ObjectStore(
            s3,
            Setup.config().userDataS3Bucket,
            prefix="test_object_cache/"
            )

        _, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            "1+1",
            s3,
            1,
            memoryLimitMb=1 * 1024,
            threadCount=30,
            returnSimulation=True,
            ioTaskThreadOverride=8,
            objectStore=objectStore,
            useInMemoryCache=False  #use an actual disk cache for this
            )

        try:
            gigabytes = 8

            t0 = time.time()

            resultVectors = []
            for ix in range(gigabytes):
                result = simulation.compute("Vector.range(125000000 + %s)" % ix, timeout=120)
                resultVectors.append(result.asResult.result)

            t1 = time.time()

            intResults = []
            for vec in resultVectors:
                result = simulation.compute("v.sum()", timeout = 120, v=vec)
                intResults.append(result.asResult.result.pyval)


            self.assertTrue(len(intResults) == gigabytes)

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.Write.10GB",
                t1 - t0,
                None
                )

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.WriteAndScan.%sGB" % gigabytes,
                time.time() - t0,
                None
                )
        finally:
            simulation.teardown()
Example #23
0
    def loadCheckpointFromFreshSimulationTest(self,
                                              calculationText,
                                              timestampsPerPassList,
                                              clientCount=1,
                                              timestep=1.0):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        statuses = []
        viewFactory = None

        for timestampsThisPass in timestampsPerPassList:
            simulation = InMemoryCumulusSimulation.InMemoryCumulusSimulation(
                4,  #worker count
                clientCount,
                memoryPerWorkerMB=100,
                threadsPerWorker=2,
                s3Service=s3,
                sharedStateViewFactory=viewFactory)

            viewFactory = simulation.sharedStateViewFactory

            statusesThisPass = []

            try:
                self.assertTrue(simulation.waitForGlobalScheduler(timeout=2.0))

                simulation.getGlobalScheduler().setCheckpointStatusInterval(
                    0.1)

                for ix in range(clientCount):
                    simulation.submitComputationOnClient(ix, calculationText)

                for subPass in range(timestampsThisPass):
                    time.sleep(timestep)
                    statusesThisPass.append(
                        self.timeElapsedOfMostRecentCheckpoints(simulation))

                simulation.getGlobalScheduler(
                ).triggerFullCheckpointsOnOutstandingComputations()

                self.waitForFullCheckpoint(simulation)

                statusesThisPass.append(
                    self.timeElapsedOfMostRecentCheckpoints(simulation))
            finally:
                for ix in range(4):
                    simulation.getWorker(ix).dumpStateToLog()

                simulation.teardown()

            statuses.append(statusesThisPass)

        return statuses
    def largeDatasetJoinTest(self, mbOfData, columns, threads, machineCount, ratio = .5):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(mbOfData, columns),
                        s3,
                        machineCount,
                        timeout = 360,
                        memoryLimitMb = mbOfData / ratio / machineCount,
                        #channelThroughputMBPerSecond = 100.0,
                        threadCount = threads,
                        returnSimulation = True,
                        useInMemoryCache = False,
                        disableEventHandler = True
                        )

        try:
            self.assertTrue(result.isResult())

            data = result.asResult.result

            joinScript = """
                    let leftDF = dataframe.DataFrame(data[,size(data)/2])
                    let rightDF = dataframe.DataFrame(data[size(data)/2,])

                    size(leftDF.join(rightDF, on: "C0", how: `outer, chunkSize: 1000000, areSorted:true))
                    """

            t0 = time.time()
            result = simulation.compute(
                joinScript,
                timeout=1080,
                data=data
                )
            totalTimeToReturnResult = time.time() - t0

            logging.info("Total time to join: %s", totalTimeToReturnResult)

            self.assertTrue(result.isResult(), result)

            PerformanceTestReporter.recordTest(
                "algorithms.Join.inMemory_%sMB_%scols_%sthreads_%smachines" %
                    (mbOfData, columns,threads,machineCount),
                totalTimeToReturnResult,
                None
                )
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
    def gbmRegressionFittingTest(self,
                                 nRows,
                                 nColumns,
                                 depth,
                                 nThreads,
                                 nBoosts,
                                 copies,
                                 report=True):

        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(nRows, nColumns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=30 * 1024,
            threadCount=nThreads,
            returnSimulation=True,
            useInMemoryCache=False)
        try:
            self.assertTrue(result.isResult())

            dfPredictors, dfResponse = result.asResult.result

            builder = simulation.compute(
                self.regressionScript(depth, nBoosts),
                timeout=360,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors).asResult.result

            t0 = time.time()

            testName = self.getTestName(nRows, nColumns, depth, nBoosts,
                                        nThreads, copies)

            result = simulation.compute(
                "Vector.range(%s).apply(fun(x) { builder.fit(dfPredictors[,-x-1], dfResponse[,-x-1]) })"
                % copies,
                timeout=360,
                builder=builder,
                dfPredictors=dfPredictors,
                dfResponse=dfResponse,
            ).asResult.result
            totalTimeToReturnResult = time.time() - t0

            if report:
                PerformanceTestReporter.recordTest(testName,
                                                   totalTimeToReturnResult,
                                                   None)

        finally:
            simulation.teardown()
    def test_importanceSampling(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            importanceSampling,
            s3,
            4,
            memoryLimitMb = 1000,
            timeout=240,
            useInMemoryCache = False
            )

        self.assertTrue(result.isResult())
    def test_bigLmOnDataframe(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            importanceSampling,
            s3,
            1,
            memoryLimitMb=4000,
            threadCount=4,
            timeout=240,
            useInMemoryCache=False)

        self.assertTrue(result.isResult(), result)
Example #28
0
    def runOnGPU(self, funcExpr, vecExpr, captureExpr=""):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = captureExpr + """
            let f = __funcExpr__;
            let vec = __vecExpr__;
            cached`(#GpuApply(f, vec));
            """.replace("__funcExpr__", funcExpr).replace("__vecExpr__", vecExpr)

        res = InMemoryCumulusSimulation.computeUsingSeveralWorkers(text, s3, 1, timeout=120, threadCount=4,memoryLimitMb=1000)
        self.assertIsNotNone(res)
        self.assertFalse(res.isException(), "Failed with %s" % res)
        return res
    def largeDatasetJoinTest(self,
                             mbOfData,
                             columns,
                             threads,
                             machineCount,
                             ratio=.5):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(mbOfData, columns),
            s3,
            machineCount,
            timeout=360,
            memoryLimitMb=mbOfData / ratio / machineCount,
            #channelThroughputMBPerSecond = 100.0,
            threadCount=threads,
            returnSimulation=True,
            useInMemoryCache=False,
            disableEventHandler=True)

        try:
            self.assertTrue(result.isResult())

            data = result.asResult.result

            joinScript = """
                    let leftDF = dataframe.DataFrame(data[,size(data)/2])
                    let rightDF = dataframe.DataFrame(data[size(data)/2,])

                    size(leftDF.join(rightDF, on: "C0", how: `outer, chunkSize: 1000000, areSorted:true))
                    """

            t0 = time.time()
            result = simulation.compute(joinScript, timeout=1080, data=data)
            totalTimeToReturnResult = time.time() - t0

            logging.info("Total time to join: %s", totalTimeToReturnResult)

            self.assertTrue(result.isResult(), result)

            PerformanceTestReporter.recordTest(
                "algorithms.Join.inMemory_%sMB_%scols_%sthreads_%smachines" %
                (mbOfData, columns, threads, machineCount),
                totalTimeToReturnResult, None)
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
    def test_bigLmOnDataframe(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            importanceSampling,
            s3,
            1,
            memoryLimitMb = 4000,
            threadCount = 4,
            timeout=240,
            useInMemoryCache = False
            )

        self.assertTrue(result.isResult(), result)
Example #31
0
 def check_precision_of_function_on_GPU(self, function, input):
     s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()
     text = """
         let f = fun(x) {
             `""" + function + """(x)
             }
         `CUDAVectorApply(f, [""" + str(input) + """])[0]
         """
     res = InMemoryCumulusSimulation.computeUsingSeveralWorkers(text, s3, 1, timeout=120, threadCount=4)
     self.assertIsNotNone(res)
     self.assertTrue(res.isResult(), res)
     gpuValue = res.asResult.result.pyval
     methodToCall = getattr(math, function)
     pythonValue = methodToCall(input)
     self.assertTrue(abs(gpuValue - pythonValue) < 1e-10)
Example #32
0
 def check_precision_of_function_on_GPU(self, function, input):
     s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()
     text = """
         let f = fun(x) {
             `""" + function + """(x)
             }
         cached`(#GpuApply(f, [""" + str(input) + """]))[0]
         """
     res = InMemoryCumulusSimulation.computeUsingSeveralWorkers(text, s3, 1, timeout=120, threadCount=4)
     self.assertIsNotNone(res)
     self.assertTrue(res.isResult(), res)
     gpuValue = res.asResult.result.pyval
     methodToCall = getattr(math, function)
     pythonValue = methodToCall(input)
     self.assertTrue(abs(gpuValue - pythonValue) < 1e-10)
Example #33
0
    def largeDatasetBigLMTest(self, mbOfData, columns, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(mbOfData, columns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=50 * 1024,
            threadCount=threads,
            returnSimulation=True,
            useInMemoryCache=False)

        if testName is not None:
            PerformanceTestReporter.recordTest(testName + "_create",
                                               time.time() - t0, None)

        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            regressionScript = """
                let model = math.regression.LinearRegression(dfPredictors, dfResponse, fitIntercept: false);
                let coefficients = model.coefficients();
                coefficients[0]
                """

            t0 = time.time()
            result = simulation.compute(regressionScript,
                                        timeout=1080,
                                        dfResponse=dfResponse,
                                        dfPredictors=dfPredictors)
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            if testName is not None:
                PerformanceTestReporter.recordTest(testName,
                                                   totalTimeToReturnResult,
                                                   None)
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
Example #34
0
    def largeDatasetBigLMTest(self, mbOfData, columns, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(mbOfData, columns),
                        s3,
                        1,
                        timeout = 360,
                        memoryLimitMb = 50 * 1024,
                        threadCount = threads,
                        returnSimulation = True,
                        useInMemoryCache = False
                        )

        if testName is not None:
            PerformanceTestReporter.recordTest(testName + "_create", time.time() - t0, None)

        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            regressionScript = """
                let model = math.regression.LinearRegression(dfPredictors, dfResponse, fitIntercept: false);
                let coefficients = model.coefficients();
                coefficients[0]
                """

            t0 = time.time()
            result = simulation.compute(
                regressionScript,
                timeout=1080,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors
                )
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            if testName is not None:
                PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
    def regressionTreePredictionTest(self,
                                     mbOfData,
                                     columns,
                                     testName,
                                     treeDepth,
                                     threads,
                                     minSamplesSplit=50):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(mbOfData, columns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=45 * 1024,
            threadCount=threads,
            returnSimulation=True,
            useInMemoryCache=False)
        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            fitTree = simulation.compute(self.regressionScript(
                treeDepth, minSamplesSplit - 1),
                                         timeout=120,
                                         dfResponse=dfResponse,
                                         dfPredictors=dfPredictors)

            def predictionScript(dirtyFlag=1):
                return ";(%s; fitRegressionTree.predict(dfPredictors));" % dirtyFlag

            t0 = time.time()
            result = simulation.compute(
                predictionScript(),
                timeout=120,
                dfPredictors=dfPredictors,
                fitRegressionTree=fitTree.asResult.result)
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            PerformanceTestReporter.recordTest(testName,
                                               totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Example #36
0
    def test_vector_transpose(self):
        #verify that the compiler doesn't crap out during many runs.
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        _, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers("1+1",
            s3,
            2,
            memoryLimitMb = 500,
            timeout=10,
            returnSimulation = True,
            channelThroughputMBPerSecond = 50.0
            )

        try:
            result = simulation.compute("""
                let arrangedContiguously = fun (vecs) {
                    let res = vecs.sum().paged;

                    let tr = []
                    let low = 0
                    for v in vecs {
                        tr = tr :: res[low,low+size(v)]
                        low = low + size(v)
                        }

                    tr
                    };

                let transpose = fun(vecOfIndexable) {
                    let vecs = arrangedContiguously(vecOfIndexable)

                    let n = size(vecs[0]);

                    [[vecs[jx][ix] for jx in sequence(size(vecs))] for ix in sequence(n)]
                    };

                let vectors = Vector.range(5000, {Vector.range(300)})

                transpose(vectors)
                """,
                timeout = 30.0
                )

            self.assertTrue(result.isResult())

        finally:
            simulation.teardown()
    def test_vector_transpose(self):
        #verify that the compiler doesn't crap out during many runs.
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        _, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers("1+1",
            s3,
            2,
            memoryLimitMb = 500,
            timeout=10,
            returnSimulation = True,
            channelThroughputMBPerSecond = 50.0
            )

        try:
            result = simulation.compute("""
                let arrangedContiguously = fun (vecs) {
                    let res = vecs.sum().paged;

                    let tr = []
                    let low = 0
                    for v in vecs {
                        tr = tr :: res[low,low+size(v)]
                        low = low + size(v)
                        }

                    tr
                    };

                let transpose = fun(vecOfIndexable) {
                    let vecs = arrangedContiguously(vecOfIndexable)

                    let n = size(vecs[0]);

                    [[vecs[jx][ix] for jx in sequence(size(vecs))] for ix in sequence(n)]
                    };

                let vectors = Vector.range(5000, {Vector.range(300)})

                transpose(vectors)
                """,
                timeout = 30.0
                )

            self.assertTrue(result.isResult())

        finally:
            simulation.teardown()
    def gbmRegressionFittingTest(self, nRows, nColumns, depth, nThreads, nBoosts, copies, report=True):

        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(nRows, nColumns),
                        s3,
                        1,
                        timeout = 360,
                        memoryLimitMb = 30 * 1024,
                        threadCount = nThreads,
                        returnSimulation = True,
                        useInMemoryCache = False
                        )
        try:
            self.assertTrue(result.isResult())

            dfPredictors, dfResponse = result.asResult.result

            builder = simulation.compute(
                self.regressionScript(depth, nBoosts),
                timeout = 360,
                dfResponse = dfResponse,
                dfPredictors = dfPredictors
                ).asResult.result


            t0 = time.time()

            testName = self.getTestName(nRows, nColumns, depth, nBoosts, nThreads, copies)

            result = simulation.compute(
                "Vector.range(%s).apply(fun(x) { builder.fit(dfPredictors[,-x-1], dfResponse[,-x-1]) })"
                    % copies,
                timeout = 360,
                builder=builder,
                dfPredictors=dfPredictors,
                dfResponse=dfResponse,
                ).asResult.result
            totalTimeToReturnResult = time.time() - t0

            if report:
                PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Example #39
0
 def createSimulation(self,
                      useHdfsObjectStore=False,
                      objectStore=None,
                      sharedStateViewFactory=None,
                      workerCount=4,
                      machineIdHashSeed=None,
                      s3Service=None):
     s3 = s3Service or InMemoryS3Interface.InMemoryS3InterfaceFactory()
     return InMemoryCumulusSimulation.InMemoryCumulusSimulation(
         workerCount,
         1,
         memoryPerWorkerMB=100,
         threadsPerWorker=2,
         s3Service=s3,
         objectStore=objectStore,
         sharedStateViewFactory=sharedStateViewFactory,
         machineIdHashSeed=machineIdHashSeed)
    def test_CalculationRicochet(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let f = fun(ct, seed = 1) {
                let x = 0

                let res = []

                let it = iterator(math.random.UniformReal(0, size(v), seed))

                for ix in sequence(ct) {
                    let x = Int64(pull it)
                    res = res :: (x / Float64(size(v)), v[x])
                    }

                return res
                }

            v[2]
            f(__count__,__seed__)
            """

        vResult, sim = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            "Vector.range(125000000, math.log)",
            s3,
            4,
            timeout = 120,
            memoryLimitMb=400,
            threadCount = 1,
            useInMemoryCache = True,
            returnSimulation = True
            )

        try:
            v = vResult.asResult.result

            t0 = time.time()
            sim.compute(text.replace("__seed__", "1").replace("__count__", "1000"), timeout = 120, v = v)
            PerformanceTestReporter.recordTest("python.InMemoryCumulus.Ricochet1000.Pass1", time.time() - t0,None)

            t0 = time.time()
            sim.compute(text.replace("__seed__", "2").replace("__count__", "1000"), timeout = 120, v = v)
            PerformanceTestReporter.recordTest("python.InMemoryCumulus.Ricochet1000.Pass2", time.time() - t0,None)
        finally:
            sim.teardown()
Example #41
0
    def test_disk_scans(self):
        s3 = ActualS3Interface.ActualS3InterfaceFactory()
        objectStore = S3ObjectStore.S3ObjectStore(
            s3, Setup.config().userDataS3Bucket, prefix="test_object_cache/")

        _, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            "1+1",
            s3,
            1,
            memoryLimitMb=1 * 1024,
            threadCount=30,
            returnSimulation=True,
            ioTaskThreadOverride=8,
            objectStore=objectStore,
            useInMemoryCache=False  #use an actual disk cache for this
        )

        try:
            gigabytes = 8

            t0 = time.time()

            resultVectors = []
            for ix in range(gigabytes):
                result = simulation.compute("Vector.range(125000000 + %s)" %
                                            ix,
                                            timeout=120)
                resultVectors.append(result.asResult.result)

            t1 = time.time()

            intResults = []
            for vec in resultVectors:
                result = simulation.compute("v.sum()", timeout=120, v=vec)
                intResults.append(result.asResult.result.pyval)

            self.assertTrue(len(intResults) == gigabytes)

            PerformanceTestReporter.recordTest("python.BigBox.Disk.Write.10GB",
                                               t1 - t0, None)

            PerformanceTestReporter.recordTest(
                "python.BigBox.Disk.WriteAndScan.%sGB" % gigabytes,
                time.time() - t0, None)
        finally:
            simulation.teardown()
Example #42
0
    def compareCudaToCPU(self, funcExpr, vecExpr):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let f = __funcExpr__;
            let i = __vecExpr__;
            let cuda = `CUDAVectorApply(f, [i])[0];
            let cpu = f(i)

            if (cuda == cpu)
                true
            else
                throw String(cuda) + " != " + String(cpu)
            """.replace("__funcExpr__", funcExpr).replace("__vecExpr__", vecExpr)

        res = InMemoryCumulusSimulation.computeUsingSeveralWorkers(text, s3, 1, timeout=120, threadCount=4)
        self.assertIsNotNone(res)
        self.assertTrue(res.isResult(), "Failed with %s on %s: %s" % (funcExpr, vecExpr, res))
Example #43
0
    def compareCudaToCPUnoCheck(self, funcExpr, vecExpr, captureExpr=""):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = captureExpr + """
            let f = __funcExpr__;
            let vec = __vecExpr__;
            let cuda = cached`(#GpuApply(f, vec));
            let cpu = [f(x) for x in vec]

            if (cuda == cpu)
                true
            else
                throw String(cuda) + " != " + String(cpu)
            """.replace("__funcExpr__", funcExpr).replace("__vecExpr__", vecExpr)

        res = InMemoryCumulusSimulation.computeUsingSeveralWorkers(text, s3, 1, timeout=120, threadCount=4)
        self.assertIsNotNone(res)
        return res
    def regressionTreePredictionTest(self, mbOfData, columns, testName,
                                     treeDepth, threads, minSamplesSplit=50):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(mbOfData, columns),
                        s3,
                        1,
                        timeout = 360,
                        memoryLimitMb = 45 * 1024,
                        threadCount = threads,
                        returnSimulation = True,
                        useInMemoryCache = False
                        )
        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            fitTree = simulation.compute(
                self.regressionScript(treeDepth, minSamplesSplit - 1),
                timeout=120,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors
                )

            def predictionScript(dirtyFlag=1):
                return ";(%s; fitRegressionTree.predict(dfPredictors));" % dirtyFlag

            t0 = time.time()
            result = simulation.compute(
                predictionScript(),
                timeout=120,
                dfPredictors=dfPredictors,
                fitRegressionTree=fitTree.asResult.result
                )
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            PerformanceTestReporter.recordTest(testName, totalTimeToReturnResult, None)

        finally:
            simulation.teardown()
Example #45
0
    def test_takeFromLargeObjectsAsymmetric(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = 20;

            //every thousandth string is 1 MB. Just take those.
            let takeFrom = [
                if (ix % 1000 == 0)
                    (" " * 100 * 100 * 10 * 10 + " " * (ix / 1000))
                else
                    ""
                for ix in sequence(N * 1000)].paged;

            let indices = Vector.range(N,fun(x) { x * 1000 }).paged;

            let result = cached`(#ExternalIoTask(#DistributedDataOperation(#Take(indices, takeFrom))))

            let targetResult = indices ~~ {takeFrom[_]};

            assertions.assertEqual(size(result), size(targetResult))
            assertions.assertEqual(result, targetResult)

            result
            """

        try:
            result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                text,
                s3,
                1,
                timeout=TIMEOUT,
                memoryLimitMb=1000,
                returnSimulation = True,
                pageSizeOverride = 1024 * 1024
                )

            self.assertTrue(result is not None)
            self.assertTrue(result.isResult(), result)

            for page in result.asResult.result.getVectorPageIds(simulation.getWorkerVdm(0)):
                self.assertLess(page.bytecount / 1024.0 / 1024.0, 5.0)
        finally:
            simulation.teardown()
Example #46
0
    def test_multiboxDataTasksTake_2(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = 10 * 1000000;
            let takeFrom = Vector.range(N)
            let indices = Vector.range(N,fun(x) { (0, (x * 503) % N ) });

            cached`(#ExternalIoTask(#DistributedDataOperation(#Take(indices, takeFrom))))[0]
            """

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text,
            s3,
            8,
            timeout=TIMEOUT,
            memoryLimitMb=200,
            returnSimulation=True)

        logging.info("Simulation completed")

        maxHighWatermark = 0

        try:
            for ix in range(8):
                vdm = simulation.getWorkerVdm(ix)
                vdmm = vdm.getMemoryManager()

                logging.info("Total bytes: %s",
                             vdmm.getTotalBytesMmappedHighWaterMark())
                maxHighWatermark = max(
                    maxHighWatermark, vdmm.getTotalBytesMmappedHighWaterMark())
                vdm = None
                vdmm = None

            self.assertTrue(result is not None)
            self.assertTrue(result.isResult(), result)
            self.assertTrue(isinstance(result.asResult.result.pyval, int),
                            result)
        finally:
            simulation.teardown()

        self.assertTrue(maxHighWatermark < 265 * 1024 * 1024)
Example #47
0
    def DISABLEDtest_canTriggerCheckpointOfCompleted(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        simulation = InMemoryCumulusSimulation.InMemoryCumulusSimulation(
            4,  #worker count
            1,
            memoryPerWorkerMB=100,
            threadsPerWorker=2,
            s3Service=s3)

        try:
            self.assertTrue(simulation.waitForGlobalScheduler(timeout=2.0))

            simulation.waitForHandshake()

            simulation.submitComputation("1+2")

            result = simulation.waitForAnyResult()

            self.assertTrue(result.isResult())

            simulation.getGlobalScheduler(
            ).triggerFullCheckpointsOnOutstandingComputations()

            ts = self.waitForFullCheckpoint(simulation, onlyUnfinished=False)

            self.assertTrue(ts is not None)

            statuses = simulation.getGlobalScheduler(
            ).currentOutstandingCheckpointStatuses(False, False)

            status = statuses[0]
            compId = status[0]
            checkpointStatus = status[1][0]
            checkpointRequest = status[1][1]

            #verify that it's a storage checkpoint
            self.assertTrue(checkpointRequest.writeToStorage)
            self.assertTrue(checkpointStatus.checkpointSuccessful)
            self.assertTrue(checkpointStatus.isRootComputationFinished)
        finally:
            simulation.teardown()
    def test_multiboxDataTasksTake_2(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = 10 * 1000000;
            let takeFrom = Vector.range(N)
            let indices = Vector.range(N,fun(x) { (0, (x * 503) % N ) });

            cached`(#ExternalIoTask(#DistributedDataOperation(#Take(indices, takeFrom))))[0]
            """

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text,
            s3,
            8,
            timeout=TIMEOUT,
            memoryLimitMb=200,
            returnSimulation = True
            )

        logging.info("Simulation completed")

        maxHighWatermark = 0

        try:
            for ix in range(8):
                vdm = simulation.getWorkerVdm(ix)
                vdmm = vdm.getMemoryManager()

                logging.info("Total bytes: %s", vdmm.getTotalBytesMmappedHighWaterMark())
                maxHighWatermark = max(maxHighWatermark, vdmm.getTotalBytesMmappedHighWaterMark())
                vdm = None
                vdmm = None

            self.assertTrue(result is not None)
            self.assertTrue(result.isResult(), result)
            self.assertTrue(isinstance(result.asResult.result.pyval,int), result)
        finally:
            simulation.teardown()

        self.assertTrue(maxHighWatermark < 265 * 1024 * 1024)
Example #49
0
 def computeUsingSeveralWorkers(self, *args, **kwds):
     return InMemoryCumulusSimulation.computeUsingSeveralWorkers(*args, **kwds)
    def dataframeSumTest(self, mbOfData, colCount, threadCount, recordResults = True):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        randomColumnsToPick = 10
        totalRowsToSum = 1000000

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(mbOfData, colCount),
                        s3,
                        count=1,
                        timeout = 360,
                        memoryLimitMb = 10000,
                        threadCount = threadCount,
                        returnSimulation = True,
                        useInMemoryCache = False,
                        channelThroughputMBPerSecond = None
                        )

        try:
            self.assertTrue(result.isResult())

            data = result.asResult.result
            executionScript = ("""
                let randomRowwiseSumFun = fun (row, randomColumnsToPick, baseSeed){
                    let rng = iterator(math.random.MultiplyWithCarry(baseSeed + row.rowIndex()));
                    let tr = nothing;
                    let ix = 0;
                    let rowSize = size(row)
                    while (ix < randomColumnsToPick) {
                        let nextIx = (pull rng) % rowSize;
                        tr = tr + row[nextIx]
                        ix = ix + 1
                        }
                    tr
                }
                let randomColumnsToPick = __subsetSize__;
                let baseSeed = 5;
                sum(0, __rows_to_sum__, fun(ix) { randomRowwiseSumFun(data[ix % size(data)], randomColumnsToPick, baseSeed) })
                """
                .replace("__subsetSize__",str(randomColumnsToPick))
                .replace("__rows_to_sum__",str(totalRowsToSum * threadCount))
                )


            t0 = time.time()
            result = simulation.compute(
                executionScript,
                timeout=1080,
                data=data
                )
            computeDuration = time.time() - t0

            totalValuesAccessed = totalRowsToSum * randomColumnsToPick * threadCount

            totalValuesPerSecondPerThread = totalValuesAccessed * 2 / computeDuration / threadCount

            secondsToDo10MillionPerThread = 10 * 1000000 / totalValuesPerSecondPerThread

            if recordResults:
                PerformanceTestReporter.recordTest(
                    "python.BigBox.RandomColumnAccess.access10mm_%smb_%scols_%sthreads" % (
                        mbOfData,
                        colCount,
                        threadCount
                        ),
                    secondsToDo10MillionPerThread,
                    None
                    )

            self.assertTrue(result.isResult())

            return computeDuration

        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()
    def largeDatasetBigLMTest(self, mbOfData, columns, threads, machineCount, ratio = .5):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                        self.dataGenerationScript(mbOfData, columns),
                        s3,
                        machineCount,
                        timeout = 360,
                        memoryLimitMb = mbOfData / ratio / machineCount,
                        channelThroughputMBPerSecond = 100.0,
                        threadCount = threads,
                        returnSimulation = True,
                        useInMemoryCache = False
                        )

        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            regressionScript = """
                    let model = math.regression.LinearRegression(dfPredictors, dfResponse,coefficientsOnly:true);
                    let coefficients = model.coefficients();
                    coefficients[0]
                    """

            t0 = time.time()
            result = simulation.compute(
                regressionScript,
                timeout=1080,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors
                )
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult(), result)

            self.assertTrue(result.isResult())

            print "Done with the first regression"

            regressionScript2 = """
                    let newCol = dfPredictors.rowApply(fun(row) { math.sin(row[0] ) })
                    let newCol2 = dfPredictors.rowApply(fun(row) { math.sin(row[0] + 1) })
                    let model2 = math.regression.LinearRegression(dfPredictors.addColumn(newCol).addColumn(newCol2), dfResponse, coefficientsOnly:true)
                    model2.coefficients()[0]
                    """

            result2 = simulation.compute(
                regressionScript2,
                timeout=1080,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors
                )

            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result2.isResult(), result2)

            PerformanceTestReporter.recordTest(
                "algorithms.linearRegression.inMemory_%sMB_%scols_%sthreads_%smachines" %
                    (mbOfData, columns,threads,machineCount),
                totalTimeToReturnResult,
                None
                )
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()