Python InMemoryS3Interface.InMemoryS3InterfaceFactory Beispiele, ufora.distributed.S3.InMemoryS3Interface.InMemoryS3InterfaceFactory Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: DistributedDataTasks_test.py Projekt: ufora/ufora

    def weirdStringSort(self, sz, machines=1, memory=1000):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = __size__;

            let values = Vector.range(N, fun(ix) { " " * ix }).paged;

            let s1 = cached`(#ExternalIoTask(#DistributedDataOperation(#Sort(values))))
            let s2 = sorting.sort(values)

            if (size(s1) != size(s2))
                return 'wrong size: %s != %s'.format(size(s1), size(s2))

            for ix in sequence(size(s1))
                if (s1[ix] != s2[ix])
                    return 'not equal: index=%s. %s != %s'.format(ix, s1[ix], s2[ix])
            return true
            """.replace("__size__", str(sz))

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text, s3, machines, timeout=TIMEOUT, memoryLimitMb=memory)

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)

Beispiel #2

0

Datei anzeigen

Datei: DistributedDataTasks_test.py Projekt: ufora/ufora

    def test_sortManySimilarValues(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let values = []
            let ct = 1000000
            for ix in sequence(ct)
                values = values :: (ix%2)

            let sortedVals = cached`(#ExternalIoTask(#DistributedDataOperation(#Sort(values.paged))))

            let sortedAndHomogenous = fun(v) {
                for ix in sequence(size(v)-1)
                    if (v[ix] > v[ix+1] or `TypeJOV(v[ix]) is not `TypeJOV(v[ix+1]))
                        throw (ix, v[ix], v[ix+1], `TypeJOV(v[ix]), `TypeJOV(v[ix+1]))
                return true;
                }

            if (size(sortedVals) != size(values))
                throw "expected " + String(size(values)) + ", not " + String(size(sortedVals))
            sortedAndHomogenous(sortedVals)
            """

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text, s3, 1, timeout=TIMEOUT, memoryLimitMb=1000)

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)

Beispiel #3

0

Datei anzeigen

Datei: DistributedDataTasks_test.py Projekt: ufora/ufora

    def test_sortVecOfVec(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let values = []
            let ct = 500000

            values = [(ix % 100, Vector.range(40)) for ix in sequence(ct)]

            let res = cached`(#ExternalIoTask(#DistributedDataOperation(#Sort(values.paged))));

            let firstAreSorted = true;
            for ix in sequence(size(res)-1)
                if (res[ix][0] > res[ix+1][0])
                    firstAreSorted = false;

            size(res) == size(values) and firstAreSorted
            """

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text, s3, 1, timeout=TIMEOUT, memoryLimitMb=3000)

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)

Beispiel #4

0

Datei anzeigen

Datei: DistributedDataTasks_test.py Projekt: ufora/ufora

    def test_takeFromLargeObjects(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = 100;

            //each string is 1 MB
            let takeFrom = [" " * 100 * 100 * 10 * 10 + " " * ix for ix in sequence(N)].paged;
            let indices = Vector.range(N,fun(x) { x }).paged;

            cached`(#ExternalIoTask(#DistributedDataOperation(#Take(indices, takeFrom))))
            """

        try:
            result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                text,
                s3,
                1,
                timeout=TIMEOUT,
                memoryLimitMb=1000,
                returnSimulation=True,
                pageSizeOverride=1024 * 1024)

            self.assertTrue(result is not None)
            self.assertTrue(result.isResult(), result)

            for page in result.asResult.result.getVectorPageIds(
                    simulation.getWorkerVdm(0)):
                self.assertLess(page.bytecount / 1024.0 / 1024.0, 2.0)
        finally:
            simulation.teardown()

Beispiel #5

0

Datei anzeigen

Datei: DistributedDataTasks_test.py Projekt: ufora/ufora

    def test_multiboxDataTasksTake_1(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = 10000000;
            let isPrime = fun(p) {
                let x = 2
                while (x*x <= p) {
                    if (p%x == 0)
                        return 0
                    x = x + 1
                    }
                return x
                }

            let takeFrom = Vector.range(N, isPrime).paged;
            let indices = Vector.range(N,fun(x) { (0, (x * 503) % N ) }).paged;

            cached`(#ExternalIoTask(#DistributedDataOperation(#Take(indices, takeFrom)))) ==
                indices ~~ { takeFrom[_[1]] }
            """

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text, s3, 2, timeout=TIMEOUT, memoryLimitMb=1000)

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)

Beispiel #6

0

Datei anzeigen

Datei: DistributedDataTasks_test.py Projekt: ufora/ufora

    def multiboxDataTasksSort(self,
                              ct,
                              workers=2,
                              memoryLimit=100,
                              pageSizeOverrideMB=1):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let N = __ct__;
            let aPrime = 503

            let toSort = Vector.range(N, { ((_ * _) % aPrime, _) }).paged;

            let result = cached`(#ExternalIoTask(#DistributedDataOperation(#Sort(toSort))))

            sorting.isSorted(result)
            """.replace("__ct__", str(ct))

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            text,
            s3,
            workers,
            timeout=TIMEOUT,
            memoryLimitMb=memoryLimit,
            pageSizeOverride=pageSizeOverrideMB * 1024 * 1024)

        self.assertTrue(result is not None)
        self.assertTrue(result.isResult(), result)
        self.assertTrue(result.asResult.result.pyval == True, result)

Beispiel #7

0

Datei anzeigen

Datei: CumulusWorkerDatasetLoadServiceIntegrationTest_test.py Projekt: data-processing/ufora

    def disable_createVectorAndReferenceInMultipleComputations(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = self.computeUsingSeveralWorkers(
            "1+1",
            s3,
            2,
            memoryLimitMb=1000,
            returnSimulation=True,
            useInMemoryCache=False)

        try:
            vecComputation = simulation.createComputation("""
                let count = 1000 * 1000 * 40

                let fpow = fun(p) {
                    fun(x) {
                        Float32( (x / 1000000000.0) )
                        }
                    };

                Vector.range(10) ~~ fun(p) {
                    Vector.range(count, fpow(p)).paged
                    }
                """)

            #we want to verify that all of these computations use the same copy of the
            #bigvec that we create in the 'vecComputation' instance
            predComp = simulation.createComputation(
                "dataframe.DataFrame(vecs[1,])", vecs=vecComputation)
            regComp = simulation.createComputation(
                "dataframe.DataFrame(vecs[,1])", vecs=vecComputation)

            predCompStr = simulation.createComputation("String(pred)",
                                                       pred=predComp)
            regCompStr = simulation.createComputation("String(reg)",
                                                      reg=regComp)

            vecSumComp = simulation.createComputation("vecs ~~ {_.sum()}",
                                                      vecs=vecComputation)

            simulation.submitComputation(predCompStr)
            simulation.submitComputation(regCompStr)
            simulation.submitComputation(vecSumComp)

            r1 = simulation.waitForAnyResult(timeout=60.0)
            r2 = simulation.waitForAnyResult(timeout=60.0)
            r3 = simulation.waitForAnyResult(timeout=60.0)

            #verify that simulation didn't write to disk
            sprt = simulation.getWorker(0).getSystemwidePageRefcountTracker()

            totalGb = sum([x.bytecount for x in sprt.getAllActivePages()
                           ]) / 1024.0 / 1024.0 / 1024.0

            logging.critical("%s", sprt.getViewOfSystem())
            self.assertTrue(totalGb < 2.0, totalGb)
        finally:
            simulation.teardown()

Beispiel #8

0

Datei anzeigen

    def stringToInt64ParsingTest(self, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """
            let doALoop = fun(x) {
                //pass 's' through a vector so that the compiler can't tell what it is
                let s = ["2013"][0];

                let res = 0
                for ix in sequence(x) {
                    if (ix == 0)
                        s = s + String(ix)

                    res = res + Int64(s) + ix
                    }
                res
                };

            Vector.range(__thread_count__) ~~ {doALoop(20000000 + _)}
            """.replace("__thread_count__", str(threads))

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                1,
                timeout = 240,
                memoryLimitMb = 55 * 1024,
                threadCount = 30,
                returnSimulation = True,
                useInMemoryCache = False
                )

        t0 = time.time()

        _, simulation = \
            self.computeUsingSeveralWorkers(
                "1+1",
                s3,
                1,
                timeout = 240,
                memoryLimitMb = 55 * 1024,
                threadCount = 30,
                returnSimulation = True,
                useInMemoryCache = False
                )

        try:
            t0 = time.time()
            result = simulation.compute(text, timeout=240)
            totalTimeToReturnResult = time.time() - t0

            PerformanceTestReporter.recordTest(testName,
                                               totalTimeToReturnResult, None)
        finally:
            simulation.teardown()

Beispiel #9

0

Datei anzeigen

    def test_sortVec2(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result = self.computeUsingSeveralWorkers("""
            let v = Vector.range(50000, fun(ix) { ix  / 10 } );
            sorting.isSorted(sort(v))
            """, s3, 4)

        self.assertEqual(result.asResult.result.pyval, True)

Beispiel #10

0

Datei anzeigen

Datei: testGbmRegression.py Projekt: vishnur/ufora

    def gbmRegressionFittingTest(self, nRows, nColumns, depth, nThreads,
                                 maxBoosts):
        testName = self.getTestName(nRows, nColumns, depth, maxBoosts,
                                    nThreads)

        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(nRows, nColumns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=30 * 1024,
            threadCount=nThreads,
            returnSimulation=True,
            useInMemoryCache=False)
        try:
            self.assertTrue(result.isResult())

            dfPredictors, dfResponse = result.asResult.result

            fitter = simulation.compute(
                self.regressionScript(depth, 1),
                timeout=360,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors).asResult.result

            t0 = time.time()

            for nBoosts in range(1, maxBoosts):
                testName = self.getTestName(nRows, nColumns, depth, nBoosts,
                                            nThreads)

                predictions = simulation.compute(
                    "fitter.predictionsAndPseudoresiduals()",
                    timeout=360,
                    fitter=fitter).asResult.result
                totalTimeToReturnResult = time.time() - t0

                PerformanceTestReporter.recordTest(testName + "_predict",
                                                   totalTimeToReturnResult,
                                                   None)

                fitter = simulation.compute(
                    "fitter.nextGivenPredictions(predictions)",
                    timeout=360,
                    fitter=fitter,
                    predictions=predictions).asResult.result
                totalTimeToReturnResult = time.time() - t0

                PerformanceTestReporter.recordTest(testName,
                                                   totalTimeToReturnResult,
                                                   None)

        finally:
            simulation.teardown()

Beispiel #11

0

Datei anzeigen

    def test_effectiveParallelism(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #do a burn-in run
        self.computeUsingSeveralWorkers("""
                let v = Vector.range(5000000, { (1,_) } );

                let f = fun(ix) {
                    let res = 0
                    for x in sequence( (ix - 2000) >>> 0, ix )
                        res = res + size(v[x])
                    res
                    }

                Vector.range(size(v),  f).sum()

                """,
                                        s3,
                                        2,
                                        wantsStats=True,
                                        timeout=240,
                                        memoryLimitMb=500)[1]

        t0 = time.time()

        stats = self.computeUsingSeveralWorkers("""
                let v = Vector.range(5000000, { (1,_) } );

                let f = fun(ix) {
                    let res = 0
                    for x in sequence( (ix - 2000) >>> 0, ix )
                        res = res + size(v[x])
                    res
                    }

                Vector.range(size(v),  f).sum()

                """,
                                                s3,
                                                2,
                                                wantsStats=True,
                                                timeout=240,
                                                memoryLimitMb=500)[1]

        timeElapsed = time.time() - t0
        totalTime = stats.timeSpentInInterpreter + stats.timeSpentInCompiler
        effParallelism = totalTime / timeElapsed

        PerformanceTestReporter.recordTest(
            "python.cumulus.EffectiveParallelism.elapsed", timeElapsed, None)

        PerformanceTestReporter.recordTest(
            "python.cumulus.EffectiveParallelism.effectiveCores",
            effParallelism, {},
            units='count')

Beispiel #12

0

Datei anzeigen

Datei: CumulusWorkerDatasetLoadServiceIntegrationTest_test.py Projekt: data-processing/ufora

    def test_CalculationRicochet(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let f = fun(ct, seed = 1) {
                let x = 0

                let res = []

                let it = iterator(math.random.UniformReal(0, size(v), seed))

                for ix in sequence(ct) {
                    let x = Int64(pull it)
                    res = res :: (x / Float64(size(v)), v[x])
                    }

                return res
                }

            v[2]
            f(__count__,__seed__)
            """

        vResult, sim = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            "Vector.range(125000000, math.log)",
            s3,
            4,
            timeout=120,
            memoryLimitMb=400,
            threadCount=1,
            useInMemoryCache=True,
            returnSimulation=True)

        try:
            v = vResult.asResult.result

            t0 = time.time()
            sim.compute(text.replace("__seed__",
                                     "1").replace("__count__", "1000"),
                        timeout=120,
                        v=v)
            PerformanceTestReporter.recordTest(
                "python.InMemoryCumulus.Ricochet1000.Pass1",
                time.time() - t0, None)

            t0 = time.time()
            sim.compute(text.replace("__seed__",
                                     "2").replace("__count__", "1000"),
                        timeout=120,
                        v=v)
            PerformanceTestReporter.recordTest(
                "python.InMemoryCumulus.Ricochet1000.Pass2",
                time.time() - t0, None)
        finally:
            sim.teardown()

Beispiel #13

0

Datei anzeigen

    def test_map_with_common(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        res = self.computeUsingSeveralWorkers("""
            let v1 = Vector.range(1000000).paged;

            let v2 = Vector.range(30000000)

            v2.sum(fun(i) { v1[(i * 100) % size(v1)] })
            """, s3, 8, timeout=240,memoryLimitMb=100
            )
        self.assertTrue(res.isResult())

Beispiel #14

0

Datei anzeigen

Datei: testBigboxGbmSimultaneous.py Projekt: vishnur/ufora

    def gbmRegressionFittingTest(self,
                                 nRows,
                                 nColumns,
                                 depth,
                                 nThreads,
                                 nBoosts,
                                 copies,
                                 report=True):

        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(nRows, nColumns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=30 * 1024,
            threadCount=nThreads,
            returnSimulation=True,
            useInMemoryCache=False)
        try:
            self.assertTrue(result.isResult())

            dfPredictors, dfResponse = result.asResult.result

            builder = simulation.compute(
                self.regressionScript(depth, nBoosts),
                timeout=360,
                dfResponse=dfResponse,
                dfPredictors=dfPredictors).asResult.result

            t0 = time.time()

            testName = self.getTestName(nRows, nColumns, depth, nBoosts,
                                        nThreads, copies)

            result = simulation.compute(
                "Vector.range(%s).apply(fun(x) { builder.fit(dfPredictors[,-x-1], dfResponse[,-x-1]) })"
                % copies,
                timeout=360,
                builder=builder,
                dfPredictors=dfPredictors,
                dfResponse=dfResponse,
            ).asResult.result
            totalTimeToReturnResult = time.time() - t0

            if report:
                PerformanceTestReporter.recordTest(testName,
                                                   totalTimeToReturnResult,
                                                   None)

        finally:
            simulation.teardown()

Beispiel #15

0

Datei anzeigen

    def loadCheckpointFromFreshSimulationTest(self,
                                              calculationText,
                                              timestampsPerPassList,
                                              clientCount=1,
                                              timestep=1.0):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        statuses = []
        viewFactory = None

        for timestampsThisPass in timestampsPerPassList:
            simulation = InMemoryCumulusSimulation.InMemoryCumulusSimulation(
                4,  #worker count
                clientCount,
                memoryPerWorkerMB=100,
                threadsPerWorker=2,
                s3Service=s3,
                sharedStateViewFactory=viewFactory)

            viewFactory = simulation.sharedStateViewFactory

            statusesThisPass = []

            try:
                self.assertTrue(simulation.waitForGlobalScheduler(timeout=2.0))

                simulation.getGlobalScheduler().setCheckpointStatusInterval(
                    0.1)

                for ix in range(clientCount):
                    simulation.submitComputationOnClient(ix, calculationText)

                for subPass in range(timestampsThisPass):
                    time.sleep(timestep)
                    statusesThisPass.append(
                        self.timeElapsedOfMostRecentCheckpoints(simulation))

                simulation.getGlobalScheduler(
                ).triggerFullCheckpointsOnOutstandingComputations()

                self.waitForFullCheckpoint(simulation)

                statusesThisPass.append(
                    self.timeElapsedOfMostRecentCheckpoints(simulation))
            finally:
                for ix in range(4):
                    simulation.getWorker(ix).dumpStateToLog()

                simulation.teardown()

            statuses.append(statusesThisPass)

        return statuses

Beispiel #16

0

Datei anzeigen

Datei: CumulusWorkerDatasetLoadServiceIntegrationTest_test.py Projekt: data-processing/ufora

    def test_CreateManySmallVectors(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let v = []; for ix in sequence(1000000) { v = v + [ix] }; v.sum()
            """

        res = self.computeUsingSeveralWorkers(text, s3, 4)

        self.assertTrue(res.isResult())

        self.assertEqual(res.asResult.result.pyval, 499999500000)

Beispiel #17

0

Datei anzeigen

    def test_takeLookupSemantics(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        takeText = """
            let directTake = fun(v, i) {
                i ~~ fun
                    ((filters.IsInteger(...) ix1,filters.IsInteger(...) ix2)) {
                        try { [v][ix1][ix2] } catch(...) { nothing }
                        }
                    (ix) { try { v[ix] } catch (...) { nothing } }
                };

            let takeFrom = [1,2,3,4].paged;
            let indices = __indices__.paged;

            let result = cached`(#ExternalIoTask(#DistributedDataOperation(#Take(indices, takeFrom))));
            let targetResult = directTake(takeFrom, indices)

            assertions.assertEqual(size(result), size(targetResult))

            for ix in sequence(size(result))
                if (result[ix] is not targetResult[ix])
                    return "Expected %s to yield %s, but got %s".format(
                        indices[ix],
                        targetResult[ix],
                        result[ix]
                        );

            return true;
            """

        def takeTest(indexExpr):
            result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
                takeText.replace("__indices__", indexExpr),
                s3,
                1,
                timeout=TIMEOUT,
                memoryLimitMb=1000
                )

            self.assertTrue(result is not None)
            self.assertTrue(result.isResult(), result)
            self.assertTrue(result.asResult.result.pyval == True, result)

        takeTest("[0,1,2,3]")
        takeTest("[0,-1,2,3]")
        takeTest("[0,1,2,30]")
        takeTest("[(0,0),(0,1),(0,2),(0,3)]")
        takeTest("[(0,0),(0,1),(0,2),(0,30)]")
        takeTest("[(0,0),(0,1),(0,2),(3,0)]")
        takeTest("[(0u8,0u16),(0u32,1u64),(0s32,2s8),(0s16,3s64)]")
        takeTest("[0,-1,(), (0,0), (0,0.0), nothing, (1,0), (0u8,6u16), (-1,2)]")

Beispiel #18

0

Datei anzeigen

    def runOnGPU(self, funcExpr, vecExpr, captureExpr=""):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = captureExpr + """
            let f = __funcExpr__;
            let vec = __vecExpr__;
            cached`(#GpuApply(f, vec));
            """.replace("__funcExpr__", funcExpr).replace("__vecExpr__", vecExpr)

        res = InMemoryCumulusSimulation.computeUsingSeveralWorkers(text, s3, 1, timeout=120, threadCount=4,memoryLimitMb=1000)
        self.assertIsNotNone(res)
        self.assertFalse(res.isException(), "Failed with %s" % res)
        return res

Beispiel #19

0

Datei anzeigen

    def test_DataFanout(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """
            let v = [[x].paged for x in sequence(8)]

            let q = v ~~ { let r = 0; for ix in sequence(10**8.5) r = r + _[0]; r }

            q.sum()
            """

        self.assertIsNotNone(self.computeUsingSeveralWorkers(text, s3, 4, timeout = 120))

Beispiel #20

0

Datei anzeigen

    def test_gcOfPagedVectors(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        self.computeUsingSeveralWorkers("""
            let res = 0

            for ix in sequence(100) {
                res = res + Vector.range(1000000+ix).paged.sum()
                }

            res
            """, s3, 4, timeout=240
            )

Beispiel #21

0

Datei anzeigen

Datei: BigLmOnDataframeTest_test.py Projekt: vishnur/ufora

    def test_bigLmOnDataframe(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        result = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            importanceSampling,
            s3,
            1,
            memoryLimitMb=4000,
            threadCount=4,
            timeout=240,
            useInMemoryCache=False)

        self.assertTrue(result.isResult(), result)

Beispiel #22

0

Datei anzeigen

Datei: MultimachineJoinSimulation_test.py Projekt: vishnur/ufora

    def largeDatasetJoinTest(self,
                             mbOfData,
                             columns,
                             threads,
                             machineCount,
                             ratio=.5):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(mbOfData, columns),
            s3,
            machineCount,
            timeout=360,
            memoryLimitMb=mbOfData / ratio / machineCount,
            #channelThroughputMBPerSecond = 100.0,
            threadCount=threads,
            returnSimulation=True,
            useInMemoryCache=False,
            disableEventHandler=True)

        try:
            self.assertTrue(result.isResult())

            data = result.asResult.result

            joinScript = """
                    let leftDF = dataframe.DataFrame(data[,size(data)/2])
                    let rightDF = dataframe.DataFrame(data[size(data)/2,])

                    size(leftDF.join(rightDF, on: "C0", how: `outer, chunkSize: 1000000, areSorted:true))
                    """

            t0 = time.time()
            result = simulation.compute(joinScript, timeout=1080, data=data)
            totalTimeToReturnResult = time.time() - t0

            logging.info("Total time to join: %s", totalTimeToReturnResult)

            self.assertTrue(result.isResult(), result)

            PerformanceTestReporter.recordTest(
                "algorithms.Join.inMemory_%sMB_%scols_%sthreads_%smachines" %
                (mbOfData, columns, threads, machineCount),
                totalTimeToReturnResult, None)
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()

Beispiel #23

0

Datei anzeigen

Datei: CumulusWorkerDatasetLoadServiceIntegrationTest_test.py Projekt: data-processing/ufora

    def test_VectorsAndSums(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        text = """
            let v = Vector.range(100).apply(fun(ix) {
                [ix * x for x in sequence(10000)].paged
                })

            v.sum(fun(vElt) { sum(0, 10**7) + vElt.sum() })
            """

        res = self.computeUsingSeveralWorkers(text, s3, 4, timeout=20)

        self.assertTrue(res.isResult(), res)

Beispiel #24

0

Datei anzeigen

 def check_precision_of_function_on_GPU(self, function, input):
     s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()
     text = """
         let f = fun(x) {
             `""" + function + """(x)
             }
         cached`(#GpuApply(f, [""" + str(input) + """]))[0]
         """
     res = InMemoryCumulusSimulation.computeUsingSeveralWorkers(text, s3, 1, timeout=120, threadCount=4)
     self.assertIsNotNone(res)
     self.assertTrue(res.isResult(), res)
     gpuValue = res.asResult.result.pyval
     methodToCall = getattr(math, function)
     pythonValue = methodToCall(input)
     self.assertTrue(abs(gpuValue - pythonValue) < 1e-10)

Beispiel #25

0

Datei anzeigen

Datei: CumulusWorkerDatasetLoadServiceIntegrationTest_test.py Projekt: data-processing/ufora

    def test_PythonIoTaskService2(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        for ix1 in range(20):
            for ix2 in range(20):
                s3().setKeyValue(
                    "bucketname", "key_%s_%s" % (ix1, ix2), "".join(
                        ("%s,%s,%s\n" % (ix1, ix2, ix3)
                         for ix3 in range(1024))))

        text = """
            datasets.s3('bucketname', 'key_0').dataAsString
            """

        self.assertIsNotNone(self.computeUsingSeveralWorkers(text, s3, 1))

Beispiel #26

0

Datei anzeigen

Datei: bigLM_test.py Projekt: vishnur/ufora

    def largeDatasetBigLMTest(self, mbOfData, columns, threads, testName):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        t0 = time.time()

        result, simulation = InMemoryCumulusSimulation.computeUsingSeveralWorkers(
            self.dataGenerationScript(mbOfData, columns),
            s3,
            1,
            timeout=360,
            memoryLimitMb=50 * 1024,
            threadCount=threads,
            returnSimulation=True,
            useInMemoryCache=False)

        if testName is not None:
            PerformanceTestReporter.recordTest(testName + "_create",
                                               time.time() - t0, None)

        try:
            self.assertTrue(result.isResult())

            dfResponse, dfPredictors = result.asResult.result

            regressionScript = """
                let model = math.regression.LinearRegression(dfPredictors, dfResponse, fitIntercept: false);
                let coefficients = model.coefficients();
                coefficients[0]
                """

            t0 = time.time()
            result = simulation.compute(regressionScript,
                                        timeout=1080,
                                        dfResponse=dfResponse,
                                        dfPredictors=dfPredictors)
            totalTimeToReturnResult = time.time() - t0

            self.assertTrue(result.isResult())

            if testName is not None:
                PerformanceTestReporter.recordTest(testName,
                                                   totalTimeToReturnResult,
                                                   None)
        finally:
            dfResponse = None
            dfPredictors = None
            result = None
            simulation.teardown()

Beispiel #27

0

Datei anzeigen

Datei: InMemoryS3Interface_test.py Projekt: vishnur/ufora

    def test_multipart(self):
        factory = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        publicInterface = factory()

        uploadID = publicInterface.initiateMultipartUpload("aBucket", "aKey")
        publicInterface.setMultipartUploadPart("aBucket", "aKey", uploadID, 1,
                                               "this ")
        publicInterface.setMultipartUploadPart("aBucket", "aKey", uploadID, 2,
                                               "is ")
        publicInterface.setMultipartUploadPart("aBucket", "aKey", uploadID, 3,
                                               "multipart")
        publicInterface.completeMultipartUpload("aBucket", "aKey", uploadID)

        self.assertEqual(publicInterface.getKeyValue("aBucket", "aKey"),
                         "this is multipart")

Beispiel #28

0

Datei anzeigen

    def test_largeVectorRange(self):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        res = self.computeUsingSeveralWorkers("""
            let res = 0;
            for ix in sequence(10) {
                let v1 = Vector.range(50000000);
                res = res + size(v1);
                }
            res
            """, s3, 4, timeout = 200)

        if res.isResult():
            self.assertEqual(res.asResult.result.pyvalOrNone, 50000000 * 10, res)
        else:
            self.assertTrue(False, res)

Beispiel #29

0

Datei anzeigen

Datei: CumulusWorkerDatasetLoadServiceIntegrationTest_test.py Projekt: data-processing/ufora

    def test_vector_string_apply(self):
        #verify that the compiler doesn't crap out during many runs.
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        InMemoryCumulusSimulation.computeUsingSeveralWorkers("""
            let v = Vector.range(10000000)

            let v2 = v.apply(String)

            let v3 = v2.apply({_ + "a"})

            v3.sum(size)
            """,
                                                             s3,
                                                             4,
                                                             timeout=240)

Beispiel #30

0

Datei anzeigen

Datei: CumulusWorkerDatasetLoadServiceIntegrationTest_test.py Projekt: data-processing/ufora

    def dataCreationTest(self, totalMB, workers=1, threadsPerWorker=4):
        s3 = InMemoryS3Interface.InMemoryS3InterfaceFactory()

        #we wish we could actually test that we achieve saturation here but we can't yet.
        text = """size(Vector.range(%s, {_*_}))""" % (totalMB * 1024 * 1024 /
                                                      8)

        self.assertIsNotNone(
            self.computeUsingSeveralWorkers(text,
                                            s3,
                                            workers,
                                            timeout=120,
                                            memoryLimitMb=totalMB / workers *
                                            1.3,
                                            threadCount=threadsPerWorker,
                                            useInMemoryCache=False))