Esempio n. 1
0
    def testDataSerialize(self):
        for type_, compress in itertools.product(
                (None,) + tuple(dataserializer.SerialType.__members__.values()),
                (None,) + tuple(dataserializer.CompressType.__members__.values())):
            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.load(
                BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress))))

            array = np.random.rand(1000, 100).T  # test non c-contiguous
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.float64(0.2345)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

        # test structured arrays.
        rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')])
        array = np.ones((100,), dtype=rec_dtype)
        array_loaded = dataserializer.loads(dataserializer.dumps(array))
        self.assertEqual(array.dtype, array_loaded.dtype)
        assert_array_equal(array, array_loaded)

        fn = os.path.join(tempfile.gettempdir(), 'test_dump_file_%d.bin' % id(self))
        try:
            array = np.random.rand(1000, 100).T  # test non c-contiguous
            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.LZ4)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.GZIP)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))
        finally:
            if os.path.exists(fn):
                os.unlink(fn)

        # test sparse
        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

        # test groupby
        df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                            'c': list('aabaaddce')})
        grouped = wrapped_groupby(df1, 'b')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b').c
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b')
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        # test categorical
        s = np.random.RandomState(0).random(10)
        cat = pd.cut(s, [0.3, 0.5, 0.8])
        self.assertIsInstance(cat, pd.Categorical)
        des_cat = dataserializer.loads(dataserializer.dumps(cat))
        self.assertEqual(len(cat), len(des_cat))
        for c, dc in zip(cat, des_cat):
            np.testing.assert_equal(c, dc)

        # test IntervalIndex
        s = pd.interval_range(10, 100, 3)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_index_equal(s, dest_s)
Esempio n. 2
0
    def testDataSerialize(self):
        try:
            import numpy as np
            from numpy.testing import assert_array_equal
        except ImportError:
            np = None

        try:
            import scipy.sparse as sps
        except ImportError:
            sps = None

        if np:
            array = np.random.rand(1000, 100)
            assert_array_equal(
                array, dataserializer.loads(dataserializer.dumps(array)))
            assert_array_equal(
                array,
                dataserializer.loads(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.LZ4)))
            if not six.PY2:
                assert_array_equal(
                    array,
                    dataserializer.loads(
                        dataserializer.dumps(
                            array, compress=dataserializer.CompressType.GZIP)))

            array = np.random.rand(1000, 100)
            assert_array_equal(
                array,
                dataserializer.load(BytesIO(dataserializer.dumps(array))))
            assert_array_equal(
                array,
                dataserializer.load(
                    BytesIO(
                        dataserializer.dumps(
                            array, compress=dataserializer.CompressType.LZ4))))
            if not six.PY2:
                assert_array_equal(
                    array,
                    dataserializer.load(
                        BytesIO(
                            dataserializer.dumps(
                                array,
                                compress=dataserializer.CompressType.GZIP))))

            array = np.random.rand(1000, 100).T  # test non c-contiguous
            assert_array_equal(
                array, dataserializer.loads(dataserializer.dumps(array)))
            assert_array_equal(
                array,
                dataserializer.loads(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.LZ4)))
            if not six.PY2:
                assert_array_equal(
                    array,
                    dataserializer.loads(
                        dataserializer.dumps(
                            array, compress=dataserializer.CompressType.GZIP)))

            array = np.float64(0.2345)
            assert_array_equal(
                array, dataserializer.loads(dataserializer.dumps(array)))
            assert_array_equal(
                array,
                dataserializer.loads(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.LZ4)))
            if not six.PY2:
                assert_array_equal(
                    array,
                    dataserializer.loads(
                        dataserializer.dumps(
                            array, compress=dataserializer.CompressType.GZIP)))

            # test structured arrays.
            rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'),
                                  ('c', '<U8')])
            array = np.ones((100, ), dtype=rec_dtype)
            array_loaded = dataserializer.loads(dataserializer.dumps(array))
            self.assertEqual(array.dtype, array_loaded.dtype)
            assert_array_equal(array, array_loaded)

            fn = os.path.join(tempfile.gettempdir(),
                              'test_dump_file_%d.bin' % id(self))
            try:
                array = np.random.rand(1000, 100).T  # test non c-contiguous
                with open(fn, 'wb') as dump_file:
                    dataserializer.dump(array, dump_file)
                with open(fn, 'rb') as dump_file:
                    assert_array_equal(array, dataserializer.load(dump_file))

                with open(fn, 'wb') as dump_file:
                    dataserializer.dump(
                        array,
                        dump_file,
                        compress=dataserializer.CompressType.LZ4)
                with open(fn, 'rb') as dump_file:
                    assert_array_equal(array, dataserializer.load(dump_file))

                if not six.PY2:
                    with open(fn, 'wb') as dump_file:
                        dataserializer.dump(
                            array,
                            dump_file,
                            compress=dataserializer.CompressType.GZIP)
                    with open(fn, 'rb') as dump_file:
                        assert_array_equal(array,
                                           dataserializer.load(dump_file))
            finally:
                if os.path.exists(fn):
                    os.unlink(fn)

        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(
                dataserializer.dumps(mat,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            if not six.PY2:
                des_mat = dataserializer.loads(
                    dataserializer.dumps(
                        mat, compress=dataserializer.CompressType.GZIP))
                self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)),
                                         shape=(2, ))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(
                dataserializer.dumps(vector,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            if not six.PY2:
                des_vector = dataserializer.loads(
                    dataserializer.dumps(
                        vector, compress=dataserializer.CompressType.GZIP))
                self.assertTrue(
                    (vector.spmatrix != des_vector.spmatrix).nnz == 0)
Esempio n. 3
0
    def testMainWithoutEtcd(self):
        self.start_processes()

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        # todo this behavior may change when eager mode is introduced
        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.FAILED)

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_allclose(loads(result), np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        graph = sumv.build_graph()
        targets = [sumv.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tensor_graph(json.dumps(graph.to_json()),
                                        graph_key,
                                        target_tensors=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = reduce(operator.add,
                          [base_arr[:10, :10] for _ in range(10)])
        result = session_ref.fetch_result(graph_key, sumv.key)
        assert_allclose(loads(result), expected)
Esempio n. 4
0
 def _read_data_all(reader):
     with reader:
         return dataserializer.loads(reader.read())
Esempio n. 5
0
    def testMainDataFrameWithoutEtcd(self):
        import pandas as pd
        from mars.dataframe.expressions.datasource.dataframe import from_pandas
        from mars.dataframe.expressions.arithmetic import add

        self.start_processes(etcd=False)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=(10, 5))
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=(10, 6))

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))
Esempio n. 6
0
    def testWorkerFailOver(self):
        def kill_process_tree(proc):
            import psutil
            proc = psutil.Process(proc.pid)
            plasma_sock_dir = None
            for p in proc.children(recursive=True):
                try:
                    if 'plasma' in p.name():
                        socks = [
                            conn.laddr for conn in p.connections('unix')
                            if 'plasma' in conn.laddr
                        ]
                        if socks:
                            plasma_sock_dir = os.path.dirname(socks[0])
                    p.kill()
                except psutil.NoSuchProcess:
                    continue
            proc.kill()
            if plasma_sock_dir:
                shutil.rmtree(plasma_sock_dir, ignore_errors=True)

        delay_file = self.add_state_file('DELAY_STATE_FILE')
        open(delay_file, 'w').close()

        terminate_file = self.add_state_file('TERMINATE_STATE_FILE')

        self.start_processes(modules=['mars.scheduler.tests.op_delayer'],
                             log_worker=True)

        session_id = uuid.uuid1()
        actor_client = new_client()
        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        np_a = np.random.random((100, 100))
        np_b = np.random.random((100, 100))

        a = mt.array(np_a, chunk_size=30) * 2 + 1
        b = mt.array(np_b, chunk_size=30) * 2 + 1
        c = a.dot(b) * 2 + 1
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        while not os.path.exists(terminate_file):
            actor_client.sleep(0.05)

        kill_process_tree(self.proc_workers[0])
        logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid)
        self.proc_workers = self.proc_workers[1:]
        os.unlink(delay_file)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1
        assert_allclose(loads(result), expected)
Esempio n. 7
0
    def testMainTensorWithoutEtcd(self):
        self.start_processes()

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(
            self.session_manager_ref.create_session(session_id))

        a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1
        c = (a * b * 2 + 1).sum()
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1
        assert_allclose(loads(result), expected.sum())

        a = mt.ones((100, 50), chunk_size=35) * 2 + 1
        b = mt.ones((50, 200), chunk_size=35) * 2 + 1
        c = a.dot(b)
        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)
        result = session_ref.fetch_result(graph_key, c.key)
        assert_allclose(loads(result), np.ones((100, 200)) * 450)

        base_arr = np.random.random((100, 100))
        a = mt.array(base_arr)
        sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)])
        graph = sumv.build_graph()
        targets = [sumv.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = reduce(operator.add,
                          [base_arr[:10, :10] for _ in range(10)])
        result = session_ref.fetch_result(graph_key, sumv.key)
        assert_allclose(loads(result), expected)

        a = mt.ones((31, 27), chunk_size=10)
        b = a.reshape(27, 31)
        b.op.extra_params['_reshape_with_shuffle'] = True
        graph = b.build_graph()
        targets = [b.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key,
                                          target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, b.key)
        assert_allclose(loads(result), np.ones((27, 31)))
Esempio n. 8
0
    def testDataSerialize(self):
        for type_, compress in itertools.product(
            (None, ) + tuple(dataserializer.SerialType.__members__.values()),
            (None, ) +
                tuple(dataserializer.CompressType.__members__.values())):
            array = np.random.rand(1000, 100)
            assert_array_equal(
                array,
                dataserializer.loads(
                    dataserializer.dumps(array,
                                         serial_type=type_,
                                         compress=compress)))

            array = np.random.rand(1000, 100)
            assert_array_equal(
                array,
                dataserializer.load(
                    BytesIO(
                        dataserializer.dumps(array,
                                             serial_type=type_,
                                             compress=compress))))

            array = np.random.rand(1000, 100).T  # test non c-contiguous
            assert_array_equal(
                array,
                dataserializer.loads(
                    dataserializer.dumps(array,
                                         serial_type=type_,
                                         compress=compress)))

            array = np.float64(0.2345)
            assert_array_equal(
                array,
                dataserializer.loads(
                    dataserializer.dumps(array,
                                         serial_type=type_,
                                         compress=compress)))

        # test non-serializable object
        if pyarrow:
            non_serial = type('non_serial', (object, ), dict(nbytes=10))
            with self.assertRaises(SerializationFailed):
                dataserializer.dumps(non_serial())

        # test structured arrays.
        rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')])
        array = np.ones((100, ), dtype=rec_dtype)
        array_loaded = dataserializer.loads(dataserializer.dumps(array))
        self.assertEqual(array.dtype, array_loaded.dtype)
        assert_array_equal(array, array_loaded)

        fn = os.path.join(tempfile.gettempdir(),
                          f'test_dump_file_{id(self)}.bin')
        try:
            array = np.random.rand(1000, 100).T  # test non c-contiguous
            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array,
                                    dump_file,
                                    compress=dataserializer.CompressType.LZ4)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array,
                                    dump_file,
                                    compress=dataserializer.CompressType.GZIP)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))
        finally:
            if os.path.exists(fn):
                os.unlink(fn)

        # test sparse
        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(
                dataserializer.dumps(mat,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(
                dataserializer.dumps(
                    mat, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)),
                                         shape=(2, ))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(
                dataserializer.dumps(vector,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(
                dataserializer.dumps(
                    vector, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

        # test groupby
        df1 = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce')
        })
        grouped = wrapped_groupby(df1, 'b')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b').c
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b')
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        # test categorical
        s = np.random.RandomState(0).random(10)
        cat = pd.cut(s, [0.3, 0.5, 0.8])
        self.assertIsInstance(cat, pd.Categorical)
        des_cat = dataserializer.loads(dataserializer.dumps(cat))
        self.assertEqual(len(cat), len(des_cat))
        for c, dc in zip(cat, des_cat):
            np.testing.assert_equal(c, dc)

        # test IntervalIndex
        s = pd.interval_range(10, 100, 3)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_index_equal(s, dest_s)

        # test complex
        s = complex(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        s = np.complex64(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        # test ndarray with negative strides
        arr = np.zeros((5, 6, 3))
        arr2 = arr[:, :, ::-1]
        dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2))
        np.testing.assert_array_equal(arr2, dest_arr2)

        # test ArrowArray
        df = pd.DataFrame({
            'a': ['s1', 's2', 's3'],
            'b': [['s1', 's2'], ['s3'], ['s4', 's5']]
        })
        df['a'] = df['a'].astype(ArrowStringDtype())
        df['b'] = df['b'].astype(ArrowListDtype(str))
        dest_df = dataserializer.loads(dataserializer.dumps(df))
        self.assertIs(type(df), type(dest_df))
        pd.testing.assert_frame_equal(df, dest_df)

        # test DataFrame with SparseDtype
        s = pd.Series([1, 2, np.nan, np.nan,
                       3]).astype(pd.SparseDtype(np.dtype(np.float64), np.nan))
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_series_equal(s, dest_s)
        df = pd.DataFrame({'s': s})
        dest_df = dataserializer.loads((dataserializer.dumps(df)))
        pd.testing.assert_frame_equal(df, dest_df)
Esempio n. 9
0
 def get_result_data(self, session_id, chunk_key):
     buf = self._data_writers[(session_id, chunk_key)].getvalue()
     return dataserializer.loads(buf)
Esempio n. 10
0
    def testFileBufferIO(self):
        if not np:
            return
        from numpy.testing import assert_array_equal

        compressions = [dataserializer.CompressType.NONE] + \
            list(dataserializer.get_supported_compressions())

        for c1 in compressions:
            for c2 in compressions:
                data = np.random.random((1000, 100))

                # test complete read
                compressed_read_file = BytesIO(
                    dataserializer.dumps(data, compress=c1))
                reader = FileBufferIO(compressed_read_file,
                                      'r',
                                      compress_out=c2)
                compressed = reader.read()
                self.assertEqual(
                    c2,
                    dataserializer.read_file_header(compressed).compress)
                assert_array_equal(data, dataserializer.loads(compressed))

                # test partial read
                compressed_read_file = BytesIO(
                    dataserializer.dumps(data, compress=c1))
                reader = FileBufferIO(compressed_read_file,
                                      'r',
                                      compress_out=c2)
                block = reader.read(128)
                data_left = reader.read()
                assert_array_equal(data,
                                   dataserializer.loads(block + data_left))

                # test read by chunks
                bio = BytesIO()
                compressed_read_file = BytesIO(
                    dataserializer.dumps(data, compress=c1))
                reader = FileBufferIO(compressed_read_file,
                                      'r',
                                      compress_out=c2)
                while True:
                    block = reader.read(128)
                    if not block:
                        break
                    bio.write(block)

                compressed = bio.getvalue()
                self.assertEqual(
                    c2,
                    dataserializer.read_file_header(compressed).compress)
                assert_array_equal(data, dataserializer.loads(compressed))

                # test write by chunks
                compressed_read_file.seek(0)
                compressed_write_file = BytesIO()
                writer = FileBufferIO(compressed_write_file,
                                      'w',
                                      compress_in=c2,
                                      managed=False)
                while True:
                    block = compressed_read_file.read(128)
                    if not block:
                        break
                    writer.write(block)
                writer.close()

                compressed = compressed_write_file.getvalue()
                self.assertEqual(
                    c2,
                    dataserializer.read_file_header(compressed).compress)
                assert_array_equal(data, dataserializer.loads(compressed))
Esempio n. 11
0
    def testIterativeTilingWithoutEtcd(self):
        self.start_processes(etcd=False)

        session_id = uuid.uuid1()
        actor_client = new_client()
        rs = np.random.RandomState(0)

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        raw = rs.rand(100)
        a = mt.tensor(raw, chunk_size=10)
        a.sort()
        c = a[:5]

        graph = c.build_graph()
        targets = [c.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, c.key)
        expected = np.sort(raw)[:5]
        assert_allclose(loads(result), expected)

        with self.assertRaises(KeyError):
            session_ref.fetch_result(graph_key, a.key, check=False)

        raw1 = rs.rand(20)
        raw2 = rs.rand(20)
        a = mt.tensor(raw1, chunk_size=10)
        a.sort()
        b = mt.tensor(raw2, chunk_size=15) + 1
        c = mt.concatenate([a[:10], b])
        c.sort()
        d = c[:5]

        graph = d.build_graph()
        targets = [d.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, d.key)
        expected = np.sort(np.concatenate([np.sort(raw1)[:10], raw2 + 1]))[:5]
        assert_allclose(loads(result), expected)

        raw = rs.randint(100, size=(100,))
        a = mt.tensor(raw, chunk_size=53)
        a.sort()
        b = mt.histogram(a, bins='scott')

        graph = build_tileable_graph(b, set())
        targets = [b[0].key, b[1].key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        res = session_ref.fetch_result(graph_key, b[0].key), \
              session_ref.fetch_result(graph_key, b[1].key)
        expected = np.histogram(np.sort(raw), bins='scott')
        assert_allclose(loads(res[0]), expected[0])
        assert_allclose(loads(res[1]), expected[1])