Esempio n. 1
0
    def test_register_object(self):
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        mymsg = DummyMessage()
        mymsg.name = "dummy"
        mymsg.description = "really dumb"

        mymenu = CronusObject()
        mymenu.name = "menu"
        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            path = dirpath + "/test/dummy.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())
            id_ = store.register_content(path,
                                         fileinfo,
                                         dataset_id=dataset.uuid,
                                         partition_key="key").uuid
            print(store[id_].address)
            buf = pa.py_buffer(store.get(id_))
            reader = pa.ipc.open_file(buf)
            self.assertEqual(reader.num_record_batches, 10)
Esempio n. 2
0
    def execute(self):
        """
        Execute simulates creating data
        creating associating metaobject
        storing data and metadata

        returns a serialized dataset object for updating
        a final store
        """
        self.__logger.info("Running job %s", self.job_id)
        data = [
            pa.array(np.random.rand(100000, )),
            pa.array(np.random.rand(100000, )),
            pa.array(np.random.rand(100000, )),
            pa.array(np.random.rand(100000, )),
            pa.array(np.random.rand(100000, )),
            pa.array(np.random.rand(100000, )),
        ]
        batch = pa.RecordBatch.from_arrays(
            data, ["f0", "f1", "f2", "f3", "f4", "f5"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()

        fileinfo = FileObjectInfo()
        fileinfo.type = 5
        fileinfo.aux.description = "Some dummy data"

        ids_ = []
        for key in self.parts:
            ids_.append(
                self.store.register_content(
                    buf,
                    fileinfo,
                    dataset_id=self.dataset_id,
                    job_id=self.job_id,
                    partition_key=key,
                ).uuid)
            self.store.put(ids_[-1], buf)
        buf_ds = self.store[self.dataset_id].dataset.SerializeToString()
        self.buf = buf_ds
Esempio n. 3
0
    def test_validation(self):
        print("Simulate production")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            # Following puts the menu and config to the datastore
            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)

            # Multiple streams
            store.new_partition(dataset.uuid, "key1")
            store.new_partition(dataset.uuid, "key2")
            store.new_partition(dataset.uuid, "key3")

            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            ids_ = []
            parts = store.list_partitions(dataset.uuid)
            # reload menu and config
            newmenu = Menu_pb()
            store.get(menu_uuid, newmenu)
            newconfig = Configuration()
            store.get(config_uuid, newconfig)
            print(parts)
            for _ in range(10):
                job_id = store.new_job(dataset.uuid)

                for key in parts:
                    ids_.append(
                        store.register_content(
                            buf,
                            fileinfo,
                            dataset_id=dataset.uuid,
                            job_id=job_id,
                            partition_key=key,
                        ).uuid)
                    store.put(ids_[-1], buf)

            for id_ in ids_:
                buf = pa.py_buffer(store.get(id_))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)

            # Save the store, reload
            store.save_store()
            newstore = BaseObjectStore(str(_path),
                                       store._name,
                                       store_uuid=store.store_uuid)
            for id_ in ids_:
                print("Get object %s", id_)
                print(type(id_))
                buf = pa.py_buffer(newstore.get(id_))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)
            print(newmenu)
            print(newconfig)
            print("Simulation Test Done ===========================")
Esempio n. 4
0
    def test_register_dataset(self):

        # Create a fake dataset
        # from a menu_id and menu msg
        # from a config_id and config msg
        # add files
        # add tables

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        store_id = str(uuid.uuid4())
        mystore = CronusObjectStore()
        mystore.name = "test"
        mystore.uuid = str(store_id)
        mystore.parent_uuid = ""  # top level store

        print("Testing directory globbing")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        # schema = batch.schema.to_pybytes()
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        fileinfo = FileObjectInfo()
        fileinfo.type = 5
        fileinfo.aux.num_columns = 3

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(str(_path), "test")
            store_id = store.store_uuid
            print(store.store_info.created.ToDatetime())

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            print(menu_uuid)
            print(config_uuid)
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            job_id = store.new_job(dataset.uuid)
            store.register_content(
                buf,
                fileinfo,
                dataset_id=dataset.uuid,
                partition_key="key",
                job_id=job_id,
            )

            ds = store.list(suffix="dataset")
            print(ds)
Esempio n. 5
0
    def test_dir_glob(self):
        print("Testing directory globbing")
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        mymenu = Menu_pb()
        mymenu.uuid = str(uuid.uuid4())
        mymenu.name = f"{mymenu.uuid}.menu.dat"

        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        mymsg = DummyMessage()
        mymsg.name = "dummy"
        mymsg.description = "really dumb"

        store_id = str(uuid.uuid4())
        mystore = CronusObjectStore()
        mystore.name = "test"
        mystore.uuid = str(store_id)
        mystore.parent_uuid = ""  # top level store

        with tempfile.TemporaryDirectory() as dirpath:
            mystore.address = dirpath + "/test"
            _path = Path(mystore.address)
            _path.mkdir()
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            path = dirpath + "/test/dummy.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())
            path = dirpath + "/test/dummy2.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())

            objs_ = store.register_content(
                mystore.address,
                fileinfo,
                glob="*arrow",
                dataset_id=dataset.uuid,
                partition_key="key",
            )
            for obj_ in objs_:
                print(obj_.uuid, store[obj_.uuid].address)
                buf = pa.py_buffer(store.get(obj_.uuid))
                reader = pa.ipc.open_file(buf)
                self.assertEqual(reader.num_record_batches, 10)

            ds = store.list(suffix="dataset")
            for d in ds:
                p = d.uuid + ".part_key"
                f = store.list(prefix=p, suffix="arrow")
                print(f)
        print("Test Done ===========================")