def test_register_object(self): data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() mymsg = DummyMessage() mymsg.name = "dummy" mymsg.description = "really dumb" mymenu = CronusObject() mymenu.name = "menu" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.description = "Some dummy data" menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid dataset = store.register_dataset(menu_uuid, config_uuid) store.new_partition(dataset.uuid, "key") path = dirpath + "/test/dummy.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) id_ = store.register_content(path, fileinfo, dataset_id=dataset.uuid, partition_key="key").uuid print(store[id_].address) buf = pa.py_buffer(store.get(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10)
def __init__(self, root, store_name, store_id, menu_id, config_id, dataset_id, job_id): self.dataset_id = dataset_id self.job_id = job_id # Connect to the metastore # Setup a datastore self.store = BaseObjectStore(str(root), store_name, store_uuid=store_id) self.parts = self.store.list_partitions(dataset_id) self.menu = Menu_pb() self.store.get(menu_id, self.menu) self.config = Configuration() # Get the menu and config to run the job self.store.get(config_id, self.config) self.buf = None
def test_config(self): myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message config_uuid = store.register_content(myconfig, configinfo).uuid store.put(config_uuid, myconfig) aconfig = Configuration() store.get(config_uuid, aconfig) self.assertEqual(myconfig.name, aconfig.name) self.assertEqual(myconfig.uuid, aconfig.uuid)
def test_validation(self): print("Simulate production") data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() mymenu = Menu_pb() mymenu.uuid = str(uuid.uuid4()) mymenu.name = f"{mymenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message # Following puts the menu and config to the datastore menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid dataset = store.register_dataset(menu_uuid, config_uuid) # Multiple streams store.new_partition(dataset.uuid, "key1") store.new_partition(dataset.uuid, "key2") store.new_partition(dataset.uuid, "key3") fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.description = "Some dummy data" ids_ = [] parts = store.list_partitions(dataset.uuid) # reload menu and config newmenu = Menu_pb() store.get(menu_uuid, newmenu) newconfig = Configuration() store.get(config_uuid, newconfig) print(parts) for _ in range(10): job_id = store.new_job(dataset.uuid) for key in parts: ids_.append( store.register_content( buf, fileinfo, dataset_id=dataset.uuid, job_id=job_id, partition_key=key, ).uuid) store.put(ids_[-1], buf) for id_ in ids_: buf = pa.py_buffer(store.get(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) # Save the store, reload store.save_store() newstore = BaseObjectStore(str(_path), store._name, store_uuid=store.store_uuid) for id_ in ids_: print("Get object %s", id_) print(type(id_)) buf = pa.py_buffer(newstore.get(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) print(newmenu) print(newconfig) print("Simulation Test Done ===========================")
def test_register_dataset(self): # Create a fake dataset # from a menu_id and menu msg # from a config_id and config msg # add files # add tables mymenu = Menu_pb() mymenu.uuid = str(uuid.uuid4()) mymenu.name = f"{mymenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) store_id = str(uuid.uuid4()) mystore = CronusObjectStore() mystore.name = "test" mystore.uuid = str(store_id) mystore.parent_uuid = "" # top level store print("Testing directory globbing") data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) # schema = batch.schema.to_pybytes() sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.num_columns = 3 with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore(str(_path), "test") store_id = store.store_uuid print(store.store_info.created.ToDatetime()) menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid print(menu_uuid) print(config_uuid) dataset = store.register_dataset(menu_uuid, config_uuid) store.new_partition(dataset.uuid, "key") job_id = store.new_job(dataset.uuid) store.register_content( buf, fileinfo, dataset_id=dataset.uuid, partition_key="key", job_id=job_id, ) ds = store.list(suffix="dataset") print(ds)
def test_dir_glob(self): print("Testing directory globbing") data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) mymenu = Menu_pb() mymenu.uuid = str(uuid.uuid4()) mymenu.name = f"{mymenu.uuid}.menu.dat" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() mymsg = DummyMessage() mymsg.name = "dummy" mymsg.description = "really dumb" store_id = str(uuid.uuid4()) mystore = CronusObjectStore() mystore.name = "test" mystore.uuid = str(store_id) mystore.parent_uuid = "" # top level store with tempfile.TemporaryDirectory() as dirpath: mystore.address = dirpath + "/test" _path = Path(mystore.address) _path.mkdir() store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.description = "Some dummy data" menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid dataset = store.register_dataset(menu_uuid, config_uuid) store.new_partition(dataset.uuid, "key") path = dirpath + "/test/dummy.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) path = dirpath + "/test/dummy2.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) objs_ = store.register_content( mystore.address, fileinfo, glob="*arrow", dataset_id=dataset.uuid, partition_key="key", ) for obj_ in objs_: print(obj_.uuid, store[obj_.uuid].address) buf = pa.py_buffer(store.get(obj_.uuid)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10) ds = store.list(suffix="dataset") for d in ds: p = d.uuid + ".part_key" f = store.list(prefix=p, suffix="arrow") print(f) print("Test Done ===========================")
def example_configuration(table_id, seed=42): # First define a data generator using SimuTable max_malloc = 2147483648 # Maximum memory allowed in Arrow memory pool max_buffer_size = 2147483648 # Maximum size serialized ipc message write_csv = True # Output csv files for each arrow output file sample_ndatums = 1 # Preprocess job to sample files from dataset sample_nchunks = 10 # Preprocess job to sample chunks from a file linesep = "\r\n" # Line delimiter to scan for on csv input delimiter = "," # Field delimiter blocksize = 2**16 # Size of chunked data in-memory header = "" # Predefined header footer = "" # Predefined footer header_offset = 0 # N bytes to scan past header footer_size = 0 # N bytes size of footer schema = [] # Predefined list of field names on input encoding = "utf8" # encoding gen_nbatches = 5 # Number of batches to generator gen_nrows = 1000 # Number of rows per batch config = Configuration() # Cronus Configuration message config.uuid = str(uuid.uuid4()) config.name = f"{config.uuid}.config.pb" config.max_malloc_size_bytes = max_malloc generator = SimuTableGen( "generator", nbatches=gen_nbatches, num_rows=gen_nrows, file_type=1, # Output type cronus.proto filetype table_id=table_id, seed=seed, ) # Set the generator configuration config.input.generator.config.CopyFrom(generator.to_msg()) filehandler = FileHandlerTool( "filehandler", filetype="csv", # TBD use filetype metadata blocksize=blocksize, delimiter=delimiter, linesep=linesep, header=header, footer=footer, header_offset=header_offset, footer_size=footer_size, schema=schema, encoding=encoding, seed=seed, ) # Add to the tools config.tools[filehandler.name].CopyFrom(filehandler.to_msg()) csvtool = CsvTool("csvtool", block_size=(2 * blocksize)) config.tools[csvtool.name].CopyFrom(csvtool.to_msg()) # filtercoltool = FilterColTool( # "filtercoltool", columns=["record_id", "Product", "Unit"] # ) # filtercoltool = FilterColTool('filtercoltool', # columns=['record_id', 'SIN', 'DOB']) # config.tools[filtercoltool.name].CopyFrom(filtercoltool.to_msg()) writer = BufferOutputWriter("bufferwriter", BUFFER_MAX_SIZE=max_buffer_size, write_csv=write_csv) config.tools[writer.name].CopyFrom(writer.to_msg()) tdigesttool = TDigestTool("tdigesttool") config.tools[tdigesttool.name].CopyFrom(tdigesttool.to_msg()) sampler = config.sampler sampler.ndatums = sample_ndatums sampler.nchunks = sample_nchunks return config
def example_job(location): # Artemis Job requirements # BaseObjectStore - name, path and id # Menu # Configuration # Input Dataset # Dataset partitions # Table schemas for each dataset partition # Build the Menu mb = ExampleMenu() msgmenu = mb.build() menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() # Read schema and generator names xlstool = XlsTool("xlstool", location=location) ds_schema = xlstool.execute(location) # Example job only have one table table = ds_schema.tables[0] # Build the Configuration # Build the partition Table schemas # Register all inputs in the Cronus object store # Build the job # To use the local directory: # dirpath = os.getcwd() with tempfile.TemporaryDirectory() as dirpath: # All jobs now require an object store # All outputs are pesisted in the object store path # See github.com/mbr/simplekv # Factory class for simplekv provided by # blueyonder/storefact store = BaseObjectStore(dirpath, "artemis") # Requires registering an parent dataset # Generator data is written to disk with # The parent dataset uuid # Register the 'generator' partition -- required g_dataset = store.register_dataset() store.new_partition(g_dataset.uuid, "generator") job_id = store.new_job(g_dataset.uuid) # The table schema which defines the model for the generator # Persisted first to the object store # protobuf file tinfo = TableObjectInfo() table_id = store.register_content( table, tinfo, dataset_id=g_dataset.uuid, job_id=job_id, partition_key="generator", ).uuid store.save_store() # Now configure all tools and algorithms # Includes IO tools config = example_configuration(table_id) # Algorithms need to added from the menu to the configuration for key in mb._algos: msg = config.algos.add() msg.CopyFrom(mb._algos[key].to_msg()) configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() # Store the menu and configuration protobufs menu_uuid = store.register_content(msgmenu, menuinfo).uuid config_uuid = store.register_content(config, configinfo).uuid # Register an output dataset dataset = store.register_dataset(menu_id=menu_uuid, config_id=config_uuid) # Copy metadata from xlstool store[dataset.uuid].dataset.aux.CopyFrom(ds_schema.dataset.aux) store.save_store() # Now define the actual Artemis job # Again the input is a protobuf # All other information read in from the # object store # inputs = store.list(prefix=g_dataset.uuid) ds_results = [] for _ in range(2): job_id = store.new_job(dataset.uuid) config = Configuration() store.get(config_uuid, config) for p in config.input.generator.config.properties.property: if p.name == "glob": p.value = dirpath.split(".")[-2] + "csv" store._put_message(config_uuid, config) store.get(config_uuid, config) ds_results.append( runjob( dirpath, store.store_name, store.store_uuid, menu_uuid, config_uuid, dataset.uuid, g_dataset.uuid, str(job_id), )) results = dask.compute(*ds_results, scheduler="single-threaded") store.new_partition(dataset.uuid, "seqA") store.new_partition(dataset.uuid, "seqB") store.save_store() for buf in results: ds = DatasetObjectInfo() ds.ParseFromString(buf) store.update_dataset(dataset.uuid, buf) store.save_store() dqtool = PlotlyTool(store=store, uuid=dataset.uuid) dqtool.visualize(output="{}/test".format(os.getcwd()), show=True, check=False)
def test_distributed(self): with tempfile.TemporaryDirectory() as dirpath: mb = MenuFactory("csvgen") msgmenu = mb.build() menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() store = BaseObjectStore(dirpath, "artemis") config = JobConfigFactory( "csvio", msgmenu, jobname="arrowproto", generator_type="file", filehandler_type="csv", nbatches=1, num_rows=10000, max_file_size=1073741824, write_csv=True, input_glob=".csv", ) config.configure() config.add_algos(mb.algos) configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() menu_uuid = store.register_content(msgmenu, menuinfo).uuid config_obj = store.register_content(config._msg, configinfo) config_uuid = config_obj.uuid g_dataset = store.register_dataset() store.new_partition(g_dataset.uuid, "generator") job_id = store.new_job(g_dataset.uuid) # define the schema for the data g_table = Table() g_table.name = "generator" g_table.uuid = str(uuid.uuid4()) g_table.info.schema.name = "csv" g_table.info.schema.uuid = str(uuid.uuid4()) fields = list( itertools.islice(GenCsvLikeArrow.generate_col_names(), 20)) for f in fields: field = g_table.info.schema.info.fields.add() field.name = f tinfo = TableObjectInfo() tinfo.fields.extend(fields) store.register_content( g_table, tinfo, dataset_id=g_dataset.uuid, job_id=job_id, partition_key="generator", ) generator = GenCsvLikeArrow( "generator", nbatches=10, num_cols=20, num_rows=1000, suffix=".csv", prefix="testio", path=dirpath, table_id=g_table.uuid, ) generator.gate.meta.parentset_id = g_dataset.uuid generator.gate.meta.job_id = str(job_id) generator.gate.store = store generator.initialize() generator.write() dataset = store.register_dataset(menu_uuid, config_uuid) job_id = store.new_job(dataset.uuid) store.save_store() ####################################### inputs = store.list(prefix=g_dataset.uuid, suffix="csv") store_name = store._name store_uuid = store.store_uuid dataset_uuid = dataset.uuid ds_results = [] for datum in inputs: job_id = store.new_job(dataset.uuid) url_data = urllib.parse.urlparse(datum.address) dpath = urllib.parse.unquote(url_data.path) print(datum) config = Configuration() store.get(config_uuid, config) for p in config.input.generator.config.properties.property: if p.name == "glob": p.value = dpath.split(".")[-2] + ".csv" store._put_message(config_uuid, config) store.get(config_uuid, config) print(config) ds_results.append( runjob( dirpath, store_name, store_uuid, menu_uuid, config_uuid, dataset_uuid, g_dataset.uuid, job_id, )) results = dask.compute(*ds_results, scheduler="single-threaded") # Workaround to fix error in dataset merging store.new_partition(dataset.uuid, "seqY") # Update the dataset for buf in results: ds = DatasetObjectInfo() ds.ParseFromString(buf) store.update_dataset(dataset.uuid, buf) # Save the store, reload store.save_store() print(store[dataset.uuid].dataset)