def test(self): model = Table() model.name = "EvolveModel" schema = model.info.schema.info field = schema.fields.add() field.name = "Name" field.info.type = "String" field.info.length = 10 field.info.aux.generator.name = "name" print(model)
def test_rbgen_csv(self): """ This method generates a test record batch as a CSV, and displays the results to the console. """ # define the schema for the data g_table = Table() # Define a table instance g_table.name = "EvolveModel" # Name the table g_table.uuid = str(uuid.uuid4()) # Create a UUID schema = g_table.info.schema.info # Access the schema unit field = schema.fields.add() # Add a field field.name = "Name" # Set the field name field.info.type = "float" # Set the type of the field field.info.length = 10 # Set the field length field.info.aux.generator.name = "normal" # Generator name, we need to trace this """ # We're adding in the parameters here. These mimic the tests that are found in the ArtemisFaker module itself params = field.info.aux.generator.parameters.add() params.name = "Mean" params.value = 3 params.type = "int" params2 = field.info.aux.generator.parameters.add() params2.name = "STD" params2.value = 3 params2.type = "int" """ g_table_msg = g_table.SerializeToString( ) # Create a string instance of this # This is the record batch generator # All the configurations are set in the # generator to produce the output. generator = RecordBatchGen( "generator", # Unknown parameter nbatches=1, # Total number of batches that are used num_rows=10, # Total rows to be generated file_type=1, # Encodes the data as csv table_id=g_table.uuid, # Sets the table UUID table_msg=g_table_msg, # Sets the table message ) generator.initialize() # Create the generator # Data returned as a pyarrow buffer # Convert to raw python bytes objects # Use io wrapper and read as csv for batch in generator: # Generator is some kind of iterator data = batch.to_pybytes() # Access the batch, convert to bytes # Create a text output, this turns it into a string with io.TextIOWrapper(io.BytesIO(data)) as textio: for row in csv.reader( textio): # Spit out the row in the buffer print(row) # Print the row
def test_gen_from_proto(self): model = Table() model.name = "EvolveModel" schema = model.info.schema.info field = schema.fields.add() field.name = "Name" field.info.type = "String" field.info.length = 10 field.info.aux.generator.name = "name" s2 = Synthesizer(model, "en_CA", idx=0, seed=4053) print(s2.generate())
def test_table(self): table = Table() table.name = "Attachment" # table.uuid = str(uuid.uuid4()) schema = table.info.schema.info schema.aux.frequency = 3 schema.aux.description = "This table is for ..." field1 = schema.fields.add() field1.name = "record_id" field1.info.type = "String" field1.info.length = 10 field2 = schema.fields.add() field2.name = "field2" field2.info.type = "String" field2.info.length = 20 aux2 = field2.info.aux aux2.generator.name = "name" aux2.meta["Bool1"].bool_val = True aux2.meta["Bool2"].bool_val = False aux2.meta["String1"].string_val = "System" aux2.description = "Blah" field3 = schema.fields.add() field3.name = "fieldl3" field3.info.type = "String" field3.info.length = 24 aux3 = field3.info.aux aux3.generator.name = "province" code = aux3.codeset code.name = "Codeset Name" code.version = "2016VR1" value1 = code.codevalues.add() value1.code = "1A" value1.description = "what 1a stands for" value2 = code.codevalues.add() value2.code = "2A" value2.description = "What 2a stands for" value2.lable = "lable for 2a" aux3.meta["Bool1"].bool_val = True aux3.meta["Bool2"].bool_val = True aux3.description = "Blah blah blah" aux3.meta["String1"].string_val = "Rule for variable population" tem2 = table.SerializeToString() print(tem2) table2 = Table() table2.ParseFromString(tem2) print(table2)
def test_fileio(self): """ Write csv to disk Read back in artemis """ with tempfile.TemporaryDirectory() as dirpath: mb = MenuFactory("csvgen") msgmenu = mb.build() menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() store = BaseObjectStore(dirpath, "artemis") config = JobConfigFactory( "csvio", msgmenu, jobname="arrowproto", generator_type="file", filehandler_type="csv", nbatches=1, num_rows=10000, max_file_size=1073741824, write_csv=True, # input_repo=dirpath, input_glob=".csv", # output_repo=dirpath ) config.configure() config.add_algos(mb.algos) configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() menu_uuid = store.register_content(msgmenu, menuinfo).uuid config_uuid = store.register_content(config._msg, configinfo).uuid g_dataset = store.register_dataset() store.new_partition(g_dataset.uuid, "generator") job_id = store.new_job(g_dataset.uuid) # define the schema for the data g_table = Table() g_table.name = "generator" g_table.uuid = str(uuid.uuid4()) g_table.info.schema.name = "csv" g_table.info.schema.uuid = str(uuid.uuid4()) fields = list( itertools.islice(GenCsvLikeArrow.generate_col_names(), 20)) for f in fields: field = g_table.info.schema.info.fields.add() field.name = f tinfo = TableObjectInfo() tinfo.fields.extend(fields) store.register_content( g_table, tinfo, dataset_id=g_dataset.uuid, job_id=job_id, partition_key="generator", ) generator = GenCsvLikeArrow( "generator", nbatches=1, num_cols=20, num_rows=10000, suffix=".csv", prefix="testio", path=dirpath, table_id=g_table.uuid, ) generator.gate.meta.parentset_id = g_dataset.uuid generator.gate.meta.job_id = str(job_id) generator.gate.store = store generator.initialize() generator.write() dataset = store.register_dataset(menu_uuid, config_uuid) job_id = store.new_job(dataset.uuid) store.save_store() job = JobInfo_pb() job.name = "arrowproto" job.job_id = "example" job.store_path = dirpath job.store_id = store.store_uuid job.store_name = store.store_name job.menu_id = menu_uuid job.config_id = config_uuid job.dataset_id = dataset.uuid job.job_id = str(job_id) # print(job) bow = Artemis(job, loglevel="INFO") bow.control()
def test_distributed(self): with tempfile.TemporaryDirectory() as dirpath: mb = MenuFactory("csvgen") msgmenu = mb.build() menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() store = BaseObjectStore(dirpath, "artemis") config = JobConfigFactory( "csvio", msgmenu, jobname="arrowproto", generator_type="file", filehandler_type="csv", nbatches=1, num_rows=10000, max_file_size=1073741824, write_csv=True, input_glob=".csv", ) config.configure() config.add_algos(mb.algos) configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() menu_uuid = store.register_content(msgmenu, menuinfo).uuid config_obj = store.register_content(config._msg, configinfo) config_uuid = config_obj.uuid g_dataset = store.register_dataset() store.new_partition(g_dataset.uuid, "generator") job_id = store.new_job(g_dataset.uuid) # define the schema for the data g_table = Table() g_table.name = "generator" g_table.uuid = str(uuid.uuid4()) g_table.info.schema.name = "csv" g_table.info.schema.uuid = str(uuid.uuid4()) fields = list( itertools.islice(GenCsvLikeArrow.generate_col_names(), 20)) for f in fields: field = g_table.info.schema.info.fields.add() field.name = f tinfo = TableObjectInfo() tinfo.fields.extend(fields) store.register_content( g_table, tinfo, dataset_id=g_dataset.uuid, job_id=job_id, partition_key="generator", ) generator = GenCsvLikeArrow( "generator", nbatches=10, num_cols=20, num_rows=1000, suffix=".csv", prefix="testio", path=dirpath, table_id=g_table.uuid, ) generator.gate.meta.parentset_id = g_dataset.uuid generator.gate.meta.job_id = str(job_id) generator.gate.store = store generator.initialize() generator.write() dataset = store.register_dataset(menu_uuid, config_uuid) job_id = store.new_job(dataset.uuid) store.save_store() ####################################### inputs = store.list(prefix=g_dataset.uuid, suffix="csv") store_name = store._name store_uuid = store.store_uuid dataset_uuid = dataset.uuid ds_results = [] for datum in inputs: job_id = store.new_job(dataset.uuid) url_data = urllib.parse.urlparse(datum.address) dpath = urllib.parse.unquote(url_data.path) print(datum) config = Configuration() store.get(config_uuid, config) for p in config.input.generator.config.properties.property: if p.name == "glob": p.value = dpath.split(".")[-2] + ".csv" store._put_message(config_uuid, config) store.get(config_uuid, config) print(config) ds_results.append( runjob( dirpath, store_name, store_uuid, menu_uuid, config_uuid, dataset_uuid, g_dataset.uuid, job_id, )) results = dask.compute(*ds_results, scheduler="single-threaded") # Workaround to fix error in dataset merging store.new_partition(dataset.uuid, "seqY") # Update the dataset for buf in results: ds = DatasetObjectInfo() ds.ParseFromString(buf) store.update_dataset(dataset.uuid, buf) # Save the store, reload store.save_store() print(store[dataset.uuid].dataset)