def test(self): model = Table() model.name = "EvolveModel" schema = model.info.schema.info field = schema.fields.add() field.name = "Name" field.info.type = "String" field.info.length = 10 field.info.aux.generator.name = "name" print(model)
def test_rbgen_csv(self): """ This method generates a test record batch as a CSV, and displays the results to the console. """ # define the schema for the data g_table = Table() # Define a table instance g_table.name = "EvolveModel" # Name the table g_table.uuid = str(uuid.uuid4()) # Create a UUID schema = g_table.info.schema.info # Access the schema unit field = schema.fields.add() # Add a field field.name = "Name" # Set the field name field.info.type = "float" # Set the type of the field field.info.length = 10 # Set the field length field.info.aux.generator.name = "normal" # Generator name, we need to trace this """ # We're adding in the parameters here. These mimic the tests that are found in the ArtemisFaker module itself params = field.info.aux.generator.parameters.add() params.name = "Mean" params.value = 3 params.type = "int" params2 = field.info.aux.generator.parameters.add() params2.name = "STD" params2.value = 3 params2.type = "int" """ g_table_msg = g_table.SerializeToString( ) # Create a string instance of this # This is the record batch generator # All the configurations are set in the # generator to produce the output. generator = RecordBatchGen( "generator", # Unknown parameter nbatches=1, # Total number of batches that are used num_rows=10, # Total rows to be generated file_type=1, # Encodes the data as csv table_id=g_table.uuid, # Sets the table UUID table_msg=g_table_msg, # Sets the table message ) generator.initialize() # Create the generator # Data returned as a pyarrow buffer # Convert to raw python bytes objects # Use io wrapper and read as csv for batch in generator: # Generator is some kind of iterator data = batch.to_pybytes() # Access the batch, convert to bytes # Create a text output, this turns it into a string with io.TextIOWrapper(io.BytesIO(data)) as textio: for row in csv.reader( textio): # Spit out the row in the buffer print(row) # Print the row
def test_gen_from_proto(self): model = Table() model.name = "EvolveModel" schema = model.info.schema.info field = schema.fields.add() field.name = "Name" field.info.type = "String" field.info.length = 10 field.info.aux.generator.name = "name" s2 = Synthesizer(model, "en_CA", idx=0, seed=4053) print(s2.generate())
def test_xduplicates(self): model = Table() model.info.aux.duplicate.probability = 1 model.info.aux.duplicate.distribution = "uniform" model.info.aux.duplicate.maximum = 1 schema = model.info.schema.info field1 = schema.fields.add() field1.name = "record_id" field1.info.type = "String" field1.info.length = 10 field2 = schema.fields.add() field2.name = "Name" field2.info.type = "String" field2.info.length = 10 field2.info.aux.generator.name = "name" field3 = schema.fields.add() field3.name = "UPC" field3.info.type = "Integer" field3.info.length = 13 field3.info.aux.generator.name = "ean" parm = field3.info.aux.generator.parameters.add() parm.name = "ndigits" parm.value = 13 parm.type = "int" s2 = Synthesizer(model, "en_CA", idx=0, seed=4053) print(s2.generate())
def test_glm_proto(self): model = Table() schema = model.info.schema.info field1 = schema.fields.add() field1.name = "Value1" field1.info.type = "Float" field1.info.length = 10 field1.info.aux.generator.name = "random_int" field1.info.aux.dependent = "Prediction" field2 = schema.fields.add() field2.name = "Value2" field2.info.type = "Float" field2.info.length = 10 field2.info.aux.generator.name = "random_int" field2.info.aux.dependent = "Prediction" field3 = schema.fields.add() field3.name = "Prediction" field3.info.type = "Float" field3.info.length = 10 field3.info.aux.generator.name = "glm" beta1 = field3.info.aux.generator.parameters.add() beta1.name = "beta1" beta1.value = 10 beta1.type = "int" beta2 = field3.info.aux.generator.parameters.add() beta2.name = "beta2" beta2.value = 0.1 beta2.type = "float" beta3 = field3.info.aux.generator.parameters.add() beta3.name = "beta3" beta3.value = 100 beta3.type = "int" sigma = field3.info.aux.generator.parameters.add() sigma.name = "sigma" sigma.value = 1 sigma.type = "int" var1 = field3.info.aux.generator.parameters.add() var1.name = "Value1" var1.type = "Field" var1.variable.CopyFrom(field1) var2 = field3.info.aux.generator.parameters.add() var2.name = "Value2" var2.type = "Field" var2.variable.CopyFrom(field2) s2 = Synthesizer(model, "en_CA") print(s2.generate())
def test_table(self): table = Table() table.name = "Attachment" # table.uuid = str(uuid.uuid4()) schema = table.info.schema.info schema.aux.frequency = 3 schema.aux.description = "This table is for ..." field1 = schema.fields.add() field1.name = "record_id" field1.info.type = "String" field1.info.length = 10 field2 = schema.fields.add() field2.name = "field2" field2.info.type = "String" field2.info.length = 20 aux2 = field2.info.aux aux2.generator.name = "name" aux2.meta["Bool1"].bool_val = True aux2.meta["Bool2"].bool_val = False aux2.meta["String1"].string_val = "System" aux2.description = "Blah" field3 = schema.fields.add() field3.name = "fieldl3" field3.info.type = "String" field3.info.length = 24 aux3 = field3.info.aux aux3.generator.name = "province" code = aux3.codeset code.name = "Codeset Name" code.version = "2016VR1" value1 = code.codevalues.add() value1.code = "1A" value1.description = "what 1a stands for" value2 = code.codevalues.add() value2.code = "2A" value2.description = "What 2a stands for" value2.lable = "lable for 2a" aux3.meta["Bool1"].bool_val = True aux3.meta["Bool2"].bool_val = True aux3.description = "Blah blah blah" aux3.meta["String1"].string_val = "Rule for variable population" tem2 = table.SerializeToString() print(tem2) table2 = Table() table2.ParseFromString(tem2) print(table2)
def __init__(self, name, **kwargs): Logger.configure(self, **kwargs) self.__name = name self.properties = Properties() options = dict(RecordBatchGenOptions()) options.update(kwargs) for key in options: self.properties.add_property(key, options[key]) if hasattr(self.properties, "seed"): self.rnd = check_random_state(seed=self.properties.seed) else: self.rnd = check_random_state(seed=None) if hasattr(self.properties, "nbatches"): self._nbatches = self.properties.nbatches self._batch_iter = iter(range(self.properties.nbatches)) else: self.__logger.warning("Number of batches not defined") self._batchidx = 0 self.table_id = self.properties.table_id self.table = Table() if hasattr(self.properties, "table_msg"): self.table.ParseFromString(self.properties.table_msg) else: self.__logger.warning("No table message to deserialize") self.num_rows = self.properties.num_rows self.linesep = self.properties.linesep # self.header = self.properties.header self.nsamples = self.properties.nsamples self.file_type = self.properties.file_type self.codec = self.properties.codec self.synthesizer = None self.num_cols = None self.write_batch = None self.header = None # FWF self.pos_char = { "0": "{", "1": "A", "2": "B", "3": "C", "4": "D", "5": "E", "6": "F", "7": "G", "8": "H", "9": "I", } self.neg_char = { "0": "}", "1": "J", "2": "K", "3": "L", "4": "M", "5": "N", "6": "O", "7": "P", "8": "Q", "9": "R", } # header = '' self.header_offset = 0 self.footer = "" self.footer_size = 0 self.__logger.info("Initialized %s", self.__class__.__name__) self.__logger.info("%s properties: %s", self.__class__.__name__, self.properties) print("Initialize RecordBatchGen")
class RecordBatchGen: """ Class is a record-generator """ def __init__(self, name, **kwargs): Logger.configure(self, **kwargs) self.__name = name self.properties = Properties() options = dict(RecordBatchGenOptions()) options.update(kwargs) for key in options: self.properties.add_property(key, options[key]) if hasattr(self.properties, "seed"): self.rnd = check_random_state(seed=self.properties.seed) else: self.rnd = check_random_state(seed=None) if hasattr(self.properties, "nbatches"): self._nbatches = self.properties.nbatches self._batch_iter = iter(range(self.properties.nbatches)) else: self.__logger.warning("Number of batches not defined") self._batchidx = 0 self.table_id = self.properties.table_id self.table = Table() if hasattr(self.properties, "table_msg"): self.table.ParseFromString(self.properties.table_msg) else: self.__logger.warning("No table message to deserialize") self.num_rows = self.properties.num_rows self.linesep = self.properties.linesep # self.header = self.properties.header self.nsamples = self.properties.nsamples self.file_type = self.properties.file_type self.codec = self.properties.codec self.synthesizer = None self.num_cols = None self.write_batch = None self.header = None # FWF self.pos_char = { "0": "{", "1": "A", "2": "B", "3": "C", "4": "D", "5": "E", "6": "F", "7": "G", "8": "H", "9": "I", } self.neg_char = { "0": "}", "1": "J", "2": "K", "3": "L", "4": "M", "5": "N", "6": "O", "7": "P", "8": "Q", "9": "R", } # header = '' self.header_offset = 0 self.footer = "" self.footer_size = 0 self.__logger.info("Initialized %s", self.__class__.__name__) self.__logger.info("%s properties: %s", self.__class__.__name__, self.properties) print("Initialize RecordBatchGen") @property def random_state(self): return self._builtin_generator.rnd @property def name(self): """ Algorithm name """ return self.__name def reset(self): if hasattr(self, "_nbatches"): self._batch_iter = iter(range(self._nbatches)) else: self.__logger.warning("Override reset in concrete class") def to_msg(self): message = Algo_pb() message.name = self.name message.klass = self.__class__.__name__ message.module = self.__module__ message.properties.CopyFrom(self.properties.to_msg()) return message @staticmethod def from_msg(logger, msg): logger.info("Loading Algo from msg %s", msg.name) try: module = importlib.import_module(msg.module) except ImportError: logger.error("Unable to load module %s", msg.module) raise except Exception as e: logger.error("Unknow error loading module: %s" % e) raise try: class_ = getattr(module, msg.klass) except AttributeError: logger.error("%s: missing attribute %s" % (msg.name, msg.klass)) raise except Exception as e: logger.error("Reason: %s" % e) raise properties = Properties.from_msg(msg.properties) logger.debug(pformat(properties)) # Update the logging level of # algorithms if loglevel not set # Ensures user-defined algos get the artemis level logging if "loglevel" not in properties: properties["loglevel"] = logger.getEffectiveLevel() try: instance = class_(msg.name, **properties) except AttributeError: logger.error("%s: missing attribute %s" % (msg.name, "properties")) raise except Exception as e: logger.error("%s: Cannot initialize %s" % e) raise return instance @property def num_batches(self): return self._nbatches @num_batches.setter def num_batches(self, n): self._nbatches = n def __iter__(self): return self def initialize(self): # Behaves like a switch case controlling the writng method method_switch = { 1: self.write_batch_csv, 2: self.write_batch_fwf, 5: self.write_batch_arrow } self.__logger.info("RecordBatchGenerator") # Get the logger info # Set the number of fields programmatically self.num_cols = len(self.table.info.schema.info.fields) names = [] # Field name array for field in self.table.info.schema.info.fields: # Iterate over the schema fields names.append(field.name) # Push to array self.header = names # Set the header as the names if hasattr(self.properties, "seed"): # Set the seed if there is a seed property self.synthesizer = Synthesizer( # Initialize synthesizer with seeding self.table, idx=0, seed=self.properties.seed) else: # Otherwise, intialize without seed self.synthesizer = Synthesizer(self.table, idx=0) try: # Handle specific filetypes self.write_batch = method_switch[self.file_type] except KeyError: # Alert that this process did not execute self.__logger.info("RecordBatchGenerator: Filetype not 1, 2, or 5") def chunk(self): """ Allow for concurrent generate during write """ for _ in range(self.num_rows): try: yield tuple(self.synthesizer.generate()) except TypeError: self.__logger.error("Generator function must return list") raise except Exception as error: self.__logger.error("Unknown error in chunk") self.__logger.error(error) def fwf_encode_row(self, row): record = "" # Create data of specific unit types. fields = list(self.table.info.schema.fields) for i, dpoint in enumerate(row): # encode # pad to field width # append to record field_schema = fields[i] dpoint = str(dpoint) # signed integers require encoding # all other fields expected to be string-like if field_schema["utype"] == "int": if dpoint < 0: # Convert negative integers. dpoint = dpoint.replace("-", "") dpoint = dpoint.replace(dpoint[-1], self.neg_char[dpoint[-1:]]) else: # Convert positive integers. dpoint = dpoint.replace(dpoint[-1], self.pos_char[dpoint[-1:]]) # ensure generated field is within schema length dpoint = dpoint[:field_schema["length"]] # pad up to required length if field_schema["utype"] == "int" or field_schema[ "utype"] == "uint": dpoint = ("0" * (field_schema["length"] - len(dpoint))) + dpoint else: dpoint = dpoint + (" " * (field_schema["length"] - len(dpoint))) # append field to record record += dpoint return record def write_batch_fwf(self): """ Generate a batch of records convert rows to fixed width fields encode to ascii format in bytes """ fwf = io.StringIO() for row in list(self.chunk()): if len(row) != len(self.header): raise ValueError fwf.write(self.fwf_encode_row(row)) fwf = fwf.getvalue().encode(self.codec) return pa.py_buffer(fwf) def write_batch_csv(self): """ Generate batch of records encode to csv in bytes """ csv = io.StringIO() if self.header: csv.write(",".join(self.header)) csv.write(self.linesep) for row in list(self.chunk()): csv.write(",".join(map(str, row))) csv.write(self.linesep) csv = csv.getvalue().encode(self.codec) return pa.py_buffer(csv) def write_batch_arrow(self): """ Generate a batch of records convert to pyarrow arrays convert to RecordBatch """ data = list(self.chunk()) data = zip(*data) arrays = [] for i, column in enumerate(data): arrays.append(pa.array(column, self.pa_schema[i].type)) batch = pa.RecordBatch.from_arrays(arrays, names=self.pa_schema.names) return batch def write_csv(self): """ Write n chunks to csv Write file to disk """ csv = b"" while (len(csv) // 1024**2) < self.maxfilesize: csv += self.write_batch_csv() if self.checkcount(): break return csv def write_fwf(self): """ Write fwf with all records """ fwf = b"" while (len(fwf) // 1024**2) < self.maxfilesize: fwf += self.write_batch_fwf() if self.checkcount(): break return fwf def write_recordbatchfile(self): sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, self.pa_schema) batches_size = 0 while (batches_size // 1024**2) < self.maxfilesize: batch = self.write_batch_arrow() batches_size += pa.get_record_batch_size(batch) writer.write_batch(batch) if self.checkcount(): break writer.close() buf = sink.getvalue() return buf def __next__(self): next(self._batch_iter) self.__logger.info("%s: Generating datum " % (self.__class__.__name__)) data = self.write_batch() self._batchidx += 1 return data
def test_xmodifer(self): model = Table() schema = model.info.schema.info field1 = schema.fields.add() field1.name = "record_id" field1.info.type = "String" field1.info.length = 10 field2 = schema.fields.add() field2.name = "Name" field2.info.type = "String" field2.info.length = 10 field2.info.aux.generator.name = "name" field3 = schema.fields.add() field3.name = "SIN" field3.info.type = "String" field3.info.length = 10 field3.info.aux.generator.name = "ssn" field4 = schema.fields.add() field4.name = "StreetNumber" field4.info.type = "String" field4.info.length = 40 field4.info.aux.generator.name = "building_number" field5 = schema.fields.add() field5.name = "Street" field5.info.type = "String" field5.info.length = 40 field5.info.aux.generator.name = "street_name" field6 = schema.fields.add() field6.name = "City" field6.info.type = "String" field6.info.length = 40 field6.info.aux.generator.name = "city" field7 = schema.fields.add() field7.name = "Province" field7.info.type = "String" field7.info.length = 40 field7.info.aux.generator.name = "province" field8 = schema.fields.add() field8.name = "PostalCode" field8.info.type = "String" field8.info.length = 40 field8.info.aux.generator.name = "postcode" field9 = schema.fields.add() field9.name = "DOB" field9.info.type = "DateTime" field9.info.length = 40 field9.info.aux.generator.name = "date" field10 = schema.fields.add() field10.name = "PhoneNum" field10.info.type = "String" field10.info.length = 11 field10.info.aux.generator.name = "phone_number" model.info.aux.duplicate.probability = 1 model.info.aux.duplicate.distribution = "uniform" model.info.aux.duplicate.maximum = 5 modifier = model.info.aux.record_modifier modifier.max_modifications_in_record = 1 modifier.max_field_modifiers = 1 modifier.max_record_modifiers = 1 name_mod = modifier.fields.add() name_mod.selection = 0.1 name_mod.name = "Name" prob = name_mod.probabilities prob.insert = 0.1 # insert character in field prob.delete = 0.1 # delete character in field prob.substitute = 0.1 # substitute character in field prob.misspell = 0.0 # use mispelling dictionary prob.transpose = 0.1 # transpose adjacent characters prob.replace = 0.1 # replace with another value of same fake prob.swap = 0.1 # swap two words/values in field prob.split = 0.1 # split a field prob.merge = 0.1 # merge a field prob.nullify = 0.1 # convert to null prob.fill = 0.1 # fill empty field with expected type street_mod = modifier.fields.add() street_mod.selection = 0.9 street_mod.name = "Street" prob2 = street_mod.probabilities prob2.insert = 0.1 # insert character in field prob2.delete = 0.1 # delete character in field prob2.substitute = 0.1 # substitute character in field prob2.misspell = 0.0 # use mispelling dictionary prob2.transpose = 0.1 # transpose adjacent characters prob2.replace = 0.1 # replace with another value of same fake prob2.swap = 0.1 # swap two words/values in field prob2.split = 0.1 # split a field prob2.merge = 0.1 # merge a field prob2.nullify = 0.1 # convert to null prob2.fill = 0.1 # fill empty field with expected type s2 = Synthesizer(model, "en_CA", idx=0, seed=4053) protorows = [] for _ in range(10): protorows.append(s2.generate()) print(protorows)
def test_fileio(self): """ Write csv to disk Read back in artemis """ with tempfile.TemporaryDirectory() as dirpath: mb = MenuFactory("csvgen") msgmenu = mb.build() menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() store = BaseObjectStore(dirpath, "artemis") config = JobConfigFactory( "csvio", msgmenu, jobname="arrowproto", generator_type="file", filehandler_type="csv", nbatches=1, num_rows=10000, max_file_size=1073741824, write_csv=True, # input_repo=dirpath, input_glob=".csv", # output_repo=dirpath ) config.configure() config.add_algos(mb.algos) configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() menu_uuid = store.register_content(msgmenu, menuinfo).uuid config_uuid = store.register_content(config._msg, configinfo).uuid g_dataset = store.register_dataset() store.new_partition(g_dataset.uuid, "generator") job_id = store.new_job(g_dataset.uuid) # define the schema for the data g_table = Table() g_table.name = "generator" g_table.uuid = str(uuid.uuid4()) g_table.info.schema.name = "csv" g_table.info.schema.uuid = str(uuid.uuid4()) fields = list( itertools.islice(GenCsvLikeArrow.generate_col_names(), 20)) for f in fields: field = g_table.info.schema.info.fields.add() field.name = f tinfo = TableObjectInfo() tinfo.fields.extend(fields) store.register_content( g_table, tinfo, dataset_id=g_dataset.uuid, job_id=job_id, partition_key="generator", ) generator = GenCsvLikeArrow( "generator", nbatches=1, num_cols=20, num_rows=10000, suffix=".csv", prefix="testio", path=dirpath, table_id=g_table.uuid, ) generator.gate.meta.parentset_id = g_dataset.uuid generator.gate.meta.job_id = str(job_id) generator.gate.store = store generator.initialize() generator.write() dataset = store.register_dataset(menu_uuid, config_uuid) job_id = store.new_job(dataset.uuid) store.save_store() job = JobInfo_pb() job.name = "arrowproto" job.job_id = "example" job.store_path = dirpath job.store_id = store.store_uuid job.store_name = store.store_name job.menu_id = menu_uuid job.config_id = config_uuid job.dataset_id = dataset.uuid job.job_id = str(job_id) # print(job) bow = Artemis(job, loglevel="INFO") bow.control()
def test_distributed(self): with tempfile.TemporaryDirectory() as dirpath: mb = MenuFactory("csvgen") msgmenu = mb.build() menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() store = BaseObjectStore(dirpath, "artemis") config = JobConfigFactory( "csvio", msgmenu, jobname="arrowproto", generator_type="file", filehandler_type="csv", nbatches=1, num_rows=10000, max_file_size=1073741824, write_csv=True, input_glob=".csv", ) config.configure() config.add_algos(mb.algos) configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() menu_uuid = store.register_content(msgmenu, menuinfo).uuid config_obj = store.register_content(config._msg, configinfo) config_uuid = config_obj.uuid g_dataset = store.register_dataset() store.new_partition(g_dataset.uuid, "generator") job_id = store.new_job(g_dataset.uuid) # define the schema for the data g_table = Table() g_table.name = "generator" g_table.uuid = str(uuid.uuid4()) g_table.info.schema.name = "csv" g_table.info.schema.uuid = str(uuid.uuid4()) fields = list( itertools.islice(GenCsvLikeArrow.generate_col_names(), 20)) for f in fields: field = g_table.info.schema.info.fields.add() field.name = f tinfo = TableObjectInfo() tinfo.fields.extend(fields) store.register_content( g_table, tinfo, dataset_id=g_dataset.uuid, job_id=job_id, partition_key="generator", ) generator = GenCsvLikeArrow( "generator", nbatches=10, num_cols=20, num_rows=1000, suffix=".csv", prefix="testio", path=dirpath, table_id=g_table.uuid, ) generator.gate.meta.parentset_id = g_dataset.uuid generator.gate.meta.job_id = str(job_id) generator.gate.store = store generator.initialize() generator.write() dataset = store.register_dataset(menu_uuid, config_uuid) job_id = store.new_job(dataset.uuid) store.save_store() ####################################### inputs = store.list(prefix=g_dataset.uuid, suffix="csv") store_name = store._name store_uuid = store.store_uuid dataset_uuid = dataset.uuid ds_results = [] for datum in inputs: job_id = store.new_job(dataset.uuid) url_data = urllib.parse.urlparse(datum.address) dpath = urllib.parse.unquote(url_data.path) print(datum) config = Configuration() store.get(config_uuid, config) for p in config.input.generator.config.properties.property: if p.name == "glob": p.value = dpath.split(".")[-2] + ".csv" store._put_message(config_uuid, config) store.get(config_uuid, config) print(config) ds_results.append( runjob( dirpath, store_name, store_uuid, menu_uuid, config_uuid, dataset_uuid, g_dataset.uuid, job_id, )) results = dask.compute(*ds_results, scheduler="single-threaded") # Workaround to fix error in dataset merging store.new_partition(dataset.uuid, "seqY") # Update the dataset for buf in results: ds = DatasetObjectInfo() ds.ParseFromString(buf) store.update_dataset(dataset.uuid, buf) # Save the store, reload store.save_store() print(store[dataset.uuid].dataset)