예제 #1
0
 def test(self):
     model = Table()
     model.name = "EvolveModel"
     schema = model.info.schema.info
     field = schema.fields.add()
     field.name = "Name"
     field.info.type = "String"
     field.info.length = 10
     field.info.aux.generator.name = "name"
     print(model)
예제 #2
0
    def test_rbgen_csv(self):
        """
        This method generates a test
        record batch as a CSV, and displays
        the results to the console.
        """
        # define the schema for the data
        g_table = Table()  # Define a table instance
        g_table.name = "EvolveModel"  # Name the table
        g_table.uuid = str(uuid.uuid4())  # Create a UUID
        schema = g_table.info.schema.info  # Access the schema unit

        field = schema.fields.add()  # Add a field
        field.name = "Name"  # Set the field name
        field.info.type = "float"  # Set the type of the field
        field.info.length = 10  # Set the field length
        field.info.aux.generator.name = "normal"  # Generator name, we need to trace this
        """
        # We're adding in the parameters here. These mimic the tests that are found in the ArtemisFaker module itself
        params = field.info.aux.generator.parameters.add()
        params.name = "Mean"
        params.value = 3
        params.type = "int"

        params2 = field.info.aux.generator.parameters.add()
        params2.name = "STD"
        params2.value = 3
        params2.type = "int"
        """
        g_table_msg = g_table.SerializeToString(
        )  # Create a string instance of this

        # This is the record batch generator
        # All the configurations are set in the
        # generator to produce the output.
        generator = RecordBatchGen(
            "generator",  # Unknown parameter
            nbatches=1,  # Total number of batches that are used
            num_rows=10,  # Total rows to be generated
            file_type=1,  # Encodes the data as csv
            table_id=g_table.uuid,  # Sets the table UUID
            table_msg=g_table_msg,  # Sets the table message
        )

        generator.initialize()  # Create the generator
        # Data returned as a pyarrow buffer
        # Convert to raw python bytes objects
        # Use io wrapper and read as csv
        for batch in generator:  # Generator is some kind of iterator
            data = batch.to_pybytes()  # Access the batch, convert to bytes
            # Create a text output, this turns it into a string
            with io.TextIOWrapper(io.BytesIO(data)) as textio:
                for row in csv.reader(
                        textio):  # Spit out the row in the buffer
                    print(row)  # Print the row
예제 #3
0
    def test_gen_from_proto(self):

        model = Table()
        model.name = "EvolveModel"
        schema = model.info.schema.info
        field = schema.fields.add()
        field.name = "Name"
        field.info.type = "String"
        field.info.length = 10
        field.info.aux.generator.name = "name"

        s2 = Synthesizer(model, "en_CA", idx=0, seed=4053)
        print(s2.generate())
예제 #4
0
    def test_xduplicates(self):

        model = Table()

        model.info.aux.duplicate.probability = 1
        model.info.aux.duplicate.distribution = "uniform"
        model.info.aux.duplicate.maximum = 1
        schema = model.info.schema.info

        field1 = schema.fields.add()
        field1.name = "record_id"
        field1.info.type = "String"
        field1.info.length = 10

        field2 = schema.fields.add()
        field2.name = "Name"
        field2.info.type = "String"
        field2.info.length = 10
        field2.info.aux.generator.name = "name"

        field3 = schema.fields.add()
        field3.name = "UPC"
        field3.info.type = "Integer"
        field3.info.length = 13
        field3.info.aux.generator.name = "ean"

        parm = field3.info.aux.generator.parameters.add()
        parm.name = "ndigits"
        parm.value = 13
        parm.type = "int"

        s2 = Synthesizer(model, "en_CA", idx=0, seed=4053)
        print(s2.generate())
예제 #5
0
    def test_glm_proto(self):
        model = Table()
        schema = model.info.schema.info
        field1 = schema.fields.add()
        field1.name = "Value1"
        field1.info.type = "Float"
        field1.info.length = 10
        field1.info.aux.generator.name = "random_int"
        field1.info.aux.dependent = "Prediction"

        field2 = schema.fields.add()
        field2.name = "Value2"
        field2.info.type = "Float"
        field2.info.length = 10
        field2.info.aux.generator.name = "random_int"
        field2.info.aux.dependent = "Prediction"

        field3 = schema.fields.add()
        field3.name = "Prediction"
        field3.info.type = "Float"
        field3.info.length = 10
        field3.info.aux.generator.name = "glm"

        beta1 = field3.info.aux.generator.parameters.add()
        beta1.name = "beta1"
        beta1.value = 10
        beta1.type = "int"
        beta2 = field3.info.aux.generator.parameters.add()
        beta2.name = "beta2"
        beta2.value = 0.1
        beta2.type = "float"
        beta3 = field3.info.aux.generator.parameters.add()
        beta3.name = "beta3"
        beta3.value = 100
        beta3.type = "int"
        sigma = field3.info.aux.generator.parameters.add()
        sigma.name = "sigma"
        sigma.value = 1
        sigma.type = "int"

        var1 = field3.info.aux.generator.parameters.add()
        var1.name = "Value1"
        var1.type = "Field"
        var1.variable.CopyFrom(field1)

        var2 = field3.info.aux.generator.parameters.add()
        var2.name = "Value2"
        var2.type = "Field"
        var2.variable.CopyFrom(field2)

        s2 = Synthesizer(model, "en_CA")
        print(s2.generate())
예제 #6
0
    def test_table(self):
        table = Table()
        table.name = "Attachment"
        # table.uuid = str(uuid.uuid4())

        schema = table.info.schema.info
        schema.aux.frequency = 3
        schema.aux.description = "This table is for ..."

        field1 = schema.fields.add()
        field1.name = "record_id"
        field1.info.type = "String"
        field1.info.length = 10

        field2 = schema.fields.add()
        field2.name = "field2"
        field2.info.type = "String"
        field2.info.length = 20
        aux2 = field2.info.aux
        aux2.generator.name = "name"
        aux2.meta["Bool1"].bool_val = True
        aux2.meta["Bool2"].bool_val = False
        aux2.meta["String1"].string_val = "System"
        aux2.description = "Blah"

        field3 = schema.fields.add()
        field3.name = "fieldl3"
        field3.info.type = "String"
        field3.info.length = 24
        aux3 = field3.info.aux
        aux3.generator.name = "province"
        code = aux3.codeset
        code.name = "Codeset Name"
        code.version = "2016VR1"
        value1 = code.codevalues.add()
        value1.code = "1A"
        value1.description = "what 1a stands for"
        value2 = code.codevalues.add()
        value2.code = "2A"
        value2.description = "What 2a stands for"
        value2.lable = "lable for 2a"
        aux3.meta["Bool1"].bool_val = True
        aux3.meta["Bool2"].bool_val = True
        aux3.description = "Blah blah blah"
        aux3.meta["String1"].string_val = "Rule for variable population"

        tem2 = table.SerializeToString()
        print(tem2)
        table2 = Table()
        table2.ParseFromString(tem2)
        print(table2)
예제 #7
0
    def __init__(self, name, **kwargs):
        Logger.configure(self, **kwargs)
        self.__name = name
        self.properties = Properties()
        options = dict(RecordBatchGenOptions())
        options.update(kwargs)
        for key in options:
            self.properties.add_property(key, options[key])

        if hasattr(self.properties, "seed"):
            self.rnd = check_random_state(seed=self.properties.seed)
        else:
            self.rnd = check_random_state(seed=None)

        if hasattr(self.properties, "nbatches"):
            self._nbatches = self.properties.nbatches
            self._batch_iter = iter(range(self.properties.nbatches))
        else:
            self.__logger.warning("Number of batches not defined")

        self._batchidx = 0

        self.table_id = self.properties.table_id

        self.table = Table()
        if hasattr(self.properties, "table_msg"):
            self.table.ParseFromString(self.properties.table_msg)
        else:
            self.__logger.warning("No table message to deserialize")

        self.num_rows = self.properties.num_rows
        self.linesep = self.properties.linesep
        # self.header = self.properties.header
        self.nsamples = self.properties.nsamples
        self.file_type = self.properties.file_type
        self.codec = self.properties.codec

        self.synthesizer = None
        self.num_cols = None
        self.write_batch = None
        self.header = None

        # FWF
        self.pos_char = {
            "0": "{",
            "1": "A",
            "2": "B",
            "3": "C",
            "4": "D",
            "5": "E",
            "6": "F",
            "7": "G",
            "8": "H",
            "9": "I",
        }
        self.neg_char = {
            "0": "}",
            "1": "J",
            "2": "K",
            "3": "L",
            "4": "M",
            "5": "N",
            "6": "O",
            "7": "P",
            "8": "Q",
            "9": "R",
        }
        # header = ''
        self.header_offset = 0
        self.footer = ""
        self.footer_size = 0

        self.__logger.info("Initialized %s", self.__class__.__name__)
        self.__logger.info("%s properties: %s", self.__class__.__name__,
                           self.properties)
        print("Initialize RecordBatchGen")
예제 #8
0
class RecordBatchGen:
    """
    Class is a record-generator
    """
    def __init__(self, name, **kwargs):
        Logger.configure(self, **kwargs)
        self.__name = name
        self.properties = Properties()
        options = dict(RecordBatchGenOptions())
        options.update(kwargs)
        for key in options:
            self.properties.add_property(key, options[key])

        if hasattr(self.properties, "seed"):
            self.rnd = check_random_state(seed=self.properties.seed)
        else:
            self.rnd = check_random_state(seed=None)

        if hasattr(self.properties, "nbatches"):
            self._nbatches = self.properties.nbatches
            self._batch_iter = iter(range(self.properties.nbatches))
        else:
            self.__logger.warning("Number of batches not defined")

        self._batchidx = 0

        self.table_id = self.properties.table_id

        self.table = Table()
        if hasattr(self.properties, "table_msg"):
            self.table.ParseFromString(self.properties.table_msg)
        else:
            self.__logger.warning("No table message to deserialize")

        self.num_rows = self.properties.num_rows
        self.linesep = self.properties.linesep
        # self.header = self.properties.header
        self.nsamples = self.properties.nsamples
        self.file_type = self.properties.file_type
        self.codec = self.properties.codec

        self.synthesizer = None
        self.num_cols = None
        self.write_batch = None
        self.header = None

        # FWF
        self.pos_char = {
            "0": "{",
            "1": "A",
            "2": "B",
            "3": "C",
            "4": "D",
            "5": "E",
            "6": "F",
            "7": "G",
            "8": "H",
            "9": "I",
        }
        self.neg_char = {
            "0": "}",
            "1": "J",
            "2": "K",
            "3": "L",
            "4": "M",
            "5": "N",
            "6": "O",
            "7": "P",
            "8": "Q",
            "9": "R",
        }
        # header = ''
        self.header_offset = 0
        self.footer = ""
        self.footer_size = 0

        self.__logger.info("Initialized %s", self.__class__.__name__)
        self.__logger.info("%s properties: %s", self.__class__.__name__,
                           self.properties)
        print("Initialize RecordBatchGen")

    @property
    def random_state(self):
        return self._builtin_generator.rnd

    @property
    def name(self):
        """
        Algorithm name
        """
        return self.__name

    def reset(self):
        if hasattr(self, "_nbatches"):
            self._batch_iter = iter(range(self._nbatches))
        else:
            self.__logger.warning("Override reset in concrete class")

    def to_msg(self):
        message = Algo_pb()
        message.name = self.name
        message.klass = self.__class__.__name__
        message.module = self.__module__
        message.properties.CopyFrom(self.properties.to_msg())
        return message

    @staticmethod
    def from_msg(logger, msg):
        logger.info("Loading Algo from msg %s", msg.name)
        try:
            module = importlib.import_module(msg.module)
        except ImportError:
            logger.error("Unable to load module %s", msg.module)
            raise
        except Exception as e:
            logger.error("Unknow error loading module: %s" % e)
            raise
        try:
            class_ = getattr(module, msg.klass)
        except AttributeError:
            logger.error("%s: missing attribute %s" % (msg.name, msg.klass))
            raise
        except Exception as e:
            logger.error("Reason: %s" % e)
            raise

        properties = Properties.from_msg(msg.properties)
        logger.debug(pformat(properties))

        # Update the logging level of
        # algorithms if loglevel not set
        # Ensures user-defined algos get the artemis level logging
        if "loglevel" not in properties:
            properties["loglevel"] = logger.getEffectiveLevel()

        try:
            instance = class_(msg.name, **properties)
        except AttributeError:
            logger.error("%s: missing attribute %s" % (msg.name, "properties"))
            raise
        except Exception as e:
            logger.error("%s: Cannot initialize %s" % e)
            raise

        return instance

    @property
    def num_batches(self):
        return self._nbatches

    @num_batches.setter
    def num_batches(self, n):
        self._nbatches = n

    def __iter__(self):
        return self

    def initialize(self):
        # Behaves like a switch case controlling the writng method
        method_switch = {
            1: self.write_batch_csv,
            2: self.write_batch_fwf,
            5: self.write_batch_arrow
        }

        self.__logger.info("RecordBatchGenerator")  # Get the logger info
        # Set the number of fields programmatically
        self.num_cols = len(self.table.info.schema.info.fields)
        names = []  # Field name array
        for field in self.table.info.schema.info.fields:  # Iterate over the schema fields
            names.append(field.name)  # Push to array
        self.header = names  # Set the header as the names

        if hasattr(self.properties,
                   "seed"):  # Set the seed if there is a seed property
            self.synthesizer = Synthesizer(  # Initialize synthesizer with seeding
                self.table,
                idx=0,
                seed=self.properties.seed)
        else:
            # Otherwise, intialize without seed
            self.synthesizer = Synthesizer(self.table, idx=0)

        try:
            # Handle specific filetypes
            self.write_batch = method_switch[self.file_type]
        except KeyError:
            # Alert that this process did not execute
            self.__logger.info("RecordBatchGenerator: Filetype not 1, 2, or 5")

    def chunk(self):
        """
        Allow for concurrent generate during write
        """
        for _ in range(self.num_rows):
            try:
                yield tuple(self.synthesizer.generate())
            except TypeError:
                self.__logger.error("Generator function must return list")
                raise
            except Exception as error:
                self.__logger.error("Unknown error in chunk")
                self.__logger.error(error)

    def fwf_encode_row(self, row):
        record = ""
        #  Create data of specific unit types.
        fields = list(self.table.info.schema.fields)
        for i, dpoint in enumerate(row):
            # encode
            # pad to field width
            # append to record
            field_schema = fields[i]

            dpoint = str(dpoint)
            # signed integers require encoding
            # all other fields expected to be string-like
            if field_schema["utype"] == "int":
                if dpoint < 0:
                    # Convert negative integers.
                    dpoint = dpoint.replace("-", "")
                    dpoint = dpoint.replace(dpoint[-1],
                                            self.neg_char[dpoint[-1:]])
                else:
                    # Convert positive integers.
                    dpoint = dpoint.replace(dpoint[-1],
                                            self.pos_char[dpoint[-1:]])

            # ensure generated field is within schema length
            dpoint = dpoint[:field_schema["length"]]

            # pad up to required length
            if field_schema["utype"] == "int" or field_schema[
                    "utype"] == "uint":
                dpoint = ("0" *
                          (field_schema["length"] - len(dpoint))) + dpoint
            else:
                dpoint = dpoint + (" " *
                                   (field_schema["length"] - len(dpoint)))

            # append field to record
            record += dpoint

        return record

    def write_batch_fwf(self):
        """
        Generate a batch of records
        convert rows to fixed width fields
        encode to ascii format in bytes
        """
        fwf = io.StringIO()
        for row in list(self.chunk()):
            if len(row) != len(self.header):
                raise ValueError
            fwf.write(self.fwf_encode_row(row))

        fwf = fwf.getvalue().encode(self.codec)
        return pa.py_buffer(fwf)

    def write_batch_csv(self):
        """
        Generate batch of records
        encode to csv in bytes
        """
        csv = io.StringIO()
        if self.header:
            csv.write(",".join(self.header))
            csv.write(self.linesep)
        for row in list(self.chunk()):
            csv.write(",".join(map(str, row)))
            csv.write(self.linesep)

        csv = csv.getvalue().encode(self.codec)
        return pa.py_buffer(csv)

    def write_batch_arrow(self):
        """
        Generate a batch of records
        convert to pyarrow arrays
        convert to RecordBatch
        """
        data = list(self.chunk())
        data = zip(*data)
        arrays = []
        for i, column in enumerate(data):
            arrays.append(pa.array(column, self.pa_schema[i].type))

        batch = pa.RecordBatch.from_arrays(arrays, names=self.pa_schema.names)
        return batch

    def write_csv(self):
        """
        Write n chunks to csv
        Write file to disk
        """
        csv = b""

        while (len(csv) // 1024**2) < self.maxfilesize:
            csv += self.write_batch_csv()
            if self.checkcount():
                break
        return csv

    def write_fwf(self):
        """
        Write fwf with all records
        """
        fwf = b""

        while (len(fwf) // 1024**2) < self.maxfilesize:
            fwf += self.write_batch_fwf()
            if self.checkcount():
                break
        return fwf

    def write_recordbatchfile(self):
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, self.pa_schema)

        batches_size = 0
        while (batches_size // 1024**2) < self.maxfilesize:
            batch = self.write_batch_arrow()
            batches_size += pa.get_record_batch_size(batch)
            writer.write_batch(batch)
            if self.checkcount():
                break

        writer.close()
        buf = sink.getvalue()
        return buf

    def __next__(self):
        next(self._batch_iter)
        self.__logger.info("%s: Generating datum " % (self.__class__.__name__))
        data = self.write_batch()
        self._batchidx += 1
        return data
예제 #9
0
    def test_xmodifer(self):

        model = Table()
        schema = model.info.schema.info

        field1 = schema.fields.add()
        field1.name = "record_id"
        field1.info.type = "String"
        field1.info.length = 10

        field2 = schema.fields.add()
        field2.name = "Name"
        field2.info.type = "String"
        field2.info.length = 10
        field2.info.aux.generator.name = "name"

        field3 = schema.fields.add()
        field3.name = "SIN"
        field3.info.type = "String"
        field3.info.length = 10
        field3.info.aux.generator.name = "ssn"

        field4 = schema.fields.add()
        field4.name = "StreetNumber"
        field4.info.type = "String"
        field4.info.length = 40
        field4.info.aux.generator.name = "building_number"

        field5 = schema.fields.add()
        field5.name = "Street"
        field5.info.type = "String"
        field5.info.length = 40
        field5.info.aux.generator.name = "street_name"

        field6 = schema.fields.add()
        field6.name = "City"
        field6.info.type = "String"
        field6.info.length = 40
        field6.info.aux.generator.name = "city"

        field7 = schema.fields.add()
        field7.name = "Province"
        field7.info.type = "String"
        field7.info.length = 40
        field7.info.aux.generator.name = "province"

        field8 = schema.fields.add()
        field8.name = "PostalCode"
        field8.info.type = "String"
        field8.info.length = 40
        field8.info.aux.generator.name = "postcode"

        field9 = schema.fields.add()
        field9.name = "DOB"
        field9.info.type = "DateTime"
        field9.info.length = 40
        field9.info.aux.generator.name = "date"

        field10 = schema.fields.add()
        field10.name = "PhoneNum"
        field10.info.type = "String"
        field10.info.length = 11
        field10.info.aux.generator.name = "phone_number"

        model.info.aux.duplicate.probability = 1
        model.info.aux.duplicate.distribution = "uniform"
        model.info.aux.duplicate.maximum = 5

        modifier = model.info.aux.record_modifier

        modifier.max_modifications_in_record = 1
        modifier.max_field_modifiers = 1
        modifier.max_record_modifiers = 1

        name_mod = modifier.fields.add()
        name_mod.selection = 0.1
        name_mod.name = "Name"
        prob = name_mod.probabilities

        prob.insert = 0.1  # insert character in field
        prob.delete = 0.1  # delete character in field
        prob.substitute = 0.1  # substitute character in field
        prob.misspell = 0.0  # use mispelling dictionary
        prob.transpose = 0.1  # transpose adjacent characters
        prob.replace = 0.1  # replace with another value of same fake
        prob.swap = 0.1  # swap two words/values in field
        prob.split = 0.1  # split a field
        prob.merge = 0.1  # merge a field
        prob.nullify = 0.1  # convert to null
        prob.fill = 0.1  # fill empty field with expected type

        street_mod = modifier.fields.add()
        street_mod.selection = 0.9
        street_mod.name = "Street"
        prob2 = street_mod.probabilities

        prob2.insert = 0.1  # insert character in field
        prob2.delete = 0.1  # delete character in field
        prob2.substitute = 0.1  # substitute character in field
        prob2.misspell = 0.0  # use mispelling dictionary
        prob2.transpose = 0.1  # transpose adjacent characters
        prob2.replace = 0.1  # replace with another value of same fake
        prob2.swap = 0.1  # swap two words/values in field
        prob2.split = 0.1  # split a field
        prob2.merge = 0.1  # merge a field
        prob2.nullify = 0.1  # convert to null
        prob2.fill = 0.1  # fill empty field with expected type
        s2 = Synthesizer(model, "en_CA", idx=0, seed=4053)
        protorows = []
        for _ in range(10):
            protorows.append(s2.generate())
        print(protorows)
    def test_fileio(self):
        """
        Write csv to disk
        Read back in artemis
        """
        with tempfile.TemporaryDirectory() as dirpath:
            mb = MenuFactory("csvgen")
            msgmenu = mb.build()
            menuinfo = MenuObjectInfo()
            menuinfo.created.GetCurrentTime()

            store = BaseObjectStore(dirpath, "artemis")

            config = JobConfigFactory(
                "csvio",
                msgmenu,
                jobname="arrowproto",
                generator_type="file",
                filehandler_type="csv",
                nbatches=1,
                num_rows=10000,
                max_file_size=1073741824,
                write_csv=True,
                # input_repo=dirpath,
                input_glob=".csv",
                # output_repo=dirpath
            )

            config.configure()
            config.add_algos(mb.algos)
            configinfo = ConfigObjectInfo()
            configinfo.created.GetCurrentTime()

            menu_uuid = store.register_content(msgmenu, menuinfo).uuid
            config_uuid = store.register_content(config._msg, configinfo).uuid

            g_dataset = store.register_dataset()
            store.new_partition(g_dataset.uuid, "generator")
            job_id = store.new_job(g_dataset.uuid)

            # define the schema for the data
            g_table = Table()
            g_table.name = "generator"
            g_table.uuid = str(uuid.uuid4())
            g_table.info.schema.name = "csv"
            g_table.info.schema.uuid = str(uuid.uuid4())

            fields = list(
                itertools.islice(GenCsvLikeArrow.generate_col_names(), 20))
            for f in fields:
                field = g_table.info.schema.info.fields.add()
                field.name = f

            tinfo = TableObjectInfo()
            tinfo.fields.extend(fields)
            store.register_content(
                g_table,
                tinfo,
                dataset_id=g_dataset.uuid,
                job_id=job_id,
                partition_key="generator",
            )

            generator = GenCsvLikeArrow(
                "generator",
                nbatches=1,
                num_cols=20,
                num_rows=10000,
                suffix=".csv",
                prefix="testio",
                path=dirpath,
                table_id=g_table.uuid,
            )

            generator.gate.meta.parentset_id = g_dataset.uuid
            generator.gate.meta.job_id = str(job_id)
            generator.gate.store = store
            generator.initialize()
            generator.write()

            dataset = store.register_dataset(menu_uuid, config_uuid)
            job_id = store.new_job(dataset.uuid)
            store.save_store()

            job = JobInfo_pb()
            job.name = "arrowproto"
            job.job_id = "example"
            job.store_path = dirpath
            job.store_id = store.store_uuid
            job.store_name = store.store_name
            job.menu_id = menu_uuid
            job.config_id = config_uuid
            job.dataset_id = dataset.uuid
            job.job_id = str(job_id)
            # print(job)
            bow = Artemis(job, loglevel="INFO")
            bow.control()
    def test_distributed(self):
        with tempfile.TemporaryDirectory() as dirpath:
            mb = MenuFactory("csvgen")
            msgmenu = mb.build()
            menuinfo = MenuObjectInfo()
            menuinfo.created.GetCurrentTime()

            store = BaseObjectStore(dirpath, "artemis")

            config = JobConfigFactory(
                "csvio",
                msgmenu,
                jobname="arrowproto",
                generator_type="file",
                filehandler_type="csv",
                nbatches=1,
                num_rows=10000,
                max_file_size=1073741824,
                write_csv=True,
                input_glob=".csv",
            )

            config.configure()
            config.add_algos(mb.algos)
            configinfo = ConfigObjectInfo()
            configinfo.created.GetCurrentTime()

            menu_uuid = store.register_content(msgmenu, menuinfo).uuid
            config_obj = store.register_content(config._msg, configinfo)
            config_uuid = config_obj.uuid

            g_dataset = store.register_dataset()
            store.new_partition(g_dataset.uuid, "generator")
            job_id = store.new_job(g_dataset.uuid)

            # define the schema for the data
            g_table = Table()
            g_table.name = "generator"
            g_table.uuid = str(uuid.uuid4())
            g_table.info.schema.name = "csv"
            g_table.info.schema.uuid = str(uuid.uuid4())

            fields = list(
                itertools.islice(GenCsvLikeArrow.generate_col_names(), 20))
            for f in fields:
                field = g_table.info.schema.info.fields.add()
                field.name = f

            tinfo = TableObjectInfo()
            tinfo.fields.extend(fields)
            store.register_content(
                g_table,
                tinfo,
                dataset_id=g_dataset.uuid,
                job_id=job_id,
                partition_key="generator",
            )

            generator = GenCsvLikeArrow(
                "generator",
                nbatches=10,
                num_cols=20,
                num_rows=1000,
                suffix=".csv",
                prefix="testio",
                path=dirpath,
                table_id=g_table.uuid,
            )

            generator.gate.meta.parentset_id = g_dataset.uuid
            generator.gate.meta.job_id = str(job_id)
            generator.gate.store = store
            generator.initialize()
            generator.write()

            dataset = store.register_dataset(menu_uuid, config_uuid)
            job_id = store.new_job(dataset.uuid)
            store.save_store()

            #######################################
            inputs = store.list(prefix=g_dataset.uuid, suffix="csv")

            store_name = store._name
            store_uuid = store.store_uuid
            dataset_uuid = dataset.uuid
            ds_results = []
            for datum in inputs:
                job_id = store.new_job(dataset.uuid)
                url_data = urllib.parse.urlparse(datum.address)
                dpath = urllib.parse.unquote(url_data.path)
                print(datum)
                config = Configuration()
                store.get(config_uuid, config)
                for p in config.input.generator.config.properties.property:
                    if p.name == "glob":
                        p.value = dpath.split(".")[-2] + ".csv"
                store._put_message(config_uuid, config)
                store.get(config_uuid, config)
                print(config)
                ds_results.append(
                    runjob(
                        dirpath,
                        store_name,
                        store_uuid,
                        menu_uuid,
                        config_uuid,
                        dataset_uuid,
                        g_dataset.uuid,
                        job_id,
                    ))

            results = dask.compute(*ds_results, scheduler="single-threaded")
            # Workaround to fix error in dataset merging
            store.new_partition(dataset.uuid, "seqY")
            # Update the dataset
            for buf in results:
                ds = DatasetObjectInfo()
                ds.ParseFromString(buf)
                store.update_dataset(dataset.uuid, buf)
            # Save the store, reload
            store.save_store()

            print(store[dataset.uuid].dataset)