コード例 #1
0
    def initialize(self):
        # Behaves like a switch case controlling the writng method
        method_switch = {
            1: self.write_batch_csv,
            2: self.write_batch_fwf,
            5: self.write_batch_arrow
        }

        self.__logger.info("RecordBatchGenerator")  # Get the logger info
        # Set the number of fields programmatically
        self.num_cols = len(self.table.info.schema.info.fields)
        names = []  # Field name array
        for field in self.table.info.schema.info.fields:  # Iterate over the schema fields
            names.append(field.name)  # Push to array
        self.header = names  # Set the header as the names

        if hasattr(self.properties,
                   "seed"):  # Set the seed if there is a seed property
            self.synthesizer = Synthesizer(  # Initialize synthesizer with seeding
                self.table,
                idx=0,
                seed=self.properties.seed)
        else:
            # Otherwise, intialize without seed
            self.synthesizer = Synthesizer(self.table, idx=0)

        try:
            # Handle specific filetypes
            self.write_batch = method_switch[self.file_type]
        except KeyError:
            # Alert that this process did not execute
            self.__logger.info("RecordBatchGenerator: Filetype not 1, 2, or 5")
コード例 #2
0
    def test_xduplicates(self):

        model = Table()

        model.info.aux.duplicate.probability = 1
        model.info.aux.duplicate.distribution = "uniform"
        model.info.aux.duplicate.maximum = 1
        schema = model.info.schema.info

        field1 = schema.fields.add()
        field1.name = "record_id"
        field1.info.type = "String"
        field1.info.length = 10

        field2 = schema.fields.add()
        field2.name = "Name"
        field2.info.type = "String"
        field2.info.length = 10
        field2.info.aux.generator.name = "name"

        field3 = schema.fields.add()
        field3.name = "UPC"
        field3.info.type = "Integer"
        field3.info.length = 13
        field3.info.aux.generator.name = "ean"

        parm = field3.info.aux.generator.parameters.add()
        parm.name = "ndigits"
        parm.value = 13
        parm.type = "int"

        s2 = Synthesizer(model, "en_CA", idx=0, seed=4053)
        print(s2.generate())
コード例 #3
0
    def test_glm_proto(self):
        model = Table()
        schema = model.info.schema.info
        field1 = schema.fields.add()
        field1.name = "Value1"
        field1.info.type = "Float"
        field1.info.length = 10
        field1.info.aux.generator.name = "random_int"
        field1.info.aux.dependent = "Prediction"

        field2 = schema.fields.add()
        field2.name = "Value2"
        field2.info.type = "Float"
        field2.info.length = 10
        field2.info.aux.generator.name = "random_int"
        field2.info.aux.dependent = "Prediction"

        field3 = schema.fields.add()
        field3.name = "Prediction"
        field3.info.type = "Float"
        field3.info.length = 10
        field3.info.aux.generator.name = "glm"

        beta1 = field3.info.aux.generator.parameters.add()
        beta1.name = "beta1"
        beta1.value = 10
        beta1.type = "int"
        beta2 = field3.info.aux.generator.parameters.add()
        beta2.name = "beta2"
        beta2.value = 0.1
        beta2.type = "float"
        beta3 = field3.info.aux.generator.parameters.add()
        beta3.name = "beta3"
        beta3.value = 100
        beta3.type = "int"
        sigma = field3.info.aux.generator.parameters.add()
        sigma.name = "sigma"
        sigma.value = 1
        sigma.type = "int"

        var1 = field3.info.aux.generator.parameters.add()
        var1.name = "Value1"
        var1.type = "Field"
        var1.variable.CopyFrom(field1)

        var2 = field3.info.aux.generator.parameters.add()
        var2.name = "Value2"
        var2.type = "Field"
        var2.variable.CopyFrom(field2)

        s2 = Synthesizer(model, "en_CA")
        print(s2.generate())
コード例 #4
0
    def test_gen_from_proto(self):

        model = Table()
        model.name = "EvolveModel"
        schema = model.info.schema.info
        field = schema.fields.add()
        field.name = "Name"
        field.info.type = "String"
        field.info.length = 10
        field.info.aux.generator.name = "name"

        s2 = Synthesizer(model, "en_CA", idx=0, seed=4053)
        print(s2.generate())
コード例 #5
0
    def initialize(self):
        self.__logger.info("RecordBatchGenerator")
        self.num_cols = len(self.table.info.schema.info.fields)
        names = []
        for field in self.table.info.schema.info.fields:
            names.append(field.name)
        self.header = names

        if hasattr(self.properties, "seed"):
            self.synthesizer = Synthesizer(self.table,
                                           "en_CA",
                                           idx=0,
                                           seed=self.properties.seed)
        else:
            self.synthesizer = Synthesizer(self.table, "en_CA", idx=0)

        if self.file_type == 1:
            self.write_batch = self.write_batch_csv
        elif self.file_type == 2:
            self.write_batch = self.write_batch_fwf
        elif self.file_type == 5:
            self.write_batch = self.write_batch_arrow
コード例 #6
0
class RecordBatchGen:
    """
    Class is a record-generator
    """
    def __init__(self, name, **kwargs):
        Logger.configure(self, **kwargs)
        self.__name = name
        self.properties = Properties()
        options = dict(RecordBatchGenOptions())
        options.update(kwargs)
        for key in options:
            self.properties.add_property(key, options[key])

        if hasattr(self.properties, "seed"):
            self.rnd = check_random_state(seed=self.properties.seed)
        else:
            self.rnd = check_random_state(seed=None)

        if hasattr(self.properties, "nbatches"):
            self._nbatches = self.properties.nbatches
            self._batch_iter = iter(range(self.properties.nbatches))
        else:
            self.__logger.warning("Number of batches not defined")

        self._batchidx = 0

        self.table_id = self.properties.table_id

        self.table = Table()
        if hasattr(self.properties, "table_msg"):
            self.table.ParseFromString(self.properties.table_msg)
        else:
            self.__logger.warning("No table message to deserialize")

        self.num_rows = self.properties.num_rows
        self.linesep = self.properties.linesep
        # self.header = self.properties.header
        self.nsamples = self.properties.nsamples
        self.file_type = self.properties.file_type
        self.codec = self.properties.codec

        self.synthesizer = None
        self.num_cols = None
        self.write_batch = None
        self.header = None

        # FWF
        self.pos_char = {
            "0": "{",
            "1": "A",
            "2": "B",
            "3": "C",
            "4": "D",
            "5": "E",
            "6": "F",
            "7": "G",
            "8": "H",
            "9": "I",
        }
        self.neg_char = {
            "0": "}",
            "1": "J",
            "2": "K",
            "3": "L",
            "4": "M",
            "5": "N",
            "6": "O",
            "7": "P",
            "8": "Q",
            "9": "R",
        }
        # header = ''
        self.header_offset = 0
        self.footer = ""
        self.footer_size = 0

        self.__logger.info("Initialized %s", self.__class__.__name__)
        self.__logger.info("%s properties: %s", self.__class__.__name__,
                           self.properties)
        print("Initialize RecordBatchGen")

    @property
    def random_state(self):
        return self._builtin_generator.rnd

    @property
    def name(self):
        """
        Algorithm name
        """
        return self.__name

    def reset(self):
        if hasattr(self, "_nbatches"):
            self._batch_iter = iter(range(self._nbatches))
        else:
            self.__logger.warning("Override reset in concrete class")

    def to_msg(self):
        message = Algo_pb()
        message.name = self.name
        message.klass = self.__class__.__name__
        message.module = self.__module__
        message.properties.CopyFrom(self.properties.to_msg())
        return message

    @staticmethod
    def from_msg(logger, msg):
        logger.info("Loading Algo from msg %s", msg.name)
        try:
            module = importlib.import_module(msg.module)
        except ImportError:
            logger.error("Unable to load module %s", msg.module)
            raise
        except Exception as e:
            logger.error("Unknow error loading module: %s" % e)
            raise
        try:
            class_ = getattr(module, msg.klass)
        except AttributeError:
            logger.error("%s: missing attribute %s" % (msg.name, msg.klass))
            raise
        except Exception as e:
            logger.error("Reason: %s" % e)
            raise

        properties = Properties.from_msg(msg.properties)
        logger.debug(pformat(properties))

        # Update the logging level of
        # algorithms if loglevel not set
        # Ensures user-defined algos get the artemis level logging
        if "loglevel" not in properties:
            properties["loglevel"] = logger.getEffectiveLevel()

        try:
            instance = class_(msg.name, **properties)
        except AttributeError:
            logger.error("%s: missing attribute %s" % (msg.name, "properties"))
            raise
        except Exception as e:
            logger.error("%s: Cannot initialize %s" % e)
            raise

        return instance

    @property
    def num_batches(self):
        return self._nbatches

    @num_batches.setter
    def num_batches(self, n):
        self._nbatches = n

    def __iter__(self):
        return self

    def initialize(self):
        # Behaves like a switch case controlling the writng method
        method_switch = {
            1: self.write_batch_csv,
            2: self.write_batch_fwf,
            5: self.write_batch_arrow
        }

        self.__logger.info("RecordBatchGenerator")  # Get the logger info
        # Set the number of fields programmatically
        self.num_cols = len(self.table.info.schema.info.fields)
        names = []  # Field name array
        for field in self.table.info.schema.info.fields:  # Iterate over the schema fields
            names.append(field.name)  # Push to array
        self.header = names  # Set the header as the names

        if hasattr(self.properties,
                   "seed"):  # Set the seed if there is a seed property
            self.synthesizer = Synthesizer(  # Initialize synthesizer with seeding
                self.table,
                idx=0,
                seed=self.properties.seed)
        else:
            # Otherwise, intialize without seed
            self.synthesizer = Synthesizer(self.table, idx=0)

        try:
            # Handle specific filetypes
            self.write_batch = method_switch[self.file_type]
        except KeyError:
            # Alert that this process did not execute
            self.__logger.info("RecordBatchGenerator: Filetype not 1, 2, or 5")

    def chunk(self):
        """
        Allow for concurrent generate during write
        """
        for _ in range(self.num_rows):
            try:
                yield tuple(self.synthesizer.generate())
            except TypeError:
                self.__logger.error("Generator function must return list")
                raise
            except Exception as error:
                self.__logger.error("Unknown error in chunk")
                self.__logger.error(error)

    def fwf_encode_row(self, row):
        record = ""
        #  Create data of specific unit types.
        fields = list(self.table.info.schema.fields)
        for i, dpoint in enumerate(row):
            # encode
            # pad to field width
            # append to record
            field_schema = fields[i]

            dpoint = str(dpoint)
            # signed integers require encoding
            # all other fields expected to be string-like
            if field_schema["utype"] == "int":
                if dpoint < 0:
                    # Convert negative integers.
                    dpoint = dpoint.replace("-", "")
                    dpoint = dpoint.replace(dpoint[-1],
                                            self.neg_char[dpoint[-1:]])
                else:
                    # Convert positive integers.
                    dpoint = dpoint.replace(dpoint[-1],
                                            self.pos_char[dpoint[-1:]])

            # ensure generated field is within schema length
            dpoint = dpoint[:field_schema["length"]]

            # pad up to required length
            if field_schema["utype"] == "int" or field_schema[
                    "utype"] == "uint":
                dpoint = ("0" *
                          (field_schema["length"] - len(dpoint))) + dpoint
            else:
                dpoint = dpoint + (" " *
                                   (field_schema["length"] - len(dpoint)))

            # append field to record
            record += dpoint

        return record

    def write_batch_fwf(self):
        """
        Generate a batch of records
        convert rows to fixed width fields
        encode to ascii format in bytes
        """
        fwf = io.StringIO()
        for row in list(self.chunk()):
            if len(row) != len(self.header):
                raise ValueError
            fwf.write(self.fwf_encode_row(row))

        fwf = fwf.getvalue().encode(self.codec)
        return pa.py_buffer(fwf)

    def write_batch_csv(self):
        """
        Generate batch of records
        encode to csv in bytes
        """
        csv = io.StringIO()
        if self.header:
            csv.write(",".join(self.header))
            csv.write(self.linesep)
        for row in list(self.chunk()):
            csv.write(",".join(map(str, row)))
            csv.write(self.linesep)

        csv = csv.getvalue().encode(self.codec)
        return pa.py_buffer(csv)

    def write_batch_arrow(self):
        """
        Generate a batch of records
        convert to pyarrow arrays
        convert to RecordBatch
        """
        data = list(self.chunk())
        data = zip(*data)
        arrays = []
        for i, column in enumerate(data):
            arrays.append(pa.array(column, self.pa_schema[i].type))

        batch = pa.RecordBatch.from_arrays(arrays, names=self.pa_schema.names)
        return batch

    def write_csv(self):
        """
        Write n chunks to csv
        Write file to disk
        """
        csv = b""

        while (len(csv) // 1024**2) < self.maxfilesize:
            csv += self.write_batch_csv()
            if self.checkcount():
                break
        return csv

    def write_fwf(self):
        """
        Write fwf with all records
        """
        fwf = b""

        while (len(fwf) // 1024**2) < self.maxfilesize:
            fwf += self.write_batch_fwf()
            if self.checkcount():
                break
        return fwf

    def write_recordbatchfile(self):
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, self.pa_schema)

        batches_size = 0
        while (batches_size // 1024**2) < self.maxfilesize:
            batch = self.write_batch_arrow()
            batches_size += pa.get_record_batch_size(batch)
            writer.write_batch(batch)
            if self.checkcount():
                break

        writer.close()
        buf = sink.getvalue()
        return buf

    def __next__(self):
        next(self._batch_iter)
        self.__logger.info("%s: Generating datum " % (self.__class__.__name__))
        data = self.write_batch()
        self._batchidx += 1
        return data
コード例 #7
0
    def test_xmodifer(self):

        model = Table()
        schema = model.info.schema.info

        field1 = schema.fields.add()
        field1.name = "record_id"
        field1.info.type = "String"
        field1.info.length = 10

        field2 = schema.fields.add()
        field2.name = "Name"
        field2.info.type = "String"
        field2.info.length = 10
        field2.info.aux.generator.name = "name"

        field3 = schema.fields.add()
        field3.name = "SIN"
        field3.info.type = "String"
        field3.info.length = 10
        field3.info.aux.generator.name = "ssn"

        field4 = schema.fields.add()
        field4.name = "StreetNumber"
        field4.info.type = "String"
        field4.info.length = 40
        field4.info.aux.generator.name = "building_number"

        field5 = schema.fields.add()
        field5.name = "Street"
        field5.info.type = "String"
        field5.info.length = 40
        field5.info.aux.generator.name = "street_name"

        field6 = schema.fields.add()
        field6.name = "City"
        field6.info.type = "String"
        field6.info.length = 40
        field6.info.aux.generator.name = "city"

        field7 = schema.fields.add()
        field7.name = "Province"
        field7.info.type = "String"
        field7.info.length = 40
        field7.info.aux.generator.name = "province"

        field8 = schema.fields.add()
        field8.name = "PostalCode"
        field8.info.type = "String"
        field8.info.length = 40
        field8.info.aux.generator.name = "postcode"

        field9 = schema.fields.add()
        field9.name = "DOB"
        field9.info.type = "DateTime"
        field9.info.length = 40
        field9.info.aux.generator.name = "date"

        field10 = schema.fields.add()
        field10.name = "PhoneNum"
        field10.info.type = "String"
        field10.info.length = 11
        field10.info.aux.generator.name = "phone_number"

        model.info.aux.duplicate.probability = 1
        model.info.aux.duplicate.distribution = "uniform"
        model.info.aux.duplicate.maximum = 5

        modifier = model.info.aux.record_modifier

        modifier.max_modifications_in_record = 1
        modifier.max_field_modifiers = 1
        modifier.max_record_modifiers = 1

        name_mod = modifier.fields.add()
        name_mod.selection = 0.1
        name_mod.name = "Name"
        prob = name_mod.probabilities

        prob.insert = 0.1  # insert character in field
        prob.delete = 0.1  # delete character in field
        prob.substitute = 0.1  # substitute character in field
        prob.misspell = 0.0  # use mispelling dictionary
        prob.transpose = 0.1  # transpose adjacent characters
        prob.replace = 0.1  # replace with another value of same fake
        prob.swap = 0.1  # swap two words/values in field
        prob.split = 0.1  # split a field
        prob.merge = 0.1  # merge a field
        prob.nullify = 0.1  # convert to null
        prob.fill = 0.1  # fill empty field with expected type

        street_mod = modifier.fields.add()
        street_mod.selection = 0.9
        street_mod.name = "Street"
        prob2 = street_mod.probabilities

        prob2.insert = 0.1  # insert character in field
        prob2.delete = 0.1  # delete character in field
        prob2.substitute = 0.1  # substitute character in field
        prob2.misspell = 0.0  # use mispelling dictionary
        prob2.transpose = 0.1  # transpose adjacent characters
        prob2.replace = 0.1  # replace with another value of same fake
        prob2.swap = 0.1  # swap two words/values in field
        prob2.split = 0.1  # split a field
        prob2.merge = 0.1  # merge a field
        prob2.nullify = 0.1  # convert to null
        prob2.fill = 0.1  # fill empty field with expected type
        s2 = Synthesizer(model, "en_CA", idx=0, seed=4053)
        protorows = []
        for _ in range(10):
            protorows.append(s2.generate())
        print(protorows)