Exemple #1
0
 def test(self):
     model = Table()
     model.name = "EvolveModel"
     schema = model.info.schema.info
     field = schema.fields.add()
     field.name = "Name"
     field.info.type = "String"
     field.info.length = 10
     field.info.aux.generator.name = "name"
     print(model)
Exemple #2
0
    def test_rbgen_csv(self):
        """
        This method generates a test
        record batch as a CSV, and displays
        the results to the console.
        """
        # define the schema for the data
        g_table = Table()  # Define a table instance
        g_table.name = "EvolveModel"  # Name the table
        g_table.uuid = str(uuid.uuid4())  # Create a UUID
        schema = g_table.info.schema.info  # Access the schema unit

        field = schema.fields.add()  # Add a field
        field.name = "Name"  # Set the field name
        field.info.type = "float"  # Set the type of the field
        field.info.length = 10  # Set the field length
        field.info.aux.generator.name = "normal"  # Generator name, we need to trace this
        """
        # We're adding in the parameters here. These mimic the tests that are found in the ArtemisFaker module itself
        params = field.info.aux.generator.parameters.add()
        params.name = "Mean"
        params.value = 3
        params.type = "int"

        params2 = field.info.aux.generator.parameters.add()
        params2.name = "STD"
        params2.value = 3
        params2.type = "int"
        """
        g_table_msg = g_table.SerializeToString(
        )  # Create a string instance of this

        # This is the record batch generator
        # All the configurations are set in the
        # generator to produce the output.
        generator = RecordBatchGen(
            "generator",  # Unknown parameter
            nbatches=1,  # Total number of batches that are used
            num_rows=10,  # Total rows to be generated
            file_type=1,  # Encodes the data as csv
            table_id=g_table.uuid,  # Sets the table UUID
            table_msg=g_table_msg,  # Sets the table message
        )

        generator.initialize()  # Create the generator
        # Data returned as a pyarrow buffer
        # Convert to raw python bytes objects
        # Use io wrapper and read as csv
        for batch in generator:  # Generator is some kind of iterator
            data = batch.to_pybytes()  # Access the batch, convert to bytes
            # Create a text output, this turns it into a string
            with io.TextIOWrapper(io.BytesIO(data)) as textio:
                for row in csv.reader(
                        textio):  # Spit out the row in the buffer
                    print(row)  # Print the row
Exemple #3
0
    def test_gen_from_proto(self):

        model = Table()
        model.name = "EvolveModel"
        schema = model.info.schema.info
        field = schema.fields.add()
        field.name = "Name"
        field.info.type = "String"
        field.info.length = 10
        field.info.aux.generator.name = "name"

        s2 = Synthesizer(model, "en_CA", idx=0, seed=4053)
        print(s2.generate())
Exemple #4
0
    def test_table(self):
        table = Table()
        table.name = "Attachment"
        # table.uuid = str(uuid.uuid4())

        schema = table.info.schema.info
        schema.aux.frequency = 3
        schema.aux.description = "This table is for ..."

        field1 = schema.fields.add()
        field1.name = "record_id"
        field1.info.type = "String"
        field1.info.length = 10

        field2 = schema.fields.add()
        field2.name = "field2"
        field2.info.type = "String"
        field2.info.length = 20
        aux2 = field2.info.aux
        aux2.generator.name = "name"
        aux2.meta["Bool1"].bool_val = True
        aux2.meta["Bool2"].bool_val = False
        aux2.meta["String1"].string_val = "System"
        aux2.description = "Blah"

        field3 = schema.fields.add()
        field3.name = "fieldl3"
        field3.info.type = "String"
        field3.info.length = 24
        aux3 = field3.info.aux
        aux3.generator.name = "province"
        code = aux3.codeset
        code.name = "Codeset Name"
        code.version = "2016VR1"
        value1 = code.codevalues.add()
        value1.code = "1A"
        value1.description = "what 1a stands for"
        value2 = code.codevalues.add()
        value2.code = "2A"
        value2.description = "What 2a stands for"
        value2.lable = "lable for 2a"
        aux3.meta["Bool1"].bool_val = True
        aux3.meta["Bool2"].bool_val = True
        aux3.description = "Blah blah blah"
        aux3.meta["String1"].string_val = "Rule for variable population"

        tem2 = table.SerializeToString()
        print(tem2)
        table2 = Table()
        table2.ParseFromString(tem2)
        print(table2)
    def test_fileio(self):
        """
        Write csv to disk
        Read back in artemis
        """
        with tempfile.TemporaryDirectory() as dirpath:
            mb = MenuFactory("csvgen")
            msgmenu = mb.build()
            menuinfo = MenuObjectInfo()
            menuinfo.created.GetCurrentTime()

            store = BaseObjectStore(dirpath, "artemis")

            config = JobConfigFactory(
                "csvio",
                msgmenu,
                jobname="arrowproto",
                generator_type="file",
                filehandler_type="csv",
                nbatches=1,
                num_rows=10000,
                max_file_size=1073741824,
                write_csv=True,
                # input_repo=dirpath,
                input_glob=".csv",
                # output_repo=dirpath
            )

            config.configure()
            config.add_algos(mb.algos)
            configinfo = ConfigObjectInfo()
            configinfo.created.GetCurrentTime()

            menu_uuid = store.register_content(msgmenu, menuinfo).uuid
            config_uuid = store.register_content(config._msg, configinfo).uuid

            g_dataset = store.register_dataset()
            store.new_partition(g_dataset.uuid, "generator")
            job_id = store.new_job(g_dataset.uuid)

            # define the schema for the data
            g_table = Table()
            g_table.name = "generator"
            g_table.uuid = str(uuid.uuid4())
            g_table.info.schema.name = "csv"
            g_table.info.schema.uuid = str(uuid.uuid4())

            fields = list(
                itertools.islice(GenCsvLikeArrow.generate_col_names(), 20))
            for f in fields:
                field = g_table.info.schema.info.fields.add()
                field.name = f

            tinfo = TableObjectInfo()
            tinfo.fields.extend(fields)
            store.register_content(
                g_table,
                tinfo,
                dataset_id=g_dataset.uuid,
                job_id=job_id,
                partition_key="generator",
            )

            generator = GenCsvLikeArrow(
                "generator",
                nbatches=1,
                num_cols=20,
                num_rows=10000,
                suffix=".csv",
                prefix="testio",
                path=dirpath,
                table_id=g_table.uuid,
            )

            generator.gate.meta.parentset_id = g_dataset.uuid
            generator.gate.meta.job_id = str(job_id)
            generator.gate.store = store
            generator.initialize()
            generator.write()

            dataset = store.register_dataset(menu_uuid, config_uuid)
            job_id = store.new_job(dataset.uuid)
            store.save_store()

            job = JobInfo_pb()
            job.name = "arrowproto"
            job.job_id = "example"
            job.store_path = dirpath
            job.store_id = store.store_uuid
            job.store_name = store.store_name
            job.menu_id = menu_uuid
            job.config_id = config_uuid
            job.dataset_id = dataset.uuid
            job.job_id = str(job_id)
            # print(job)
            bow = Artemis(job, loglevel="INFO")
            bow.control()
    def test_distributed(self):
        with tempfile.TemporaryDirectory() as dirpath:
            mb = MenuFactory("csvgen")
            msgmenu = mb.build()
            menuinfo = MenuObjectInfo()
            menuinfo.created.GetCurrentTime()

            store = BaseObjectStore(dirpath, "artemis")

            config = JobConfigFactory(
                "csvio",
                msgmenu,
                jobname="arrowproto",
                generator_type="file",
                filehandler_type="csv",
                nbatches=1,
                num_rows=10000,
                max_file_size=1073741824,
                write_csv=True,
                input_glob=".csv",
            )

            config.configure()
            config.add_algos(mb.algos)
            configinfo = ConfigObjectInfo()
            configinfo.created.GetCurrentTime()

            menu_uuid = store.register_content(msgmenu, menuinfo).uuid
            config_obj = store.register_content(config._msg, configinfo)
            config_uuid = config_obj.uuid

            g_dataset = store.register_dataset()
            store.new_partition(g_dataset.uuid, "generator")
            job_id = store.new_job(g_dataset.uuid)

            # define the schema for the data
            g_table = Table()
            g_table.name = "generator"
            g_table.uuid = str(uuid.uuid4())
            g_table.info.schema.name = "csv"
            g_table.info.schema.uuid = str(uuid.uuid4())

            fields = list(
                itertools.islice(GenCsvLikeArrow.generate_col_names(), 20))
            for f in fields:
                field = g_table.info.schema.info.fields.add()
                field.name = f

            tinfo = TableObjectInfo()
            tinfo.fields.extend(fields)
            store.register_content(
                g_table,
                tinfo,
                dataset_id=g_dataset.uuid,
                job_id=job_id,
                partition_key="generator",
            )

            generator = GenCsvLikeArrow(
                "generator",
                nbatches=10,
                num_cols=20,
                num_rows=1000,
                suffix=".csv",
                prefix="testio",
                path=dirpath,
                table_id=g_table.uuid,
            )

            generator.gate.meta.parentset_id = g_dataset.uuid
            generator.gate.meta.job_id = str(job_id)
            generator.gate.store = store
            generator.initialize()
            generator.write()

            dataset = store.register_dataset(menu_uuid, config_uuid)
            job_id = store.new_job(dataset.uuid)
            store.save_store()

            #######################################
            inputs = store.list(prefix=g_dataset.uuid, suffix="csv")

            store_name = store._name
            store_uuid = store.store_uuid
            dataset_uuid = dataset.uuid
            ds_results = []
            for datum in inputs:
                job_id = store.new_job(dataset.uuid)
                url_data = urllib.parse.urlparse(datum.address)
                dpath = urllib.parse.unquote(url_data.path)
                print(datum)
                config = Configuration()
                store.get(config_uuid, config)
                for p in config.input.generator.config.properties.property:
                    if p.name == "glob":
                        p.value = dpath.split(".")[-2] + ".csv"
                store._put_message(config_uuid, config)
                store.get(config_uuid, config)
                print(config)
                ds_results.append(
                    runjob(
                        dirpath,
                        store_name,
                        store_uuid,
                        menu_uuid,
                        config_uuid,
                        dataset_uuid,
                        g_dataset.uuid,
                        job_id,
                    ))

            results = dask.compute(*ds_results, scheduler="single-threaded")
            # Workaround to fix error in dataset merging
            store.new_partition(dataset.uuid, "seqY")
            # Update the dataset
            for buf in results:
                ds = DatasetObjectInfo()
                ds.ParseFromString(buf)
                store.update_dataset(dataset.uuid, buf)
            # Save the store, reload
            store.save_store()

            print(store[dataset.uuid].dataset)