Beispiel #1
0
    def test_create_dataset_should_load_parquet(self):
        dataset_conf = {
            "path": "resources/datasets/test.parquet",
            "format": "parquet"
        }

        dataset = DatasetFactory.create_dataset(self.spark, dataset_conf)

        self.assertEqual(dataset.columns, ["id", "text"])
        self.assertEqual(dataset.count(), 4)
Beispiel #2
0
    def test_create_dataset_should_load_csv(self):
        dataset_conf = {
            "path": "resources/datasets/test.csv",
            "format": "csv",
            "sep": ",",
            "header": True
        }

        dataset = DatasetFactory.create_dataset(self.spark, dataset_conf)

        self.assertEqual(dataset.columns, ["id", "text"])
        self.assertEqual(dataset.count(), 4)
Beispiel #3
0
    def create_stage(spark, stage_conf):
        if not isinstance(stage_conf, dict):
            # Stage conf is already a stage
            return stage_conf

        name = stage_conf["name"]
        params = stage_conf["params"]

        if "dataset" in params and isinstance(params["dataset"], dict):
            # Create dataset if needed in stage parameters
            dataset_conf = params["dataset"]
            params["dataset"] = DatasetFactory.create_dataset(
                spark, dataset_conf)

        stage = StageFactory.get_stage(name)
        StageFactory.set_params(spark, stage, params)

        return stage
Beispiel #4
0
    def create_step(spark, step_conf):
        name = step_conf["name"]
        params = step_conf["params"]
        stage = step_conf["stage"]

        if isinstance(stage, dict) and "stage" in stage:
            # Stage comes from an executed step
            stage = StepFactory.create_step(spark, stage).execute()
        else:
            # Create a stage
            stage = StageFactory.create_stage(spark, stage)

        if "dataset" in params and isinstance(params["dataset"], dict):
            # Create dataset if needed in step parameters
            dataset_conf = params["dataset"]
            params["dataset"] = DatasetFactory.create_dataset(
                spark, dataset_conf)

        return Step(name, params, stage)