def test_create_dataset_should_load_parquet(self): dataset_conf = { "path": "resources/datasets/test.parquet", "format": "parquet" } dataset = DatasetFactory.create_dataset(self.spark, dataset_conf) self.assertEqual(dataset.columns, ["id", "text"]) self.assertEqual(dataset.count(), 4)
def test_create_dataset_should_load_csv(self): dataset_conf = { "path": "resources/datasets/test.csv", "format": "csv", "sep": ",", "header": True } dataset = DatasetFactory.create_dataset(self.spark, dataset_conf) self.assertEqual(dataset.columns, ["id", "text"]) self.assertEqual(dataset.count(), 4)
def create_stage(spark, stage_conf): if not isinstance(stage_conf, dict): # Stage conf is already a stage return stage_conf name = stage_conf["name"] params = stage_conf["params"] if "dataset" in params and isinstance(params["dataset"], dict): # Create dataset if needed in stage parameters dataset_conf = params["dataset"] params["dataset"] = DatasetFactory.create_dataset( spark, dataset_conf) stage = StageFactory.get_stage(name) StageFactory.set_params(spark, stage, params) return stage
def create_step(spark, step_conf): name = step_conf["name"] params = step_conf["params"] stage = step_conf["stage"] if isinstance(stage, dict) and "stage" in stage: # Stage comes from an executed step stage = StepFactory.create_step(spark, stage).execute() else: # Create a stage stage = StageFactory.create_stage(spark, stage) if "dataset" in params and isinstance(params["dataset"], dict): # Create dataset if needed in step parameters dataset_conf = params["dataset"] params["dataset"] = DatasetFactory.create_dataset( spark, dataset_conf) return Step(name, params, stage)