def run(self, processor_context: ProcessorContext) -> None: default_options = processor_context.get_property_group( self.DEFAULT_PROPS_GROUP) write_options = processor_context.get_property_group( self.WRITE_OPTIONS_GROUP) path = default_options.get_property(self.PATH) write_format = default_options.get_property(self.FORMAT) mode = default_options.get_property(self.MODE) partition_by = default_options.get_property(self.PARTITION_BY) trigger_type = default_options.get_property( self.TRIGGER_TYPE, self.TRIGGER_TYPE.default_value) trigger_value = default_options.get_property( self.TRIGGER_VALUE, self.TRIGGER_VALUE.default_value) if trigger_type == ONCE_TRIGGER_TYPE: trigger_value = get_boolean_value(trigger_value, True) source_df = processor_context.dependencies[0].df trigger_params = {trigger_type: trigger_value} # TODO check query name source_df \ .writeStream \ .trigger(**trigger_params) \ .start(path=path, format=write_format, outputMode=mode, partitionBy=partition_by, **write_options) \ .awaitTermination()
def test_load_stream_processor(spark_session: SparkSession): schema = { 'type': 'struct', 'fields': [ { 'name': 'name', 'type': 'string', 'nullable': False, 'metadata': {} }, { 'name': 'contact', 'type': 'integer', 'nullable': False, 'metadata': {} }, ] } load_options = { 'header': 'true', 'inferSchema': 'true', 'checkpointLocation': f'{TEST_DIR}/checkpoint' } default_props = PropertyGroup() default_props.set_property(LoadStreamProcessor.PATH, f'{FIXTURE_DIR}/sample_load.csv') default_props.set_property(LoadStreamProcessor.FORMAT, 'csv') default_props.set_property(LoadStreamProcessor.SCHEMA, schema) property_groups = PropertyGroups() property_groups.set_property_group(LoadStreamProcessor.LOAD_OPTIONS_GROUP, load_options) property_groups.set_property_group(LoadStreamProcessor.DEFAULT_PROPS_GROUP, default_props) processor_context = ProcessorContext(spark_session, property_groups=property_groups) processor = LoadStreamProcessor() output = processor.run(processor_context) output_dir = f'{TEST_DIR}/stream_output' output.df.createOrReplaceTempView('input') output.df.writeStream.trigger(once=True) \ .start(path=output_dir, format='csv', outputMode='append', **load_options) \ .awaitTermination() actual = spark_session.read.options(**load_options) \ .csv(output_dir).collect() expected_data = [{'name': 'xyz', 'contact': 123}] expected = spark_session.createDataFrame(expected_data).select( 'name', 'contact').collect() assert actual == expected
def run(self, processor_context: ProcessorContext) -> Dependency: dependency_config = {} default_options = processor_context.get_property_group( self.DEFAULT_PROPS_GROUP) load_options = processor_context.get_property_group( self.LOAD_OPTIONS_GROUP) view_name = default_options.get_property(self.VIEW_NAME) if view_name is not None: dependency_config['view_name'] = view_name path = default_options.get_property(self.PATH) format = default_options.get_property(self.FORMAT) spark_session = processor_context.spark_session df = spark_session.read.load(path=path, format=format, **load_options) return Dependency(df, dependency_config)
def run(self, processor_context: ProcessorContext) -> None: default_options = processor_context.get_property_group( self.DEFAULT_PROPS_GROUP ) write_options = processor_context.get_property_group( self.WRITE_OPTIONS_GROUP ) path = default_options.get_property(self.PATH) format = default_options.get_property(self.FORMAT) mode = default_options.get_property(self.MODE) partition_by = default_options.get_property(self.PARTITION_BY) source_df = processor_context.dependencies[0].df source_df.write.save(path=path, format=format, mode=mode, partitionBy=partition_by, **write_options)
def test_load_processor(spark_session: SparkSession): load_options = { 'header': 'true' } default_props = PropertyGroup() default_props.set_property( LoadProcessor.PATH, f'{FIXTURE_DIR}/sample_load.csv') default_props.set_property(LoadProcessor.FORMAT, 'csv') property_groups = PropertyGroups() property_groups.set_property_group( LoadProcessor.LOAD_OPTIONS_GROUP, load_options) property_groups.set_property_group( LoadProcessor.DEFAULT_PROPS_GROUP, default_props ) processor_context = ProcessorContext(spark_session) processor_context.set_property_group( LoadProcessor.LOAD_OPTIONS_GROUP, load_options) processor_context.set_property_group( LoadProcessor.DEFAULT_PROPS_GROUP, default_props ) processor = LoadProcessor() output = processor.run(processor_context) actual = output.df.collect() expected_data = [{'name': 'xyz', 'contact': '123'}] expected = spark_session.createDataFrame(expected_data) \ .select('name', 'contact').collect() assert actual == expected
def run(self, processor_context: ProcessorContext) -> Dependency: dependency_config = {} default_options = processor_context.get_property_group( self.DEFAULT_PROPS_GROUP) load_options = processor_context.get_property_group( self.LOAD_OPTIONS_GROUP) view_name = default_options.get_property(self.VIEW_NAME) if view_name is not None: dependency_config['view_name'] = view_name path = default_options.get_property(self.PATH) load_format = default_options.get_property(self.FORMAT) schema = default_options.get_property(self.SCHEMA) struct_type = StructType.fromJson(schema) df = processor_context.spark_session.readStream.load( path=path, format=load_format, schema=struct_type, **load_options) return Dependency(df, dependency_config)
def run(self, processor_context: ProcessorContext) -> Dependency: for dependency in processor_context.dependencies: assert 'view_name' in dependency.config, \ 'Missing view_name in dependency.' view_name = dependency.config['view_name'] df = dependency.df df.createOrReplaceTempView(view_name) default_options = processor_context.get_property_group( self.DEFAULT_PROPS_GROUP) spark = processor_context.spark_session df = spark.sql(default_options.get_property(self.QUERY)) dependency_config = {} return Dependency(df, dependency_config)
def get_dependency(cls, workflow: Workflow, processor_id: str, graph: DiGraph, spark: SparkSession) -> Union[Dependency, None]: processor_config = workflow.get_processor(processor_id) predecessors = [] if graph.number_of_edges() > 0: predecessors = list(graph.predecessors(processor_id)) dependencies = [] if not bool(predecessors): predecessors = [] for predecessor in predecessors: dependencies.append( cls.get_dependency(workflow, predecessor, graph, spark)) processor_context = ProcessorContext( spark_session=spark, property_groups=processor_config.property_groups, dependencies=dependencies) processor = SparkProcessor.get_spark_processor(processor_config.type) return processor.run(processor_context)
def test_write_stream_processor(spark_session: SparkSession): write_options = { 'header': 'true', 'inferSchema': 'true', 'checkpointLocation': f'{TEST_DIR}/checkpoint' } schema = { 'type': 'struct', 'fields': [ { 'name': 'name', 'type': 'string', 'nullable': False, 'metadata': {} }, { 'name': 'contact', 'type': 'integer', 'nullable': False, 'metadata': {} }, ] } input_path = f'{FIXTURE_DIR}/sample_load.csv' output_path = f'{TEST_DIR}/sample_load.csv' default_props = PropertyGroup() default_props.set_property(WriteStreamProcessor.PATH, output_path) default_props.set_property(WriteStreamProcessor.FORMAT, 'csv') property_groups = PropertyGroups() property_groups.set_property_group( WriteStreamProcessor.WRITE_OPTIONS_GROUP, write_options) property_groups.set_property_group( WriteStreamProcessor.DEFAULT_PROPS_GROUP, default_props) dependency_df = spark_session.readStream.load( path=input_path, format='csv', schema=StructType.fromJson(schema), **write_options) dependency = Dependency(dependency_df, {}) processor_context = ProcessorContext(spark_session, property_groups, [dependency]) processor = WriteStreamProcessor() processor.run(processor_context) actual = spark_session \ .read \ .options(**write_options) \ .csv(output_path) \ .collect() expected = spark_session \ .read \ .options(**write_options) \ .csv(input_path) \ .collect() assert actual == expected