Example #1
0
    def run(self, processor_context: ProcessorContext) -> None:
        default_options = processor_context.get_property_group(
            self.DEFAULT_PROPS_GROUP)

        write_options = processor_context.get_property_group(
            self.WRITE_OPTIONS_GROUP)

        path = default_options.get_property(self.PATH)
        write_format = default_options.get_property(self.FORMAT)
        mode = default_options.get_property(self.MODE)
        partition_by = default_options.get_property(self.PARTITION_BY)
        trigger_type = default_options.get_property(
            self.TRIGGER_TYPE, self.TRIGGER_TYPE.default_value)
        trigger_value = default_options.get_property(
            self.TRIGGER_VALUE, self.TRIGGER_VALUE.default_value)
        if trigger_type == ONCE_TRIGGER_TYPE:
            trigger_value = get_boolean_value(trigger_value, True)

        source_df = processor_context.dependencies[0].df

        trigger_params = {trigger_type: trigger_value}

        # TODO check query name

        source_df \
            .writeStream \
            .trigger(**trigger_params) \
            .start(path=path,
                   format=write_format,
                   outputMode=mode,
                   partitionBy=partition_by,
                   **write_options) \
            .awaitTermination()
Example #2
0
def test_load_stream_processor(spark_session: SparkSession):

    schema = {
        'type':
        'struct',
        'fields': [
            {
                'name': 'name',
                'type': 'string',
                'nullable': False,
                'metadata': {}
            },
            {
                'name': 'contact',
                'type': 'integer',
                'nullable': False,
                'metadata': {}
            },
        ]
    }

    load_options = {
        'header': 'true',
        'inferSchema': 'true',
        'checkpointLocation': f'{TEST_DIR}/checkpoint'
    }

    default_props = PropertyGroup()
    default_props.set_property(LoadStreamProcessor.PATH,
                               f'{FIXTURE_DIR}/sample_load.csv')
    default_props.set_property(LoadStreamProcessor.FORMAT, 'csv')
    default_props.set_property(LoadStreamProcessor.SCHEMA, schema)

    property_groups = PropertyGroups()
    property_groups.set_property_group(LoadStreamProcessor.LOAD_OPTIONS_GROUP,
                                       load_options)
    property_groups.set_property_group(LoadStreamProcessor.DEFAULT_PROPS_GROUP,
                                       default_props)

    processor_context = ProcessorContext(spark_session,
                                         property_groups=property_groups)

    processor = LoadStreamProcessor()
    output = processor.run(processor_context)
    output_dir = f'{TEST_DIR}/stream_output'
    output.df.createOrReplaceTempView('input')
    output.df.writeStream.trigger(once=True) \
        .start(path=output_dir,
               format='csv',
               outputMode='append',
               **load_options) \
        .awaitTermination()

    actual = spark_session.read.options(**load_options) \
        .csv(output_dir).collect()
    expected_data = [{'name': 'xyz', 'contact': 123}]
    expected = spark_session.createDataFrame(expected_data).select(
        'name', 'contact').collect()
    assert actual == expected
Example #3
0
    def run(self,
            processor_context: ProcessorContext) -> Dependency:
        dependency_config = {}

        default_options = processor_context.get_property_group(
            self.DEFAULT_PROPS_GROUP)
        load_options = processor_context.get_property_group(
            self.LOAD_OPTIONS_GROUP)

        view_name = default_options.get_property(self.VIEW_NAME)
        if view_name is not None:
            dependency_config['view_name'] = view_name

        path = default_options.get_property(self.PATH)
        format = default_options.get_property(self.FORMAT)

        spark_session = processor_context.spark_session
        df = spark_session.read.load(path=path, format=format, **load_options)

        return Dependency(df, dependency_config)
Example #4
0
    def run(self,
            processor_context: ProcessorContext) -> None:

        default_options = processor_context.get_property_group(
            self.DEFAULT_PROPS_GROUP
        )

        write_options = processor_context.get_property_group(
            self.WRITE_OPTIONS_GROUP
        )

        path = default_options.get_property(self.PATH)
        format = default_options.get_property(self.FORMAT)
        mode = default_options.get_property(self.MODE)
        partition_by = default_options.get_property(self.PARTITION_BY)

        source_df = processor_context.dependencies[0].df

        source_df.write.save(path=path, format=format,
                             mode=mode, partitionBy=partition_by,
                             **write_options)
def test_load_processor(spark_session: SparkSession):
    load_options = {
        'header': 'true'
    }

    default_props = PropertyGroup()
    default_props.set_property(
        LoadProcessor.PATH, f'{FIXTURE_DIR}/sample_load.csv')
    default_props.set_property(LoadProcessor.FORMAT, 'csv')

    property_groups = PropertyGroups()
    property_groups.set_property_group(
        LoadProcessor.LOAD_OPTIONS_GROUP, load_options)
    property_groups.set_property_group(
        LoadProcessor.DEFAULT_PROPS_GROUP, default_props
    )

    processor_context = ProcessorContext(spark_session)
    processor_context.set_property_group(
        LoadProcessor.LOAD_OPTIONS_GROUP, load_options)
    processor_context.set_property_group(
        LoadProcessor.DEFAULT_PROPS_GROUP, default_props
    )
    processor = LoadProcessor()
    output = processor.run(processor_context)
    actual = output.df.collect()
    expected_data = [{'name': 'xyz', 'contact': '123'}]
    expected = spark_session.createDataFrame(expected_data) \
        .select('name', 'contact').collect()
    assert actual == expected
Example #6
0
    def run(self, processor_context: ProcessorContext) -> Dependency:
        dependency_config = {}

        default_options = processor_context.get_property_group(
            self.DEFAULT_PROPS_GROUP)
        load_options = processor_context.get_property_group(
            self.LOAD_OPTIONS_GROUP)

        view_name = default_options.get_property(self.VIEW_NAME)
        if view_name is not None:
            dependency_config['view_name'] = view_name

        path = default_options.get_property(self.PATH)
        load_format = default_options.get_property(self.FORMAT)
        schema = default_options.get_property(self.SCHEMA)

        struct_type = StructType.fromJson(schema)

        df = processor_context.spark_session.readStream.load(
            path=path, format=load_format, schema=struct_type, **load_options)

        return Dependency(df, dependency_config)
Example #7
0
    def run(self,
            processor_context: ProcessorContext) -> Dependency:

        for dependency in processor_context.dependencies:
            assert 'view_name' in dependency.config, \
                'Missing view_name in dependency.'
            view_name = dependency.config['view_name']
            df = dependency.df
            df.createOrReplaceTempView(view_name)

        default_options = processor_context.get_property_group(
            self.DEFAULT_PROPS_GROUP)

        spark = processor_context.spark_session
        df = spark.sql(default_options.get_property(self.QUERY))
        dependency_config = {}
        return Dependency(df, dependency_config)
Example #8
0
    def get_dependency(cls,
                       workflow: Workflow,
                       processor_id: str,
                       graph: DiGraph,
                       spark: SparkSession) -> Union[Dependency, None]:
        processor_config = workflow.get_processor(processor_id)
        predecessors = []
        if graph.number_of_edges() > 0:
            predecessors = list(graph.predecessors(processor_id))
        dependencies = []
        if not bool(predecessors):
            predecessors = []
        for predecessor in predecessors:
            dependencies.append(
                cls.get_dependency(workflow, predecessor, graph, spark))

        processor_context = ProcessorContext(
            spark_session=spark,
            property_groups=processor_config.property_groups,
            dependencies=dependencies)
        processor = SparkProcessor.get_spark_processor(processor_config.type)

        return processor.run(processor_context)
Example #9
0
def test_write_stream_processor(spark_session: SparkSession):
    write_options = {
        'header': 'true',
        'inferSchema': 'true',
        'checkpointLocation': f'{TEST_DIR}/checkpoint'
    }

    schema = {
        'type':
        'struct',
        'fields': [
            {
                'name': 'name',
                'type': 'string',
                'nullable': False,
                'metadata': {}
            },
            {
                'name': 'contact',
                'type': 'integer',
                'nullable': False,
                'metadata': {}
            },
        ]
    }

    input_path = f'{FIXTURE_DIR}/sample_load.csv'
    output_path = f'{TEST_DIR}/sample_load.csv'
    default_props = PropertyGroup()
    default_props.set_property(WriteStreamProcessor.PATH, output_path)
    default_props.set_property(WriteStreamProcessor.FORMAT, 'csv')

    property_groups = PropertyGroups()
    property_groups.set_property_group(
        WriteStreamProcessor.WRITE_OPTIONS_GROUP, write_options)
    property_groups.set_property_group(
        WriteStreamProcessor.DEFAULT_PROPS_GROUP, default_props)

    dependency_df = spark_session.readStream.load(
        path=input_path,
        format='csv',
        schema=StructType.fromJson(schema),
        **write_options)
    dependency = Dependency(dependency_df, {})
    processor_context = ProcessorContext(spark_session, property_groups,
                                         [dependency])

    processor = WriteStreamProcessor()
    processor.run(processor_context)

    actual = spark_session \
        .read \
        .options(**write_options) \
        .csv(output_path) \
        .collect()

    expected = spark_session \
        .read \
        .options(**write_options) \
        .csv(input_path) \
        .collect()

    assert actual == expected