Example #1
0
    def execute(self):
        """Execute the link."""
        # get process manager and data store
        ds = process_manager.service(DataStore)

        # check if data frame exists in data store
        if self.read_key not in ds:
            err_msg = 'No input data found in data store with key "{}".'.format(
                self.read_key)
            if not self.fail_missing_data:
                self.logger.error(err_msg.capitalize())
                return StatusCode.Success
            raise KeyError(err_msg)

        # fetch data from data store
        data = ds[self.read_key]
        if not isinstance(data, pyspark.sql.DataFrame):
            spark = process_manager.service(SparkManager).get_session()
            self.logger.debug(
                'Converting data of type "{type}" to a Spark data frame.',
                type=type(data))
            data = data_conversion.create_spark_df(spark,
                                                   data,
                                                   schema=self.schema)

        # create data-frame writer with requested number of partitions/output files
        df_writer = data.repartition(self.num_files).write

        # call data-frame writer methods
        apply_transform_funcs(df_writer, self._write_methods)

        return StatusCode.Success
Example #2
0
    def execute(self):
        """Execute the link."""
        # create data-frame reader
        spark = process_manager.service(SparkManager).get_session()
        data = spark.read

        # call data-frame reader methods
        data = apply_transform_funcs(data, self._read_methods)

        # store data in data store
        process_manager.service(DataStore)[self.store_key] = data

        return StatusCode.Success
Example #3
0
    def test_apply_transform_funcs(self, mock_process, mock_type):
        """Test applying transformation functions"""

        # create mock transformation functions
        funcs = ['func0', mock.Mock(name='func1')]
        trans_objs = [mock.Mock(name='transformed_obj{:d}'.format(it)) for it in range(2)]
        funcs[1].return_value = trans_objs[1]

        # create mock transformation object and its type
        obj_cls = type('obj_cls', (), {'func0': mock.Mock(name='func0'), 'no_func': type('non_callable', (), {})()})
        obj_cls.func0.return_value = trans_objs[0]
        obj = mock.Mock(name='obj')
        obj.__class__ = obj_cls
        mock_type.side_effect = lambda a: obj_cls if a is obj else type(a)

        # create mock function arguments
        func_args = dict((fun, tuple(mock.Mock(name='arg{:d}_{:d}'.format(fi, ai)) for ai in range(2)))
                         for fi, fun in enumerate(funcs))
        func_kwargs = dict((fun, dict(('key{:d}'.format(ai), mock.Mock(name='kwarg{:d}_{:d}'.format(fi, ai)))
                                      for ai in range(2))) for fi, fun in enumerate(funcs))

        # make sure functions and arguments are processed as expected
        mock_process.side_effect = lambda funcs, args, kwargs: [(f, args[f], kwargs[f]) for f in funcs]

        # test normal operation
        ret_obj = apply_transform_funcs(obj, funcs, func_args, func_kwargs)
        self.assertIs(ret_obj, trans_objs[-1], 'unexpected transformed object returned')
        obj_cls.func0.assert_called_once_with(obj, *func_args[funcs[0]], **func_kwargs[funcs[0]])
        funcs[1].assert_called_once_with(trans_objs[0], *func_args[funcs[1]], **func_kwargs[funcs[1]])

        # test with non-existing member function
        with self.assertRaises(AttributeError):
            apply_transform_funcs(obj, ['foo'], dict(foo=()), dict(foo={}))

        # test with non-callable member
        with self.assertRaises(TypeError):
            apply_transform_funcs(obj, ['no_func'], dict(no_func=()), dict(no_func={}))
Example #4
0
    def execute(self):
        """Execute SparkDfConverter"""

        # get process manager and data store
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)

        # fetch data frame from data store
        if self.read_key not in ds:
            err_msg = 'no input data found in data store with key "{}"'.format(
                self.read_key)
            if not self.fail_missing_data:
                self.log().error(err_msg.capitalize())
                return StatusCode.Success
            raise KeyError(err_msg)
        data = ds[self.read_key]
        if not isinstance(data, pyspark.sql.DataFrame):
            raise TypeError(
                'expected a Spark data frame for "{0:s}" (got "{1:s}")'.format(
                    self.read_key, str(type(data))))

        # store data-frame schema
        ds[self.schema_key] = data.schema

        # convert data frame
        if self.output_format == 'rdd':
            # convert to RDD
            data = data.rdd
            if not self.preserve_col_names:
                # convert rows to tuples, which removes column names
                data = data.map(tuple)
        elif self.output_format == 'list':
            # convert to a list
            data = data.collect()
            if not self.preserve_col_names:
                # convert rows to tuples, which removes column names
                data = list(map(tuple, data))
        elif self.output_format == 'pd':
            # convert to Pandas data frame
            data = data.toPandas()

        # further process created dataset
        data = apply_transform_funcs(data, self._process_methods)

        # store data in data store
        ds[self.store_key] = data

        return StatusCode.Success
Example #5
0
def create_spark_df(spark, data, schema=None, process_methods=None, **kwargs):
    """Create a Spark data frame from data in a different format.

    A Spark data frame is created with either a specified schema or a schema
    inferred from the input data.  The schema can be specified with the
    keyword argument "schema".

    Functions to transform the data frame after creation can be specified by
    the keyword argument "process_methods".  The value of this argument is
    an iterable of (function, arguments, keyword arguments) tuples to apply.

    The data frame is created with the createDataFrame function of the
    SparkSession.  Remaining keyword arguments are passed to this function.

    >>> spark = pyspark.sql.SparkSession.builder.getOrCreate()
    >>> df = create_spark_df(spark,
    >>>                      [[1, 1.1, 'one'], [2, 2.2, 'two']],
    >>>                      schema=['int', 'float', 'str'],
    >>>                      process_methods=[('repartition', (), {'numPartitions': 6})])
    >>> df.show()
    +---+-----+---+
    |int|float|str|
    +---+-----+---+
    |  2|  2.2|two|
    |  1|  1.1|one|
    +---+-----+---+

    :param pyspark.sql.SparkSession spark: SparkSession instance
    :param data: input dataset
    :param schema: schema of created data frame
    :param iterable process_methods: methods to apply on the data frame after creation
    :returns: created data frame
    :rtype: pyspark.sql.DataFrame
    """
    # check if data-frame schema was provided
    if isinstance(schema, int):
        # infer schema from a single row (prevents Spark >= 1.6.1 from checking schema of all rows)
        def get_row(data, ind):
            """Get row."""
            try:
                return data.iloc[ind].tolist()
            except AttributeError:
                pass
            try:
                row = data.first()
                if ind > 0:
                    logger.warning('Inferring data-frame schema from first row, instead of row with index {i:d}', i=ind)
                return row
            except AttributeError:
                pass
            try:
                return data[ind]
            except TypeError:
                raise TypeError('Unable to get row from data of type "{!s}" to infer schema.'.format(type(data)))

        row = get_row(data, schema)

        def to_python_type(var):
            """Get item."""
            try:
                return var.item()
            except AttributeError:
                return var

        schema = pyspark.sql.types._infer_schema(tuple(to_python_type(it) for it in row))
        try:
            for t, n in zip(schema.fields, data.columns):
                t.name = str(n)
        except AttributeError:
            pass
    elif isinstance(schema, dict):
        # create schema from dictionary of (name, data type) pairs
        schema = df_schema(schema)
    kwargs['schema'] = schema

    # check if input is a data frame
    if isinstance(data, pyspark.sql.DataFrame):
        if not kwargs['schema']:
            kwargs['schema'] = data.schema
        data = data.rdd

    # create and transform data frame
    df = spark.createDataFrame(data, **kwargs)
    if process_methods:
        df = apply_transform_funcs(df, process_methods)
    return df