def execute(self): """Execute the link.""" # get process manager and data store ds = process_manager.service(DataStore) # check if data frame exists in data store if self.read_key not in ds: err_msg = 'No input data found in data store with key "{}".'.format( self.read_key) if not self.fail_missing_data: self.logger.error(err_msg.capitalize()) return StatusCode.Success raise KeyError(err_msg) # fetch data from data store data = ds[self.read_key] if not isinstance(data, pyspark.sql.DataFrame): spark = process_manager.service(SparkManager).get_session() self.logger.debug( 'Converting data of type "{type}" to a Spark data frame.', type=type(data)) data = data_conversion.create_spark_df(spark, data, schema=self.schema) # create data-frame writer with requested number of partitions/output files df_writer = data.repartition(self.num_files).write # call data-frame writer methods apply_transform_funcs(df_writer, self._write_methods) return StatusCode.Success
def execute(self): """Execute the link.""" # create data-frame reader spark = process_manager.service(SparkManager).get_session() data = spark.read # call data-frame reader methods data = apply_transform_funcs(data, self._read_methods) # store data in data store process_manager.service(DataStore)[self.store_key] = data return StatusCode.Success
def test_apply_transform_funcs(self, mock_process, mock_type): """Test applying transformation functions""" # create mock transformation functions funcs = ['func0', mock.Mock(name='func1')] trans_objs = [mock.Mock(name='transformed_obj{:d}'.format(it)) for it in range(2)] funcs[1].return_value = trans_objs[1] # create mock transformation object and its type obj_cls = type('obj_cls', (), {'func0': mock.Mock(name='func0'), 'no_func': type('non_callable', (), {})()}) obj_cls.func0.return_value = trans_objs[0] obj = mock.Mock(name='obj') obj.__class__ = obj_cls mock_type.side_effect = lambda a: obj_cls if a is obj else type(a) # create mock function arguments func_args = dict((fun, tuple(mock.Mock(name='arg{:d}_{:d}'.format(fi, ai)) for ai in range(2))) for fi, fun in enumerate(funcs)) func_kwargs = dict((fun, dict(('key{:d}'.format(ai), mock.Mock(name='kwarg{:d}_{:d}'.format(fi, ai))) for ai in range(2))) for fi, fun in enumerate(funcs)) # make sure functions and arguments are processed as expected mock_process.side_effect = lambda funcs, args, kwargs: [(f, args[f], kwargs[f]) for f in funcs] # test normal operation ret_obj = apply_transform_funcs(obj, funcs, func_args, func_kwargs) self.assertIs(ret_obj, trans_objs[-1], 'unexpected transformed object returned') obj_cls.func0.assert_called_once_with(obj, *func_args[funcs[0]], **func_kwargs[funcs[0]]) funcs[1].assert_called_once_with(trans_objs[0], *func_args[funcs[1]], **func_kwargs[funcs[1]]) # test with non-existing member function with self.assertRaises(AttributeError): apply_transform_funcs(obj, ['foo'], dict(foo=()), dict(foo={})) # test with non-callable member with self.assertRaises(TypeError): apply_transform_funcs(obj, ['no_func'], dict(no_func=()), dict(no_func={}))
def execute(self): """Execute SparkDfConverter""" # get process manager and data store proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) # fetch data frame from data store if self.read_key not in ds: err_msg = 'no input data found in data store with key "{}"'.format( self.read_key) if not self.fail_missing_data: self.log().error(err_msg.capitalize()) return StatusCode.Success raise KeyError(err_msg) data = ds[self.read_key] if not isinstance(data, pyspark.sql.DataFrame): raise TypeError( 'expected a Spark data frame for "{0:s}" (got "{1:s}")'.format( self.read_key, str(type(data)))) # store data-frame schema ds[self.schema_key] = data.schema # convert data frame if self.output_format == 'rdd': # convert to RDD data = data.rdd if not self.preserve_col_names: # convert rows to tuples, which removes column names data = data.map(tuple) elif self.output_format == 'list': # convert to a list data = data.collect() if not self.preserve_col_names: # convert rows to tuples, which removes column names data = list(map(tuple, data)) elif self.output_format == 'pd': # convert to Pandas data frame data = data.toPandas() # further process created dataset data = apply_transform_funcs(data, self._process_methods) # store data in data store ds[self.store_key] = data return StatusCode.Success
def create_spark_df(spark, data, schema=None, process_methods=None, **kwargs): """Create a Spark data frame from data in a different format. A Spark data frame is created with either a specified schema or a schema inferred from the input data. The schema can be specified with the keyword argument "schema". Functions to transform the data frame after creation can be specified by the keyword argument "process_methods". The value of this argument is an iterable of (function, arguments, keyword arguments) tuples to apply. The data frame is created with the createDataFrame function of the SparkSession. Remaining keyword arguments are passed to this function. >>> spark = pyspark.sql.SparkSession.builder.getOrCreate() >>> df = create_spark_df(spark, >>> [[1, 1.1, 'one'], [2, 2.2, 'two']], >>> schema=['int', 'float', 'str'], >>> process_methods=[('repartition', (), {'numPartitions': 6})]) >>> df.show() +---+-----+---+ |int|float|str| +---+-----+---+ | 2| 2.2|two| | 1| 1.1|one| +---+-----+---+ :param pyspark.sql.SparkSession spark: SparkSession instance :param data: input dataset :param schema: schema of created data frame :param iterable process_methods: methods to apply on the data frame after creation :returns: created data frame :rtype: pyspark.sql.DataFrame """ # check if data-frame schema was provided if isinstance(schema, int): # infer schema from a single row (prevents Spark >= 1.6.1 from checking schema of all rows) def get_row(data, ind): """Get row.""" try: return data.iloc[ind].tolist() except AttributeError: pass try: row = data.first() if ind > 0: logger.warning('Inferring data-frame schema from first row, instead of row with index {i:d}', i=ind) return row except AttributeError: pass try: return data[ind] except TypeError: raise TypeError('Unable to get row from data of type "{!s}" to infer schema.'.format(type(data))) row = get_row(data, schema) def to_python_type(var): """Get item.""" try: return var.item() except AttributeError: return var schema = pyspark.sql.types._infer_schema(tuple(to_python_type(it) for it in row)) try: for t, n in zip(schema.fields, data.columns): t.name = str(n) except AttributeError: pass elif isinstance(schema, dict): # create schema from dictionary of (name, data type) pairs schema = df_schema(schema) kwargs['schema'] = schema # check if input is a data frame if isinstance(data, pyspark.sql.DataFrame): if not kwargs['schema']: kwargs['schema'] = data.schema data = data.rdd # create and transform data frame df = spark.createDataFrame(data, **kwargs) if process_methods: df = apply_transform_funcs(df, process_methods) return df