class SparkSessionTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def setUp(self): self.session = Session() def tearDown(self): self.session = Session() self.session.stop() def test_session(self): spark_session = self.session.get_session() session2 = Session() self.assertEqual(self.session, session2) self.assertIsInstance(spark_session, SparkSession)
def __init__(self, dataset_name: str, frame_metadata: FrameInfo): self.dataset_name = dataset_name self.H = frame_metadata.height self.W = frame_metadata.width self.C = frame_metadata.num_channels # The schema defines how the dataset schema looks like self.dataset_schema = Unischema(self.dataset_name, [ UnischemaField('frame_id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C), CompressedNdarrayCodec(), False), ]) # Construct output location eva_dir = ConfigurationManager().get_value("core", "location") output_url = os.path.join(eva_dir, self.dataset_name) # Get session handle session = Session() spark = session.get_session() spark_context = session.get_context() # Wrap dataset materialization portion. rows_count = 10 with materialize_dataset(spark, output_url, self.dataset_schema): rows_rdd = spark_context.parallelize(range(rows_count))\ .map(lambda x: row_generator(x, self.H, self.W, self.C))\ .map(lambda x: dict_to_spark_row(self.dataset_schema, x)) spark.createDataFrame(rows_rdd, self.dataset_schema.as_spark_schema()) \ .coalesce(10) \ .write \ .mode('overwrite') \ .parquet(output_url)
class PetastormStorageEngine(AbstractStorageEngine): def __init__(self): """ Maintain a long live spark session and context. """ self._spark = Session() self.spark_session = self._spark.get_session() self.spark_context = self._spark.get_context() def _spark_url(self, table: DataFrameMetadata) -> str: """ Generate a spark/petastorm url given a table """ return Path(table.file_url).resolve().as_uri() def create(self, table: DataFrameMetadata): """ Create an empty dataframe in petastorm. """ empty_rdd = self.spark_context.emptyRDD() with materialize_dataset(self.spark_session, self._spark_url(table), table.schema.petastorm_schema): self.spark_session.createDataFrame(empty_rdd, table.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('overwrite') \ .parquet(self._spark_url(table)) def write(self, table: DataFrameMetadata, rows: Batch): """ Write rows into the dataframe. Arguments: table: table metadata object to write into rows : batch to be persisted in the storage. """ if rows.empty(): return # ToDo # Throw an error if the row schema doesn't match the table schema with materialize_dataset(self.spark_session, self._spark_url(table), table.schema.petastorm_schema): records = rows.frames columns = records.keys() rows_rdd = self.spark_context.parallelize(records.values) \ .map(lambda x: dict(zip(columns, x))) \ .map(lambda x: dict_to_spark_row(table.schema.petastorm_schema, x)) self.spark_session.createDataFrame(rows_rdd, table.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(self._spark_url(table)) def read(self, table: DataFrameMetadata, columns: List[str] = None, predicate_func=None) -> Iterator[Batch]: """ Reads the table and return a batch iterator for the tuples that passes the predicate func. Argument: table: table metadata object to write into columns List[str]: A list of column names to be considered in predicate_func predicate_func: customized predicate function returns bool Return: Iterator of Batch read. """ predicate = None if predicate_func and columns: predicate = in_lambda(columns, predicate_func) # ToDo: Handle the sharding logic. We might have to maintain a # context for deciding which shard to read petastorm_reader = PetastormReader(self._spark_url(table), predicate=predicate) for batch in petastorm_reader.read(): yield batch def _open(self, table): pass def _close(self, table): pass def _read_init(self, table): pass