Ejemplo n.º 1
0
class SparkSessionTest(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def setUp(self):
        self.session = Session()

    def tearDown(self):
        self.session = Session()
        self.session.stop()

    def test_session(self):

        spark_session = self.session.get_session()

        session2 = Session()
        self.assertEqual(self.session, session2)
        self.assertIsInstance(spark_session, SparkSession)
Ejemplo n.º 2
0
    def __init__(self, dataset_name: str, frame_metadata: FrameInfo):

        self.dataset_name = dataset_name
        self.H = frame_metadata.height
        self.W = frame_metadata.width
        self.C = frame_metadata.num_channels

        # The schema defines how the dataset schema looks like
        self.dataset_schema = Unischema(self.dataset_name, [
            UnischemaField('frame_id', np.int32,
                           (), ScalarCodec(IntegerType()), False),
            UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C),
                           CompressedNdarrayCodec(), False),
        ])

        # Construct output location
        eva_dir = ConfigurationManager().get_value("core", "location")
        output_url = os.path.join(eva_dir, self.dataset_name)

        # Get session handle
        session = Session()
        spark = session.get_session()
        spark_context = session.get_context()

        # Wrap dataset materialization portion.
        rows_count = 10
        with materialize_dataset(spark, output_url, self.dataset_schema):

            rows_rdd = spark_context.parallelize(range(rows_count))\
                .map(lambda x: row_generator(x, self.H, self.W, self.C))\
                .map(lambda x: dict_to_spark_row(self.dataset_schema, x))

            spark.createDataFrame(rows_rdd,
                                  self.dataset_schema.as_spark_schema()) \
                .coalesce(10) \
                .write \
                .mode('overwrite') \
                .parquet(output_url)
Ejemplo n.º 3
0
class PetastormStorageEngine(AbstractStorageEngine):
    def __init__(self):
        """
        Maintain a long live spark session and context.
        """
        self._spark = Session()
        self.spark_session = self._spark.get_session()
        self.spark_context = self._spark.get_context()

    def _spark_url(self, table: DataFrameMetadata) -> str:
        """
        Generate a spark/petastorm url given a table
        """
        return Path(table.file_url).resolve().as_uri()

    def create(self, table: DataFrameMetadata):
        """
        Create an empty dataframe in petastorm.
        """
        empty_rdd = self.spark_context.emptyRDD()

        with materialize_dataset(self.spark_session, self._spark_url(table),
                                 table.schema.petastorm_schema):

            self.spark_session.createDataFrame(empty_rdd,
                                               table.schema.pyspark_schema) \
                .coalesce(1) \
                .write \
                .mode('overwrite') \
                .parquet(self._spark_url(table))

    def write(self, table: DataFrameMetadata, rows: Batch):
        """
        Write rows into the dataframe.

        Arguments:
            table: table metadata object to write into
            rows : batch to be persisted in the storage.
        """

        if rows.empty():
            return
        # ToDo
        # Throw an error if the row schema doesn't match the table schema

        with materialize_dataset(self.spark_session, self._spark_url(table),
                                 table.schema.petastorm_schema):

            records = rows.frames
            columns = records.keys()
            rows_rdd = self.spark_context.parallelize(records.values) \
                .map(lambda x: dict(zip(columns, x))) \
                .map(lambda x: dict_to_spark_row(table.schema.petastorm_schema,
                                                 x))
            self.spark_session.createDataFrame(rows_rdd,
                                               table.schema.pyspark_schema) \
                .coalesce(1) \
                .write \
                .mode('append') \
                .parquet(self._spark_url(table))

    def read(self,
             table: DataFrameMetadata,
             columns: List[str] = None,
             predicate_func=None) -> Iterator[Batch]:
        """
        Reads the table and return a batch iterator for the
        tuples that passes the predicate func.

        Argument:
            table: table metadata object to write into
            columns List[str]: A list of column names to be
                considered in predicate_func
            predicate_func: customized predicate function returns bool

        Return:
            Iterator of Batch read.
        """
        predicate = None
        if predicate_func and columns:
            predicate = in_lambda(columns, predicate_func)

        # ToDo: Handle the sharding logic. We might have to maintain a
        # context for deciding which shard to read
        petastorm_reader = PetastormReader(self._spark_url(table),
                                           predicate=predicate)
        for batch in petastorm_reader.read():
            yield batch

    def _open(self, table):
        pass

    def _close(self, table):
        pass

    def _read_init(self, table):
        pass