コード例 #1
0
 def __init__(self):
     """
     Maintain a long live spark session and context.
     """
     self._spark = Session()
     self.spark_session = self._spark.get_session()
     self.spark_context = self._spark.get_context()
コード例 #2
0
ファイル: test_frame_loader.py プロジェクト: JeremyHua18/eva
class FrameLoaderTest(unittest.TestCase):
    def create_dummy_frames(self, num_frames=NUM_FRAMES, filters=[]):
        if not filters:
            filters = range(num_frames)
        for i in filters:
            yield Frame(
                i,
                np.array(np.ones((2, 2, 3)) * 0.1 * float(i + 1) * 255,
                         dtype=np.uint8), FrameInfo(2, 2, 3, ColorSpace.BGR))

    def setUp(self):
        suppress_py4j_logging()
        self.create_dummy_frames(NUM_FRAMES)

    def tearDown(self):
        self.session = Session()
        self.session.stop()

    def test_frameinfo_information(self):

        frame_info = FrameInfo(2, 2, 3, ColorSpace.BGR)
        f = FrameLoader("appname", frame_info)

        self.assertEqual(f.H, 2)

    def test_load_images(self):

        frame_info = FrameInfo(28, 28, 1, ColorSpace.GRAY)
        f = FrameLoader("mnist", frame_info)

        f.load_images()
コード例 #3
0
class CatalogManagerTests(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def setUp(self):
        suppress_py4j_logging()

    def tearDown(self):
        self.session = Session()
        self.session.stop()

    @mock.patch('src.catalog.catalog_manager.init_db')
    def test_catalog_manager_singleton_pattern(self, mocked_db):
        x = CatalogManager()
        y = CatalogManager()
        self.assertEqual(x, y)
コード例 #4
0
ファイル: test_session.py プロジェクト: JeremyHua18/eva
    def test_session(self):

        spark_session = self.session.get_session()

        session2 = Session()
        self.assertEqual(self.session, session2)
        self.assertIsInstance(spark_session, SparkSession)
コード例 #5
0
class SparkSessionTest(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def setUp(self):
        self.session = Session()

    def tearDown(self):
        self.session = Session()
        self.session.stop()

    def test_session(self):

        spark_session = self.session.get_session()

        session2 = Session()
        self.assertEqual(self.session, session2)
        self.assertIsInstance(spark_session, SparkSession)
コード例 #6
0
def append_rows(df_metadata: DataFrameMetadata, rows):

    spark = Session().get_session()

    # Convert a list of rows to RDD
    rows_df = spark.createDataFrame(rows,
                                    df_metadata.get_dataframe_pyspark_schema())
    rows_rdd = rows_df.rdd

    # Use petastorm to appends rows
    with materialize_dataset(spark, df_metadata.get_dataframe_file_url(),
                             df_metadata.get_dataframe_petastorm_schema()):

        spark.createDataFrame(rows_rdd,
                              df_metadata.get_dataframe_pyspark_schema()) \
            .coalesce(1) \
            .write \
            .mode('append') \
            .parquet(df_metadata.get_dataframe_file_url())
コード例 #7
0
ファイル: frame_loader.py プロジェクト: swati21/eva
    def __init__(self, dataset_name: str, frame_metadata: FrameInfo):

        self.dataset_name = dataset_name
        self.H = frame_metadata.height
        self.W = frame_metadata.width
        self.C = frame_metadata.num_channels

        # The schema defines how the dataset schema looks like
        self.dataset_schema = Unischema(self.dataset_name, [
            UnischemaField('frame_id', np.int32,
                           (), ScalarCodec(IntegerType()), False),
            UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C),
                           CompressedNdarrayCodec(), False),
        ])

        # Construct output location
        eva_dir = ConfigurationManager().get_value("core", "location")
        output_url = os.path.join(eva_dir, self.dataset_name)

        # Get session handle
        session = Session()
        spark = session.get_session()
        spark_context = session.get_context()

        # Wrap dataset materialization portion.
        rows_count = 10
        with materialize_dataset(spark, output_url, self.dataset_schema):

            rows_rdd = spark_context.parallelize(range(rows_count))\
                .map(lambda x: row_generator(x, self.H, self.W, self.C))\
                .map(lambda x: dict_to_spark_row(self.dataset_schema, x))

            spark.createDataFrame(rows_rdd,
                                  self.dataset_schema.as_spark_schema()) \
                .coalesce(10) \
                .write \
                .mode('overwrite') \
                .parquet(output_url)
コード例 #8
0
def create_dataframe(df_metadata: DataFrameMetadata):

    spark = Session().get_session()
    spark_context = Session().get_context()

    # Create an empty RDD
    empty_rdd = spark_context.emptyRDD()
    print("url", df_metadata.file_url)
    # Use petastorm to create dataframe
    with materialize_dataset(spark, df_metadata.file_url,
                             df_metadata.schema.petastorm_schema):

        spark.createDataFrame(empty_rdd,
                              df_metadata.schema.pyspark_schema) \
            .coalesce(1) \
            .write \
            .mode('overwrite') \
            .parquet(df_metadata.file_url)
コード例 #9
0
def append_rows(df_metadata: DataFrameMetadata, rows):

    spark = Session().get_session()
    spark_context = Session().get_context()

    # Use petastorm to appends rows
    with materialize_dataset(spark, df_metadata.file_url,
                             df_metadata.schema.petastorm_schema):
        # Convert a list of rows to RDD
        rows_rdd = spark_context.parallelize(
            rows).map(lambda x: dict_to_spark_row(
                df_metadata.schema.petastorm_schema, x))

        spark.createDataFrame(rows_rdd,
                              df_metadata.schema.pyspark_schema) \
            .coalesce(1) \
            .write \
            .mode('append') \
            .parquet(df_metadata.file_url)
コード例 #10
0
class PetastormStorageEngine(AbstractStorageEngine):
    def __init__(self):
        """
        Maintain a long live spark session and context.
        """
        self._spark = Session()
        self.spark_session = self._spark.get_session()
        self.spark_context = self._spark.get_context()

    def _spark_url(self, table: DataFrameMetadata) -> str:
        """
        Generate a spark/petastorm url given a table
        """
        return Path(table.file_url).resolve().as_uri()

    def create(self, table: DataFrameMetadata):
        """
        Create an empty dataframe in petastorm.
        """
        empty_rdd = self.spark_context.emptyRDD()

        with materialize_dataset(self.spark_session, self._spark_url(table),
                                 table.schema.petastorm_schema):

            self.spark_session.createDataFrame(empty_rdd,
                                               table.schema.pyspark_schema) \
                .coalesce(1) \
                .write \
                .mode('overwrite') \
                .parquet(self._spark_url(table))

    def write(self, table: DataFrameMetadata, rows: Batch):
        """
        Write rows into the dataframe.

        Arguments:
            table: table metadata object to write into
            rows : batch to be persisted in the storage.
        """

        if rows.empty():
            return
        # ToDo
        # Throw an error if the row schema doesn't match the table schema

        with materialize_dataset(self.spark_session, self._spark_url(table),
                                 table.schema.petastorm_schema):

            records = rows.frames
            columns = records.keys()
            rows_rdd = self.spark_context.parallelize(records.values) \
                .map(lambda x: dict(zip(columns, x))) \
                .map(lambda x: dict_to_spark_row(table.schema.petastorm_schema,
                                                 x))
            self.spark_session.createDataFrame(rows_rdd,
                                               table.schema.pyspark_schema) \
                .coalesce(1) \
                .write \
                .mode('append') \
                .parquet(self._spark_url(table))

    def read(self,
             table: DataFrameMetadata,
             columns: List[str] = None,
             predicate_func=None) -> Iterator[Batch]:
        """
        Reads the table and return a batch iterator for the
        tuples that passes the predicate func.

        Argument:
            table: table metadata object to write into
            columns List[str]: A list of column names to be
                considered in predicate_func
            predicate_func: customized predicate function returns bool

        Return:
            Iterator of Batch read.
        """
        predicate = None
        if predicate_func and columns:
            predicate = in_lambda(columns, predicate_func)

        # ToDo: Handle the sharding logic. We might have to maintain a
        # context for deciding which shard to read
        petastorm_reader = PetastormReader(self._spark_url(table),
                                           predicate=predicate)
        for batch in petastorm_reader.read():
            yield batch

    def _open(self, table):
        pass

    def _close(self, table):
        pass

    def _read_init(self, table):
        pass
コード例 #11
0
def load_dataframe(dataframe_url: str):

    spark = Session().get_session()
    dataframe = spark.read.load(dataframe_url)

    return dataframe
コード例 #12
0
 def tearDown(self):
     self.session = Session()
     self.session.stop()
コード例 #13
0
ファイル: test_session.py プロジェクト: JeremyHua18/eva
 def setUp(self):
     suppress_py4j_logging()
     self.session = Session()
コード例 #14
0
 def setUp(self):
     self.session = Session()