def create_sample_table(self): table_info = DataFrameMetadata("dataset", 'dataset') column_1 = DataFrameColumn("id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("data", ColumnType.NDARRAY, False, [2, 2, 3]) table_info.schema = [column_1, column_2] return table_info
def test_df_metadata(self): df_metadata = DataFrameMetadata('name', 'eva_dataset') column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2] schema = DataFrameSchema('name', col_list) df_metadata.schema = col_list self.assertEqual(df_metadata.name, 'name') self.assertEqual(df_metadata.file_url, 'eva_dataset') self.assertEqual(df_metadata.id, None) self.assertEqual(df_metadata.identifier_column, 'id') self.assertEqual(df_metadata.schema, schema)
def test_should_return_batches_equivalent_to_number_of_frames(self): video_info = DataFrameMetadata("dataset_1", 'dummy.avi') video_loader = VideoLoader(video_info) batches = list(video_loader.load()) dummy_frames = list(self.create_dummy_frames()) self.assertEqual(len(batches), NUM_FRAMES) self.assertEqual(dummy_frames, [batch.frames[0] for batch in batches])
def test_should_call_petastorm_make_reader_with_correct_params(self, mock): video_info = DataFrameMetadata("dataset_1", 'dummy.avi') video_loader = PetastormLoader(video_info, curr_shard=3, total_shards=3) list(video_loader._load_frames()) mock.assert_called_once_with('dummy.avi', shard_count=3, cur_shard=3)
def test_should_return_the_new_path_after_execution(self, mock_class): class_instatnce = mock_class.return_value dummy_expr = type('dummy_expr', (), {"evaluate": lambda x=None: [True, False, True]}) # Build plan tree video = DataFrameMetadata("dataset", "dummy.avi") batch_1 = Batch(pd.DataFrame({'data': [1, 2, 3]})) batch_2 = Batch(pd.DataFrame({'data': [4, 5, 6]})) class_instatnce.load.return_value = map(lambda x: x, [batch_1, batch_2]) storage_plan = StoragePlan(video) seq_scan = SeqScanPlan(predicate=dummy_expr, column_ids=[]) seq_scan.append_child(storage_plan) # Execute the plan executor = PlanExecutor(seq_scan) actual = executor.execute_plan() expected = batch_1[::2] + batch_2[::2] mock_class.assert_called_once() self.assertEqual(expected, actual)
def test_should_return_batches_equivalent_to_number_of_frames(self): video_info = DataFrameMetadata("dataset_1", 'dummy.avi') video_loader = VideoLoader(video_info) batches = list(video_loader.load()) dummy_frames = list(self.create_dummy_frames()) self.assertEqual(len(batches), NUM_FRAMES) expected = [batch.frames.to_dict('records')[0] for batch in batches] self.assertTrue(custom_list_of_dicts_equal(dummy_frames, expected))
def test_should_skip_first_two_frames_with_offset_two(self): video_info = DataFrameMetadata("dataset_1", 'dummy.avi') video_loader = VideoLoader(video_info, offset=2) dummy_frames = list( self.create_dummy_frames(filters=[i for i in range(2, NUM_FRAMES)])) batches = list(video_loader.load()) self.assertEqual(NUM_FRAMES - 2, len(batches)) self.assertEqual(dummy_frames, [batch.frames[0] for batch in batches])
def test_should_return_half_then_number_of_batches_with_skip_of_two(self): video_info = DataFrameMetadata("dataset_1", 'dummy.avi') video_loader = VideoLoader(video_info, skip_frames=2) batches = list(video_loader.load()) dummy_frames = list( self.create_dummy_frames( filters=[i * 2 for i in range(NUM_FRAMES // 2)])) self.assertEqual(len(batches), NUM_FRAMES / 2) self.assertEqual(dummy_frames, [batch.frames[0] for batch in batches])
def test_should_return_single_batch_if_batch_size_equal_to_no_of_frames( self): video_info = DataFrameMetadata("dataset_1", 'dummy.avi') video_loader = VideoLoader(video_info, batch_size=NUM_FRAMES) dummy_frames = list( self.create_dummy_frames(filters=[i for i in range(NUM_FRAMES)])) batches = list(video_loader.load()) self.assertEqual(1, len(batches)) self.assertEqual(dummy_frames, list(batches[0].frames))
def test_should_return_only_few_frames_when_limit_is_specified(self): video_info = DataFrameMetadata("dataset_1", 'dummy.avi') limit = 4 video_loader = VideoLoader(video_info, limit=limit) dummy_frames = list( self.create_dummy_frames(filters=[i for i in range(limit)])) batches = list(video_loader.load()) self.assertEqual(limit, len(batches)) self.assertEqual(dummy_frames, [batch.frames[0] for batch in batches])
def test_should_return_only_few_frames_when_limit_is_specified(self): video_info = DataFrameMetadata("dataset_1", 'dummy.avi') limit = 4 video_loader = VideoLoader(video_info, limit=limit) dummy_frames = list( self.create_dummy_frames(filters=[i for i in range(limit)])) batches = list(video_loader.load()) self.assertEqual(limit, len(batches)) expected = [batch.frames.to_dict('records')[0] for batch in batches] self.assertTrue(custom_list_of_dicts_equal(dummy_frames, expected))
def test_should_skip_first_two_frames_with_offset_two(self): video_info = DataFrameMetadata("dataset_1", 'dummy.avi') video_loader = VideoLoader(video_info, offset=2) dummy_frames = list( self.create_dummy_frames( filters=[i for i in range(2, NUM_FRAMES)])) batches = list(video_loader.load()) self.assertEqual(NUM_FRAMES - 2, len(batches)) expected = [batch.frames.to_dict('records')[0] for batch in batches] self.assertTrue(custom_list_of_dicts_equal(dummy_frames, expected))
def test_should_return_half_then_number_of_batches_with_skip_of_two(self): video_info = DataFrameMetadata("dataset_1", 'dummy.avi') video_loader = VideoLoader(video_info, skip_frames=2) batches = list(video_loader.load()) dummy_frames = list( self.create_dummy_frames( filters=[i * 2 for i in range(NUM_FRAMES // 2)])) self.assertEqual(len(batches), NUM_FRAMES / 2) expected = [batch.frames.to_dict('records')[0] for batch in batches] self.assertTrue(custom_list_of_dicts_equal(dummy_frames, expected))
def test_df_metadata_equality(self): df_metadata = DataFrameMetadata('name', 'eva_dataset') column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2] df_metadata.schema = col_list self.assertEqual(df_metadata, df_metadata) df_metadata1 = DataFrameMetadata('name2', 'eva_dataset') column_1 = DataFrameColumn("frame_id", ColumnType.INTEGER, False) column_2 = DataFrameColumn("frame_label", ColumnType.INTEGER, False) col_list = [column_1, column_2] df_metadata1.schema = col_list self.assertNotEqual(df_metadata, df_metadata1) df_metadata2 = DataFrameMetadata('name2', 'eva_dataset') df_metadata2.schema = col_list[1:] self.assertNotEqual(df_metadata1, df_metadata2)
def test_load_frame_load_frames_using_petastorm(self, mock): dummy_values = map(lambda i: self.DummyRow(i, np.ones((2, 2, 3)) * i), range(3)) mock.return_value = self.DummyReader(dummy_values) video_info = DataFrameMetadata("dataset_1", 'dummy.avi') video_loader = PetastormLoader(video_info, curr_shard=3, total_shards=3) actual = list(video_loader._load_frames()) expected = [value._asdict() for value in dummy_values] self.assertTrue(custom_list_of_dicts_equal(expected, actual))
def get_table_bindings(self, database_name: str, table_name: str, column_names: List[str]) -> Tuple[int, List[int]]: """ This method fetches bindings for strings :param database_name: currently not in use :param table_name: the table that is being referred to :param column_names: the column names of the table for which bindings are required :return: returns metadat_id of table and a list of column ids """ metadata_id = DataFrameMetadata.get_id_from_name(table_name) column_ids = [] if column_names is not None: column_ids = DataFrameColumn.get_id_from_metadata_id_and_name_in( metadata_id, column_names) return metadata_id, column_ids
def get_metadata(self, metadata_id: int, col_id_list: List[int] = None) -> DataFrameMetadata: """ This method returns the metadata object given a metadata_id, when requested by the executor. It will further be used by storage engine for retrieving the dataframe. :param metadata_id: metadata id of the table :param col_id_list: optional column ids of the table referred :return: """ metadata = DataFrameMetadata.get(metadata_id) if col_id_list is not None: df_columns = DataFrameColumn.get_by_metadata_id_and_id_in( col_id_list, metadata_id) metadata.set_schema( DataFrameSchema(metadata.get_name(), df_columns)) return metadata
def test_load_frame_load_frames_using_petastorm(self, mock): mock.return_value = self.DummyReader( map(lambda i: self.DummyRow(i, np.ones((2, 2, 3)) * i), range(3))) video_info = DataFrameMetadata("dataset_1", 'dummy.avi') video_loader = PetastormLoader(video_info, curr_shard=3, total_shards=3) actual = list(video_loader._load_frames()) expected = [ Frame(i, np.ones((2, 2, 3)) * i, FrameInfo(2, 2, 3, ColorSpace.BGR)) for i in range(3) ] self.assertEqual(expected, actual)
def test_calling_storage_executor_should_return_batches(self, mock_class): class_instance = mock_class.return_value video_info = DataFrameMetadata('dataset', 'dummy.avi') storage_plan = StoragePlan(video_info) executor = DiskStorageExecutor(storage_plan) class_instance.load.return_value = range(5) actual = list(executor.exec()) mock_class.assert_called_once_with( video_info, batch_size=storage_plan.batch_size, limit=storage_plan.limit, offset=storage_plan.offset, skip_frames=(storage_plan.skip_frames), total_shards=0, curr_shard=0) class_instance.load.assert_called_once() self.assertEqual(list(range(5)), actual)