def test_should_load_and_select_in_table(self): query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" perform_query(query) select_query = "SELECT id FROM MyVideo;" actual_batch = perform_query(select_query) expected_rows = [{"id": i} for i in range(NUM_FRAMES)] expected_batch = Batch(frames=pd.DataFrame(expected_rows)) self.assertEqual(actual_batch, expected_batch) select_query = "SELECT data FROM MyVideo;" actual_batch = perform_query(select_query) expected_rows = [{ "data": np.array(np.ones((2, 2, 3)) * 0.1 * float(i + 1) * 255, dtype=np.uint8) } for i in range(NUM_FRAMES)] expected_batch = Batch(frames=pd.DataFrame(expected_rows)) self.assertEqual(actual_batch, expected_batch) # select * is not supported select_query = "SELECT id,data FROM MyVideo;" actual_batch = [perform_query(select_query)] expected_batch = list(create_dummy_batches()) self.assertEqual(actual_batch, expected_batch)
def test_adding_batch_frame_with_outcomes_returns_new_batch_frame(self): batch_1 = Batch(frames=create_dataframe()) batch_2 = Batch(frames=create_dataframe()) batch_3 = Batch(frames=create_dataframe_same(2)) self.assertEqual(batch_3, batch_1 + batch_2)
def test_merge_column_wise_batch_frame(self): batch_1 = Batch(frames=pd.DataFrame([{'id': 0}])) batch_2 = Batch(frames=pd.DataFrame([{'data': 1}])) batch_3 = Batch.merge_column_wise([batch_1, batch_2]) batch_4 = Batch(frames=pd.DataFrame([{'id': 0, 'data': 1}])) self.assertEqual(batch_3, batch_4)
def test_should_return_smaller_num_rows(self): dfs = [ pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list('ABCD')) for _ in range(4) ] batches = [Batch(frames=df) for df in dfs] sample_value = 3 plan = SamplePlan(ConstantValueExpression(sample_value)) sample_executor = SampleExecutor(plan) sample_executor.append_child(DummyExecutor(batches)) reduced_batches = list(sample_executor.exec()) original = Batch.concat(batches) filter = range(0, len(original), sample_value) original = original._get_frames_from_indices(filter) original = Batch.concat([original]) reduced = Batch.concat(reduced_batches) self.assertEqual(len(original), len(reduced)) self.assertEqual(original, reduced)
def test_slicing_on_batched_should_return_new_batch_frame(self): batch = Batch(frames=create_dataframe(2), outcomes={'test': [[None], [None]]}) expected = Batch(frames=create_dataframe(), outcomes={'test': [[None]]}) self.assertEqual(batch, batch[:]) self.assertEqual(expected, batch[:-1])
def test_set_outcomes_method_should_set_temp_outcome_when_bool_is_true( self): batch = Batch(frames=create_dataframe()) batch.set_outcomes('test', [1], is_temp=True) expected = Batch(frames=create_dataframe(), temp_outcomes={'test': [1]}) self.assertEqual(expected, batch)
def load(self) -> Iterator[Batch]: """ This is a generator for loading the frames of a video. Uses the video metadata and other class arguments Yields: :obj: `Batch`: An object containing a batch of frames and record specific metadata """ frames = [] for record in self._load_frames(): if self.skip_frames > 0 and record.get(self.identifier_column, 0) % self.skip_frames != 0: continue if self.limit and record.get(self.identifier_column, 0) >= self.limit: return Batch(pd.DataFrame(frames), identifier_column=self.identifier_column) frames.append(record) if len(frames) % self.batch_size == 0: yield Batch(pd.DataFrame(frames), identifier_column=self.identifier_column) frames = [] if frames: return Batch(pd.DataFrame(frames), identifier_column=self.identifier_column)
def test_should_return_the_new_path_after_execution(self, mock_class): class_instatnce = mock_class.return_value dummy_expr = type('dummy_expr', (), {"evaluate": lambda x=None: [True, False, True]}) # Build plan tree video = DataFrameMetadata("dataset", "dummy.avi") batch_1 = Batch(pd.DataFrame({'data': [1, 2, 3]})) batch_2 = Batch(pd.DataFrame({'data': [4, 5, 6]})) class_instatnce.load.return_value = map(lambda x: x, [batch_1, batch_2]) storage_plan = StoragePlan(video) seq_scan = SeqScanPlan(predicate=dummy_expr, column_ids=[]) seq_scan.append_child(storage_plan) # Execute the plan executor = PlanExecutor(seq_scan) actual = executor.execute_plan() expected = batch_1[::2] + batch_2[::2] mock_class.assert_called_once() self.assertEqual(expected, actual)
def test_when_function_executor_with_a_child_should_allow_chaining(self): expression = FunctionExpression(lambda x: pd.DataFrame(x)) child = FunctionExpression(lambda x: x + 1) expression.append_child(child) values = Batch(pd.DataFrame([1, 2, 3])) actual = expression.evaluate(values) expected = Batch(pd.DataFrame([2, 3, 4])) self.assertEqual(expected, actual)
def test_fetching_frames_by_index_should_also_return_temp_outcomes(self): batch = Batch(frames=create_dataframe_same(2), outcomes={'test': [[1], [2]]}, temp_outcomes={'test2': [[3], [4]]}) expected = Batch(frames=create_dataframe(), outcomes={'test': [[1]]}, temp_outcomes={'test2': [[3]]}) self.assertEqual(expected, batch[[0]])
def test_should_return_sorted_frames(self): """ data (3 batches): 'A' 'B' 'C' [1, 1, 1] ---------- [1, 5, 6] [4, 7, 10] ---------- [2, 9, 7] [4, 1, 2] [4, 2, 4] """ df1 = pd.DataFrame(np.array([[1, 1, 1]]), columns=['A', 'B', 'C']) df2 = pd.DataFrame(np.array([[1, 5, 6], [4, 7, 10]]), columns=['A', 'B', 'C']) df3 = pd.DataFrame(np.array([[2, 9, 7], [4, 1, 2], [4, 2, 4]]), columns=['A', 'B', 'C']) batches = [Batch(frames=df) for df in [df1, df2, df3]] "query: .... ORDER BY A ASC, B DESC " plan = OrderByPlan([ (TupleValueExpression('A'), ParserOrderBySortType.ASC), (TupleValueExpression('B'), ParserOrderBySortType.DESC) ]) orderby_executor = OrderByExecutor(plan) orderby_executor.append_child(DummyExecutor(batches)) sorted_batches = list(orderby_executor.exec()) """ A B C 0 1 5 6 1 1 1 1 2 2 9 7 3 4 7 10 4 4 2 4 5 4 1 2 """ expected_df1 = pd.DataFrame(np.array([[1, 5, 6]]), columns=['A', 'B', 'C']) expected_df2 = pd.DataFrame(np.array([[1, 1, 1], [2, 9, 7]]), columns=['A', 'B', 'C']) expected_df3 = pd.DataFrame(np.array([[4, 7, 10], [4, 2, 4], [4, 1, 2]]), columns=['A', 'B', 'C']) expected_batches = [ Batch(frames=df) for df in [expected_df1, expected_df2, expected_df3] ] self.assertEqual(expected_batches[0], sorted_batches[0]) self.assertEqual(expected_batches[1], sorted_batches[1]) self.assertEqual(expected_batches[2], sorted_batches[2])
def test_should_return_top_frames_after_sorting(self): """ Checks if limit returns the top 2 rows from the data after sorting data (3 batches): 'A' 'B' 'C' [1, 1, 1] ---------- [1, 5, 6] [4, 7, 10] ---------- [2, 9, 7] [4, 1, 2] [4, 2, 4] """ df1 = pd.DataFrame( np.array([[1, 1, 1]]), columns=['A', 'B', 'C']) df2 = pd.DataFrame( np.array([[1, 5, 6], [4, 7, 10]]), columns=['A', 'B', 'C']) df3 = pd.DataFrame( np.array([[2, 9, 7], [4, 1, 2], [4, 2, 4]]), columns=['A', 'B', 'C']) batches = [Batch(frames=df) for df in [df1, df2, df3]] "query: .... ORDER BY A ASC, B DESC limit 2" plan = OrderByPlan( [(TupleValueExpression('A'), ParserOrderBySortType.ASC), (TupleValueExpression('B'), ParserOrderBySortType.DESC)]) orderby_executor = OrderByExecutor(plan) orderby_executor.append_child(DummyExecutor(batches)) sorted_batches = list(orderby_executor.exec()) limit_value = 2 plan = LimitPlan(ConstantValueExpression(limit_value)) limit_executor = LimitExecutor(plan) limit_executor.append_child(DummyExecutor(sorted_batches)) reduced_batches = list(limit_executor.exec()) # merge everything into one batch aggregated_batch = Batch.concat(reduced_batches, copy=False) """ A B C 0 1 5 6 1 1 1 1 """ expected_df1 = pd.DataFrame( np.array([[1, 5, 6], [1, 1, 1]]), columns=['A', 'B', 'C']) expected_batches = [Batch(frames=df) for df in [expected_df1]] self.assertEqual(expected_batches[0], aggregated_batch)
def test_should_update_the_batch_with_outcomes_in_exec_mode(self): values = [1, 2, 3] expression = FunctionExpression(lambda x: values, mode=ExecutionMode.EXEC, name="test") expected_batch = Batch(frames=pd.DataFrame(), outcomes={"test": [1, 2, 3]}) input_batch = Batch(frames=pd.DataFrame()) expression.evaluate(input_batch) self.assertEqual(expected_batch, input_batch)
def test_should_load_video_in_table(self): query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" execute_query_fetch_all(query) metadata = CatalogManager().get_dataset_metadata("", "MyVideo") actual_batch = Batch(pd.DataFrame()) actual_batch = Batch.concat(StorageEngine.read(metadata), copy=False) actual_batch.sort() expected_batch = list(create_dummy_batches()) self.assertEqual([actual_batch], expected_batch)
def test_should_update_temp_outcomes_when_is_temp_set_exec_mode(self): values = [1, 2, 3] expression = FunctionExpression(lambda x: values, mode=ExecutionMode.EXEC, name="test", is_temp=True) expected_batch = Batch(frames=pd.DataFrame(), temp_outcomes={"test": [1, 2, 3]}) input_batch = Batch(frames=pd.DataFrame()) expression.evaluate(input_batch) self.assertEqual(expected_batch, input_batch)
def test_should_load_video_in_table(self): query = """LOAD DATA INFILE 'dummy.avi' INTO MyVideo;""" perform_query(query) metadata = CatalogManager().get_dataset_metadata("", "MyVideo") actual_batch = Batch(pd.DataFrame()) for batch in StorageEngine.read(metadata): actual_batch += batch actual_batch.sort() expected_batch = list(create_dummy_batches()) self.assertEqual([actual_batch], expected_batch)
def evaluate(self, *args, **kwargs): batch = self.get_child(0).evaluate(*args, **kwargs) if self.etype == ExpressionType.AGGREGATION_SUM: return Batch(frames=batch.frames.agg(['sum'])) elif self.etype == ExpressionType.AGGREGATION_COUNT: return Batch(frames=batch.frames.agg(['count'])) elif self.etype == ExpressionType.AGGREGATION_AVG: return Batch(frames=batch.frames.agg(['mean'])) elif self.etype == ExpressionType.AGGREGATION_MIN: return Batch(frames=batch.frames.agg(['min'])) elif self.etype == ExpressionType.AGGREGATION_MAX: return Batch(frames=batch.frames.agg(['max']))
def evaluate(self, *args, **kwargs): vl = self.get_child(0).evaluate(*args, **kwargs).frames vr = self.get_child(1).evaluate(*args, **kwargs).frames if self.etype == ExpressionType.ARITHMETIC_ADD: return Batch(pd.DataFrame(vl + vr)) elif self.etype == ExpressionType.ARITHMETIC_SUBTRACT: return Batch(pd.DataFrame(vl - vr)) elif self.etype == ExpressionType.ARITHMETIC_MULTIPLY: return Batch(pd.DataFrame(vl * vr)) elif self.etype == ExpressionType.ARITHMETIC_DIVIDE: return Batch(pd.DataFrame(vl / vr))
def test_adding_batch_frame_with_outcomes_returns_new_batch_frame(self): batch_1 = Batch(frames=create_dataframe(), outcomes={'1': [1]}, temp_outcomes={'2': [1]}) batch_2 = Batch(frames=create_dataframe(), outcomes={'1': [2]}, temp_outcomes={'2': [2]}) batch_3 = Batch(frames=create_dataframe_same(2), outcomes={'1': [1, 2]}, temp_outcomes={'2': [1, 2]}) self.assertEqual(batch_3, batch_1 + batch_2)
def evaluate(self, batch: Batch): args = [] if self.get_children_count() > 0: child = self.get_child(0) args.append(child.evaluate(batch)) else: args.append(batch) outcome = self.function(*args) if self.mode == ExecutionMode.EXEC: batch.set_outcomes(self.name, outcome, is_temp=self.is_temp) return outcome
def evaluate(self, *args): if self.get_children_count() == 2: left_values = self.get_child(0).evaluate(*args).frames right_values = self.get_child(1).evaluate(*args).frames if self.etype == ExpressionType.LOGICAL_AND: return Batch(pd.DataFrame(left_values & right_values)) elif self.etype == ExpressionType.LOGICAL_OR: return Batch(pd.DataFrame(left_values | right_values)) else: values = self.get_child(0).evaluate(*args).frames if self.etype == ExpressionType.LOGICAL_NOT: return Batch(pd.DataFrame(~values))
def evaluate(self, batch: Batch): new_batch = batch child_batches = [child.evaluate(batch) for child in self.children] if len(child_batches): new_batch = Batch.merge_column_wise(child_batches) func = self._gpu_enabled_function() outcomes = func(new_batch.frames) outcomes = Batch(pd.DataFrame(outcomes)) if self._output: return outcomes.project([self._output]) else: return outcomes
def test_has_outcomes_returns_true_if_the_given_name_is_in_outcomes(self): batch = Batch(frames=create_dataframe()) batch.set_outcomes('test_temp', [1], is_temp=True) batch.set_outcomes('test', [1]) self.assertTrue(batch.has_outcome('test')) self.assertTrue(batch.has_outcome('test_temp'))
def test_should_return_only_frames_satisfy_predicate(self): dataframe = create_dataframe(3) batch = Batch(frames=dataframe) expression = type("AbstractExpression", (), {"evaluate": lambda x: Batch( pd.DataFrame([False, False, True]))}) plan = type("ScanPlan", (), {"predicate": expression, "columns": None}) predicate_executor = SequentialScanExecutor(plan) predicate_executor.append_child(DummyExecutor([batch])) expected = Batch(batch[[2]].frames.reset_index(drop=True)) filtered = list(predicate_executor.exec())[0] self.assertEqual(expected, filtered)
def test_execute_plan_for_pp_scan_plan(self, mock_clean, mock_build): # PPExecutor tree = MagicMock(node=PPScanPlan(None)) tree.exec.return_value = [ Batch(pd.DataFrame([1])), Batch(pd.DataFrame([2])), Batch(pd.DataFrame([3])) ] mock_build.return_value = tree actual = PlanExecutor(None).execute_plan() mock_build.assert_called_once_with(None) mock_clean.assert_called_once() tree.exec.assert_called_once() self.assertEqual(actual, Batch(pd.DataFrame([[1], [2], [3]])))
def exec(self) -> Iterator[Batch]: child_executor = self.children[0] aggregated_batch_list = [] # aggregates the batches into one large batch for batch in child_executor.exec(): self.batch_sizes.append(batch.batch_size) aggregated_batch_list.append(batch) aggregated_batch = Batch.concat(aggregated_batch_list, copy=False) # sorts the batch try: aggregated_batch.sort_orderby(by=self.extract_column_names(), sort_type=self.extract_sort_types()) except KeyError: # pass for now pass # split the aggregated batch into smaller ones based # on self.batch_sizes which holds the input batches sizes index = 0 for i in self.batch_sizes: batch = aggregated_batch[index:index + i] batch.reset_index() index += i yield batch
def test_should_return_limit_greater_than_size(self): """ This should return the exact same data if the limit value is greater than what is present. This will also leave a warning """ dfs = [pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list('ABCD')) for _ in range(4)] batches = [Batch(frames=df) for df in dfs] previous_total_size = 0 for batch in batches: previous_total_size += batch.batch_size limit_value = 500 plan = LimitPlan(ConstantValueExpression(limit_value)) limit_executor = LimitExecutor(plan) limit_executor.append_child(DummyExecutor(batches)) reduced_batches = list(limit_executor.exec()) after_total_size = 0 for batch in reduced_batches: after_total_size += batch.batch_size self.assertEqual(previous_total_size, after_total_size)
def write(self, table: DataFrameMetadata, rows: Batch): """ Write rows into the dataframe. Arguments: table: table metadata object to write into rows : batch to be persisted in the storage. """ if rows.empty(): return # ToDo # Throw an error if the row schema doesn't match the table schema with materialize_dataset(self.spark_session, self._spark_url(table), table.schema.petastorm_schema): records = rows.frames columns = records.keys() rows_rdd = self.spark_context.parallelize(records.values) \ .map(lambda x: dict(zip(columns, x))) \ .map(lambda x: dict_to_spark_row(table.schema.petastorm_schema, x)) self.spark_session.createDataFrame(rows_rdd, table.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(self._spark_url(table))
def test_select_and_where_video_in_table(self): select_query = "SELECT id,data FROM MyVideo WHERE id = 5;" actual_batch = execute_query_fetch_all(select_query) expected_batch = list(create_dummy_batches(filters=[5]))[0] self.assertEqual(actual_batch, expected_batch) select_query = "SELECT data FROM MyVideo WHERE id = 5;" actual_batch = execute_query_fetch_all(select_query) expected_rows = [{ "data": np.array(np.ones((2, 2, 3)) * float(5 + 1) * 25, dtype=np.uint8) }] expected_batch = Batch(frames=pd.DataFrame(expected_rows)) self.assertEqual(actual_batch, expected_batch) select_query = "SELECT id, data FROM MyVideo WHERE id >= 2;" actual_batch = execute_query_fetch_all(select_query) actual_batch.sort() expected_batch = list( create_dummy_batches(filters=range(2, NUM_FRAMES)))[0] self.assertEqual(actual_batch, expected_batch) select_query = "SELECT id, data FROM MyVideo WHERE id >= 2 AND id < 5;" actual_batch = execute_query_fetch_all(select_query) actual_batch.sort() expected_batch = list(create_dummy_batches(filters=range(2, 5)))[0] self.assertEqual(actual_batch, expected_batch)
def test_should_load_and_sort_in_table(self): select_query = "SELECT data, id FROM MyVideo ORDER BY id;" actual_batch = execute_query_fetch_all(select_query) expected_rows = [{ 'id': i, 'data': np.array(np.ones((2, 2, 3)) * float(i + 1) * 25, dtype=np.uint8) } for i in range(NUM_FRAMES)] expected_batch = Batch(frames=pd.DataFrame(expected_rows)) self.assertEqual(actual_batch, expected_batch) select_query = "SELECT data, id FROM MyVideo ORDER BY id DESC;" actual_batch = execute_query_fetch_all(select_query) expected_batch.reverse() self.assertEqual(actual_batch, expected_batch)