def test_hash_join_with_multiple_tables(self): select_query = """SELECT * FROM table1 JOIN table2 ON table1.a0 = table2.a0 JOIN table3 ON table3.a1 = table1.a1 WHERE table1.a2 > 50;""" actual_batch = execute_query_fetch_all(select_query) tmp = pd.merge( self.table1, self.table2, left_on=["table1.a0"], right_on=["table2.a0"], how="inner", ) expected = pd.merge( tmp, self.table3, left_on=["table1.a1"], right_on=["table3.a1"], how="inner", ) expected = expected.where(expected["table1.a2"] > 50) if len(expected): expected_batch = Batch(expected) self.assertEqual( expected_batch.sort_orderby(["table1.a0"]), actual_batch.sort_orderby(["table1.a0"]), )
def test_adding_batch_frame_with_outcomes_returns_new_batch_frame(self): batch_1 = Batch(frames=create_dataframe()) batch_2 = Batch(frames=create_dataframe()) batch_3 = Batch(frames=create_dataframe_same(2)) self.assertEqual(batch_3, batch_1 + batch_2)
def test_merge_column_wise_batch_frame(self): batch_1 = Batch(frames=pd.DataFrame([{'id': 0}])) batch_2 = Batch(frames=pd.DataFrame([{'data': 1}])) batch_3 = Batch.merge_column_wise([batch_1, batch_2]) batch_4 = Batch(frames=pd.DataFrame([{'id': 0, 'data': 1}])) self.assertEqual(batch_3, batch_4)
def test_should_return_smaller_num_rows(self): dfs = [ pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list('ABCD')) for _ in range(4) ] batches = [Batch(frames=df) for df in dfs] sample_value = 3 plan = SamplePlan(ConstantValueExpression(sample_value)) sample_executor = SampleExecutor(plan) sample_executor.append_child(DummyExecutor(batches)) reduced_batches = list(sample_executor.exec()) original = Batch.concat(batches) filter = range(0, len(original), sample_value) original = original._get_frames_from_indices(filter) original = Batch.concat([original]) reduced = Batch.concat(reduced_batches) self.assertEqual(len(original), len(reduced)) self.assertEqual(original, reduced)
def test_should_load_and_select_using_udf_video(self): # Equality test select_query = "SELECT id,DummyObjectDetector(data) FROM MyVideo \ WHERE DummyObjectDetector(data).label = ['person'] ORDER BY id;" actual_batch = execute_query_fetch_all(select_query) expected = [{ 'myvideo.id': i * 2, 'dummyobjectdetector.label': ['person'] } for i in range(NUM_FRAMES // 2)] expected_batch = Batch(frames=pd.DataFrame(expected)) self.assertEqual(actual_batch, expected_batch) # Contain test select_query = "SELECT id, DummyObjectDetector(data) FROM MyVideo \ WHERE DummyObjectDetector(data).label <@ ['person'] ORDER BY id;" actual_batch = execute_query_fetch_all(select_query) self.assertEqual(actual_batch, expected_batch) select_query = "SELECT id FROM MyVideo WHERE \ DummyMultiObjectDetector(data).labels @> ['person'] ORDER BY id;" actual_batch = execute_query_fetch_all(select_query) expected = [{'myvideo.id': i} for i in range(0, NUM_FRAMES, 3)] expected_batch = Batch(frames=pd.DataFrame(expected)) self.assertEqual(actual_batch, expected_batch)
def test_should_return_sorted_frames(self): """ data (3 batches): 'A' 'B' 'C' [1, 1, 1] ---------- [1, 5, 6] [4, 7, 10] ---------- [2, 9, 7] [4, 1, 2] [4, 2, 4] """ df1 = pd.DataFrame(np.array([[1, 1, 1]]), columns=['A', 'B', 'C']) df2 = pd.DataFrame(np.array([[1, 5, 6], [4, 7, 10]]), columns=['A', 'B', 'C']) df3 = pd.DataFrame(np.array([[2, 9, 7], [4, 1, 2], [4, 2, 4]]), columns=['A', 'B', 'C']) batches = [Batch(frames=df) for df in [df1, df2, df3]] "query: .... ORDER BY A ASC, B DESC " plan = OrderByPlan([ (TupleValueExpression(col_alias='A'), ParserOrderBySortType.ASC), (TupleValueExpression(col_alias='B'), ParserOrderBySortType.DESC) ]) orderby_executor = OrderByExecutor(plan) orderby_executor.append_child(DummyExecutor(batches)) sorted_batches = list(orderby_executor.exec()) """ A B C 0 1 5 6 1 1 1 1 2 2 9 7 3 4 7 10 4 4 2 4 5 4 1 2 """ expected_df1 = pd.DataFrame(np.array([[1, 5, 6]]), columns=['A', 'B', 'C']) expected_df2 = pd.DataFrame(np.array([[1, 1, 1], [2, 9, 7]]), columns=['A', 'B', 'C']) expected_df3 = pd.DataFrame(np.array([[4, 7, 10], [4, 2, 4], [4, 1, 2]]), columns=['A', 'B', 'C']) expected_batches = [ Batch(frames=df) for df in [expected_df1, expected_df2, expected_df3] ] self.assertEqual(expected_batches[0], sorted_batches[0]) self.assertEqual(expected_batches[1], sorted_batches[1]) self.assertEqual(expected_batches[2], sorted_batches[2])
def test_should_return_top_frames_after_sorting(self): """ Checks if limit returns the top 2 rows from the data after sorting data (3 batches): 'A' 'B' 'C' [1, 1, 1] ---------- [1, 5, 6] [4, 7, 10] ---------- [2, 9, 7] [4, 1, 2] [4, 2, 4] """ df1 = pd.DataFrame(np.array([[1, 1, 1]]), columns=['A', 'B', 'C']) df2 = pd.DataFrame(np.array([[1, 5, 6], [4, 7, 10]]), columns=['A', 'B', 'C']) df3 = pd.DataFrame(np.array([[2, 9, 7], [4, 1, 2], [4, 2, 4]]), columns=['A', 'B', 'C']) batches = [Batch(frames=df) for df in [df1, df2, df3]] "query: .... ORDER BY A ASC, B DESC limit 2" plan = OrderByPlan([ (TupleValueExpression(col_alias='A'), ParserOrderBySortType.ASC), (TupleValueExpression(col_alias='B'), ParserOrderBySortType.DESC) ]) orderby_executor = OrderByExecutor(plan) orderby_executor.append_child(DummyExecutor(batches)) sorted_batches = list(orderby_executor.exec()) limit_value = 2 plan = LimitPlan(ConstantValueExpression(limit_value)) limit_executor = LimitExecutor(plan) limit_executor.append_child(DummyExecutor(sorted_batches)) reduced_batches = list(limit_executor.exec()) # merge everything into one batch aggregated_batch = Batch.concat(reduced_batches, copy=False) """ A B C 0 1 5 6 1 1 1 1 """ expected_df1 = pd.DataFrame(np.array([[1, 5, 6], [1, 1, 1]]), columns=['A', 'B', 'C']) expected_batches = [Batch(frames=df) for df in [expected_df1]] self.assertEqual(expected_batches[0], aggregated_batch)
def test_simple_function_scan(self): values = Batch(pd.DataFrame([1, 2, 3], columns=['a'])) expression = FunctionExpression(lambda x: x + 1, name='test', alias='test') expression.output_col_aliases = ['test.a'] plan = type("FunctionScanPlan", (), {"func_expr": expression}) function_scan_executor = FunctionScanExecutor(plan) actual = list(function_scan_executor.exec(lateral_input=values))[0] expected = Batch(pd.DataFrame([2, 3, 4], columns=['test.a'])) self.assertEqual(expected, actual)
def evaluate(self, *args, **kwargs): batch = self.get_child(0).evaluate(*args, **kwargs) if self.etype == ExpressionType.AGGREGATION_SUM: return Batch(frames=batch.frames.agg(['sum'])) elif self.etype == ExpressionType.AGGREGATION_COUNT: return Batch(frames=batch.frames.agg(['count'])) elif self.etype == ExpressionType.AGGREGATION_AVG: return Batch(frames=batch.frames.agg(['mean'])) elif self.etype == ExpressionType.AGGREGATION_MIN: return Batch(frames=batch.frames.agg(['min'])) elif self.etype == ExpressionType.AGGREGATION_MAX: return Batch(frames=batch.frames.agg(['max']))
def evaluate(self, *args, **kwargs): vl = self.get_child(0).evaluate(*args, **kwargs).frames vr = self.get_child(1).evaluate(*args, **kwargs).frames if self.etype == ExpressionType.ARITHMETIC_ADD: return Batch(pd.DataFrame(vl + vr)) elif self.etype == ExpressionType.ARITHMETIC_SUBTRACT: return Batch(pd.DataFrame(vl - vr)) elif self.etype == ExpressionType.ARITHMETIC_MULTIPLY: return Batch(pd.DataFrame(vl * vr)) elif self.etype == ExpressionType.ARITHMETIC_DIVIDE: return Batch(pd.DataFrame(vl / vr))
def test_should_load_and_select_real_video_in_table(self): query = """LOAD DATA INFILE 'data/ua_detrac/ua_detrac.mp4' INTO UADETRAC;""" execute_query_fetch_all(query) select_query = "SELECT * FROM UADETRAC;" actual_batch = execute_query_fetch_all(select_query) actual_batch.sort() video_reader = OpenCVReader("data/ua_detrac/ua_detrac.mp4", batch_mem_size=30000000) expected_batch = Batch(frames=pd.DataFrame()) for batch in video_reader.read(): expected_batch += batch expected_batch.modify_column_alias("uadetrac") self.assertEqual(actual_batch, expected_batch)
def test_short_circuiting_or_complete(self): # tests whether right-hand side is bypassed completely with or tup_val_exp_l = TupleValueExpression(col_name=0) tup_val_exp_l.col_alias = 0 tup_val_exp_r = TupleValueExpression(col_name=1) tup_val_exp_r.col_alias = 1 comp_exp_l = ComparisonExpression( ExpressionType.COMPARE_EQUAL, tup_val_exp_l, tup_val_exp_r ) comp_exp_r = Mock(spec=ComparisonExpression) logical_exp = LogicalExpression( ExpressionType.LOGICAL_OR, comp_exp_l, comp_exp_r ) tuples = Batch(pd.DataFrame( {0: [1, 2, 3], 1: [1, 2, 3]})) self.assertEqual( [True, True, True], logical_exp.evaluate(tuples).frames[0].tolist() ) comp_exp_r.evaluate.assert_not_called()
def test_short_circuiting_or_partial(self): # tests whether right-hand side is partially executed with or tup_val_exp_l = TupleValueExpression(col_name=0) tup_val_exp_l.col_alias = 0 tup_val_exp_r = TupleValueExpression(col_name=1) tup_val_exp_r.col_alias = 1 comp_exp_l = ComparisonExpression( ExpressionType.COMPARE_EQUAL, tup_val_exp_l, tup_val_exp_r ) comp_exp_r = Mock(spec=ComparisonExpression) comp_exp_r.evaluate = Mock(return_value=Mock(frames=[[True], [False]])) logical_exp = LogicalExpression( ExpressionType.LOGICAL_OR, comp_exp_l, comp_exp_r ) tuples = Batch(pd.DataFrame( {0: [1, 2, 3, 4], 1: [5, 6, 3, 4]})) self.assertEqual( [True, False, True, True], logical_exp.evaluate(tuples).frames[0].tolist() ) comp_exp_r.evaluate.assert_called_once_with(tuples, mask=[0, 1])
def test_array_count(self): select_query = """SELECT id FROM MyVideo WHERE Array_Count(DummyMultiObjectDetector(data).labels, 'person') = 2 ORDER BY id;""" actual_batch = execute_query_fetch_all(select_query) expected = [{'myvideo.id': i} for i in range(0, NUM_FRAMES, 3)] expected_batch = Batch(frames=pd.DataFrame(expected)) self.assertEqual(actual_batch, expected_batch) select_query = """SELECT id FROM MyVideo WHERE Array_Count(DummyObjectDetector(data).label, 'bicycle') = 1 ORDER BY id;""" actual_batch = execute_query_fetch_all(select_query) expected = [{'myvideo.id': i} for i in range(1, NUM_FRAMES, 2)] expected_batch = Batch(frames=pd.DataFrame(expected)) self.assertEqual(actual_batch, expected_batch)
def test_should_call_opencv_reader_and_storage_engine(self, create_mock): file_path = 'video' table_metainfo = 'info' batch_mem_size = 3000 file_options = {} file_options['file_format'] = FileFormatType.VIDEO plan = type( "LoadDataPlan", (), { 'table_metainfo': table_metainfo, 'file_path': file_path, 'batch_mem_size': batch_mem_size, 'file_options': file_options }) load_executor = LoadDataExecutor(plan) with patch.object(Path, 'exists') as mock_exists: mock_exists.return_value = True batch = next(load_executor.exec()) create_mock.assert_called_once_with(table_metainfo, file_path) self.assertEqual( batch, Batch( pd.DataFrame([{ 'Video successfully added at location: ': file_path }])))
def exec(self) -> Iterator[Batch]: child_executor = self.children[0] aggregated_batch_list = [] # aggregates the batches into one large batch for batch in child_executor.exec(): self.batch_sizes.append(batch.batch_size) aggregated_batch_list.append(batch) aggregated_batch = Batch.concat(aggregated_batch_list, copy=False) # sorts the batch try: aggregated_batch.sort_orderby(by=self.extract_column_names(), sort_type=self.extract_sort_types()) except KeyError: # pass for now pass # split the aggregated batch into smaller ones based # on self.batch_sizes which holds the input batches sizes index = 0 for i in self.batch_sizes: batch = aggregated_batch[index:index + i] batch.reset_index() index += i yield batch
def test_should_load_and_sort_in_table(self): select_query = "SELECT data, id FROM MyVideo ORDER BY id;" actual_batch = execute_query_fetch_all(select_query) expected_rows = [{ "myvideo.id": i, "myvideo.data": np.array(np.ones((2, 2, 3)) * float(i + 1) * 25, dtype=np.uint8), } for i in range(NUM_FRAMES)] expected_batch = Batch(frames=pd.DataFrame(expected_rows)) self.assertEqual(actual_batch, expected_batch) select_query = "SELECT data, id FROM MyVideo ORDER BY id DESC;" actual_batch = execute_query_fetch_all(select_query) expected_batch.reverse() self.assertEqual(actual_batch, expected_batch)
def test_should_search_in_upload_directory(self, create_mock): self.upload_path = Path(ConfigurationManager().get_value( 'storage', 'path_prefix')) file_path = 'video' table_metainfo = 'info' batch_mem_size = 3000 file_options = {} file_options['file_format'] = FileFormatType.VIDEO plan = type( "LoadDataPlan", (), { 'table_metainfo': table_metainfo, 'file_path': file_path, 'batch_mem_size': batch_mem_size, 'file_options': file_options }) load_executor = LoadDataExecutor(plan) with patch.object(Path, 'exists') as mock_exists: mock_exists.side_effect = [False, True] batch = next(load_executor.exec()) create_mock.assert_called_once_with(table_metainfo, self.upload_path / file_path) self.assertEqual( batch, Batch( pd.DataFrame([{ 'Video successfully added at location: ': file_path }])))
def exec(self): """ Read the input meta file using pandas and persist data using storage engine """ # Read the CSV file # converters is a dictionary of functions that convert the values # in the column to the desired type csv_reader = CSVReader(os.path.join(self.path_prefix, self.node.file_path), column_list=self.node.column_list, batch_mem_size=self.node.batch_mem_size) # write with storage engine in batches num_loaded_frames = 0 for batch in csv_reader.read(): StorageEngine.write(self.node.table_metainfo, batch) num_loaded_frames += len(batch) # yield result df_yield_result = Batch( pd.DataFrame( { 'CSV': str(self.node.file_path), 'Number of loaded frames': num_loaded_frames }, index=[0])) yield df_yield_result
def test_select_and_where_video_in_table(self): select_query = "SELECT id,data FROM MyVideo WHERE id = 5;" actual_batch = execute_query_fetch_all(select_query) expected_batch = list(create_dummy_batches(filters=[5]))[0] self.assertEqual(actual_batch, expected_batch) select_query = "SELECT data FROM MyVideo WHERE id = 5;" actual_batch = execute_query_fetch_all(select_query) expected_rows = [{ "myvideo.data": np.array(np.ones((2, 2, 3)) * float(5 + 1) * 25, dtype=np.uint8) }] expected_batch = Batch(frames=pd.DataFrame(expected_rows)) self.assertEqual(actual_batch, expected_batch) select_query = "SELECT id, data FROM MyVideo WHERE id >= 2;" actual_batch = execute_query_fetch_all(select_query) actual_batch.sort() expected_batch = list( create_dummy_batches(filters=range(2, NUM_FRAMES)))[0] self.assertEqual(actual_batch, expected_batch) select_query = "SELECT id, data FROM MyVideo WHERE id >= 2 AND id < 5;" actual_batch = execute_query_fetch_all(select_query) actual_batch.sort() expected_batch = list(create_dummy_batches(filters=range(2, 5)))[0] self.assertEqual(actual_batch, expected_batch)
def test_should_mat_view_to_the_same_table(self): materialized_query = """CREATE MATERIALIZED VIEW IF NOT EXISTS dummy_view2 (id, label) AS SELECT id, DummyObjectDetector(data).label FROM MyVideo WHERE id < 5; """ execute_query_fetch_all(materialized_query) materialized_query = """CREATE MATERIALIZED VIEW IF NOT EXISTS dummy_view2 (id, label) AS SELECT id, DummyObjectDetector(data).label FROM MyVideo WHERE id >= 5; """ execute_query_fetch_all(materialized_query) select_query = 'SELECT id, label FROM dummy_view2;' actual_batch = execute_query_fetch_all(select_query) actual_batch.sort() labels = DummyObjectDetector().labels expected = [{ 'dummy_view2.id': i, 'dummy_view2.label': labels[1 + i % 2] } for i in range(5)] expected_batch = Batch(frames=pd.DataFrame(expected)) self.assertEqual(actual_batch, expected_batch)
def test_should_return_limit_greater_than_size(self): """ This should return the exact same data if the limit value is greater than what is present. This will also leave a warning """ dfs = [ pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list('ABCD')) for _ in range(4) ] batches = [Batch(frames=df) for df in dfs] previous_total_size = 0 for batch in batches: previous_total_size += batch.batch_size limit_value = 500 plan = LimitPlan(ConstantValueExpression(limit_value)) limit_executor = LimitExecutor(plan) limit_executor.append_child(DummyExecutor(batches)) reduced_batches = list(limit_executor.exec()) after_total_size = 0 for batch in reduced_batches: after_total_size += batch.batch_size self.assertEqual(previous_total_size, after_total_size)
def create_dummy_batches(num_frames=NUM_FRAMES, filters=[], batch_size=10, start_id=0): if not filters: filters = range(num_frames) data = [] for i in filters: data.append({'myvideo.id': i + start_id, 'myvideo.data': np.array( np.ones((2, 2, 3)) * float(i + 1) * 25, dtype=np.uint8)}) if len(data) % batch_size == 0: yield Batch(pd.DataFrame(data)) data = [] if data: yield Batch(pd.DataFrame(data))
def exec(self): """Drop table executor""" catalog_manager = CatalogManager() if len(self.node.table_refs) > 1: logger.exception('Drop supports only single table') table_ref = self.node.table_refs[0] if not catalog_manager.check_table_exists( table_ref.table.database_name, table_ref.table.table_name): err_msg = "Table: {} does not exsits".format(table_ref) if self.node.if_exists: logger.warn(err_msg) else: logger.exception(err_msg) if table_ref.table.table_obj.is_video: VideoStorageEngine.drop(table=table_ref.table.table_obj) else: StorageEngine.drop(table=table_ref.table.table_obj) success = catalog_manager.drop_dataset_metadata( table_ref.table.database_name, table_ref.table.table_name) if not success: err_msg = "Failed to drop {}".format(table_ref) logger.exception(err_msg) yield Batch( pd.DataFrame( { "Table Successfully dropped: {}".format( table_ref.table.table_name) }, index=[0], ))
def test_should_use_the_same_function_if_not_gpu_compatible(self): mock_function = MagicMock(return_value=pd.DataFrame()) expression = FunctionExpression(mock_function, name="test") input_batch = Batch(frames=pd.DataFrame()) expression.evaluate(input_batch) mock_function.assert_called()
def test_hash_join_with_one_on(self): select_query = """SELECT * FROM table1 JOIN table2 ON table1.a1 = table2.a1;""" actual_batch = execute_query_fetch_all(select_query) expected = pd.merge( self.table1, self.table2, left_on=["table1.a1"], right_on=["table2.a1"], how="inner", ) if len(expected): expected_batch = Batch(expected) self.assertEqual( expected_batch.sort_orderby(["table1.a2"]), actual_batch.sort_orderby(["table1.a2"]), )
def execute_query_fetch_all(query) -> Optional[Batch]: """ Execute the query and fetch all results into one Batch object. """ output = execute_query(query) if output: batch_list = list(output) return Batch.concat(batch_list, copy=False)
def evaluate(self, *args, **kwargs): if self.get_children_count() == 2: left_values = self.get_child(0).evaluate(*args, **kwargs).frames if self.etype == ExpressionType.LOGICAL_AND: if (~left_values).all().bool(): # check if all are false return Batch(left_values) kwargs["mask"] = left_values[left_values[0]].index.tolist() elif self.etype == ExpressionType.LOGICAL_OR: if left_values.all().bool(): # check if all are true return Batch(left_values) kwargs["mask"] = left_values[~left_values[0]].index.tolist() right_values = self.get_child(1).evaluate(*args, **kwargs).frames left_values.iloc[kwargs["mask"]] = right_values return Batch(pd.DataFrame(left_values)) else: values = self.get_child(0).evaluate(*args, **kwargs).frames if self.etype == ExpressionType.LOGICAL_NOT: return Batch(pd.DataFrame(~values))
def test_should_return_only_frames_satisfy_predicate(self): dataframe = create_dataframe(3) batch = Batch(frames=dataframe) expression = type( "AbstractExpression", (), {"evaluate": lambda x: Batch(pd.DataFrame([False, False, True]))}) plan = type("ScanPlan", (), { "predicate": expression, "columns": None, "alias": None }) predicate_executor = SequentialScanExecutor(plan) predicate_executor.append_child(DummyExecutor([batch])) expected = Batch(batch[[2]].frames.reset_index(drop=True)) filtered = list(predicate_executor.exec())[0] self.assertEqual(expected, filtered)
def read(self) -> Iterator[Batch]: """ This calls the sub class read implementation and yields the batch to the caller """ data_batch = [] row_size = None for data in self._read(): if row_size is None: row_size = 0 row_size = get_size(data) data_batch.append(data) if len(data_batch) * row_size >= self.batch_mem_size: yield Batch(pd.DataFrame(data_batch)) data_batch = [] if data_batch: yield Batch(pd.DataFrame(data_batch))