Esempio n. 1
0
 def test_hash_join_with_multiple_tables(self):
     select_query = """SELECT * FROM table1 JOIN table2
                       ON table1.a0 = table2.a0 JOIN table3
                       ON table3.a1 = table1.a1 WHERE table1.a2 > 50;"""
     actual_batch = execute_query_fetch_all(select_query)
     tmp = pd.merge(
         self.table1,
         self.table2,
         left_on=["table1.a0"],
         right_on=["table2.a0"],
         how="inner",
     )
     expected = pd.merge(
         tmp,
         self.table3,
         left_on=["table1.a1"],
         right_on=["table3.a1"],
         how="inner",
     )
     expected = expected.where(expected["table1.a2"] > 50)
     if len(expected):
         expected_batch = Batch(expected)
         self.assertEqual(
             expected_batch.sort_orderby(["table1.a0"]),
             actual_batch.sort_orderby(["table1.a0"]),
         )
Esempio n. 2
0
    def test_adding_batch_frame_with_outcomes_returns_new_batch_frame(self):
        batch_1 = Batch(frames=create_dataframe())
        batch_2 = Batch(frames=create_dataframe())

        batch_3 = Batch(frames=create_dataframe_same(2))

        self.assertEqual(batch_3, batch_1 + batch_2)
Esempio n. 3
0
    def test_merge_column_wise_batch_frame(self):
        batch_1 = Batch(frames=pd.DataFrame([{'id': 0}]))
        batch_2 = Batch(frames=pd.DataFrame([{'data': 1}]))

        batch_3 = Batch.merge_column_wise([batch_1, batch_2])
        batch_4 = Batch(frames=pd.DataFrame([{'id': 0, 'data': 1}]))
        self.assertEqual(batch_3, batch_4)
Esempio n. 4
0
    def test_should_return_smaller_num_rows(self):
        dfs = [
            pd.DataFrame(np.random.randint(0, 100, size=(100, 4)),
                         columns=list('ABCD')) for _ in range(4)
        ]

        batches = [Batch(frames=df) for df in dfs]

        sample_value = 3

        plan = SamplePlan(ConstantValueExpression(sample_value))

        sample_executor = SampleExecutor(plan)
        sample_executor.append_child(DummyExecutor(batches))
        reduced_batches = list(sample_executor.exec())

        original = Batch.concat(batches)
        filter = range(0, len(original), sample_value)
        original = original._get_frames_from_indices(filter)
        original = Batch.concat([original])

        reduced = Batch.concat(reduced_batches)

        self.assertEqual(len(original), len(reduced))
        self.assertEqual(original, reduced)
Esempio n. 5
0
    def test_should_load_and_select_using_udf_video(self):
        # Equality test
        select_query = "SELECT id,DummyObjectDetector(data) FROM MyVideo \
            WHERE DummyObjectDetector(data).label = ['person'] ORDER BY id;"

        actual_batch = execute_query_fetch_all(select_query)
        expected = [{
            'myvideo.id': i * 2,
            'dummyobjectdetector.label': ['person']
        } for i in range(NUM_FRAMES // 2)]
        expected_batch = Batch(frames=pd.DataFrame(expected))
        self.assertEqual(actual_batch, expected_batch)

        # Contain test
        select_query = "SELECT id, DummyObjectDetector(data) FROM MyVideo \
            WHERE DummyObjectDetector(data).label <@ ['person'] ORDER BY id;"

        actual_batch = execute_query_fetch_all(select_query)
        self.assertEqual(actual_batch, expected_batch)

        select_query = "SELECT id FROM MyVideo WHERE \
            DummyMultiObjectDetector(data).labels @> ['person'] ORDER BY id;"

        actual_batch = execute_query_fetch_all(select_query)
        expected = [{'myvideo.id': i} for i in range(0, NUM_FRAMES, 3)]
        expected_batch = Batch(frames=pd.DataFrame(expected))
        self.assertEqual(actual_batch, expected_batch)
Esempio n. 6
0
    def test_should_return_sorted_frames(self):
        """
        data (3 batches):
        'A' 'B' 'C'
        [1, 1, 1]
        ----------
        [1, 5, 6]
        [4, 7, 10]
        ----------
        [2, 9, 7]
        [4, 1, 2]
        [4, 2, 4]
        """

        df1 = pd.DataFrame(np.array([[1, 1, 1]]), columns=['A', 'B', 'C'])
        df2 = pd.DataFrame(np.array([[1, 5, 6], [4, 7, 10]]),
                           columns=['A', 'B', 'C'])
        df3 = pd.DataFrame(np.array([[2, 9, 7], [4, 1, 2], [4, 2, 4]]),
                           columns=['A', 'B', 'C'])

        batches = [Batch(frames=df) for df in [df1, df2, df3]]

        "query: .... ORDER BY A ASC, B DESC "

        plan = OrderByPlan([
            (TupleValueExpression(col_alias='A'), ParserOrderBySortType.ASC),
            (TupleValueExpression(col_alias='B'), ParserOrderBySortType.DESC)
        ])

        orderby_executor = OrderByExecutor(plan)
        orderby_executor.append_child(DummyExecutor(batches))

        sorted_batches = list(orderby_executor.exec())
        """
           A  B   C
        0  1  5   6
        1  1  1   1
        2  2  9   7
        3  4  7  10
        4  4  2   4
        5  4  1   2
        """
        expected_df1 = pd.DataFrame(np.array([[1, 5, 6]]),
                                    columns=['A', 'B', 'C'])
        expected_df2 = pd.DataFrame(np.array([[1, 1, 1], [2, 9, 7]]),
                                    columns=['A', 'B', 'C'])
        expected_df3 = pd.DataFrame(np.array([[4, 7, 10], [4, 2, 4], [4, 1,
                                                                      2]]),
                                    columns=['A', 'B', 'C'])

        expected_batches = [
            Batch(frames=df)
            for df in [expected_df1, expected_df2, expected_df3]
        ]

        self.assertEqual(expected_batches[0], sorted_batches[0])
        self.assertEqual(expected_batches[1], sorted_batches[1])
        self.assertEqual(expected_batches[2], sorted_batches[2])
Esempio n. 7
0
    def test_should_return_top_frames_after_sorting(self):
        """
        Checks if limit returns the top 2 rows from the data
        after sorting

        data (3 batches):
        'A' 'B' 'C'
        [1, 1, 1]
        ----------
        [1, 5, 6]
        [4, 7, 10]
        ----------
        [2, 9, 7]
        [4, 1, 2]
        [4, 2, 4]
        """

        df1 = pd.DataFrame(np.array([[1, 1, 1]]), columns=['A', 'B', 'C'])
        df2 = pd.DataFrame(np.array([[1, 5, 6], [4, 7, 10]]),
                           columns=['A', 'B', 'C'])
        df3 = pd.DataFrame(np.array([[2, 9, 7], [4, 1, 2], [4, 2, 4]]),
                           columns=['A', 'B', 'C'])

        batches = [Batch(frames=df) for df in [df1, df2, df3]]

        "query: .... ORDER BY A ASC, B DESC limit 2"

        plan = OrderByPlan([
            (TupleValueExpression(col_alias='A'), ParserOrderBySortType.ASC),
            (TupleValueExpression(col_alias='B'), ParserOrderBySortType.DESC)
        ])

        orderby_executor = OrderByExecutor(plan)
        orderby_executor.append_child(DummyExecutor(batches))

        sorted_batches = list(orderby_executor.exec())

        limit_value = 2
        plan = LimitPlan(ConstantValueExpression(limit_value))
        limit_executor = LimitExecutor(plan)
        limit_executor.append_child(DummyExecutor(sorted_batches))
        reduced_batches = list(limit_executor.exec())

        # merge everything into one batch
        aggregated_batch = Batch.concat(reduced_batches, copy=False)
        """
           A  B   C
        0  1  5   6
        1  1  1   1
        """

        expected_df1 = pd.DataFrame(np.array([[1, 5, 6], [1, 1, 1]]),
                                    columns=['A', 'B', 'C'])

        expected_batches = [Batch(frames=df) for df in [expected_df1]]

        self.assertEqual(expected_batches[0], aggregated_batch)
Esempio n. 8
0
 def test_simple_function_scan(self):
     values = Batch(pd.DataFrame([1, 2, 3], columns=['a']))
     expression = FunctionExpression(lambda x: x + 1,
                                     name='test',
                                     alias='test')
     expression.output_col_aliases = ['test.a']
     plan = type("FunctionScanPlan", (), {"func_expr": expression})
     function_scan_executor = FunctionScanExecutor(plan)
     actual = list(function_scan_executor.exec(lateral_input=values))[0]
     expected = Batch(pd.DataFrame([2, 3, 4], columns=['test.a']))
     self.assertEqual(expected, actual)
Esempio n. 9
0
 def evaluate(self, *args, **kwargs):
     batch = self.get_child(0).evaluate(*args, **kwargs)
     if self.etype == ExpressionType.AGGREGATION_SUM:
         return Batch(frames=batch.frames.agg(['sum']))
     elif self.etype == ExpressionType.AGGREGATION_COUNT:
         return Batch(frames=batch.frames.agg(['count']))
     elif self.etype == ExpressionType.AGGREGATION_AVG:
         return Batch(frames=batch.frames.agg(['mean']))
     elif self.etype == ExpressionType.AGGREGATION_MIN:
         return Batch(frames=batch.frames.agg(['min']))
     elif self.etype == ExpressionType.AGGREGATION_MAX:
         return Batch(frames=batch.frames.agg(['max']))
Esempio n. 10
0
    def evaluate(self, *args, **kwargs):
        vl = self.get_child(0).evaluate(*args, **kwargs).frames
        vr = self.get_child(1).evaluate(*args, **kwargs).frames

        if self.etype == ExpressionType.ARITHMETIC_ADD:
            return Batch(pd.DataFrame(vl + vr))
        elif self.etype == ExpressionType.ARITHMETIC_SUBTRACT:
            return Batch(pd.DataFrame(vl - vr))
        elif self.etype == ExpressionType.ARITHMETIC_MULTIPLY:
            return Batch(pd.DataFrame(vl * vr))
        elif self.etype == ExpressionType.ARITHMETIC_DIVIDE:
            return Batch(pd.DataFrame(vl / vr))
Esempio n. 11
0
    def test_should_load_and_select_real_video_in_table(self):
        query = """LOAD DATA INFILE 'data/ua_detrac/ua_detrac.mp4'
                   INTO UADETRAC;"""
        execute_query_fetch_all(query)

        select_query = "SELECT * FROM UADETRAC;"
        actual_batch = execute_query_fetch_all(select_query)
        actual_batch.sort()
        video_reader = OpenCVReader("data/ua_detrac/ua_detrac.mp4",
                                    batch_mem_size=30000000)
        expected_batch = Batch(frames=pd.DataFrame())
        for batch in video_reader.read():
            expected_batch += batch
        expected_batch.modify_column_alias("uadetrac")
        self.assertEqual(actual_batch, expected_batch)
Esempio n. 12
0
    def test_short_circuiting_or_complete(self):
        # tests whether right-hand side is bypassed completely with or
        tup_val_exp_l = TupleValueExpression(col_name=0)
        tup_val_exp_l.col_alias = 0
        tup_val_exp_r = TupleValueExpression(col_name=1)
        tup_val_exp_r.col_alias = 1

        comp_exp_l = ComparisonExpression(
            ExpressionType.COMPARE_EQUAL,
            tup_val_exp_l,
            tup_val_exp_r
        )
        comp_exp_r = Mock(spec=ComparisonExpression)

        logical_exp = LogicalExpression(
            ExpressionType.LOGICAL_OR,
            comp_exp_l,
            comp_exp_r
        )

        tuples = Batch(pd.DataFrame(
            {0: [1, 2, 3], 1: [1, 2, 3]}))
        self.assertEqual(
            [True, True, True],
            logical_exp.evaluate(tuples).frames[0].tolist()
        )
        comp_exp_r.evaluate.assert_not_called()
Esempio n. 13
0
    def test_short_circuiting_or_partial(self):
        # tests whether right-hand side is partially executed with or
        tup_val_exp_l = TupleValueExpression(col_name=0)
        tup_val_exp_l.col_alias = 0
        tup_val_exp_r = TupleValueExpression(col_name=1)
        tup_val_exp_r.col_alias = 1

        comp_exp_l = ComparisonExpression(
            ExpressionType.COMPARE_EQUAL,
            tup_val_exp_l,
            tup_val_exp_r
        )
        comp_exp_r = Mock(spec=ComparisonExpression)
        comp_exp_r.evaluate = Mock(return_value=Mock(frames=[[True], [False]]))

        logical_exp = LogicalExpression(
            ExpressionType.LOGICAL_OR,
            comp_exp_l,
            comp_exp_r
        )

        tuples = Batch(pd.DataFrame(
            {0: [1, 2, 3, 4], 1: [5, 6, 3, 4]}))
        self.assertEqual(
            [True, False, True, True],
            logical_exp.evaluate(tuples).frames[0].tolist()
        )
        comp_exp_r.evaluate.assert_called_once_with(tuples, mask=[0, 1])
Esempio n. 14
0
    def test_array_count(self):
        select_query = """SELECT id FROM MyVideo WHERE
            Array_Count(DummyMultiObjectDetector(data).labels, 'person') = 2
            ORDER BY id;"""
        actual_batch = execute_query_fetch_all(select_query)
        expected = [{'myvideo.id': i} for i in range(0, NUM_FRAMES, 3)]
        expected_batch = Batch(frames=pd.DataFrame(expected))
        self.assertEqual(actual_batch, expected_batch)

        select_query = """SELECT id FROM MyVideo
            WHERE Array_Count(DummyObjectDetector(data).label, 'bicycle') = 1
            ORDER BY id;"""
        actual_batch = execute_query_fetch_all(select_query)
        expected = [{'myvideo.id': i} for i in range(1, NUM_FRAMES, 2)]
        expected_batch = Batch(frames=pd.DataFrame(expected))
        self.assertEqual(actual_batch, expected_batch)
Esempio n. 15
0
    def test_should_call_opencv_reader_and_storage_engine(self, create_mock):
        file_path = 'video'
        table_metainfo = 'info'
        batch_mem_size = 3000
        file_options = {}
        file_options['file_format'] = FileFormatType.VIDEO
        plan = type(
            "LoadDataPlan", (), {
                'table_metainfo': table_metainfo,
                'file_path': file_path,
                'batch_mem_size': batch_mem_size,
                'file_options': file_options
            })

        load_executor = LoadDataExecutor(plan)
        with patch.object(Path, 'exists') as mock_exists:
            mock_exists.return_value = True
            batch = next(load_executor.exec())
            create_mock.assert_called_once_with(table_metainfo, file_path)
            self.assertEqual(
                batch,
                Batch(
                    pd.DataFrame([{
                        'Video successfully added at location: ':
                        file_path
                    }])))
Esempio n. 16
0
    def exec(self) -> Iterator[Batch]:
        child_executor = self.children[0]
        aggregated_batch_list = []

        # aggregates the batches into one large batch
        for batch in child_executor.exec():
            self.batch_sizes.append(batch.batch_size)
            aggregated_batch_list.append(batch)
        aggregated_batch = Batch.concat(aggregated_batch_list, copy=False)

        # sorts the batch
        try:
            aggregated_batch.sort_orderby(by=self.extract_column_names(),
                                          sort_type=self.extract_sort_types())
        except KeyError:
            # pass for now
            pass

        # split the aggregated batch into smaller ones based
        #  on self.batch_sizes which holds the input batches sizes
        index = 0
        for i in self.batch_sizes:
            batch = aggregated_batch[index:index + i]
            batch.reset_index()
            index += i
            yield batch
Esempio n. 17
0
    def test_should_load_and_sort_in_table(self):
        select_query = "SELECT data, id FROM MyVideo ORDER BY id;"
        actual_batch = execute_query_fetch_all(select_query)
        expected_rows = [{
            "myvideo.id":
            i,
            "myvideo.data":
            np.array(np.ones((2, 2, 3)) * float(i + 1) * 25, dtype=np.uint8),
        } for i in range(NUM_FRAMES)]
        expected_batch = Batch(frames=pd.DataFrame(expected_rows))
        self.assertEqual(actual_batch, expected_batch)

        select_query = "SELECT data, id FROM MyVideo ORDER BY id DESC;"
        actual_batch = execute_query_fetch_all(select_query)
        expected_batch.reverse()
        self.assertEqual(actual_batch, expected_batch)
Esempio n. 18
0
    def test_should_search_in_upload_directory(self, create_mock):
        self.upload_path = Path(ConfigurationManager().get_value(
            'storage', 'path_prefix'))
        file_path = 'video'
        table_metainfo = 'info'
        batch_mem_size = 3000
        file_options = {}
        file_options['file_format'] = FileFormatType.VIDEO
        plan = type(
            "LoadDataPlan", (), {
                'table_metainfo': table_metainfo,
                'file_path': file_path,
                'batch_mem_size': batch_mem_size,
                'file_options': file_options
            })

        load_executor = LoadDataExecutor(plan)
        with patch.object(Path, 'exists') as mock_exists:
            mock_exists.side_effect = [False, True]
            batch = next(load_executor.exec())
            create_mock.assert_called_once_with(table_metainfo,
                                                self.upload_path / file_path)
            self.assertEqual(
                batch,
                Batch(
                    pd.DataFrame([{
                        'Video successfully added at location: ':
                        file_path
                    }])))
Esempio n. 19
0
    def exec(self):
        """
        Read the input meta file using pandas and persist data
        using storage engine
        """

        # Read the CSV file
        # converters is a dictionary of functions that convert the values
        # in the column to the desired type
        csv_reader = CSVReader(os.path.join(self.path_prefix,
                                            self.node.file_path),
                               column_list=self.node.column_list,
                               batch_mem_size=self.node.batch_mem_size)

        # write with storage engine in batches
        num_loaded_frames = 0
        for batch in csv_reader.read():
            StorageEngine.write(self.node.table_metainfo, batch)
            num_loaded_frames += len(batch)

        # yield result
        df_yield_result = Batch(
            pd.DataFrame(
                {
                    'CSV': str(self.node.file_path),
                    'Number of loaded frames': num_loaded_frames
                },
                index=[0]))

        yield df_yield_result
Esempio n. 20
0
    def test_select_and_where_video_in_table(self):
        select_query = "SELECT id,data FROM MyVideo WHERE id = 5;"
        actual_batch = execute_query_fetch_all(select_query)
        expected_batch = list(create_dummy_batches(filters=[5]))[0]
        self.assertEqual(actual_batch, expected_batch)

        select_query = "SELECT data FROM MyVideo WHERE id = 5;"
        actual_batch = execute_query_fetch_all(select_query)
        expected_rows = [{
            "myvideo.data":
            np.array(np.ones((2, 2, 3)) * float(5 + 1) * 25, dtype=np.uint8)
        }]
        expected_batch = Batch(frames=pd.DataFrame(expected_rows))
        self.assertEqual(actual_batch, expected_batch)

        select_query = "SELECT id, data FROM MyVideo WHERE id >= 2;"
        actual_batch = execute_query_fetch_all(select_query)
        actual_batch.sort()
        expected_batch = list(
            create_dummy_batches(filters=range(2, NUM_FRAMES)))[0]
        self.assertEqual(actual_batch, expected_batch)

        select_query = "SELECT id, data FROM MyVideo WHERE id >= 2 AND id < 5;"
        actual_batch = execute_query_fetch_all(select_query)
        actual_batch.sort()
        expected_batch = list(create_dummy_batches(filters=range(2, 5)))[0]

        self.assertEqual(actual_batch, expected_batch)
Esempio n. 21
0
    def test_should_mat_view_to_the_same_table(self):
        materialized_query = """CREATE MATERIALIZED VIEW IF NOT EXISTS
            dummy_view2 (id, label)
            AS SELECT id, DummyObjectDetector(data).label FROM MyVideo
            WHERE id < 5;
        """
        execute_query_fetch_all(materialized_query)

        materialized_query = """CREATE MATERIALIZED VIEW IF NOT EXISTS
            dummy_view2 (id, label)
            AS SELECT id, DummyObjectDetector(data).label FROM MyVideo
            WHERE id >= 5;
        """
        execute_query_fetch_all(materialized_query)

        select_query = 'SELECT id, label FROM dummy_view2;'
        actual_batch = execute_query_fetch_all(select_query)
        actual_batch.sort()

        labels = DummyObjectDetector().labels
        expected = [{
            'dummy_view2.id': i,
            'dummy_view2.label': labels[1 + i % 2]
        } for i in range(5)]
        expected_batch = Batch(frames=pd.DataFrame(expected))
        self.assertEqual(actual_batch, expected_batch)
Esempio n. 22
0
    def test_should_return_limit_greater_than_size(self):
        """ This should return the exact same data
        if the limit value is greater than what is present.
        This will also leave a warning """

        dfs = [
            pd.DataFrame(np.random.randint(0, 100, size=(100, 4)),
                         columns=list('ABCD')) for _ in range(4)
        ]

        batches = [Batch(frames=df) for df in dfs]

        previous_total_size = 0
        for batch in batches:
            previous_total_size += batch.batch_size

        limit_value = 500

        plan = LimitPlan(ConstantValueExpression(limit_value))

        limit_executor = LimitExecutor(plan)
        limit_executor.append_child(DummyExecutor(batches))
        reduced_batches = list(limit_executor.exec())

        after_total_size = 0
        for batch in reduced_batches:
            after_total_size += batch.batch_size

        self.assertEqual(previous_total_size, after_total_size)
Esempio n. 23
0
def create_dummy_batches(num_frames=NUM_FRAMES,
                         filters=[], batch_size=10, start_id=0):
    if not filters:
        filters = range(num_frames)
    data = []
    for i in filters:
        data.append({'myvideo.id': i + start_id,
                     'myvideo.data': np.array(
                         np.ones((2, 2, 3)) * float(i + 1) * 25,
                         dtype=np.uint8)})

        if len(data) % batch_size == 0:
            yield Batch(pd.DataFrame(data))
            data = []
    if data:
        yield Batch(pd.DataFrame(data))
Esempio n. 24
0
    def exec(self):
        """Drop table executor"""
        catalog_manager = CatalogManager()
        if len(self.node.table_refs) > 1:
            logger.exception('Drop supports only single table')
        table_ref = self.node.table_refs[0]

        if not catalog_manager.check_table_exists(
                table_ref.table.database_name, table_ref.table.table_name):
            err_msg = "Table: {} does not exsits".format(table_ref)
            if self.node.if_exists:
                logger.warn(err_msg)
            else:
                logger.exception(err_msg)

        if table_ref.table.table_obj.is_video:
            VideoStorageEngine.drop(table=table_ref.table.table_obj)
        else:
            StorageEngine.drop(table=table_ref.table.table_obj)

        success = catalog_manager.drop_dataset_metadata(
            table_ref.table.database_name, table_ref.table.table_name)

        if not success:
            err_msg = "Failed to drop {}".format(table_ref)
            logger.exception(err_msg)

        yield Batch(
            pd.DataFrame(
                {
                    "Table Successfully dropped: {}".format(
                        table_ref.table.table_name)
                },
                index=[0],
            ))
    def test_should_use_the_same_function_if_not_gpu_compatible(self):
        mock_function = MagicMock(return_value=pd.DataFrame())

        expression = FunctionExpression(mock_function, name="test")

        input_batch = Batch(frames=pd.DataFrame())
        expression.evaluate(input_batch)
        mock_function.assert_called()
Esempio n. 26
0
 def test_hash_join_with_one_on(self):
     select_query = """SELECT * FROM table1 JOIN
                     table2 ON table1.a1 = table2.a1;"""
     actual_batch = execute_query_fetch_all(select_query)
     expected = pd.merge(
         self.table1,
         self.table2,
         left_on=["table1.a1"],
         right_on=["table2.a1"],
         how="inner",
     )
     if len(expected):
         expected_batch = Batch(expected)
         self.assertEqual(
             expected_batch.sort_orderby(["table1.a2"]),
             actual_batch.sort_orderby(["table1.a2"]),
         )
Esempio n. 27
0
def execute_query_fetch_all(query) -> Optional[Batch]:
    """
    Execute the query and fetch all results into one Batch object.
    """
    output = execute_query(query)
    if output:
        batch_list = list(output)
        return Batch.concat(batch_list, copy=False)
Esempio n. 28
0
 def evaluate(self, *args, **kwargs):
     if self.get_children_count() == 2:
         left_values = self.get_child(0).evaluate(*args, **kwargs).frames
         if self.etype == ExpressionType.LOGICAL_AND:
             if (~left_values).all().bool():  # check if all are false
                 return Batch(left_values)
             kwargs["mask"] = left_values[left_values[0]].index.tolist()
         elif self.etype == ExpressionType.LOGICAL_OR:
             if left_values.all().bool():  # check if all are true
                 return Batch(left_values)
             kwargs["mask"] = left_values[~left_values[0]].index.tolist()
         right_values = self.get_child(1).evaluate(*args, **kwargs).frames
         left_values.iloc[kwargs["mask"]] = right_values
         return Batch(pd.DataFrame(left_values))
     else:
         values = self.get_child(0).evaluate(*args, **kwargs).frames
         if self.etype == ExpressionType.LOGICAL_NOT:
             return Batch(pd.DataFrame(~values))
Esempio n. 29
0
    def test_should_return_only_frames_satisfy_predicate(self):
        dataframe = create_dataframe(3)
        batch = Batch(frames=dataframe)
        expression = type(
            "AbstractExpression", (),
            {"evaluate": lambda x: Batch(pd.DataFrame([False, False, True]))})

        plan = type("ScanPlan", (), {
            "predicate": expression,
            "columns": None,
            "alias": None
        })
        predicate_executor = SequentialScanExecutor(plan)
        predicate_executor.append_child(DummyExecutor([batch]))

        expected = Batch(batch[[2]].frames.reset_index(drop=True))
        filtered = list(predicate_executor.exec())[0]
        self.assertEqual(expected, filtered)
Esempio n. 30
0
    def read(self) -> Iterator[Batch]:
        """
        This calls the sub class read implementation and
        yields the batch to the caller
        """

        data_batch = []
        row_size = None
        for data in self._read():
            if row_size is None:
                row_size = 0
                row_size = get_size(data)
            data_batch.append(data)
            if len(data_batch) * row_size >= self.batch_mem_size:
                yield Batch(pd.DataFrame(data_batch))
                data_batch = []
        if data_batch:
            yield Batch(pd.DataFrame(data_batch))