Example #1
0
 def test_two_arrays_of_same_type_are_same_object(self, bq_type):
     # type: (BQScalarType) -> None
     # Type objects are immutable, and we need to be able to compare them
     # (an array of ints is an array of ints, but it's not a string or an array of floats).
     # A way to achieve this is to ensure that all types, including arrays, are singletons.
     # So we test that for each scalar type, creating two arrays of it yields the same object.
     a1 = BQArray(bq_type)
     a2 = BQArray(bq_type)
     self.assertIs(a1, a2)
Example #2
0
class TypeGrammarTest(unittest.TestCase):
    @data(
        dict(text='INTEGER', expected_type=BQScalarType.INTEGER), )
    @unpack
    def test_scalar_type(self, text, expected_type):
        # type: (str, BQScalarType) -> None
        node, leftover = scalar_type(tokenize(text))
        self.assertFalse(leftover)
        self.assertEqual(node, expected_type)

    @data(
        dict(text='ARRAY<INTEGER>',
             expected_type=BQArray(BQScalarType.INTEGER)),
        dict(text='ARRAY<STRUCT<a INTEGER, b INTEGER> >',
             expected_type=BQArray(
                 BQStructType(['a', 'b'],
                              [BQScalarType.INTEGER, BQScalarType.INTEGER]))),
        dict(text='ARRAY<STRUCT<INTEGER, INTEGER> >',
             expected_type=BQArray(
                 BQStructType([None, None],
                              [BQScalarType.INTEGER, BQScalarType.INTEGER]))),
    )
    @unpack
    def test_array_type(self, text, expected_type):
        # type: (str, BQArray) -> None
        node, leftover = array_type(tokenize(text))
        self.assertFalse(leftover)
        self.assertEqual(node, expected_type)

    @data(
        dict(text='STRUCT<a INTEGER>',
             expected_type=BQStructType(['a'], [BQScalarType.INTEGER])),
        dict(text='STRUCT<INTEGER>',
             expected_type=BQStructType([None], [BQScalarType.INTEGER])),
        dict(text='STRUCT<a INTEGER, b STRING>',
             expected_type=BQStructType(
                 ['a', 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(text='STRUCT<INTEGER, b STRING>',
             expected_type=BQStructType(
                 [None, 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(text='STRUCT<ARRAY<FLOAT>, b STRING>',
             expected_type=BQStructType(
                 [None, 'b'],
                 [BQArray(BQScalarType.FLOAT), BQScalarType.STRING])),
    )
    @unpack
    def test_struct_type(self, text, expected_type):
        # type: (str, BQStructType) -> None
        node, leftover = struct_type(tokenize(text))
        self.assertFalse(leftover)
        self.assertEqual(node, expected_type)
Example #3
0
 def test_get_typed_series_as_list(self):
     typed_series = TypedSeries(
         pd.Series([(np.float64(1.5), np.float64(2.5), np.float64(3.0)),
                    (np.float64(2.5), np.float64(3.5), np.float64(4.0))]),
         BQArray(BQScalarType.FLOAT))
     self.assertEqual(typed_series.to_list(), [(1.5, 2.5, 3.0),
                                               (2.5, 3.5, 4.0)])
Example #4
0
    def test_convert_between_schema_field_and_bq_type(self, bq_type,
                                                      schema_field):
        # type: (BQScalarType, SchemaField) -> None

        # Test scalar
        self.assertEqual(BQType.from_schema_field(schema_field), bq_type)
        self.assertEqual(bq_type.to_schema_field('foo'), schema_field)

        # Test array
        schema_array_field = SchemaField(name=schema_field.name,
                                         field_type=schema_field.field_type,
                                         mode='REPEATED')
        bq_array_type = BQArray(bq_type)
        self.assertEqual(BQType.from_schema_field(schema_array_field),
                         bq_array_type)
        self.assertEqual(bq_array_type.to_schema_field('foo'),
                         schema_array_field)
Example #5
0
 def test_get_typed_dataframe_schema(self):
     typed_dataframe = TypedDataFrame(
         pd.DataFrame(columns=['a', 'b']),
         [BQScalarType.BOOLEAN,
          BQArray(BQScalarType.FLOAT)])
     self.assertEqual(typed_dataframe.to_bq_schema(), [
         SchemaField(name='a', field_type='BOOLEAN'),
         SchemaField(name='b', field_type='FLOAT', mode='REPEATED')
     ])
Example #6
0
    def test_convert(self, bq_type, np_object, py_object):
        # type: (BQScalarType, NumPyType, PythonType) -> None

        # First, convert from a NumPy-typed object to a Pandas-typed object.
        # Types are mostly the same except for np.datetime64 becomes pd.Timestamp
        # We do this by creating a Pandas Series containing the single object, and then
        # converting it to a sequence and extracting its single element.
        pd_object, = pd.Series(np_object)
        self.assertEqual(bq_type.convert(pd_object), py_object)

        # Test that for any type, a NaN converts to None
        self.assertIsNone(bq_type.convert(np.nan))

        # Now test the same conversion for a list (array) of objects.
        # Construct a Series containing a single row which is a list of three objects.
        pd_array_object, = pd.Series([(pd_object, ) * 3])
        self.assertEqual(
            BQArray(bq_type).convert(pd_array_object), (py_object, ) * 3)

        # Test that for any Array type, a NaN converts to None
        self.assertIsNone(BQArray(bq_type).convert(np.nan))
Example #7
0
 def test_get_typed_dataframe_as_list_of_lists(self):
     typed_dataframe = TypedDataFrame(
         pd.DataFrame(
             [[
                 np.bool_(True),
                 (np.float64(1.5), np.float64(2.5), np.float64(3.0))
             ],
              [
                  np.bool_(False),
                  (np.float64(2.5), np.float64(3.5), np.float64(4.0))
              ]],
             columns=['a', 'b']),
         [BQScalarType.BOOLEAN,
          BQArray(BQScalarType.FLOAT)])
     self.assertEqual(typed_dataframe.to_list_of_lists(),
                      [[True, (1.5, 2.5, 3.0)], [False, (2.5, 3.5, 4.0)]])
Example #8
0
 def test_to_dtype(self, bq_type, np_type):
     # type: (BQScalarType, NumPyType) -> None
     self.assertEqual(bq_type.to_dtype(), np.dtype(np_type))
     # NumPy doesn't know from cell elements that are lists, so it just leaves it as an
     # uninterpreted Python object.
     self.assertEqual(BQArray(bq_type).to_dtype(), np.dtype('object'))
class EvaluatableNodeTest(unittest.TestCase):
    def setUp(self):
        # type: () -> None
        self.small_table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1], [2]], columns=['a']),
                                   types=[BQScalarType.INTEGER])
                }
            }
        })

        self.large_table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 4, 3]],
                                                columns=['a', 'b', 'c']),
                                   types=[
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER
                                   ])
                }
            }
        })

    def test_selector(self):
        # type: () -> None
        selector = Selector(Field(('a', )), 'field_alias')
        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        typed_series = selector.evaluate(context)
        assert isinstance(typed_series, TypedSeries)

        self.assertEqual(list(typed_series.series), [1, 2])
        self.assertEqual(list(typed_series.dataframe), ['field_alias'])
        self.assertEqual(typed_series.types, [BQScalarType.INTEGER])

    def test_selector_group_by_success(self):
        # type: () -> None
        selector = Selector(Field(('c', )), EMPTY_NODE)
        selector.position = 1
        context = EvaluationContext(self.large_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)

        context.exclude_aggregation = True
        updated_selector, = context.do_group_by([selector],
                                                [Field(('my_table', 'c'))])

        typed_series = updated_selector.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [3])

    @data((5, BQScalarType.INTEGER), (1.23, BQScalarType.FLOAT),
          ("something", BQScalarType.STRING), (True, BQScalarType.BOOLEAN),
          (None, None))
    @unpack
    def test_value_repr(self, value, type_):
        # type: (Optional[LiteralType], Optional[BQScalarType]) -> None
        '''Check Value's string representation'''
        node = Value(value, type_)
        representation = 'Value(type_={}, value={})'.format(
            type_.__repr__(), value.__repr__())
        self.assertEqual(node.__repr__(), representation)

    @data((5, None), (None, BQScalarType.INTEGER))
    @unpack
    def test_invalid_value(self, value, type_):
        # type: (Optional[LiteralType], Optional[BQScalarType]) -> None
        '''Check that None is only allowed as both value and type_ or neither.'''
        with self.assertRaises(ValueError):
            Value(value, type_)

    def test_value_eval(self):
        # type: () -> None
        # A constant is repeated for each row in the context table.
        value = Value(12345, BQScalarType.INTEGER)
        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')), 'foo')
        typed_series = value.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [12345, 12345])

    def test_field(self):
        # type: () -> None
        field = Field(('a', ))
        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        typed_series = field.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [1, 2])
        self.assertEqual(typed_series.series.name, 'a')

    @data(
        dict(function_name='sum',
             args=[Field(('a', ))],
             expected_result=[3],
             is_aggregating=True),
        dict(function_name='max',
             args=[Field(('a', ))],
             expected_result=[2],
             is_aggregating=True),
        dict(function_name='min',
             args=[Field(('a', ))],
             expected_result=[1],
             is_aggregating=True),
        dict(function_name='concat',
             args=[
                 Value('foo', BQScalarType.STRING),
                 Value('bar', BQScalarType.STRING)
             ],
             expected_result=['foobar'] *
             2),  # two copies to match length of context table.
        dict(function_name='mod',
             args=[Field(('a', )),
                   Value(2, BQScalarType.INTEGER)],
             expected_result=[1, 0]),
        dict(function_name='mod',
             args=[
                 Value(1.0, BQScalarType.FLOAT),
                 Value(2, BQScalarType.INTEGER)
             ],
             expected_result=[1.0, 1.0]),
        dict(function_name='timestamp',
             args=[Value("2019-04-22", BQScalarType.STRING)],
             expected_result=[datetime.datetime(2019, 4, 22)] *
             2),  # two copies to match table len
    )
    @unpack
    def test_functions(self,
                       function_name,
                       args,
                       expected_result,
                       is_aggregating=False):
        # type: (str, List[EvaluatableNode], List[PythonType], bool) -> None
        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        if is_aggregating:
            context.do_group_by((), [])
        result = FunctionCall.create(function_name, args,
                                     EMPTY_NODE).evaluate(context)
        assert isinstance(result, TypedSeries)
        self.assertEqual([result.type_.convert(elt) for elt in result.series],
                         expected_result)

    def test_current_timestamp(self):
        # type: () -> None
        node, leftover = apply_rule(
            query_expression,
            tokenize(
                'select current_timestamp(), a from unnest([struct(1 as a), struct(2), struct(3)])'
            ))
        assert isinstance(node, QueryExpression)
        self.assertFalse(leftover)
        result, _ = node.get_dataframe(DatasetTableContext({}))
        table = cast(List[List[datetime.datetime]], result.to_list_of_lists())
        self.assertEqual(len(table), 3)
        # CURRENT_TIMESTAMP() returns a very recent timestamp
        self.assertLess((datetime.datetime.now() - table[0][0]).seconds, 2)
        # All rows have the same timestamp value.
        self.assertEqual(table[0][0], table[1][0])
        self.assertEqual(table[0][0], table[2][0])

    @data(
        # These expressions are ones whose EvaluatableNode subclass constructs a
        # new pandas Series rather than computing on existing ones.  See below:
        # this runs the risk of constructing it with an incorrect index.
        dict(query='select 10, c', expected_result=[[10, 6], [10, 9]]),
        dict(query='select [a, b], c',
             expected_result=[[(4, 5), 6], [(7, 8), 9]]),
        dict(query='select (a, b), c',
             expected_result=[[(4, 5), 6], [(7, 8), 9]]),
        dict(query='select exists(select 1), c',
             expected_result=[[True, 6], [True, 9]]),
        dict(query='select a in (1, 4), c',
             expected_result=[[True, 6], [False, 9]]),
        dict(query='select row_number() over (), c',
             expected_result=[[1, 6], [2, 9]]),
        dict(query='select current_timestamp() > timestamp("2019-01-01"), c',
             expected_result=[[True, 6], [True, 9]]),
    )
    @unpack
    def test_constructed_column_has_correct_index(self, query,
                                                  expected_result):
        # type: (str, List[List[int]]) -> None
        '''Checks that manually constructed columns have the same index as the data.

        A manually constructed column will usually have an index 0, 1, 2, ...
        (e.g. pd.Series(['a', 'b', 'c']) has index 0, 1, 2).
        The data may not; filtering, sorting or other changes might result in an index of
        different numbers.  If one column's index doesn't match the index of other columns,
        it can't be compared or joined with them properly.
        '''
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([[1, 2, -1], [4, 5, 6], [7, 8, 9]],
                                     columns=['a', 'b', 'c']),
                        types=[
                            BQScalarType.INTEGER, BQScalarType.INTEGER,
                            BQScalarType.INTEGER
                        ])
                }
            }
        })

        # Skip the first row of the table, so that the index of the table that
        # the test queries operate on is [1, 2]; this makes sure that the index is
        # different from the default index you would get for a two-row column,
        # which would be [0, 1], to test that expressions are not incorrectly
        # using that default index.
        node, leftover = select_rule(
            tokenize(query + ' from (select * from my_table where c > 0)'))
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertFalse(leftover)
        self.assertEqual(result.to_list_of_lists(), expected_result)
        self.assertEqual(list(result.dataframe.index), [1, 2])

    def test_bad_function(self):
        # type: () -> None
        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        with self.assertRaisesRegexp(NotImplementedError,
                                     'NOT_A_FUNCTION not implemented'):
            FunctionCall.create('not_a_function', [],
                                EMPTY_NODE).evaluate(context)

    @data(
        # Explore each aggregate function, along with a non-aggregate function to make sure we
        # can compute both at once.
        dict(selectors='sum(a), b+10', expected_result=[[6, 11], [5, 12]]),
        dict(selectors='sum(a), 20+10', expected_result=[[6, 30], [5, 30]]),
        dict(selectors='sum(a+1), b+10', expected_result=[[8, 11], [6, 12]]),
        dict(selectors='max(a), b+10', expected_result=[[4, 11], [5, 12]]),
        dict(selectors='min(a), b+10', expected_result=[[2, 11], [5, 12]]),
        dict(selectors='count(a), b+10', expected_result=[[2, 11], [1, 12]]),
        dict(selectors='count(*), b+10', expected_result=[[2, 11], [2, 12]]),
        dict(selectors='array_agg(a), []',
             expected_result=[[(2, 4), ()], [(5, None), ()]]),
        dict(selectors='array_agg(a), [b]',
             expected_result=[[(2, 4), (1, )], [(5, None), (2, )]]),
        dict(selectors='array_agg(a), [7, 8]',
             expected_result=[[(2, 4), (7, 8)], [(5, None), (7, 8)]]),
        dict(selectors='array_agg(a), b+10',
             expected_result=[[(2, 4), 11], [(5, None), 12]]),
    )
    @unpack
    def test_aggregate_functions_in_group_by(self, selectors, expected_result):
        # type: (str, List[List[int]]) -> None
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([[2, 1], [4, 1], [5, 2], [np.nan, 2]],
                                     columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })

        tokens = tokenize(
            'select {} from my_table group by b'.format(selectors))
        node, leftover = select_rule(tokens)
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertFalse(leftover)
        self.assertEqual(result.to_list_of_lists(), expected_result)

    @data(
        dict(query='select sum(a + 1) + 2, count(*) + 3, 4 from my_table',
             expected_result=[[11, 6, 4]]), )
    @unpack
    def test_aggregate_functions_in_expressions(self, query, expected_result):
        # type: (str, List[List[int]]) -> None
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1], [2], [3]],
                                                columns=['a']),
                                   types=[BQScalarType.INTEGER])
                }
            }
        })

        node, leftover = select_rule(tokenize(query))
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertFalse(leftover)
        self.assertEqual(result.to_list_of_lists(), expected_result)

    @data(
        # Test all variations of creating a struct (typed, typeless, tuple),
        # with and without named fields, with one field, and then with two
        # fields.
        dict(query='SELECT STRUCT<INTEGER>(1)',
             expected_result=(1, ),
             expected_type=BQStructType([None], [BQScalarType.INTEGER])),
        dict(query='SELECT STRUCT<a INTEGER>(1)',
             expected_result=(1, ),
             expected_type=BQStructType(['a'], [BQScalarType.INTEGER])),
        dict(query='SELECT STRUCT(1 AS a)',
             expected_result=(1, ),
             expected_type=BQStructType(['a'], [BQScalarType.INTEGER])),
        dict(query='SELECT STRUCT(1)',
             expected_result=(1, ),
             expected_type=BQStructType([None], [BQScalarType.INTEGER])),
        # Note: no test of single-element tuple syntax, as that would just be a
        # parenthesized expression, there's no analogue to Python's trailing comma.
        dict(query='SELECT STRUCT<INTEGER, STRING>(1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 [None, None], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT<a INTEGER, STRING>(1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 ['a', None], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT<INTEGER, b STRING>(1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 [None, 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT<a INTEGER, b STRING>(1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 ['a', 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT(1 AS a, "a" as b)',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 ['a', 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT(1, "a" as b)',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 [None, 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT(1 AS a, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 ['a', None], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT(1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 [None, None], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT (1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 [None, None], [BQScalarType.INTEGER, BQScalarType.STRING])),
    )
    @unpack
    def test_struct_constant_expressions(self, query, expected_result,
                                         expected_type):
        # type: (str, Tuple[Optional[int], ...], BQStructType) -> None
        table_context = DatasetTableContext({})
        node, leftover = select_rule(tokenize(query))
        self.assertFalse(leftover)
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertEqual(result.to_list_of_lists(), [[expected_result]])
        self.assertEqual(result.types, [expected_type])

    @data(
        # Test all three struct syntaxes, selecting a column as one field, a
        # constant as the other.
        dict(query='SELECT (a, "a") FROM my_table',
             expected_result=[[(1, 'a')], [(2, 'a')]],
             expected_types=[
                 BQStructType([None, None],
                              [BQScalarType.INTEGER, BQScalarType.STRING])
             ]),
        dict(query='SELECT STRUCT(a as x, "a" as y) FROM my_table',
             expected_result=[[(1, 'a')], [(2, 'a')]],
             expected_types=[
                 BQStructType(['x', 'y'],
                              [BQScalarType.INTEGER, BQScalarType.STRING])
             ]),
        dict(query='SELECT STRUCT<x INTEGER, y STRING>(a, "a") FROM my_table',
             expected_result=[[(1, 'a')], [(2, 'a')]],
             expected_types=[
                 BQStructType(['x', 'y'],
                              [BQScalarType.INTEGER, BQScalarType.STRING])
             ]),
    )
    @unpack
    def test_struct_field_and_constant(self, query, expected_result,
                                       expected_types):
        # type: (str, List[List[Tuple[Optional[int], ...]]], Sequence[BQStructType]) -> None
        node, leftover = select_rule(tokenize(query))
        self.assertFalse(leftover)
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(
            self.small_table_context)
        self.assertEqual(result.to_list_of_lists(), expected_result)
        self.assertEqual(result.types, expected_types)

    @data(
        # Test combination types of arrays and structs.
        dict(query='SELECT ([1], "a")',
             expected_result=((1, ), 'a'),
             expected_type=BQStructType(
                 [None, None],
                 [BQArray(BQScalarType.INTEGER), BQScalarType.STRING])),
        dict(
            query=
            'SELECT STRUCT<x ARRAY<INTEGER>, y STRING>(ARRAY<INTEGER>[1], "a")',
            expected_result=((1, ), 'a'),
            expected_type=BQStructType(
                ['x', 'y'],
                [BQArray(BQScalarType.INTEGER), BQScalarType.STRING])),
        dict(query='SELECT [(1, "a")]',
             expected_result=((1, 'a'), ),
             expected_type=BQArray(
                 BQStructType([None, None],
                              [BQScalarType.INTEGER, BQScalarType.STRING]))),
        dict(query='SELECT [STRUCT<a INTEGER, b STRING>(1, "a"), (2, "b")]',
             expected_result=((1, 'a'), (2, 'b')),
             expected_type=BQArray(
                 BQStructType(['a', 'b'],
                              [BQScalarType.INTEGER, BQScalarType.STRING]))),
        # Test that an array of structs merges and coerces the types of the
        # structs.
        dict(
            query=
            'SELECT [STRUCT<a FLOAT, STRING>(1.0, "a"), STRUCT<INTEGER, b STRING>(2, "b")]',
            expected_result=((1.0, 'a'), (2.0, 'b')),
            expected_type=BQArray(
                BQStructType(['a', 'b'],
                             [BQScalarType.FLOAT, BQScalarType.STRING]))),
        dict(
            query=
            'SELECT [STRUCT<a INTEGER, b ARRAY<STRING> >(1, ["a"]), (2, ["b", "c"])]',
            expected_result=((1, ('a', )), (2, ('b', 'c'))),
            expected_type=BQArray(
                BQStructType(
                    ['a', 'b'],
                    [BQScalarType.INTEGER,
                     BQArray(BQScalarType.STRING)]))),
    )
    @unpack
    def test_complex_types(self, query, expected_result, expected_type):
        # type: (str, Tuple[Optional[int], ...], BQType) -> None
        table_context = DatasetTableContext({})
        node, leftover = select_rule(tokenize(query))
        self.assertFalse(leftover)
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertEqual(result.to_list_of_lists(), [[expected_result]])
        self.assertEqual(result.types, [expected_type])

    @data(
        dict(query='SELECT ARRAY_AGG(a)', expected_result=(1, 1, 2, None)),
        dict(query='SELECT ARRAY_AGG(a RESPECT NULLS)',
             expected_result=(1, 1, 2, None)),
        dict(query='SELECT ARRAY_AGG(DISTINCT a)',
             expected_result=(1, 2, None)),
        dict(query='SELECT ARRAY_AGG(DISTINCT a RESPECT NULLS)',
             expected_result=(1, 2, None)),
        dict(query='SELECT ARRAY_AGG(a IGNORE NULLS)',
             expected_result=(1, 1, 2)),
        dict(query='SELECT ARRAY_AGG(DISTINCT a IGNORE NULLS)',
             expected_result=(1, 2)),
    )
    @unpack
    def test_array_agg_arguments(self, query, expected_result):
        # type: (str, Tuple[Optional[int], ...]) -> None
        table_context = DatasetTableContext({
            'p': {
                'd': {
                    't':
                    TypedDataFrame(pd.DataFrame([[1], [1], [2], [None]],
                                                columns=['a']),
                                   types=[BQScalarType.INTEGER])
                }
            }
        })

        node, leftover = select_rule(tokenize(query + ' FROM p.d.t'))
        self.assertFalse(leftover)
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertEqual(result.to_list_of_lists(), [[expected_result]])

    @data(
        dict(query='SELECT [1,2,"a"]',
             error='Cannot implicitly coerce the given types'),
        dict(
            query='SELECT STRUCT<INT64>(3.7)',
            error=
            'Struct field 1 has type .*FLOAT which does not coerce to .*INTEGER'
        ),
        dict(
            query='SELECT ARRAY<INT64>[3.7]',
            error=
            'Array specifies type .*INTEGER, incompatible with values of type .*FLOAT'
        ),
        dict(query='SELECT ARRAY<INT64>[1,2,"a"]',
             error='Cannot implicitly coerce the given types'),
        dict(query='SELECT ARRAY<string>[1,2]',
             error='Cannot implicitly coerce the given types'),
        dict(query='SELECT [[1]]', error='Cannot create arrays of arrays'),
        dict(query='SELECT [(1, 2), (3, 4, 5)]',
             error='Cannot merge .* number of fields varies'),
        dict(query='SELECT [STRUCT(1 as a, 2 as b), STRUCT(3 as x, 4 as b)]',
             error='Cannot merge Structs; field names .* do not match'),
        # same types in different orders can't merge.
        dict(query='SELECT [(1, "a"), ("b", 2)]',
             error='Cannot implicitly coerce the given types'),
        # same names in different orders can't merge
        dict(query='SELECT [STRUCT(1 as a, 2 as b), STRUCT(3 as b, 4 as a)]',
             error='Cannot merge Structs; field names .* do not match'),
    )
    @unpack
    def test_complex_type_errors(self, query, error):
        # type: (str, str) -> None
        node, leftover = select_rule(tokenize(query))
        self.assertFalse(leftover)
        assert isinstance(node, Select)
        with self.assertRaisesRegexp(ValueError, error):
            node.get_dataframe(self.small_table_context)

    @data(
        # Row number over whole dataset; order is not guaranteed
        dict(selectors='row_number() over ()',
             expected_result=[[1], [2], [3], [4]]),
        dict(selectors='row_number() over (order by a), a',
             expected_result=[[1, 10], [2, 20], [3, 30], [4, 30]]),
        dict(selectors='row_number() over (order by a asc), a',
             expected_result=[[1, 10], [2, 20], [3, 30], [4, 30]]),
        dict(selectors='row_number() over (order by a desc), a',
             expected_result=[[4, 10], [3, 20], [2, 30], [1, 30]]),
        dict(selectors='row_number() over (partition by b order by a), a',
             expected_result=[[1, 10], [2, 20], [1, 30], [2, 30]]),
        dict(selectors='sum(a) over (), a',
             expected_result=[[90, 10], [90, 20], [90, 30], [90, 30]]),
        dict(selectors='sum(a) over (partition by b), a',
             expected_result=[[30, 10], [30, 20], [60, 30], [60, 30]]),
        dict(selectors='count(*) over (), a',
             expected_result=[[4, 10], [4, 20], [4, 30], [4, 30]]),
        dict(selectors='count(a) over (), a',
             expected_result=[[4, 10], [4, 20], [4, 30], [4, 30]]),
        dict(selectors='count(*) over (partition by b), a',
             expected_result=[[2, 10], [2, 20], [2, 30], [2, 30]]),
        dict(selectors='count(a) over (partition by b), a',
             expected_result=[[2, 10], [2, 20], [2, 30], [2, 30]]),
        dict(selectors='sum(count(*)) over ()', expected_result=[[4]]),
    )
    @unpack
    def test_analytic_function(self, selectors, expected_result):
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame(
                            [[20, 200], [10, 200], [30, 300], [30, 300]],
                            columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })
        tokens = tokenize('select {} from my_table'.format(selectors))
        node, leftover = select_rule(tokens)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertFalse(leftover)
        # Note: BQ docs say if ORDER BY clause (for the select as a whole) is not present, order of
        # results is undefined, so we do not assert on the order.
        six.assertCountEqual(self, result.to_list_of_lists(), expected_result)

    @data(
        dict(selectors='sum(count(*)) over (), count(*)',
             expected_result=[[5, 2], [5, 3]]), )
    @unpack
    def test_analytic_function_with_group_by(self, selectors, expected_result):
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame(
                            [[20, 2], [10, 2], [30, 3], [31, 3], [32, 3]],
                            columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })
        tokens = tokenize(
            'select {} from my_table group by b'.format(selectors))
        node, leftover = select_rule(tokens)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertFalse(leftover)
        # Note: BQ docs say if ORDER BY clause (for the select as a whole) is not present, order of
        # results is undefined, so we do not assert on the order.
        six.assertCountEqual(self, result.to_list_of_lists(), expected_result)

    def test_non_aggregate_function_in_group_by(self):
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([['one', '1'], ['two', '1'],
                                      ['three', '2'], ['four', '2']],
                                     columns=['a', 'b']),
                        types=[BQScalarType.STRING, BQScalarType.INTEGER])
                }
            }
        })

        tokens = tokenize(
            'select max(concat(b, "hi")) from my_table group by b')
        node, leftover = select_rule(tokens)
        self.assertFalse(leftover)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertEqual(result.to_list_of_lists(), [['1hi'], ['2hi']])

    @data(
        dict(count='COUNT(*)', expected_result=[[2]]),
        dict(count='COUNT(c)', expected_result=[[2]]),
        dict(count='COUNT(DISTINCT c)', expected_result=[[1]]),
        dict(count='COUNT(b)', expected_result=[[2]]),
        dict(count='COUNT(DISTINCT b)', expected_result=[[2]]),
        dict(count='COUNT(a)', expected_result=[[1]]),
    )
    @unpack
    def test_count(self, count, expected_result):
        # type: (str, List[List[int]]) -> None
        count_table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1, 2, 3], [None, 4, 3]],
                                                columns=['a', 'b', 'c']),
                                   types=[
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER
                                   ])
                }
            }
        })
        select, leftover = select_rule(
            tokenize('SELECT {} FROM my_table'.format(count)))
        self.assertFalse(leftover)
        assert isinstance(select, Select)
        dataframe, unused_table_name = select.get_dataframe(
            count_table_context)
        self.assertEqual(dataframe.to_list_of_lists(), expected_result)

    @data(('IS_NULL', [True, False]), ('IS_NOT_NULL', [False, True]))
    @unpack
    def test_null_check(self, direction, result):
        # type: (str, List[bool]) -> None
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([[1, None], [2, 3]], columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })

        context = EvaluationContext(table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        expression = Field(('b', ))
        null_check = NullCheck(expression, direction)

        typed_series = null_check.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), result)

    @data(('IN', [True, False]), ('NOT_IN', [False, True]))
    @unpack
    def test_in_check(self, direction, result):
        # type: (str, List[bool]) -> None
        expression = Field(('a', ))
        elements = (Value(1, type_=BQScalarType.INTEGER),
                    Value(3, type_=BQScalarType.INTEGER))
        in_check = InCheck(expression, direction, elements)

        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        typed_series = in_check.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), result)

    @data((True, 0), (False, 1))
    @unpack
    def test_if_empty_context(self, condition_bool, result):
        # type: (bool, int) -> None
        condition = Value(condition_bool, BQScalarType.BOOLEAN)
        then = Value(0, BQScalarType.INTEGER)
        else_ = Value(1, BQScalarType.INTEGER)
        # IF [condition] THEN 0 ELSE 1
        if_expression = If(condition, then, else_)

        typed_series = if_expression.evaluate(EMPTY_CONTEXT)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [result])

    def test_if(self):
        condition = BinaryExpression(Field(('a', )), '>',
                                     Value(1, BQScalarType.INTEGER))
        then = Value('yes', BQScalarType.STRING)
        else_ = Value('no', BQScalarType.STRING)
        # IF a > 1 THEN "yes" ELSE "no"
        if_expression = If(condition, then, else_)

        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        typed_series = if_expression.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), ['no', 'yes'])

    def test_if_different_types(self):
        condition = Value(True, BQScalarType.BOOLEAN)
        then = Value('yes', BQScalarType.STRING)
        else_ = Value(1, BQScalarType.INTEGER)
        if_expression = If(condition, then, else_)

        error = (r"Cannot implicitly coerce the given types: "
                 r"\(BQScalarType.STRING, BQScalarType.INTEGER\)")
        with self.assertRaisesRegexp(ValueError, error):
            if_expression.evaluate(EMPTY_CONTEXT)

    def test_if_error(self):
        condition = Value(5, BQScalarType.INTEGER)
        then = Value(0, BQScalarType.INTEGER)
        else_ = Value(1, BQScalarType.INTEGER)
        if_expression = If(condition, then, else_)

        error = escape("IF condition isn't boolean! Found: {}".format(
            str(condition.evaluate(EMPTY_CONTEXT))))
        with self.assertRaisesRegexp(ValueError, error):
            if_expression.evaluate(EMPTY_CONTEXT)

    def test_not(self):
        expression = Value(True, BQScalarType.BOOLEAN)
        not_expression = Not(expression)

        typed_series = not_expression.evaluate(EMPTY_CONTEXT)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [False])

    def test_not_type_error(self):
        expression = Value(5, BQScalarType.INTEGER)
        not_expression = Not(expression)

        with self.assertRaisesRegexp(ValueError, ""):
            not_expression.evaluate(EMPTY_CONTEXT)

    @data(
        (1, BQScalarType.INTEGER, -1),
        (1.0, BQScalarType.FLOAT, -1.0),
    )
    @unpack
    def test_unary_negation(self, initial_value, value_type, result_value):
        # type: (Any, BQScalarType, Any) -> None
        expression = Value(initial_value, value_type)
        negation = UnaryNegation(expression)

        typed_series = negation.evaluate(EMPTY_CONTEXT)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [result_value])

    @data(
        ("abc", BQScalarType.STRING),
        (True, BQScalarType.BOOLEAN),
    )
    @unpack
    def test_unary_negation_error(self, value, value_type):
        # type: (Any, BQScalarType) -> None
        expression = Value(value, value_type)
        negation = UnaryNegation(expression)

        error = (
            "UnaryNegation expression supports only integers and floats, got: {}"
            .format(value_type))
        with self.assertRaisesRegexp(TypeError, error):
            negation.evaluate(EMPTY_CONTEXT)

    @data(
        dict(comparand=Field(('a', )),
             whens=[(Value(1, BQScalarType.INTEGER),
                     Value("one", BQScalarType.STRING)),
                    (Value(2, BQScalarType.INTEGER),
                     Value("two", BQScalarType.STRING))],
             else_=Value("other", BQScalarType.STRING),
             result=["one", "two"]),
        dict(comparand=Field(('a', )),
             whens=[(Value(1, BQScalarType.INTEGER),
                     Value("one", BQScalarType.STRING))],
             else_=Value("other", BQScalarType.STRING),
             result=["one", "other"]),
        dict(comparand=EMPTY_NODE,
             whens=[(Value(True, BQScalarType.BOOLEAN),
                     Value("yes", BQScalarType.STRING)),
                    (Value(False, BQScalarType.BOOLEAN),
                     Value("no", BQScalarType.STRING))],
             else_=EMPTY_NODE,
             result=["yes", "yes"]),
        dict(comparand=Field(('a', )),
             whens=[(Value(1, BQScalarType.INTEGER),
                     Value("one", BQScalarType.STRING))],
             else_=EMPTY_NODE,
             result=["one", None]),
    )
    @unpack
    def test_case_with_comparand(
            self,
            comparand,  # type: Union[_EmptyNode, EvaluatableNode]
            whens,  # type: List[Tuple[AbstractSyntaxTreeNode, EvaluatableNode]]
            else_,  # type: EvaluatableNode
            result  # type: List[str]
    ):
        # type: (...) -> None
        case = Case(comparand, whens, else_)

        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        typed_series = case.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), result)