def test_nullable_bool(self): data = [(None, ), (True, ), (None, ), (None, ), (None, ), (None, )] cursor_descr = [("is_test", "bool", None, None, None, None, True)] results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) self.assertEqual(results.columns[0]["type"], "BOOL") df = results.to_pandas_df() self.assertEqual( df_to_records(df), [ { "is_test": None }, { "is_test": True }, { "is_test": None }, { "is_test": None }, { "is_test": None }, { "is_test": None }, ], )
def test_single_column_multidim_nested_types(self): data = [ ( [ "test", [ [ "foo", 123456, [ [["test"], 3432546, 7657658766], [["fake"], 656756765, 324324324324], ], ] ], ["test2", 43, 765765765], None, None, ], ) ] cursor_descr = [("metadata",)] results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) self.assertEqual(results.columns[0]["type"], "STRING") df = results.to_pandas_df() self.assertEqual( df_to_records(df), [ { "metadata": '["test", [["foo", 123456, [[["test"], 3432546, 7657658766], [["fake"], 656756765, 324324324324]]]], ["test2", 43, 765765765], null, null]' } ], )
def test_js_max_int() -> None: from superset.db_engine_specs import BaseEngineSpec from superset.result_set import SupersetResultSet data = [(1, 1239162456494753670, "c1"), (2, 100, "c2")] cursor_descr: DbapiDescription = [ ("a", "int", None, None, None, None, False), ("b", "int", None, None, None, None, False), ("c", "string", None, None, None, None, False), ] results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) df = results.to_pandas_df() assert df_to_records(df) == [ { "a": 1, "b": "1239162456494753670", "c": "c1" }, { "a": 2, "b": 100, "c": "c2" }, ]
def test_nested_list_types(self): data = [([{"TestKey": [123456, "foo"]}],)] cursor_descr = [("metadata",)] results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) self.assertEqual(results.columns[0]["type"], "STRING") df = results.to_pandas_df() self.assertEqual( df_to_records(df), [{"metadata": '[{"TestKey": [123456, "foo"]}]'}] )
def test_results_msgpack_deserialization(self): use_new_deserialization = True data = [("a", 4, 4.0, "2019-08-18T16:39:16.660000")] cursor_descr = ( ("a", "string"), ("b", "int"), ("c", "float"), ("d", "datetime"), ) db_engine_spec = BaseEngineSpec() results = SupersetResultSet(data, cursor_descr, db_engine_spec) query = { "database_id": 1, "sql": "SELECT * FROM birth_names LIMIT 100", "status": utils.QueryStatus.PENDING, } ( serialized_data, selected_columns, all_columns, expanded_columns, ) = sql_lab._serialize_and_expand_data( results, db_engine_spec, use_new_deserialization ) payload = { "query_id": 1, "status": utils.QueryStatus.SUCCESS, "state": utils.QueryStatus.SUCCESS, "data": serialized_data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query, } serialized_payload = sql_lab._serialize_payload( payload, use_new_deserialization ) self.assertIsInstance(serialized_payload, bytes) with mock.patch.object( db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data ) as expand_data: query_mock = mock.Mock() query_mock.database.db_engine_spec.expand_data = expand_data deserialized_payload = superset.views.utils._deserialize_results_payload( serialized_payload, query_mock, use_new_deserialization ) df = results.to_pandas_df() payload["data"] = dataframe.df_to_records(df) self.assertDictEqual(deserialized_payload, payload) expand_data.assert_called_once()
def test_max_pandas_timestamp(input_, expected) -> None: from superset.db_engine_specs import BaseEngineSpec from superset.result_set import SupersetResultSet cursor_descr: DbapiDescription = [ ("a", "datetime", None, None, None, None, False), ("b", "int", None, None, None, None, False), ] results = SupersetResultSet(input_, cursor_descr, BaseEngineSpec) df = results.to_pandas_df() assert df_to_records(df) == expected
def test_data_as_list_of_lists(self): data = [[1, "a"], [2, "b"]] cursor_descr = [ ("user_id", "INT", None, None, None, None, True), ("username", "STRING", None, None, None, None, True), ] results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) df = results.to_pandas_df() self.assertEqual( df_to_records(df), [{"user_id": 1, "username": "******"}, {"user_id": 2, "username": "******"}], )
def test_nested_types(self): data = [ ( 4, [{ "table_name": "unicode_test", "database_id": 1 }], [1, 2, 3], { "chart_name": "scatter" }, ), ( 3, [{ "table_name": "birth_names", "database_id": 1 }], [4, 5, 6], { "chart_name": "plot" }, ), ] cursor_descr = [("id", ), ("dict_arr", ), ("num_arr", ), ("map_col", )] results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) self.assertEqual(results.columns[0]["type"], "INT") self.assertEqual(results.columns[1]["type"], "STRING") self.assertEqual(results.columns[2]["type"], "STRING") self.assertEqual(results.columns[3]["type"], "STRING") df = results.to_pandas_df() self.assertEqual( df_to_records(df), [ { "id": 4, "dict_arr": '[{"table_name": "unicode_test", "database_id": 1}]', "num_arr": "[1, 2, 3]", "map_col": '{"chart_name": "scatter"}', }, { "id": 3, "dict_arr": '[{"table_name": "birth_names", "database_id": 1}]', "num_arr": "[4, 5, 6]", "map_col": '{"chart_name": "plot"}', }, ], )
def get_df( # pylint: disable=too-many-locals self, sql: str, schema: Optional[str] = None, mutator: Optional[Callable[[pd.DataFrame], None]] = None, ) -> pd.DataFrame: sqls = self.db_engine_spec.parse_sql(sql) engine = self.get_sqla_engine(schema) def needs_conversion(df_series: pd.Series) -> bool: return ( not df_series.empty and isinstance(df_series, pd.Series) and isinstance(df_series[0], (list, dict)) ) def _log_query(sql: str) -> None: if log_query: log_query( engine.url, sql, schema, get_username(), __name__, security_manager, ) with closing(engine.raw_connection()) as conn: cursor = conn.cursor() for sql_ in sqls[:-1]: _log_query(sql_) self.db_engine_spec.execute(cursor, sql_) cursor.fetchall() _log_query(sqls[-1]) self.db_engine_spec.execute(cursor, sqls[-1]) data = self.db_engine_spec.fetch_data(cursor) result_set = SupersetResultSet( data, cursor.description, self.db_engine_spec ) df = result_set.to_pandas_df() if mutator: df = mutator(df) for col, coltype in df.dtypes.to_dict().items(): if coltype == numpy.object_ and needs_conversion(df[col]): df[col] = df[col].apply(utils.json_dumps_w_dates) return df
def test_is_date(self): data = [("a", 1), ("a", 2)] cursor_descr = (("a", "string"), ("a", "string")) results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) self.assertEqual(results.is_temporal("DATE"), True) self.assertEqual(results.is_temporal("DATETIME"), True) self.assertEqual(results.is_temporal("TIME"), True) self.assertEqual(results.is_temporal("TIMESTAMP"), True) self.assertEqual(results.is_temporal("STRING"), False) self.assertEqual(results.is_temporal(""), False) self.assertEqual(results.is_temporal(None), False)
def test_msgpack_payload_serialization(self): use_new_deserialization = True data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))] cursor_descr = ( ("a", "string"), ("b", "int"), ("c", "float"), ("d", "datetime"), ) db_engine_spec = BaseEngineSpec() results = SupersetResultSet(data, cursor_descr, db_engine_spec) query = { "database_id": 1, "sql": "SELECT * FROM birth_names LIMIT 100", "status": QueryStatus.PENDING, } serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data( results, db_engine_spec, use_new_deserialization) payload = { "query_id": 1, "status": QueryStatus.SUCCESS, "state": QueryStatus.SUCCESS, "data": serialized_data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query, } serialized = sql_lab._serialize_payload(payload, use_new_deserialization) self.assertIsInstance(serialized, bytes)
def test_pa_conversion_tuple(self): cols = ["string_col", "int_col", "list_col", "float_col"] data = [("Text", 111, [123], 1.0)] results = SupersetResultSet(data, cols, BaseEngineSpec) self.assertEqual(len(data), results.size) self.assertEqual(len(cols), len(results.columns))
def test_pa_conversion_dict(self): cols = ["string_col", "dict_col", "int_col"] data = [["a", {"c1": 1, "c2": 2, "c3": 3}, 4]] results = SupersetResultSet(data, cols, BaseEngineSpec) self.assertEqual(len(data), results.size) self.assertEqual(len(cols), len(results.columns))
def _serialize_and_expand_data( result_set: SupersetResultSet, db_engine_spec: BaseEngineSpec, use_msgpack: Optional[bool] = False, expand_data: bool = False, ) -> Tuple[Union[bytes, str], List[Any], List[Any], List[Any]]: selected_columns = result_set.columns all_columns: List[Any] expanded_columns: List[Any] if use_msgpack: with stats_timing("sqllab.query.results_backend_pa_serialization", stats_logger): data = (pa.default_serialization_context().serialize( result_set.pa_table).to_buffer().to_pybytes()) # expand when loading data from results backend all_columns, expanded_columns = (selected_columns, []) else: df = result_set.to_pandas_df() data = df_to_records(df) or [] if expand_data: all_columns, data, expanded_columns = db_engine_spec.expand_data( selected_columns, data) else: all_columns = selected_columns expanded_columns = [] return (data, selected_columns, all_columns, expanded_columns)
def test_ps_conversion_no_dict(self): cols = [["string_col", "string"], ["int_col", "int"], ["float_col", "float"]] data = [["a", 4, 4.0]] results = SupersetResultSet(data, cols, BaseEngineSpec) self.assertEqual(len(data), results.size) self.assertEqual(len(cols), len(results.columns))
def test_msgpack_payload_serialization(): use_new_deserialization = True db_engine_spec = BaseEngineSpec() results = SupersetResultSet(SERIALIZATION_DATA, CURSOR_DESCR, db_engine_spec) query = { "database_id": 1, "sql": "SELECT * FROM birth_names LIMIT 100", "status": QueryStatus.PENDING, } ( serialized_data, selected_columns, all_columns, expanded_columns, ) = sql_lab._serialize_and_expand_data(results, db_engine_spec, use_new_deserialization) payload = { "query_id": 1, "status": QueryStatus.SUCCESS, "state": QueryStatus.SUCCESS, "data": serialized_data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query, } serialized = sql_lab._serialize_payload(payload, use_new_deserialization) assert isinstance(serialized, bytes)
def test_new_data_serialization(self): data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))] cursor_descr = ( ("a", "string"), ("b", "int"), ("c", "float"), ("d", "datetime"), ) db_engine_spec = BaseEngineSpec() results = SupersetResultSet(data, cursor_descr, db_engine_spec) with mock.patch.object( db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data) as expand_data: ( data, selected_columns, all_columns, expanded_columns, ) = sql_lab._serialize_and_expand_data(results, db_engine_spec, True) expand_data.assert_not_called() self.assertIsInstance(data, bytes)
def test_mssql_engine_spec_pymssql(self): # Test for case when tuple is returned (pymssql) data = [ (1, 1, datetime.datetime(2017, 10, 19, 23, 39, 16, 660000)), (2, 2, datetime.datetime(2018, 10, 19, 23, 39, 16, 660000)), ] results = SupersetResultSet( list(data), [["col1"], ["col2"], ["col3"]], MssqlEngineSpec ) df = results.to_pandas_df() data = dataframe.df_to_records(df) self.assertEqual(len(data), 2) self.assertEqual( data[0], {"col1": 1, "col2": 1, "col3": pd.Timestamp("2017-10-19 23:39:16.660000")}, )
def test_column_names_as_bytes() -> None: """ Test that we can handle column names as bytes. """ from superset.db_engine_specs.redshift import RedshiftEngineSpec from superset.result_set import SupersetResultSet data = ( [ "2016-01-26", 392.002014, 397.765991, 390.575012, 392.153015, 392.153015, 58147000, ], [ "2016-01-27", 392.444, 396.842987, 391.782013, 394.971985, 394.971985, 47424400, ], ) description = [ (b"date", 1043, None, None, None, None, None), (b"open", 701, None, None, None, None, None), (b"high", 701, None, None, None, None, None), (b"low", 701, None, None, None, None, None), (b"close", 701, None, None, None, None, None), (b"adj close", 701, None, None, None, None, None), (b"volume", 20, None, None, None, None, None), ] result_set = SupersetResultSet(data, description, RedshiftEngineSpec) # type: ignore assert ( result_set.to_pandas_df().to_markdown() == """ | | date | open | high | low | close | adj close | volume | |---:|:-----------|--------:|--------:|--------:|--------:|------------:|---------:| | 0 | 2016-01-26 | 392.002 | 397.766 | 390.575 | 392.153 | 392.153 | 58147000 | | 1 | 2016-01-27 | 392.444 | 396.843 | 391.782 | 394.972 | 394.972 | 47424400 | """.strip() )
def test_empty_data(self): data = [] cursor_descr = [ ("emptyone", "varchar", None, None, None, None, True), ("emptytwo", "int", None, None, None, None, True), ] results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) self.assertEqual(results.columns, [])
def test_df_to_records(self): data = [("a1", "b1", "c1"), ("a2", "b2", "c2")] cursor_descr = (("a", "string"), ("b", "string"), ("c", "string")) results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) df = results.to_pandas_df() self.assertEqual( df_to_records(df), [{ "a": "a1", "b": "b1", "c": "c1" }, { "a": "a2", "b": "b2", "c": "c2" }], )
def test_no_type_coercion(self): data = [("a", 1), ("b", 2)] cursor_descr = [ ("one", "varchar", None, None, None, None, True), ("two", "int", None, None, None, None, True), ] results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) self.assertEqual(results.columns[0]["type"], "VARCHAR") self.assertEqual(results.columns[1]["type"], "INT")
def test_new_data_serialization(): db_engine_spec = BaseEngineSpec() results = SupersetResultSet(SERIALIZATION_DATA, CURSOR_DESCR, db_engine_spec) with mock.patch.object( db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data ) as expand_data: data = sql_lab._serialize_and_expand_data(results, db_engine_spec, True) expand_data.assert_not_called() assert isinstance(data[0], bytes)
def test_dataframe_timezone(self): tz = pytz.FixedOffset(60) data = [ (datetime.datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=tz), ), (datetime.datetime(2017, 11, 18, 22, 6, 30, tzinfo=tz), ), ] results = SupersetResultSet(list(data), [["data"]], BaseEngineSpec) df = results.to_pandas_df() data = dataframe.df_to_records(df) json_str = json.dumps(data, default=utils.pessimistic_json_iso_dttm_ser) self.assertDictEqual( data[0], {"data": pd.Timestamp("2017-11-18 21:53:00.219225+0100", tz=tz)}) self.assertDictEqual( data[1], {"data": pd.Timestamp("2017-11-18 22:06:30+0100", tz=tz)}) self.assertEqual( json_str, '[{"data": "2017-11-18T21:53:00.219225+01:00"}, {"data": "2017-11-18T22:06:30+01:00"}]', )
def test_get_columns_with_int(self): data = [("a1", 1), ("a2", 2)] cursor_descr = (("a", "string"), ("b", "int")) results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) self.assertEqual( results.columns, [ {"is_date": False, "type": "STRING", "name": "a"}, {"is_date": False, "type": "INT", "name": "b"}, ], )
def test_js_max_int(self): data = [(1, 1239162456494753670, "c1"), (2, 100, "c2")] cursor_descr = (("a", "int"), ("b", "int"), ("c", "string")) results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) df = results.to_pandas_df() self.assertEqual( df_to_records(df), [ { "a": 1, "b": "1239162456494753670", "c": "c1" }, { "a": 2, "b": 100, "c": "c2" }, ], )
def test_get_columns_basic(self): data = [("a1", "b1", "c1"), ("a2", "b2", "c2")] cursor_descr = (("a", "string"), ("b", "string"), ("c", "string")) results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) self.assertEqual( results.columns, [ {"is_date": False, "type": "STRING", "name": "a"}, {"is_date": False, "type": "STRING", "name": "b"}, {"is_date": False, "type": "STRING", "name": "c"}, ], )
def test_df_to_records() -> None: from superset.db_engine_specs import BaseEngineSpec from superset.result_set import SupersetResultSet data = [("a1", "b1", "c1"), ("a2", "b2", "c2")] cursor_descr: DbapiDescription = [(column, "string", None, None, None, None, False) for column in ("a", "b", "c")] results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) df = results.to_pandas_df() assert df_to_records(df) == [ { "a": "a1", "b": "b1", "c": "c1" }, { "a": "a2", "b": "b2", "c": "c2" }, ]
def test_results_default_deserialization(self): use_new_deserialization = False data = [("a", 4, 4.0, "2019-08-18T16:39:16.660000")] cursor_descr = ( ("a", "string"), ("b", "int"), ("c", "float"), ("d", "datetime"), ) db_engine_spec = BaseEngineSpec() results = SupersetResultSet(data, cursor_descr, db_engine_spec) query = { "database_id": 1, "sql": "SELECT * FROM birth_names LIMIT 100", "status": utils.QueryStatus.PENDING, } ( serialized_data, selected_columns, all_columns, expanded_columns, ) = sql_lab._serialize_and_expand_data( results, db_engine_spec, use_new_deserialization ) payload = { "query_id": 1, "status": utils.QueryStatus.SUCCESS, "state": utils.QueryStatus.SUCCESS, "data": serialized_data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query, } serialized_payload = sql_lab._serialize_payload( payload, use_new_deserialization ) self.assertIsInstance(serialized_payload, str) query_mock = mock.Mock() deserialized_payload = superset.views.utils._deserialize_results_payload( serialized_payload, query_mock, use_new_deserialization ) self.assertDictEqual(deserialized_payload, payload) query_mock.assert_not_called()
def test_get_columns_type_inference(self): data = [ (1.2, 1, "foo", datetime(2018, 10, 19, 23, 39, 16, 660000), True), (3.14, 2, "bar", datetime(2019, 10, 19, 23, 39, 16, 660000), False), ] cursor_descr = (("a", None), ("b", None), ("c", None), ("d", None), ("e", None)) results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) self.assertEqual( results.columns, [ {"is_date": False, "type": "FLOAT", "name": "a"}, {"is_date": False, "type": "INT", "name": "b"}, {"is_date": False, "type": "STRING", "name": "c"}, {"is_date": True, "type": "DATETIME", "name": "d"}, {"is_date": False, "type": "BOOL", "name": "e"}, ], )