def test_get_columns_with_int(self): data = [ ('a1', 1), ('a2', 2), ] cursor_descr = ( ('a', 'string'), ('b', 'int'), ) cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec) self.assertEqual( cdf.columns, [ { 'is_date': False, 'type': 'STRING', 'name': 'a', 'is_dim': True, }, { 'is_date': False, 'type': 'INT', 'name': 'b', 'is_dim': False, 'agg': 'sum', }, ], )
def test_df_conversion_no_dict(self): cols = [["string_col", "string"], ["int_col", "int"], ["float_col", "float"]] data = [["a", 4, 4.0]] cdf = SupersetDataFrame(data, cols, BaseEngineSpec) self.assertEqual(len(data), cdf.size) self.assertEqual(len(cols), len(cdf.columns))
def test_df_conversion_dict(self): cols = ["string_col", "dict_col", "int_col"] data = [["a", {"c1": 1, "c2": 2, "c3": 3}, 4]] cdf = SupersetDataFrame(data, cols, BaseEngineSpec) self.assertEquals(len(data), cdf.size) self.assertEquals(len(cols), len(cdf.columns))
def test_get_columns_basic(self): data = [ ('a1', 'b1', 'c1'), ('a2', 'b2', 'c2'), ] cursor_descr = ( ('a', 'string'), ('b', 'string'), ('c', 'string'), ) cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec) self.assertEqual( cdf.columns, [ { 'is_date': False, 'type': 'STRING', 'name': 'a', 'is_dim': True, }, { 'is_date': False, 'type': 'STRING', 'name': 'b', 'is_dim': True, }, { 'is_date': False, 'type': 'STRING', 'name': 'c', 'is_dim': True, }, ], )
def test_df_conversion_tuple(self): cols = ["string_col", "int_col", "list_col", "float_col"] data = [("Text", 111, [123], 1.0)] cdf = SupersetDataFrame(data, cols, BaseEngineSpec) self.assertEquals(len(data), cdf.size) self.assertEquals(len(cols), len(cdf.columns))
def test_df_conversion_tuple(self): cols = ['string_col', 'int_col', 'list_col', 'float_col'] data = [(u'Text', 111, [123], 1.0)] cdf = SupersetDataFrame(data, cols, BaseEngineSpec) self.assertEquals(len(data), cdf.size) self.assertEquals(len(cols), len(cdf.columns))
def test_get_columns_basic(self): data = [("a1", "b1", "c1"), ("a2", "b2", "c2")] cursor_descr = (("a", "string"), ("b", "string"), ("c", "string")) cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec) self.assertEqual( cdf.columns, [ { "is_date": False, "type": "STRING", "name": "a", "is_dim": True }, { "is_date": False, "type": "STRING", "name": "b", "is_dim": True }, { "is_date": False, "type": "STRING", "name": "c", "is_dim": True }, ], )
def test_df_conversion_dict(self): cols = ['string_col', 'dict_col', 'int_col'] data = [['a', {'c1': 1, 'c2': 2, 'c3': 3}, 4]] cdf = SupersetDataFrame(data, cols, BaseEngineSpec) self.assertEquals(len(data), cdf.size) self.assertEquals(len(cols), len(cdf.columns))
def test_msgpack_payload_serialization(self): use_new_deserialization = True data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))] cursor_descr = ( ("a", "string"), ("b", "int"), ("c", "float"), ("d", "datetime"), ) db_engine_spec = BaseEngineSpec() cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec) query = { "database_id": 1, "sql": "SELECT * FROM birth_names LIMIT 100", "status": QueryStatus.PENDING, } serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data( cdf, db_engine_spec, use_new_deserialization) payload = { "query_id": 1, "status": QueryStatus.SUCCESS, "state": QueryStatus.SUCCESS, "data": serialized_data, "columns": all_columns, "selected_columns": selected_columns, "expanded_columns": expanded_columns, "query": query, } serialized = sql_lab._serialize_payload(payload, use_new_deserialization) self.assertIsInstance(serialized, bytes)
def test_get_columns_type_inference(self): data = [ (1.2, 1), (3.14, 2), ] cursor_descr = ( ('a', None), ('b', None), ) cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec) self.assertEqual( cdf.columns, [ { 'is_date': False, 'type': 'FLOAT', 'name': 'a', 'is_dim': False, 'agg': 'sum', }, { 'is_date': False, 'type': 'INT', 'name': 'b', 'is_dim': False, 'agg': 'sum', }, ], )
def test_empty_data(self): data = [] cursor_descr = [ ("one", "varchar", None, None, None, None, True), ("two", "integer", None, None, None, None, True), ] cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec) self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("O")) self.assertEqual(cdf.raw_df.dtypes[1], pd.Int64Dtype())
def test_int64_with_missing_data(self): data = [(None,), (1239162456494753670,), (None,), (None,), (None,), (None,)] cursor_descr = [("user_id", "bigint", None, None, None, None, True)] # the base engine spec does not provide a dtype based on the cursor # description, so the column is inferred as float64 because of the # missing data cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec) np.testing.assert_array_equal( cdf.raw_df.values.tolist(), [[np.nan], [1.2391624564947538e18], [np.nan], [np.nan], [np.nan], [np.nan]], ) # currently only Presto provides a dtype based on the cursor description cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec) np.testing.assert_array_equal( cdf.raw_df.values.tolist(), [[np.nan], [1239162456494753670], [np.nan], [np.nan], [np.nan], [np.nan]], )
def test_df_conversion_no_dict(self): cols = [ ['string_col', 'string'], ['int_col', 'int'], ['float_col', 'float'], ] data = [['a', 4, 4.0]] cdf = SupersetDataFrame(data, cols, BaseEngineSpec) self.assertEquals(len(data), cdf.size) self.assertEquals(len(cols), len(cdf.columns))
def test_dedup_with_data(self): data = [ ('a', 1), ('a', 2), ] cursor_descr = ( ('a', 'string'), ('a', 'string'), ) cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec) self.assertListEqual(cdf.column_names, ['a', 'a__1'])
def test_get_columns_with_int(self): data = [("a1", 1), ("a2", 2)] cursor_descr = (("a", "string"), ("b", "int")) cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec) self.assertEqual( cdf.columns, [ {"is_date": False, "type": "STRING", "name": "a", "is_dim": True}, { "is_date": False, "type": "INT", "name": "b", "is_dim": False, "agg": "sum", }, ], )
def test_new_data_serialization(self): data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))] cursor_descr = ( ("a", "string"), ("b", "int"), ("c", "float"), ("d", "datetime"), ) db_engine_spec = BaseEngineSpec() cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec) with mock.patch.object( db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data) as expand_data: data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data( cdf, db_engine_spec, True) expand_data.assert_not_called() self.assertIsInstance(data, bytes)
def test_get_columns_type_inference(self): data = [(1.2, 1), (3.14, 2)] cursor_descr = (("a", None), ("b", None)) cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec) self.assertEqual( cdf.columns, [ { "is_date": False, "type": "FLOAT", "name": "a", "is_dim": False, "agg": "sum", }, { "is_date": False, "type": "INT", "name": "b", "is_dim": False, "agg": "sum", }, ], )
def execute_sql_statement(sql_statement, query, user_name, session, cursor): """Executes a single SQL statement""" database = query.database db_engine_spec = database.db_engine_spec parsed_query = ParsedQuery(sql_statement) sql = parsed_query.stripped() if not parsed_query.is_readonly() and not database.allow_dml: raise SqlLabSecurityException( _("Only `SELECT` statements are allowed against this database")) if query.select_as_cta: if not parsed_query.is_select(): raise SqlLabException( _("Only `SELECT` statements can be used with the CREATE TABLE " "feature.")) if not query.tmp_table_name: start_dttm = datetime.fromtimestamp(query.start_time) query.tmp_table_name = "tmp_{}_table_{}".format( query.user_id, start_dttm.strftime("%Y_%m_%d_%H_%M_%S")) sql = parsed_query.as_create_table(query.tmp_table_name) query.select_as_cta_used = True if parsed_query.is_select(): if SQL_MAX_ROW and (not query.limit or query.limit > SQL_MAX_ROW): query.limit = SQL_MAX_ROW if query.limit: sql = database.apply_limit_to_sql(sql, query.limit) # Hook to allow environment-specific mutation (usually comments) to the SQL if SQL_QUERY_MUTATOR: sql = SQL_QUERY_MUTATOR(sql, user_name, security_manager, database) try: if log_query: log_query( query.database.sqlalchemy_uri, query.executed_sql, query.schema, user_name, __name__, security_manager, ) query.executed_sql = sql session.commit() with stats_timing("sqllab.query.time_executing_query", stats_logger): logger.info(f"Query {query.id}: Running query: \n{sql}") db_engine_spec.execute(cursor, sql, async_=True) logger.info(f"Query {query.id}: Handling cursor") db_engine_spec.handle_cursor(cursor, query, session) with stats_timing("sqllab.query.time_fetching_results", stats_logger): logger.debug( "Query %d: Fetching data for query object: %s", query.id, str(query.to_dict()), ) data = db_engine_spec.fetch_data(cursor, query.limit) except SoftTimeLimitExceeded as e: logger.exception(f"Query {query.id}: {e}") raise SqlLabTimeoutException( "SQL Lab timeout. This environment's policy is to kill queries " "after {} seconds.".format(SQLLAB_TIMEOUT)) except Exception as e: logger.exception(f"Query {query.id}: {e}") raise SqlLabException(db_engine_spec.extract_error_message(e)) logger.debug(f"Query {query.id}: Fetching cursor description") cursor_description = cursor.description return SupersetDataFrame(data, cursor_description, db_engine_spec)
def test_dedup_with_data(self): data = [("a", 1), ("a", 2)] cursor_descr = (("a", "string"), ("a", "string")) cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec) self.assertListEqual(cdf.column_names, ["a", "a__1"])
def test_pandas_datetime64(self): data = [(None, )] cursor_descr = [("ds", "timestamp", None, None, None, None, True)] cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec) self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("<M8[ns]"))