def test_is_column_name_quoted(self): column_name = "mock" assert PrestoEngineSpec._is_column_name_quoted(column_name) is False column_name = '"mock' assert PrestoEngineSpec._is_column_name_quoted(column_name) is False column_name = '"moc"k' assert PrestoEngineSpec._is_column_name_quoted(column_name) is False column_name = '"moc"k"' assert PrestoEngineSpec._is_column_name_quoted(column_name) is True
def test_convert_dttm(self): dttm = self.get_dttm() self.assertEqual( PrestoEngineSpec.convert_dttm("DATE", dttm), "from_iso8601_date('2019-01-02')", ) self.assertEqual( PrestoEngineSpec.convert_dttm("TIMESTAMP", dttm), "from_iso8601_timestamp('2019-01-02T03:04:05.678900')", )
def test_split_data_type(self): data_type = "value1 value2" result = PrestoEngineSpec._split_data_type(data_type, " ") assert result == ["value1", "value2"] data_type = "value1,value2" result = PrestoEngineSpec._split_data_type(data_type, ",") assert result == ["value1", "value2"] data_type = '"value,1",value2' result = PrestoEngineSpec._split_data_type(data_type, ",") assert result == ['"value,1"', "value2"]
def test_select_star_no_presto_expand_data(self, mock_select_star): database = mock.Mock() table_name = "table_name" engine = mock.Mock() cols = [ {"col1": "val1"}, {"col2": "val2"}, ] PrestoEngineSpec.select_star(database, table_name, engine, cols=cols) mock_select_star.assert_called_once_with( database, table_name, engine, None, 100, False, True, True, cols )
def test_get_sqla_column_type(self): sqla_type = PrestoEngineSpec.get_sqla_column_type("varchar(255)") assert isinstance(sqla_type, types.VARCHAR) assert sqla_type.length == 255 sqla_type = PrestoEngineSpec.get_sqla_column_type("varchar") assert isinstance(sqla_type, types.String) assert sqla_type.length is None sqla_type = PrestoEngineSpec.get_sqla_column_type("char(10)") assert isinstance(sqla_type, types.CHAR) assert sqla_type.length == 10 sqla_type = PrestoEngineSpec.get_sqla_column_type("char") assert isinstance(sqla_type, types.CHAR) assert sqla_type.length is None sqla_type = PrestoEngineSpec.get_sqla_column_type("integer") assert isinstance(sqla_type, types.Integer) sqla_type = PrestoEngineSpec.get_sqla_column_type("time") assert isinstance(sqla_type, types.Time) sqla_type = PrestoEngineSpec.get_sqla_column_type("timestamp") assert isinstance(sqla_type, types.TIMESTAMP) sqla_type = PrestoEngineSpec.get_sqla_column_type(None) assert sqla_type is None
def test_get_sqla_column_type(self): column_spec = PrestoEngineSpec.get_column_spec("varchar(255)") assert isinstance(column_spec.sqla_type, types.VARCHAR) assert column_spec.sqla_type.length == 255 self.assertEqual(column_spec.generic_type, GenericDataType.STRING) column_spec = PrestoEngineSpec.get_column_spec("varchar") assert isinstance(column_spec.sqla_type, types.String) assert column_spec.sqla_type.length is None self.assertEqual(column_spec.generic_type, GenericDataType.STRING) column_spec = PrestoEngineSpec.get_column_spec("char(10)") assert isinstance(column_spec.sqla_type, types.CHAR) assert column_spec.sqla_type.length == 10 self.assertEqual(column_spec.generic_type, GenericDataType.STRING) column_spec = PrestoEngineSpec.get_column_spec("char") assert isinstance(column_spec.sqla_type, types.CHAR) assert column_spec.sqla_type.length is None self.assertEqual(column_spec.generic_type, GenericDataType.STRING) column_spec = PrestoEngineSpec.get_column_spec("integer") assert isinstance(column_spec.sqla_type, types.Integer) self.assertEqual(column_spec.generic_type, GenericDataType.NUMERIC) column_spec = PrestoEngineSpec.get_column_spec("time") assert isinstance(column_spec.sqla_type, types.Time) self.assertEqual(column_spec.generic_type, GenericDataType.TEMPORAL) column_spec = PrestoEngineSpec.get_column_spec("timestamp") assert isinstance(column_spec.sqla_type, types.TIMESTAMP) self.assertEqual(column_spec.generic_type, GenericDataType.TEMPORAL) sqla_type = PrestoEngineSpec.get_sqla_column_type(None) assert sqla_type is None
def test_select_star_presto_expand_data(self, mock_select_star, mock_is_feature_enabled): mock_is_feature_enabled.return_value = True database = mock.Mock() table_name = "table_name" engine = mock.Mock() cols = [ { "name": "val1" }, { "name": "val2<?!@#$312,/'][p098" }, { "name": ".val2" }, { "name": "val2." }, { "name": "val.2" }, { "name": ".val2." }, ] PrestoEngineSpec.select_star(database, table_name, engine, show_cols=True, cols=cols) mock_select_star.assert_called_once_with( database, table_name, engine, None, 100, True, True, True, [ { "name": "val1" }, { "name": "val2<?!@#$312,/'][p098" }, ], )
def test_presto_expand_row_data(self): datum = {'row_col': [1, 'a']} row_column = 'row_col' row_col_hierarchy = { 'row_col': { 'type': 'ROW', 'children': ['row_col.nested_int', 'row_col.nested_str'], }, } PrestoEngineSpec._expand_row_data(datum, row_column, row_col_hierarchy) expected_datum = { 'row_col': [1, 'a'], 'row_col.nested_int': 1, 'row_col.nested_str': 'a', } self.assertEqual(datum, expected_datum)
def test_presto_expand_row_data(self): datum = {"row_col": [1, "a"]} row_column = "row_col" row_col_hierarchy = { "row_col": { "type": "ROW", "children": ["row_col.nested_int", "row_col.nested_str"], } } PrestoEngineSpec._expand_row_data(datum, row_column, row_col_hierarchy) expected_datum = { "row_col": [1, "a"], "row_col.nested_int": 1, "row_col.nested_str": "a", } self.assertEqual(datum, expected_datum)
def test_presto_expand_data_with_simple_structural_columns(self): cols = [ {"name": "row_column", "type": "ROW(NESTED_OBJ VARCHAR)"}, {"name": "array_column", "type": "ARRAY(BIGINT)"}, ] data = [ {"row_column": ["a"], "array_column": [1, 2, 3]}, {"row_column": ["b"], "array_column": [4, 5, 6]}, ] actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data( cols, data ) expected_cols = [ {"name": "row_column", "type": "ROW(NESTED_OBJ VARCHAR)"}, {"name": "row_column.nested_obj", "type": "VARCHAR"}, {"name": "array_column", "type": "ARRAY(BIGINT)"}, ] expected_data = [ {"array_column": 1, "row_column": ["a"], "row_column.nested_obj": "a"}, {"array_column": 2, "row_column": "", "row_column.nested_obj": ""}, {"array_column": 3, "row_column": "", "row_column.nested_obj": ""}, {"array_column": 4, "row_column": ["b"], "row_column.nested_obj": "b"}, {"array_column": 5, "row_column": "", "row_column.nested_obj": ""}, {"array_column": 6, "row_column": "", "row_column.nested_obj": ""}, ] expected_expanded_cols = [{"name": "row_column.nested_obj", "type": "VARCHAR"}] self.assertEqual(actual_cols, expected_cols) self.assertEqual(actual_data, expected_data) self.assertEqual(actual_expanded_cols, expected_expanded_cols)
def test_presto_get_fields(self): cols = [ { "name": "column" }, { "name": "column.nested_obj" }, { "name": 'column."quoted.nested obj"' }, ] actual_results = PrestoEngineSpec._get_fields(cols) expected_results = [ { "name": '"column"', "label": "column" }, { "name": '"column"."nested_obj"', "label": "column.nested_obj" }, { "name": '"column"."quoted.nested obj"', "label": 'column."quoted.nested obj"', }, ] for actual_result, expected_result in zip(actual_results, expected_results): self.assertEqual(actual_result.element.name, expected_result["name"]) self.assertEqual(actual_result.name, expected_result["label"])
def test_get_full_name(self): names = [ ("part1", "part2"), ("part11", "part22"), ] result = PrestoEngineSpec._get_full_name(names) assert result == "part1.part11"
def test_presto_process_array_data(self): data = [ {"array_column": [[1], [2]], "int_column": 3}, {"array_column": [[11], [22]], "int_column": 33}, ] all_columns = [ {"name": "array_column", "type": "ARRAY"}, {"name": "array_column.nested_row", "type": "BIGINT"}, {"name": "int_column", "type": "BIGINT"}, ] array_column_hierarchy = { "array_column": {"type": "ARRAY", "children": ["array_column.nested_row"]} } actual_array_data = PrestoEngineSpec._process_array_data( data, all_columns, array_column_hierarchy ) expected_array_data = { 0: [ {"array_column": [[1], [2]], "array_column.nested_row": 1}, {"array_column": "", "array_column.nested_row": 2, "int_column": ""}, ], 1: [ {"array_column": [[11], [22]], "array_column.nested_row": 11}, {"array_column": "", "array_column.nested_row": 22, "int_column": ""}, ], } self.assertEqual(actual_array_data, expected_array_data)
def test_presto_expand_data_with_simple_structural_columns(self): cols = [{ 'name': 'row_column', 'type': 'ROW(NESTED_OBJ VARCHAR)' }, { 'name': 'array_column', 'type': 'ARRAY(BIGINT)' }] data = [{ 'row_column': ['a'], 'array_column': [1, 2, 3] }, { 'row_column': ['b'], 'array_column': [4, 5, 6] }] actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data( cols, data) expected_cols = [{ 'name': 'row_column', 'type': 'ROW' }, { 'name': 'row_column.nested_obj', 'type': 'VARCHAR' }, { 'name': 'array_column', 'type': 'ARRAY' }] expected_data = [{ 'row_column': ['a'], 'row_column.nested_obj': 'a', 'array_column': 1 }, { 'row_column': '', 'row_column.nested_obj': '', 'array_column': 2 }, { 'row_column': '', 'row_column.nested_obj': '', 'array_column': 3 }, { 'row_column': ['b'], 'row_column.nested_obj': 'b', 'array_column': 4 }, { 'row_column': '', 'row_column.nested_obj': '', 'array_column': 5 }, { 'row_column': '', 'row_column.nested_obj': '', 'array_column': 6 }] expected_expanded_cols = [{ 'name': 'row_column.nested_obj', 'type': 'VARCHAR' }] self.assertEqual(actual_cols, expected_cols) self.assertEqual(actual_data, expected_data) self.assertEqual(actual_expanded_cols, expected_expanded_cols)
def test_presto_expand_data_with_complex_row_columns(self): cols = [{ "name": "row_column", "type": "ROW(NESTED_OBJ1 VARCHAR, NESTED_ROW ROW(NESTED_OBJ2 VARCHAR)", }] data = [{"row_column": ["a1", ["a2"]]}, {"row_column": ["b1", ["b2"]]}] actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data( cols, data) expected_cols = [ { "name": "row_column", "type": "ROW" }, { "name": "row_column.nested_obj1", "type": "VARCHAR" }, { "name": "row_column.nested_row", "type": "ROW" }, { "name": "row_column.nested_row.nested_obj2", "type": "VARCHAR" }, ] expected_data = [ { "row_column": ["a1", ["a2"]], "row_column.nested_obj1": "a1", "row_column.nested_row": ["a2"], "row_column.nested_row.nested_obj2": "a2", }, { "row_column": ["b1", ["b2"]], "row_column.nested_obj1": "b1", "row_column.nested_row": ["b2"], "row_column.nested_row.nested_obj2": "b2", }, ] expected_expanded_cols = [ { "name": "row_column.nested_obj1", "type": "VARCHAR" }, { "name": "row_column.nested_row", "type": "ROW" }, { "name": "row_column.nested_row.nested_obj2", "type": "VARCHAR" }, ] self.assertEqual(actual_cols, expected_cols) self.assertEqual(actual_data, expected_data) self.assertEqual(actual_expanded_cols, expected_expanded_cols)
def test_presto_process_array_data(self): data = [ { 'array_column': [[1], [2]], 'int_column': 3 }, { 'array_column': [[11], [22]], 'int_column': 33 }, ] all_columns = [ { 'name': 'array_column', 'type': 'ARRAY' }, { 'name': 'array_column.nested_row', 'type': 'BIGINT' }, { 'name': 'int_column', 'type': 'BIGINT' }, ] array_column_hierarchy = { 'array_column': { 'type': 'ARRAY', 'children': ['array_column.nested_row'], }, } actual_array_data = PrestoEngineSpec._process_array_data( data, all_columns, array_column_hierarchy) expected_array_data = { 0: [ { 'array_column': [[1], [2]], 'array_column.nested_row': 1 }, { 'array_column': '', 'array_column.nested_row': 2, 'int_column': '' }, ], 1: [ { 'array_column': [[11], [22]], 'array_column.nested_row': 11 }, { 'array_column': '', 'array_column.nested_row': 22, 'int_column': '' }, ], } self.assertEqual(actual_array_data, expected_array_data)
def test_get_full_name_empty_tuple(self): names = [ ("part1", "part2"), ("", "part3"), ("part4", "part5"), ("", "part6"), ] result = PrestoEngineSpec._get_full_name(names) assert result == "part1.part4"
def test_get_table_names_split_views_from_tables_no_tables( self, mock_get_view_names, mock_get_table_names, mock_is_feature_enabled ): mock_get_view_names.return_value = [] table_names = [] mock_get_table_names.return_value = table_names mock_is_feature_enabled.return_value = True tables = PrestoEngineSpec.get_table_names(mock.Mock(), mock.Mock(), None) assert tables == []
def test_get_table_names_split_views_from_tables( self, mock_get_view_names, mock_get_table_names, mock_is_feature_enabled ): mock_get_view_names.return_value = ["view1", "view2"] table_names = ["table1", "table2", "view1", "view2"] mock_get_table_names.return_value = table_names mock_is_feature_enabled.return_value = True tables = PrestoEngineSpec.get_table_names(mock.Mock(), mock.Mock(), None) assert sorted(tables) == sorted(table_names)
def test_presto_extra_table_metadata(self): db = mock.Mock() db.get_indexes = mock.Mock(return_value=[{"column_names": ["ds", "hour"]}]) db.get_extra = mock.Mock(return_value={}) df = pd.DataFrame({"ds": ["01-01-19"], "hour": [1]}) db.get_df = mock.Mock(return_value=df) PrestoEngineSpec.get_create_view = mock.Mock(return_value=None) result = PrestoEngineSpec.extra_table_metadata(db, "test_table", "test_schema") self.assertEqual({"ds": "01-01-19", "hour": 1}, result["partitions"]["latest"])
def test_extract_error_message_orig(self): DatabaseError = namedtuple("DatabaseError", ["error_dict"]) db_err = DatabaseError( {"errorName": "name", "errorLocation": "location", "message": "msg"} ) exception = Exception() exception.orig = db_err result = PrestoEngineSpec._extract_error_message(exception) assert result == "name at location: msg"
def test_estimate_statement_cost(self): mock_cursor = mock.MagicMock() estimate_json = {"a": "b"} mock_cursor.fetchone.return_value = [ '{"a": "b"}', ] result = PrestoEngineSpec.estimate_statement_cost( "SELECT * FROM brth_names", mock_cursor) assert result == estimate_json
def test_presto_create_row_and_array_hierarchy(self): cols = [{ 'name': 'row_column', 'type': 'ROW(NESTED_OBJ1 VARCHAR, NESTED_ROW ROW(NESTED_OBJ2 VARCHAR)' }, { 'name': 'array_column', 'type': 'ARRAY(ROW(NESTED_ARRAY ARRAY(ROW(NESTED_OBJ VARCHAR))))' }] actual_row_col_hierarchy,\ actual_array_col_hierarchy,\ actual_expanded_cols = PrestoEngineSpec._create_row_and_array_hierarchy(cols) expected_row_col_hierarchy = { 'row_column': { 'type': 'ROW', 'children': ['row_column.nested_obj1', 'row_column.nested_row'], }, 'row_column.nested_row': { 'type': 'ROW', 'children': ['row_column.nested_row.nested_obj2'] }, } expected_array_col_hierarchy = { 'array_column': { 'type': 'ARRAY', 'children': ['array_column.nested_array'], }, 'array_column.nested_array': { 'type': 'ARRAY', 'children': ['array_column.nested_array.nested_obj'] }, } expected_expanded_cols = [{ 'name': 'row_column.nested_obj1', 'type': 'VARCHAR' }, { 'name': 'row_column.nested_row', 'type': 'ROW' }, { 'name': 'row_column.nested_row.nested_obj2', 'type': 'VARCHAR' }, { 'name': 'array_column.nested_array', 'type': 'ARRAY' }, { 'name': 'array_column.nested_array.nested_obj', 'type': 'VARCHAR' }] self.assertEqual(actual_row_col_hierarchy, expected_row_col_hierarchy) self.assertEqual(actual_array_col_hierarchy, expected_array_col_hierarchy) self.assertEqual(actual_expanded_cols, expected_expanded_cols)
def test_show_columns(self): inspector = mock.MagicMock() inspector.engine.dialect.identifier_preparer.quote_identifier = ( lambda x: f'"{x}"' ) mock_execute = mock.MagicMock(return_value=["a", "b"]) inspector.bind.execute = mock_execute table_name = "table_name" result = PrestoEngineSpec._show_columns(inspector, table_name, None) assert result == ["a", "b"] mock_execute.assert_called_once_with(f'SHOW COLUMNS FROM "{table_name}"')
def test_presto_filter_out_array_nested_cols(self): cols = [ {"name": "column", "type": "ARRAY"}, {"name": "column.nested_obj", "type": "FLOAT"}, ] actual_filtered_cols, actual_array_cols = PrestoEngineSpec._filter_out_array_nested_cols( # noqa ignore: E50 cols ) expected_filtered_cols = [{"name": "column", "type": "ARRAY"}] self.assertEqual(actual_filtered_cols, expected_filtered_cols) self.assertEqual(actual_array_cols, cols)
def test_presto_where_latest_partition(self): db = mock.Mock() db.get_indexes = mock.Mock(return_value=[{"column_names": ["ds", "hour"]}]) db.get_extra = mock.Mock(return_value={}) df = pd.DataFrame({"ds": ["01-01-19"], "hour": [1]}) db.get_df = mock.Mock(return_value=df) columns = [{"name": "ds"}, {"name": "hour"}] result = PrestoEngineSpec.where_latest_partition( "test_table", "test_schema", db, select(), columns ) query_result = str(result.compile(compile_kwargs={"literal_binds": True})) self.assertEqual("SELECT \nWHERE ds = '01-01-19' AND hour = 1", query_result)
def test_query_cost_formatter(self): raw_cost = [{ "inputTableColumnInfos": [{ "table": { "catalog": "hive", "schemaTable": { "schema": "default", "table": "fact_passenger_state", }, }, "columnConstraints": [{ "columnName": "ds", "typeSignature": "varchar", "domain": { "nullsAllowed": False, "ranges": [{ "low": { "value": "2019-07-10", "bound": "EXACTLY", }, "high": { "value": "2019-07-10", "bound": "EXACTLY", }, }], }, }], "estimate": { "outputRowCount": 9.04969899e8, "outputSizeInBytes": 3.54143678301e11, "cpuCost": 3.54143678301e11, "maxMemory": 0.0, "networkCost": 0.0, }, }], "estimate": { "outputRowCount": 9.04969899e8, "outputSizeInBytes": 3.54143678301e11, "cpuCost": 3.54143678301e11, "maxMemory": 0.0, "networkCost": 3.54143678301e11, }, }] formatted_cost = PrestoEngineSpec.query_cost_formatter(raw_cost) expected = [{ "Output count": "904 M rows", "Output size": "354 GB", "CPU cost": "354 G", "Max memory": "0 B", "Network cost": "354 G", }] self.assertEqual(formatted_cost, expected)
def test_get_all_datasource_names(self): df = pd.DataFrame.from_dict( {"table_schema": ["schema1", "schema2"], "table_name": ["name1", "name2"]} ) database = mock.MagicMock() database.get_df.return_value = df result = PrestoEngineSpec.get_all_datasource_names(database, "table") expected_result = [ DatasourceName(schema="schema1", table="name1"), DatasourceName(schema="schema2", table="name2"), ] assert result == expected_result
def test_presto_remove_processed_array_columns(self): array_col_hierarchy = { "array_column": { "type": "ARRAY", "children": ["array_column.nested_array"], }, "array_column.nested_array": { "type": "ARRAY", "children": ["array_column.nested_array.nested_obj"], }, } unprocessed_array_cols = {"array_column.nested_array"} PrestoEngineSpec._remove_processed_array_columns( unprocessed_array_cols, array_col_hierarchy) expected_array_col_hierarchy = { "array_column.nested_array": { "type": "ARRAY", "children": ["array_column.nested_array.nested_obj"], } } self.assertEqual(array_col_hierarchy, expected_array_col_hierarchy)
def test_get_create_view_database_error(self): from pyhive.exc import DatabaseError mock_execute = mock.MagicMock(side_effect=DatabaseError()) database = mock.MagicMock() database.get_sqla_engine.return_value.raw_connection.return_value.cursor.return_value.execute = ( mock_execute ) schema = "schema" table = "table" result = PrestoEngineSpec.get_create_view(database, schema=schema, table=table) assert result is None