def test_presto_expand_data_with_simple_structural_columns(self): cols = [ { "name": "row_column", "type": "ROW(NESTED_OBJ VARCHAR)" }, { "name": "array_column", "type": "ARRAY(BIGINT)" }, ] data = [ { "row_column": ["a"], "array_column": [1, 2, 3] }, { "row_column": ["b"], "array_column": [4, 5, 6] }, ] actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data( cols, data) expected_cols = [ { "name": "row_column", "type": "ROW" }, { "name": "row_column.nested_obj", "type": "VARCHAR" }, { "name": "array_column", "type": "ARRAY" }, ] expected_data = [ { "row_column": ["a"], "row_column.nested_obj": "a", "array_column": 1 }, { "row_column": "", "row_column.nested_obj": "", "array_column": 2 }, { "row_column": "", "row_column.nested_obj": "", "array_column": 3 }, { "row_column": ["b"], "row_column.nested_obj": "b", "array_column": 4 }, { "row_column": "", "row_column.nested_obj": "", "array_column": 5 }, { "row_column": "", "row_column.nested_obj": "", "array_column": 6 }, ] expected_expanded_cols = [{ "name": "row_column.nested_obj", "type": "VARCHAR" }] self.assertEqual(actual_cols, expected_cols) self.assertEqual(actual_data, expected_data) self.assertEqual(actual_expanded_cols, expected_expanded_cols)
def test_presto_expand_data_with_complex_array_columns(self): cols = [ {"name": "int_column", "type": "BIGINT"}, { "name": "array_column", "type": "ARRAY(ROW(NESTED_ARRAY ARRAY(ROW(NESTED_OBJ VARCHAR))))", }, ] data = [ {"int_column": 1, "array_column": [[[["a"], ["b"]]], [[["c"], ["d"]]]]}, {"int_column": 2, "array_column": [[[["e"], ["f"]]], [[["g"], ["h"]]]]}, ] actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data( cols, data ) expected_cols = [ {"name": "int_column", "type": "BIGINT"}, { "name": "array_column", "type": "ARRAY(ROW(NESTED_ARRAY ARRAY(ROW(NESTED_OBJ VARCHAR))))", }, { "name": "array_column.nested_array", "type": "ARRAY(ROW(NESTED_OBJ VARCHAR))", }, {"name": "array_column.nested_array.nested_obj", "type": "VARCHAR"}, ] expected_data = [ { "array_column": [[["a"], ["b"]]], "array_column.nested_array": ["a"], "array_column.nested_array.nested_obj": "a", "int_column": 1, }, { "array_column": "", "array_column.nested_array": ["b"], "array_column.nested_array.nested_obj": "b", "int_column": "", }, { "array_column": [[["c"], ["d"]]], "array_column.nested_array": ["c"], "array_column.nested_array.nested_obj": "c", "int_column": "", }, { "array_column": "", "array_column.nested_array": ["d"], "array_column.nested_array.nested_obj": "d", "int_column": "", }, { "array_column": [[["e"], ["f"]]], "array_column.nested_array": ["e"], "array_column.nested_array.nested_obj": "e", "int_column": 2, }, { "array_column": "", "array_column.nested_array": ["f"], "array_column.nested_array.nested_obj": "f", "int_column": "", }, { "array_column": [[["g"], ["h"]]], "array_column.nested_array": ["g"], "array_column.nested_array.nested_obj": "g", "int_column": "", }, { "array_column": "", "array_column.nested_array": ["h"], "array_column.nested_array.nested_obj": "h", "int_column": "", }, ] expected_expanded_cols = [ { "name": "array_column.nested_array", "type": "ARRAY(ROW(NESTED_OBJ VARCHAR))", }, {"name": "array_column.nested_array.nested_obj", "type": "VARCHAR"}, ] self.assertEqual(actual_cols, expected_cols) self.assertEqual(actual_data, expected_data) self.assertEqual(actual_expanded_cols, expected_expanded_cols)
def test_presto_expand_data_array(self): cols = [ { "name": "event_id", "type": "VARCHAR", "is_date": False }, { "name": "timestamp", "type": "BIGINT", "is_date": False }, { "name": "user", "type": "ROW(ID BIGINT, FIRST_NAME VARCHAR, LAST_NAME VARCHAR)", "is_date": False, }, ] data = [{ "event_id": "abcdef01-2345-6789-abcd-ef0123456789", "timestamp": "1595895506219", "user": '******', }] actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data( cols, data) expected_cols = [ { "name": "event_id", "type": "VARCHAR", "is_date": False }, { "name": "timestamp", "type": "BIGINT", "is_date": False }, { "name": "user", "type": "ROW(ID BIGINT, FIRST_NAME VARCHAR, LAST_NAME VARCHAR)", "is_date": False, }, { "name": "user.id", "type": "BIGINT" }, { "name": "user.first_name", "type": "VARCHAR" }, { "name": "user.last_name", "type": "VARCHAR" }, ] expected_data = [{ "event_id": "abcdef01-2345-6789-abcd-ef0123456789", "timestamp": "1595895506219", "user": [1, "JOHN", "DOE"], "user.id": 1, "user.first_name": "JOHN", "user.last_name": "DOE", }] expected_expanded_cols = [ { "name": "user.id", "type": "BIGINT" }, { "name": "user.first_name", "type": "VARCHAR" }, { "name": "user.last_name", "type": "VARCHAR" }, ] self.assertEqual(actual_cols, expected_cols) self.assertEqual(actual_data, expected_data) self.assertEqual(actual_expanded_cols, expected_expanded_cols)
def test_presto_expand_data_with_complex_row_columns(self): cols = [{ "name": "row_column", "type": "ROW(NESTED_OBJ1 VARCHAR, NESTED_ROW ROW(NESTED_OBJ2 VARCHAR))", }] data = [{"row_column": ["a1", ["a2"]]}, {"row_column": ["b1", ["b2"]]}] actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data( cols, data) expected_cols = [ { "name": "row_column", "type": "ROW(NESTED_OBJ1 VARCHAR, NESTED_ROW ROW(NESTED_OBJ2 VARCHAR))", }, { "name": "row_column.nested_obj1", "type": "VARCHAR" }, { "name": "row_column.nested_row", "type": "ROW(NESTED_OBJ2 VARCHAR)" }, { "name": "row_column.nested_row.nested_obj2", "type": "VARCHAR" }, ] expected_data = [ { "row_column": ["a1", ["a2"]], "row_column.nested_obj1": "a1", "row_column.nested_row": ["a2"], "row_column.nested_row.nested_obj2": "a2", }, { "row_column": ["b1", ["b2"]], "row_column.nested_obj1": "b1", "row_column.nested_row": ["b2"], "row_column.nested_row.nested_obj2": "b2", }, ] expected_expanded_cols = [ { "name": "row_column.nested_obj1", "type": "VARCHAR" }, { "name": "row_column.nested_row", "type": "ROW(NESTED_OBJ2 VARCHAR)" }, { "name": "row_column.nested_row.nested_obj2", "type": "VARCHAR" }, ] self.assertEqual(actual_cols, expected_cols) self.assertEqual(actual_data, expected_data) self.assertEqual(actual_expanded_cols, expected_expanded_cols)
def test_presto_expand_data_with_complex_row_columns_and_null_values(self): cols = [ { "name": "row_column", "type": "ROW(NESTED_ROW ROW(NESTED_OBJ VARCHAR))", "is_dttm": False, } ] data = [ {"row_column": '[["a"]]'}, {"row_column": "[[null]]"}, {"row_column": "[null]"}, {"row_column": "null"}, ] actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data( cols, data ) expected_cols = [ { "name": "row_column", "type": "ROW(NESTED_ROW ROW(NESTED_OBJ VARCHAR))", "is_dttm": False, }, { "name": "row_column.nested_row", "type": "ROW(NESTED_OBJ VARCHAR)", "is_dttm": False, }, { "name": "row_column.nested_row.nested_obj", "type": "VARCHAR", "is_dttm": False, }, ] expected_data = [ { "row_column": [["a"]], "row_column.nested_row": ["a"], "row_column.nested_row.nested_obj": "a", }, { "row_column": [[None]], "row_column.nested_row": [None], "row_column.nested_row.nested_obj": None, }, { "row_column": [None], "row_column.nested_row": None, "row_column.nested_row.nested_obj": "", }, { "row_column": None, "row_column.nested_row": "", "row_column.nested_row.nested_obj": "", }, ] expected_expanded_cols = [ { "name": "row_column.nested_row", "type": "ROW(NESTED_OBJ VARCHAR)", "is_dttm": False, }, { "name": "row_column.nested_row.nested_obj", "type": "VARCHAR", "is_dttm": False, }, ] self.assertEqual(actual_cols, expected_cols) self.assertEqual(actual_data, expected_data) self.assertEqual(actual_expanded_cols, expected_expanded_cols)
def test_presto_expand_data_with_complex_array_columns(self): cols = [{ 'name': 'int_column', 'type': 'BIGINT' }, { 'name': 'array_column', 'type': 'ARRAY(ROW(NESTED_ARRAY ARRAY(ROW(NESTED_OBJ VARCHAR))))' }] data = [{ 'int_column': 1, 'array_column': [[[['a'], ['b']]], [[['c'], ['d']]]] }, { 'int_column': 2, 'array_column': [[[['e'], ['f']]], [[['g'], ['h']]]] }] actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data( cols, data) expected_cols = [{ 'name': 'int_column', 'type': 'BIGINT' }, { 'name': 'array_column', 'type': 'ARRAY' }, { 'name': 'array_column.nested_array', 'type': 'ARRAY' }, { 'name': 'array_column.nested_array.nested_obj', 'type': 'VARCHAR' }] expected_data = [{ 'int_column': 1, 'array_column': [[[['a'], ['b']]], [[['c'], ['d']]]], 'array_column.nested_array': [['a'], ['b']], 'array_column.nested_array.nested_obj': 'a' }, { 'int_column': '', 'array_column': '', 'array_column.nested_array': '', 'array_column.nested_array.nested_obj': 'b' }, { 'int_column': '', 'array_column': '', 'array_column.nested_array': [['c'], ['d']], 'array_column.nested_array.nested_obj': 'c' }, { 'int_column': '', 'array_column': '', 'array_column.nested_array': '', 'array_column.nested_array.nested_obj': 'd' }, { 'int_column': 2, 'array_column': [[[['e'], ['f']]], [[['g'], ['h']]]], 'array_column.nested_array': [['e'], ['f']], 'array_column.nested_array.nested_obj': 'e' }, { 'int_column': '', 'array_column': '', 'array_column.nested_array': '', 'array_column.nested_array.nested_obj': 'f' }, { 'int_column': '', 'array_column': '', 'array_column.nested_array': [['g'], ['h']], 'array_column.nested_array.nested_obj': 'g' }, { 'int_column': '', 'array_column': '', 'array_column.nested_array': '', 'array_column.nested_array.nested_obj': 'h' }] expected_expanded_cols = [{ 'name': 'array_column.nested_array', 'type': 'ARRAY' }, { 'name': 'array_column.nested_array.nested_obj', 'type': 'VARCHAR' }] self.assertEqual(actual_cols, expected_cols) self.assertEqual(actual_data, expected_data) self.assertEqual(actual_expanded_cols, expected_expanded_cols)