def _cast_in(self, sql: str, result: re.match, **kwargs) -> str: """Value & IN statement must have the same type of data type. Example: select a \nfrom cte\nwhere a in ( \n 1, 2, 3) --> line 4:2: IN value and list items must be the same type: bigint (1) Fix: select a \nfrom cte\nwhere cast(a AS bigint) in ( \n 1, 2, 3) Args: sql (str): SQL to fix result (re.match): Regex match object Raises: ValueError: Data type in IN statement are inconsistent. It is not clear which data type should be chosen. Returns: str: Fixed SQL """ line, column = int(result["line"]) - 1, int( result["column"]) - 1 # Presto to python array notation token, idx = self.ColumnCaster.get_problematic_token( sql, line, column) # Should return an identifier list or identifier if isinstance(token, IdentifierList): token_in_list = [ t for t in token.tokens if t.ttype not in (Whitespace, Punctuation, Comment, Newline) ] else: token_in_list = [token] if all(t.ttype == Literal.String.Single for t in token_in_list): cast_to = "varchar" elif all(t.ttype == Literal.Number.Integer for t in token_in_list): cast_to = "bigint" elif all(t.ttype == Literal.Number.Float for t in token_in_list): cast_to = "double" else: raise ValueError( f"Inconsistent data type in the IN list! {[token_in_list]}") parent_idx = token.parent.tokens.index(token) for preceding_token in token.parent.tokens[ parent_idx - 1::-1]: # Subtract the length of all preceding siblings idx -= len(preceding_token.value) grand_parent_idx = token.parent.parent.tokens.index( token.parent ) # Move up to the grand parent token. See tests to understand why this is relevant for preceding_token in token.parent.parent.tokens[grand_parent_idx - 1::-1]: idx -= len(preceding_token.value) if preceding_token.ttype == Keyword and preceding_token.value.lower( ) == "in": token = preceding_token break return self.ColumnCaster.cast_non_trivial_tokens( sql, token, idx, cast_to, result.groupdict(), count_forward_tokens=0)
def _cast_in_subquery(self, sql: str, result: re.match, **kwargs) -> str: """Value & subqueries results must have the same data type. Example: select 1 in (select '1') --> line 1:10: value and result of subquery must be of the same type for IN expression: integer vs varchar (1) Fix: select cast(1 AS varchar) in (select '1') Args: sql (str): SQL to fix result (re.match): Regex match object Returns: str: Fixed SQL """ line, column = int(result["line"]) - 1, int( result["column"]) - 1 # Presto to python array notation token, idx = self.ColumnCaster.get_problematic_token( sql, line, column) # Should return an identifier list or identifier logging.debug( f"[DEBUG] Found token {[token]}\nvalue:{token}|ttype:{token.ttype}\nparent:{token.parent}" ) return self.ColumnCaster.cast_non_trivial_tokens( sql, token, idx, result["f_type_0"], result.groupdict(), count_forward_tokens=0)
def helper(match: re.match): alias = match.groupdict()["alias"] if alias.upper() in self.reserved_keywords or alias.upper() in ( "ASC", "DESC"): # List of Presto reserved keywords return match.group( ) # Not an alias but the continuation of the SQL logic else: return match.group()[:-len(alias)] + "as " + alias
def _cast_both_sides(self, sql: str, result: re.match, **kwargs) -> str: """Cast both sides of a comparison to varchar Example: select 'a' =1 --> line 1:12: '=' cannot be applied to varchar, bigint (1) Fix: select 'a' =cast(1 AS varchar) Args: sql (str): SQL to fix result (re.match): Regex match object Returns: str: Fixed SQL """ line, column = int(result["line"]) - 1, int( result["column"]) - 1 # Presto to python array notation token, idx = self.ColumnCaster.get_problematic_token(sql, line, column) return self.ColumnCaster.cast_non_trivial_tokens( sql, token, idx, "varchar", result.groupdict())
def format(self, path: Path, match: re.match): """ Format the path with the result of the matching. Only replace what was captured. """ assert match is not None # get what is before and after the capture prefix = match.string[:match.start()] suffix = match.string[match.end():] updated_name = file_formatter.format( self.renamer, None, *match.groups(), **match.groupdict()) return self.untouched_root(path) / Path(prefix + updated_name + suffix)
def _between(self, sql: str, result: re.match, **kwargs) -> str: """Fixes data type mismatches in BETWEEN statements. Example: select a from cte where b between c and d --> line 1:27: Cannot check if varchar is BETWEEN varchar and date (1) Fix: select a from cte where b between c and cast(d AS varchar) Args: sql (str): SQL to fix result (re.match): Regex match object Returns: str: Fixed SQL """ line, column = int(result["line"]) - 1, int( result["column"]) - 1 # Presto to python array notation token, idx = self.ColumnCaster.get_problematic_token(sql, line, column) return self.ColumnCaster.cast_non_trivial_tokens( sql, token, idx, "varchar", result.groupdict(), count_forward_tokens=3)
def __init__(self, match_object: re.match): self.match = match_object self.dict = match_object.groupdict()