def _get_column_counts( self, table_info_1: Dict, table_info_2: Dict) -> Tuple[List[Tuple[int]], List[Tuple[int]]]: """DEPRECATED: Get the column counts from two tables of interest. This function is deprecated in favor of the IOU on all rows between both tables. Args: table_info_1 (Dict): Table 1 info table_info_2 (Dict): Table 2 info Returns: Tuple[List[Tuple[int]], List[Tuple[int]]]: Counts of all columns for each table """ counts = ", ".join([ f"max(typeof({column})), count({column}), count(distinct {column})" for column in table_info_1["columns"] ]) sql = f"SELECT {counts} FROM {{table}}" if table_info_1["latest_partitions"]: partition_filter = utils.partition_builder(table_info_1, date_cast=True) sql += f" WHERE {partition_filter}" sql += " LIMIT 1" print(sql) counts_table_1 = fetch( sql.format(table=f"{self.test_database}.{table_info_1['name']}"), self.pconn)[0] counts_table_2 = fetch( sql.format(table=f"{self.test_database}.{table_info_2['name']}"), self.pconn)[0] return counts_table_1, counts_table_2
def _get_latest_partitions(self, table_name: str) -> Dict[str, str]: """Get the latest partition for a given table Args: table_name (str): Name of the table to query Raises: ValueError: Multiple partition keys are not supported ValueError: Table is partitionned but does not contain any data. Returns: Dict[str, str]: Partition key & value for the latest partition in table """ partitions = fetch(f"SHOW PARTITIONS {table_name}", self.hconn) try: assert "/" not in partitions[0][0] except AssertionError: msg = ( f"Multiple partition keys detected: {partitions[0][0]}. " "This behavior is not yet supported." ) raise ValueError(msg) except IndexError: raise ValueError(f"Table {table_name} is partitioned but does not contain any data!") partitions = [ decode_utf8(p[0]) # Remove encoded utf8 symbols for p in partitions if p not in self.partition_black_list ] partition_key, partition_value = sorted(partitions, reverse=True)[0].split("=") return {partition_key: partition_value}
def _describe_formatted(self, table_name: str) -> Dict: """Run describe formatted statement on a given table Args: table_name (str): Name of the table to query Returns: Dict: Get the formatted & cleaned up output of the describe formatted statement """ describe_formatted = fetch(f"DESCRIBE FORMATTED {table_name}", self.hconn) return parse_describe_formatted(describe_formatted)
def insert_into_presto_table(self) -> Tuple[str, float]: """Insert into the Presto validation table Returns: Tuple[str, float]: Validated SQL (likely to be very different from the source) & time it took to execute the query (in s) """ # I. Format query parameters sql, original_sql = self.tgt_sql, self.tgt_sql # Working copy + selectively edited one print(f"PARAMS:{self.evaluated_query_parameters}") sql = sql.format( **self.evaluated_query_parameters) # Case insensitive parameters # II. Format insert statement in the working copy sql = Regex.sub( self.regex_presto_insert, f"INSERT INTO {self.test_database}.{self.temp_tgt_table_properties['name']}", sql) # III. Execute query & validate that table is not empty start = time.perf_counter() validated_sql = self._presto_runner( sql, original_sql) # Returns the validated SQL - likely to be modified duration = time.perf_counter() - start print( f"Data was inserted in temp table in {duration:.3f} s with Presto") validate_table_count = fetch( f"SELECT count(*) FROM {self.test_database}.{self.temp_tgt_table_properties['name']}", self.pconn) if validate_table_count[0][0] == 0: print( colored( f"WARNING: After inserting into {self.test_database}.{self.temp_tgt_table_properties['name']} the count is still 0", "yellow")) return validated_sql, duration
def _compare_rows_between_two_tables(self, table_info_1: Dict, table_info_2: Dict) -> Dict[str, str]: """Execute the SQL comparing two tables based on their IOU (Intersection Over Union) score Args: table_info_1 (Dict): Table 1 info table_info_2 (Dict): Table 2 info Returns: Dict[str, str]: Result of the different components necessary to calculate the IOU score """ from_table_1 = f"{self.test_database}.{table_info_1['name']}" from_table_2 = f"{self.test_database}.{table_info_2['name']}" if table_info_1["latest_partitions"]: partition_filter = utils.partition_builder(table_info_1, date_cast=True) from_table_1 += f" WHERE {partition_filter}" from_table_2 += f" WHERE {partition_filter}" sql = ( "with\n" f"select_distinct_table_1 AS (SELECT distinct * FROM {from_table_1}),\n" f"select_distinct_table_2 AS (SELECT distinct * FROM {from_table_2}),\n" "table_1_minus_table_2 AS (\n" f"\tSELECT * FROM {from_table_1}\n" "\texcept\n" f"\tSELECT * FROM {from_table_2}\n" "),\n" "table_2_minus_table_1 AS (\n" f"\tSELECT * FROM {from_table_1}\n" "\texcept\n" f"\tSELECT * FROM {from_table_2}\n" "),\n" "intersection AS (\n" f"\tSELECT * FROM {from_table_1}\n" "\tintersect\n" f"\tSELECT * FROM {from_table_2}\n" ")\n" "SELECT '1_count_table_1' AS counts, count(*)\n" f"FROM {from_table_1}\n" "union\n" "SELECT '2_count_table_2' AS counts, count(*)\n" f"FROM {from_table_2}\n" "union\n" "SELECT '3_count_distinct_table_1' AS counts, count(*)\n" "FROM select_distinct_table_1\n" "union\n" "SELECT '4_count_distinct_table_2' AS counts, count(*)\n" "FROM select_distinct_table_2\n" "union\n" "SELECT '5_count_distinct_table_1_minus_table_2' AS counts, count(*)\n" "FROM table_1_minus_table_2\n" "union\n" "SELECT '6_count_distinct_table_2_minus_table_1' AS counts, count(*)\n" "FROM table_2_minus_table_1\n" "union\n" "SELECT '7_count_distinct_intersection' AS counts, count(*)\n" "FROM intersection\n" "order by counts\n") print(sql) return dict(fetch(sql, self.pconn))
def test_fetch_many(): conn = MagicMock() query_utils.fetch('select * from db.table', conn, 1) assert conn.cursor.called
def test_fetch_Exception(): conn = MagicMock() conn.cursor = helper with pytest.raises(Exception): query_utils.fetch('select * from db.table', conn)
def test_fetch() -> None: conn = MagicMock() query_utils.fetch('select * from db.table', conn) assert conn.cursor.called