Beispiel #1
0
    def _get_column_counts(
            self, table_info_1: Dict,
            table_info_2: Dict) -> Tuple[List[Tuple[int]], List[Tuple[int]]]:
        """DEPRECATED: Get the column counts from two tables of interest.
        This function is deprecated in favor of the IOU on all rows between both tables.

        Args:
            table_info_1 (Dict): Table 1 info
            table_info_2 (Dict): Table 2 info

        Returns:
            Tuple[List[Tuple[int]], List[Tuple[int]]]: Counts of all columns for each table
        """
        counts = ", ".join([
            f"max(typeof({column})), count({column}), count(distinct {column})"
            for column in table_info_1["columns"]
        ])
        sql = f"SELECT {counts} FROM {{table}}"

        if table_info_1["latest_partitions"]:
            partition_filter = utils.partition_builder(table_info_1,
                                                       date_cast=True)
            sql += f" WHERE {partition_filter}"
        sql += " LIMIT 1"
        print(sql)

        counts_table_1 = fetch(
            sql.format(table=f"{self.test_database}.{table_info_1['name']}"),
            self.pconn)[0]
        counts_table_2 = fetch(
            sql.format(table=f"{self.test_database}.{table_info_2['name']}"),
            self.pconn)[0]
        return counts_table_1, counts_table_2
Beispiel #2
0
    def _get_latest_partitions(self, table_name: str) -> Dict[str, str]:
        """Get the latest partition for a given table

        Args:
            table_name (str): Name of the table to query

        Raises:
            ValueError: Multiple partition keys are not supported
            ValueError: Table is partitionned but does not contain any data.

        Returns:
            Dict[str, str]: Partition key & value for the latest partition in table
        """
        partitions = fetch(f"SHOW PARTITIONS {table_name}", self.hconn)
        try:
            assert "/" not in partitions[0][0]
        except AssertionError:
            msg = (
                f"Multiple partition keys detected: {partitions[0][0]}. "
                "This behavior is not yet supported."
            )
            raise ValueError(msg)
        except IndexError:
            raise ValueError(f"Table {table_name} is partitioned but does not contain any data!")

        partitions = [
            decode_utf8(p[0])  # Remove encoded utf8 symbols
            for p in partitions
            if p not in self.partition_black_list
        ]
        partition_key, partition_value = sorted(partitions, reverse=True)[0].split("=")
        return {partition_key: partition_value}
Beispiel #3
0
    def _describe_formatted(self, table_name: str) -> Dict:
        """Run describe formatted statement on a given table

        Args:
            table_name (str): Name of the table to query

        Returns:
            Dict: Get the formatted & cleaned up output of the describe formatted statement
        """
        describe_formatted = fetch(f"DESCRIBE FORMATTED {table_name}", self.hconn)
        return parse_describe_formatted(describe_formatted)
Beispiel #4
0
    def insert_into_presto_table(self) -> Tuple[str, float]:
        """Insert into the Presto validation table

        Returns:
            Tuple[str, float]: Validated SQL (likely to be very different from the source) & time it took to execute the query (in s)
        """
        # I. Format query parameters
        sql, original_sql = self.tgt_sql, self.tgt_sql  # Working copy + selectively edited one
        print(f"PARAMS:{self.evaluated_query_parameters}")
        sql = sql.format(
            **self.evaluated_query_parameters)  # Case insensitive parameters

        # II. Format insert statement in the working copy
        sql = Regex.sub(
            self.regex_presto_insert,
            f"INSERT INTO {self.test_database}.{self.temp_tgt_table_properties['name']}",
            sql)

        # III. Execute query & validate that table is not empty
        start = time.perf_counter()
        validated_sql = self._presto_runner(
            sql,
            original_sql)  # Returns the validated SQL - likely to be modified
        duration = time.perf_counter() - start
        print(
            f"Data was inserted in temp table in {duration:.3f} s with Presto")

        validate_table_count = fetch(
            f"SELECT count(*) FROM {self.test_database}.{self.temp_tgt_table_properties['name']}",
            self.pconn)
        if validate_table_count[0][0] == 0:
            print(
                colored(
                    f"WARNING: After inserting into {self.test_database}.{self.temp_tgt_table_properties['name']} the count is still 0",
                    "yellow"))
        return validated_sql, duration
Beispiel #5
0
    def _compare_rows_between_two_tables(self, table_info_1: Dict,
                                         table_info_2: Dict) -> Dict[str, str]:
        """Execute the SQL comparing two tables based on their IOU (Intersection Over Union) score

        Args:
            table_info_1 (Dict): Table 1 info
            table_info_2 (Dict): Table 2 info

        Returns:
            Dict[str, str]: Result of the different components necessary to calculate the IOU score
        """
        from_table_1 = f"{self.test_database}.{table_info_1['name']}"
        from_table_2 = f"{self.test_database}.{table_info_2['name']}"
        if table_info_1["latest_partitions"]:
            partition_filter = utils.partition_builder(table_info_1,
                                                       date_cast=True)
            from_table_1 += f" WHERE {partition_filter}"
            from_table_2 += f" WHERE {partition_filter}"

        sql = (
            "with\n"
            f"select_distinct_table_1 AS (SELECT distinct * FROM {from_table_1}),\n"
            f"select_distinct_table_2 AS (SELECT distinct * FROM {from_table_2}),\n"
            "table_1_minus_table_2 AS (\n"
            f"\tSELECT * FROM {from_table_1}\n"
            "\texcept\n"
            f"\tSELECT * FROM {from_table_2}\n"
            "),\n"
            "table_2_minus_table_1 AS (\n"
            f"\tSELECT * FROM {from_table_1}\n"
            "\texcept\n"
            f"\tSELECT * FROM {from_table_2}\n"
            "),\n"
            "intersection AS (\n"
            f"\tSELECT * FROM {from_table_1}\n"
            "\tintersect\n"
            f"\tSELECT * FROM {from_table_2}\n"
            ")\n"
            "SELECT '1_count_table_1' AS counts, count(*)\n"
            f"FROM {from_table_1}\n"
            "union\n"
            "SELECT '2_count_table_2' AS counts, count(*)\n"
            f"FROM {from_table_2}\n"
            "union\n"
            "SELECT '3_count_distinct_table_1' AS counts, count(*)\n"
            "FROM select_distinct_table_1\n"
            "union\n"
            "SELECT '4_count_distinct_table_2' AS counts, count(*)\n"
            "FROM select_distinct_table_2\n"
            "union\n"
            "SELECT '5_count_distinct_table_1_minus_table_2' AS counts, count(*)\n"
            "FROM table_1_minus_table_2\n"
            "union\n"
            "SELECT '6_count_distinct_table_2_minus_table_1' AS counts, count(*)\n"
            "FROM table_2_minus_table_1\n"
            "union\n"
            "SELECT '7_count_distinct_intersection' AS counts, count(*)\n"
            "FROM intersection\n"
            "order by counts\n")

        print(sql)
        return dict(fetch(sql, self.pconn))
def test_fetch_many():
    conn = MagicMock()
    query_utils.fetch('select * from db.table', conn, 1)
    assert conn.cursor.called
def test_fetch_Exception():
    conn = MagicMock()
    conn.cursor = helper
    with pytest.raises(Exception):
        query_utils.fetch('select * from db.table', conn)
def test_fetch() -> None:
    conn = MagicMock()
    query_utils.fetch('select * from db.table', conn)
    assert conn.cursor.called