예제 #1
0
    def run_single(self, query_n):
        """Run a single query for comparison on both systems

        Parameters
        ----------
        query_n : int, query number to run on both systems

        Returns
        -------
        df_bq_result : Pandas DataFrame, query result
        df_sf_result : Pandas DataFrame, query result
        bq_qid : str, BigQuery query id for query job
        sf_qid : str, Snowflake query ide for query job
        """

        sf = sf_tpc.SFTPC(test=self.test,
                          scale=self.scale,
                          cid=self.cid,
                          warehouse=self.sf_warehouse_name,
                          desc=self.desc,
                          verbose=self.verbose,
                          verbose_query=self.verbose_query)
        sf.verbose_query_n = self.verbose_query_n

        if self.verbose:
            print('Using database:', sf.database)

        sf.timestamp = self.shared_timestamp
        sf.results_dir = self.results_dir

        sf.connect()

        _sf_t0, _sf_t1, df_sf_result, _sf_query_text, sf_qid = sf.query_n(
            n=query_n)

        sf.close()

        bq = bq_tpc.BQTPC(test=self.test,
                          scale=self.scale,
                          cid=self.cid,
                          desc=self.desc,
                          verbose_query=self.verbose_query,
                          verbose=self.verbose)
        bq.verbose_query_n = self.verbose_query_n

        bq.timestamp = self.shared_timestamp
        bq.results_dir = self.results_dir

        if self.cache is True:
            # sf.cache_set("on")
            sf.cache = True
        else:
            sf.cache = False
            # sf.cache_set("off")

        _bq_t0, _bq_t1, df_bq_result, _bq_query_text, bq_qid = bq.query_n(
            n=query_n)

        return df_bq_result, df_sf_result, bq_qid, sf_qid
예제 #2
0
def bq_results(results_dir,
               t0=None,
               buffer_time="20 minutes",
               verbose: bool = False):
    """Get query history for an already collected query result directory

    Parameters
    ----------
    results_dir : str, directory to data collected
    t0 : str, optional, start time to bound query results
    buffer_time : str, time interval for Pandas.Timedelta, amount of time before t0
    verbose : bool, print debug statements

    Returns
    -------
    df_bq_history : Pandas DataFrame, containing query history
    """

    if t0 is None:
        t0 = exp_log_t0(results_dir)

    t_buffer = pd.Timedelta(buffer_time)
    t0 = pd.to_datetime(t0) - t_buffer

    bq = bq_tpc.BQTPC(test="ds",
                      scale=1,
                      cid="01",
                      desc="query_history",
                      verbose=verbose,
                      verbose_query=verbose)

    df_bq_history, qid_bq = bq.query_history(t0=t0, t1=pd.Timestamp.utcnow())

    df_bq_history.to_csv(results_dir + config.sep + "query_history_bq.csv")

    return df_bq_history
예제 #3
0
    def table_metadata(self):
        """Compare table contents on both platforms
        TODO: the BQ and SF specific methods could probably be migrated to
        sf_tpc.py and bq_tpc.py

        For more details see:
        https://docs.snowflake.com/en/sql-reference/account-usage/tables.html
        https://cloud.google.com/bigquery/docs/information-schema-datasets

        Returns
        -------
        df_sf_results : Pandas DataFrame, recormatted view of query
            select * from snowflake.account_usage.tables

            TABLE_ID                                     object
            TABLE_NAME                                   object
            TABLE_SCHEMA_ID                              object
            TABLE_SCHEMA                                 object
            TABLE_CATALOG_ID                             object
            TABLE_CATALOG                                object
            TABLE_OWNER                                  object
            TABLE_TYPE                                   object
            IS_TRANSIENT                                 object
            CLUSTERING_KEY                               object
            ROW_COUNT                                    object
            BYTES                                        object
            RETENTION_TIME                               object
            SELF_REFERENCING_COLUMN_NAME                 object
            REFERENCE_GENERATION                         object
            USER_DEFINED_TYPE_CATALOG                    object
            USER_DEFINED_TYPE_SCHEMA                     object
            USER_DEFINED_TYPE_NAME                       object
            IS_INSERTABLE_INTO                           object
            IS_TYPED                                     object
            COMMIT_ACTION                                object
            CREATED                         datetime64[ns, UTC]
            LAST_ALTERED                    datetime64[ns, UTC]
            DELETED                         datetime64[ns, UTC]
            AUTO_CLUSTERING_ON                           object
            COMMENT                                      object

        df_bq_results : Pandas DataFrame, reformatted query view of
            select * from dataset.table.__TABLES___

            project_id            object, project id on GCP
            dataset_id            object, dataset name
            table_id              object, table name
            creation_time          int64,
            last_modified_time     int64,
            row_count              int64, number of rows in table
            size_bytes             int64, total stored size in bytes
            type                   int64,
        """
        sf = sf_tpc.SFTPC(test=self.test,
                          scale=self.scale,
                          cid=self.cid,
                          warehouse="TEST9000",
                          desc=self.desc,
                          verbose=self.verbose,
                          verbose_query=self.verbose_query)

        if self.verbose:
            print('Using database:', sf.database)

        sf.timestamp = self.shared_timestamp
        sf.results_dir = self.results_dir

        sf.connect()
        query_text = "select * from snowflake.account_usage.tables"
        sf_query_result = sf.sfc.query(query_text=query_text)
        df_sf_result = sf_query_result.fetch_pandas_all()
        sf.close()

        bq = bq_tpc.BQTPC(test=self.test,
                          scale=self.scale,
                          cid=self.cid,
                          desc=self.desc,
                          verbose_query=self.verbose_query,
                          verbose=self.verbose)

        bq.timestamp = self.shared_timestamp
        bq.results_dir = self.results_dir

        query_text = f"SELECT * FROM `{config.gcp_project.lower()}.INFORMATION_SCHEMA.SCHEMATA`"
        bq_query_result = bq.query(query_text=query_text)
        df_bq_tables = bq_query_result.result().to_dataframe()

        d = []
        for table_name in df_bq_tables.schema_name.unique():
            # INFORMATION_SCHEMA.COLUMNS
            query_text = f"SELECT * FROM `{config.gcp_project.lower()}.{table_name}.__TABLES__`"
            bq_query_result = bq.query(query_text=query_text)
            df_bq_result = bq_query_result.result().to_dataframe()
            d.append(df_bq_result)

        df_bq_result = pd.concat(d, axis=0)

        return df_sf_result, df_bq_result
예제 #4
0
    def compare_sum(self):

        ds_col = {
            "call_center": "cc_call_center_sk",  # integer
            "catalog_page": "cp_catalog_page_sk",
            "catalog_returns": "cr_order_number",
            "catalog_sales": "cs_order_number",
            "customer": "c_customer_sk",
            "customer_address": "ca_address_sk",
            "customer_demographics": "cd_demo_sk",
            "date_dim": "d_date_sk",  # integer
            # skip dbgen
            "household_demographics": "hd_demo_sk",
            "income_band": "ib_income_band_sk",
            "inventory": "inv_item_sk",  # integer
            "item": "i_item_sk",
            "promotion": "p_promo_sk",
            "reason": "r_reason_sk",
            "ship_mode": "sm_ship_mode_sk",
            "store": "s_store_sk",
            "store_returns": "sr_item_sk",
            "store_sales": "ss_item_sk",
            "time_dim": "t_time_sk",
            "warehouse": "w_warehouse_sk",
            "web_page": "wp_web_page_sk",
            "web_returns": "wr_item_sk",
            "web_sales": "ws_item_sk",
            "web_site": "web_site_sk"
        }

        h_col = {
            "customer": "c_custkey",
            "lineitem": "l_linenumber",
            "nation": "n_nationkey",
            "orders": "o_orderkey",
            "part": "p_partkey",
            "partsupp": "ps_partkey",
            "region": "r_regionkey",
            "supplier": "s_suppkey"
        }

        col_names = {"ds": ds_col, "h": h_col}[self.test]

        sf = sf_tpc.SFTPC(test=self.test,
                          scale=self.scale,
                          cid=self.cid,
                          warehouse="TEST9000",
                          desc=self.desc,
                          verbose=self.verbose,
                          verbose_query=self.verbose_query)

        if self.verbose:
            print('Using database:', sf.database)

        sf.timestamp = self.shared_timestamp
        sf.results_dir = self.results_dir
        sf.connect()

        bq = bq_tpc.BQTPC(test=self.test,
                          scale=self.scale,
                          cid=self.cid,
                          desc=self.desc,
                          verbose_query=self.verbose_query,
                          verbose=self.verbose)

        bq.timestamp = self.shared_timestamp
        bq.results_dir = self.results_dir

        d = []
        for table, column in col_names.items():
            if self.verbose_iter:
                print(f"TABLE & COLUMN: {table} >> {column}")

            query_text = f"select sum({column}) from {table}"

            sf_query_result = sf.sfc.query(query_text=query_text)
            df_sf_result = sf_query_result.fetch_pandas_all()
            df_sf_result.columns = ["r"]
            sf_r = df_sf_result.loc[0, "r"]

            bq_query_result = bq.query(query_text=query_text)
            df_bq_result = bq_query_result.result().to_dataframe()
            df_bq_result.columns = ["r"]
            bq_r = df_bq_result.loc[0, "r"]

            if self.verbose_iter:
                print("RESULT: SF | BQ")
                print("SF Type:", type(sf_r))
                print("BQ Type:", type(bq_r))
                print(sf_r, "|", bq_r)
                print("-" * 40)
                print()

            # type convert to assure numerical comparison
            # is the only comparison being done
            sf_r_a = np.int64(sf_r)
            bq_r_a = np.int64(bq_r)

            try:
                equal = sf_r_a == bq_r_a
            except TypeError:
                print("Error comparing query results.")
                print("SF Reply:")
                print(sf_r)
                print("-" * 30)
                print(bq_r)
                print("-" * 30)

            d.append([table, column, sf_r, bq_r, equal])
        sf.close()

        df = pd.DataFrame(d, columns=["table", "column", "sf", "bq", "equal"])

        db_name = self.test + "_" + "{:02d}".format(
            self.scale) + "_" + self.cid
        rdir, rfp = tools.make_name(db="bqsf",
                                    test=self.test,
                                    cid=self.cid,
                                    kind="qc-comparison",
                                    datasource=db_name,
                                    desc=self.desc,
                                    ext=".csv",
                                    timestamp=None)
        tools.mkdir_safe(rdir)
        fp = rdir + config.sep + rfp
        df.to_csv(fp, index=False)
        return df
예제 #5
0
    def run(self, seq):
        """Run a benchmark comparison

        Parameters
        ----------
        seq : list of int, query numbers to execute
        #systems : list of str, systems under test to collect data. Allowed values: 'sf' and 'bq'

        Returns
        -------
        None, writes multiple files to self.results_dir location
        """

        self.query_sequence = seq

        if 'sf' in self.systems:

            self.test_stage = "Snowflake start"
            metadata_fp = self.results_dir + config.sep + "metadata_sf_compare_initial.json"
            with open(metadata_fp, "w") as f:
                f.write(self.to_json(indent="  "))

            sf = sf_tpc.SFTPC(test=self.test,
                              scale=self.scale,
                              cid=self.cid,
                              warehouse=self.sf_warehouse_name,
                              desc=self.desc,
                              verbose=self.verbose,
                              verbose_query=self.verbose_query)
            sf.verbose_query_n = self.verbose_query_n

            if self.verbose:
                print('Using database:', sf.database)

            sf.timestamp = self.shared_timestamp
            sf.results_dir = self.results_dir

            sf.connect()

            # record what the SF warehouse size is
            query_result = sf.show_warehouses()
            warehouse_size_mapper = {
                r[0]: r[3]
                for r in query_result.fetchall()
            }
            self.sf_warehouse_size = warehouse_size_mapper[
                self.sf_warehouse_name]

            # update initial metadata so warehouse size is captured

            if self.cache is True:
                sf.cache = True
            else:
                sf.cache = False

            self.result_sf = sf.query_seq(seq=seq,
                                          seq_n=self.stream_n,
                                          qual=self.qual,
                                          save=self.save,
                                          verbose_iter=self.verbose_iter)
            sf.close()

            self.results_sf_csv_fp = sf.results_csv_fp

            self.test_stage = "Snowflake end"
            metadata_fp = self.results_dir + config.sep + "metadata_sf_compare_final.json"
            with open(metadata_fp, "w") as f:
                f.write(self.to_json(indent="  "))

        if 'bq' in self.systems:

            bq = bq_tpc.BQTPC(test=self.test,
                              scale=self.scale,
                              cid=self.cid,
                              desc=self.desc,
                              verbose_query=self.verbose_query,
                              verbose=self.verbose)
            bq.verbose_query_n = self.verbose_query_n

            bq.timestamp = self.shared_timestamp
            bq.results_dir = self.results_dir

            if self.cache is True:
                bq.cache = True
            else:
                bq.cache = False

            self.test_stage = "BigQuery start"
            metadata_fp = self.results_dir + config.sep + "metadata_bq_compare_initial.json"
            with open(metadata_fp, "w") as f:
                f.write(self.to_json(indent="  "))

            self.result_bq = bq.query_seq(seq,
                                          seq_n=self.stream_n,
                                          qual=self.qual,
                                          save=self.save,
                                          verbose_iter=self.verbose_iter)

            self.results_bq_csv_fp = bq.results_csv_fp

            self.test_stage = "BigQuery end"
            metadata_fp = self.results_dir + config.sep + "metadata_bq_compare_final.json"
            with open(metadata_fp, "w") as f:
                f.write(self.to_json(indent="  "))