def build_module(module, required): import os.path tools.mkdir_safe('product/modules/%s' % module) out = 'product/modules/%s/%s.o' % (module, module) srcs = [] metafile = 'modules/%s/%s.meta' % (module, module) if os.path.exists(metafile): tools.copy_file('product/modules/%s/%s.meta' % (module, module), metafile) srcs.append(metafile) meta = tools.load_meta(metafile) extra_objs = '' if 'objs' in meta: extra_objs = meta['objs'] src = 'modules/%s/%s.rip' % (module, module) if tools.depends(out, module_deps + [src]): tools.pprint('MOD', src, out) args = [ 'product/ripe', conf["RFLAGS"], '-n', module, '-c', srcs, src, extra_objs, '-o', out ] # Required (default) packages have already been typed, and are # loaded by default. Hence, they do not need to be typed. if required: args.append('--omit-typing') if conf["VERBOSITY"] > 1: args.append('-v') if required: tools.call(args) else: if not tools.try_call(args): failed_modules.append(module)
def build_module(module, required): import os.path tools.mkdir_safe('product/modules/%s' % module) out = 'product/modules/%s/%s.o' % (module, module) srcs = [] metafile = 'modules/%s/%s.meta' % (module, module) if os.path.exists(metafile): tools.copy_file('product/modules/%s/%s.meta' % (module, module), metafile) srcs.append(metafile) meta = tools.load_meta(metafile) extra_objs = '' if 'objs' in meta: extra_objs = meta['objs'] src = 'modules/%s/%s.rip' % (module, module) if tools.depends(out, module_deps + [src]): tools.pprint('MOD', src, out) args = ['product/ripe', conf["RFLAGS"], '-n', module, '-c', srcs, src, extra_objs, '-o', out] # Required (default) packages have already been typed, and are # loaded by default. Hence, they do not need to be typed. if required: args.append('--omit-typing') if conf["VERBOSITY"] > 1: args.append('-v') if required: tools.call(args) else: if not tools.try_call(args): failed_modules.append(module)
def type_module(module): path = 'modules/%s/%s.rip' % (module, module) out = 'product/modules/%s/%s.typ' % (module, module) if tools.depends(out, type_deps + [path]): sys.stdout.write(tools.color_src + module + tools.color_reset + " ") tools.mkdir_safe('product/modules/%s' % module) tools.call(['product/ripe', '-t', path, '>', out]) return out
def qgen_stream(p, templates_dir, dialect, scale, qual=None, verbose=False, verbose_out=False): """Generate DS query text for query template number n Parameters ---------- p : int, query number to generate BigQuery SQL templates_dir : str, absolute path to directory of query templates to draw from for n. scale : int, scale factor of db being queried qual : bool, generate qualification queries in ascending order verbose : bool, print debug statements verbose_out : bool, print std_out and std_err output Returns ------- query_text : str, query text generated for query """ if config.random_seed is not None: r = config.random_seed else: r = None # make temporary query directory temp_dir = config.fp_ds_output + config.sep + "temp_queries" tools.mkdir_safe(temp_dir) std_out, err_out = dsqgen(directory=templates_dir, dialect=dialect, scale=scale, # filter="Y", # write to std_out streams=p+1, input=templates_dir + config.sep + "templates.lst", rngseed=r, qualify=qual, verbose=verbose, output_dir=temp_dir, ) query_fp = temp_dir + config.sep + "query_{}.sql".format(p) with open(query_fp, "r") as f: query_text = f.read() if verbose_out: print("QUERY STREAM:", p) print("=================") print() print("Source File") print("===========") print(query_fp) print() std_err_print(std_out, err_out) return query_text
def write_query_text(self, query_text, query_n): """Write query text executed to a specific folder Parameters ---------- query_text : str, TPC query SQL executed query_n : int, TPC query number """ fd = self.results_dir + config.sep tools.mkdir_safe(fd) fp = fd + "query_text_bq_{0:02d}.sql".format(query_n) with open(fp, "w") as f: f.write(query_text)
def write_results_csv(self, df, query_n): """Write the results of a TPC query to a CSV file in a specific folder Parameters ---------- df : Pandas DataFrame query_n : int, query number in TPC test """ fd = self.results_dir + config.sep tools.mkdir_safe(fd) fp = fd + "query_result_bq_{0:02d}.csv".format(query_n) df = tools.to_consistent(df, n=config.float_precision) df.to_csv(fp, index=False, float_format="%.3f")
def set_timestamp_dir(self): self.shared_timestamp = pd.Timestamp.now() # "UTC" self.shared_timestamp = str(self.shared_timestamp).replace(" ", "_") self.data_source = self.test + "_" + str(self.scale) + "GB_" + self.cid self.results_dir, _ = tools.make_name(db="".join(self.systems), test=self.test, cid=self.cid, kind="results", datasource=self.data_source, desc=self.desc, ext="", timestamp=self.shared_timestamp) tools.mkdir_safe(self.results_dir) if self.verbose: print("Result Folder Name:") print(self.results_dir)
def write_times_csv(self, results_list, columns): """Write a list of results from queries to a CSV file Parameters ---------- results_list : list, data as recorded on the local machine columns : list, column names for output CSV """ _, fp = tools.make_name(db="bq", test=self.test, cid=self.cid, kind="times", datasource=self.dataset, desc=self.desc, ext=".csv", timestamp=self.timestamp) self.results_csv_fp = self.results_dir + config.sep + fp df = pd.DataFrame(results_list, columns=columns) tools.mkdir_safe(self.results_dir) df.to_csv(self.results_csv_fp, index=False)
def query_seq(self, seq, seq_n=None, qual=None, save=False, verbose_iter=False): """Query BigQuery with TPC-DS or TPC-H query template number n Parameters ---------- seq : iterable sequence int, query numbers to execute between 1 and 99 for ds and 1 and 22 for h seq_n : int, stream sequence number for test - i.e. 0 or 4 etc qual : None, or True to use qualifying values (to test 1GB qualification db) save : bool, save data about this query sequence to disk verbose_iter : bool, print per iteration status statements Returns ------- n_time_data : list, timing data for query stream, with: db : str, database system under test name ("sf" or "bq") test : str, test name ("ds" or "h") scale : int, TPC scale factor in GB source : str, source dataset/database cid : str, configuration id desc : str, description of stream test query_n : int, benchmark query number seq_n : int, benchmark query sequence/stream number driver_t0 : datetime, time on the driver when query was started driver_t1 : datatime, time on the driver when query returned qid : str, database system under test query id for the query run """ self.test_stage = "start" metadata_fp = self.results_dir + config.sep + "metadata_bq_initial.json" tools.mkdir_safe(self.results_dir) with open(metadata_fp, "w") as f: f.write(self.to_json(indent=" ")) if seq_n is None: seq_n = "sNA" else: seq_n = str(seq_n) n_time_data = [] columns = [ "db", "test", "scale", "source", "cid", "desc", "query_n", "seq_n", "driver_t0", "driver_t1", "qid" ] t0_seq = pd.Timestamp.now("UTC") i_total = len(seq) for i, n in enumerate(seq): qn_label = self.dataset + "-q" + str( n) + "-" + seq_n + "-" + self.desc qn_label = qn_label.lower() if verbose_iter: print("=" * 40) print("BigQuery Start Query:", n) print("-" * 20) print("Stream Completion: {} / {}".format(i + 1, i_total)) print("Query Label:", qn_label) print("-" * 20) print() self.set_query_label(qn_label) (t0, t1, df_result, query_text, qid) = self.query_n(n=n, qual=qual, std_out=False) _d = [ "bq", self.test, self.scale, self.dataset, self.cid, self.desc, n, seq_n, t0, t1, qid ] n_time_data.append(_d) # write results as collected by each query if save: self.write_query_text(query_text=query_text, query_n=n) if len(df_result) > 0: self.write_results_csv(df=df_result, query_n=n) else: # filler for statistics when the query returns no values df_result.loc[0, :] = ["filler"] * df_result.shape[1] if verbose_iter: print("No result rows, FILLER DataFrame created.") self.write_results_csv(df=df_result, query_n=n) if verbose_iter: dt = t1 - t0 print("Query ID: {}".format(qid)) print("Total Time Elapsed: {}".format(dt)) print("-" * 40) print() if self.verbose: if len(df_result) < 25: print("Result:") print("-------") print(df_result) print() else: print("Head of Result:") print("---------------") print(df_result.head()) print() t1_seq = pd.Timestamp.now("UTC") #if self.verbose: dt_seq = t1_seq - t0_seq print() print("=" * 40) print("BigQuery Query Stream Done!") print("Total Time Elapsed: {}".format(dt_seq)) print() # write local timing results to file self.write_times_csv(results_list=n_time_data, columns=columns) self.test_stage = "end" metadata_fp = self.results_dir + config.sep + "metadata_initial.json" with open(metadata_fp, "w") as f: f.write(self.to_json(indent=" ")) return pd.DataFrame(n_time_data, columns=columns)
# conf["CFLAGS"].extend(["-pg", "-fno-omit-frame-pointer", "-O3", "-DNDEBUG"]) # conf["LFLAGS"].append("-pg") # conf["RFLAGS"].append("--optim-verify") #if "nostack" in sys.argv: # conf["CFLAGS"].append("-DNOSTACK") #if "nothreads" in sys.argv: # conf["CFLAGS"].append("-DNOTHREADS") #if "memlog" in sys.argv: # conf["CFLAGS"].append("-DMEMLOG") # Construct required directories required_dirs = ['bin', 'product', 'product/include', 'product/include/clib', 'product/include/vm', 'product/include/modules', 'product/include/lang', 'product/modules'] for d in required_dirs: tools.mkdir_safe(d) ############################################################################### # CLIB clib_hs = [ 'clib/clib.h' ] clib_srcs = [ 'clib/array.c', 'clib/dict.c', 'clib/hash.c', 'clib/mem.c', 'clib/path.c', 'clib/stringbuf.c', 'clib/structs.c', 'clib/tok.c', 'clib/utf8.c', 'clib/util.c' ]
def compare_sum(self): ds_col = { "call_center": "cc_call_center_sk", # integer "catalog_page": "cp_catalog_page_sk", "catalog_returns": "cr_order_number", "catalog_sales": "cs_order_number", "customer": "c_customer_sk", "customer_address": "ca_address_sk", "customer_demographics": "cd_demo_sk", "date_dim": "d_date_sk", # integer # skip dbgen "household_demographics": "hd_demo_sk", "income_band": "ib_income_band_sk", "inventory": "inv_item_sk", # integer "item": "i_item_sk", "promotion": "p_promo_sk", "reason": "r_reason_sk", "ship_mode": "sm_ship_mode_sk", "store": "s_store_sk", "store_returns": "sr_item_sk", "store_sales": "ss_item_sk", "time_dim": "t_time_sk", "warehouse": "w_warehouse_sk", "web_page": "wp_web_page_sk", "web_returns": "wr_item_sk", "web_sales": "ws_item_sk", "web_site": "web_site_sk" } h_col = { "customer": "c_custkey", "lineitem": "l_linenumber", "nation": "n_nationkey", "orders": "o_orderkey", "part": "p_partkey", "partsupp": "ps_partkey", "region": "r_regionkey", "supplier": "s_suppkey" } col_names = {"ds": ds_col, "h": h_col}[self.test] sf = sf_tpc.SFTPC(test=self.test, scale=self.scale, cid=self.cid, warehouse="TEST9000", desc=self.desc, verbose=self.verbose, verbose_query=self.verbose_query) if self.verbose: print('Using database:', sf.database) sf.timestamp = self.shared_timestamp sf.results_dir = self.results_dir sf.connect() bq = bq_tpc.BQTPC(test=self.test, scale=self.scale, cid=self.cid, desc=self.desc, verbose_query=self.verbose_query, verbose=self.verbose) bq.timestamp = self.shared_timestamp bq.results_dir = self.results_dir d = [] for table, column in col_names.items(): if self.verbose_iter: print(f"TABLE & COLUMN: {table} >> {column}") query_text = f"select sum({column}) from {table}" sf_query_result = sf.sfc.query(query_text=query_text) df_sf_result = sf_query_result.fetch_pandas_all() df_sf_result.columns = ["r"] sf_r = df_sf_result.loc[0, "r"] bq_query_result = bq.query(query_text=query_text) df_bq_result = bq_query_result.result().to_dataframe() df_bq_result.columns = ["r"] bq_r = df_bq_result.loc[0, "r"] if self.verbose_iter: print("RESULT: SF | BQ") print("SF Type:", type(sf_r)) print("BQ Type:", type(bq_r)) print(sf_r, "|", bq_r) print("-" * 40) print() # type convert to assure numerical comparison # is the only comparison being done sf_r_a = np.int64(sf_r) bq_r_a = np.int64(bq_r) try: equal = sf_r_a == bq_r_a except TypeError: print("Error comparing query results.") print("SF Reply:") print(sf_r) print("-" * 30) print(bq_r) print("-" * 30) d.append([table, column, sf_r, bq_r, equal]) sf.close() df = pd.DataFrame(d, columns=["table", "column", "sf", "bq", "equal"]) db_name = self.test + "_" + "{:02d}".format( self.scale) + "_" + self.cid rdir, rfp = tools.make_name(db="bqsf", test=self.test, cid=self.cid, kind="qc-comparison", datasource=db_name, desc=self.desc, ext=".csv", timestamp=None) tools.mkdir_safe(rdir) fp = rdir + config.sep + rfp df.to_csv(fp, index=False) return df
# conf["RFLAGS"].append("--optim-verify") #if "nostack" in sys.argv: # conf["CFLAGS"].append("-DNOSTACK") #if "nothreads" in sys.argv: # conf["CFLAGS"].append("-DNOTHREADS") #if "memlog" in sys.argv: # conf["CFLAGS"].append("-DMEMLOG") # Construct required directories required_dirs = [ 'bin', 'product', 'product/include', 'product/include/clib', 'product/include/vm', 'product/include/modules', 'product/include/lang', 'product/modules' ] for d in required_dirs: tools.mkdir_safe(d) ############################################################################### # CLIB clib_hs = ['clib/clib.h'] clib_srcs = [ 'clib/array.c', 'clib/dict.c', 'clib/hash.c', 'clib/mem.c', 'clib/path.c', 'clib/stringbuf.c', 'clib/structs.c', 'clib/tok.c', 'clib/utf8.c', 'clib/util.c' ] clib_objs = tools.cons_objs(clib_srcs, clib_hs) ############################################################################### # LANG