def test_basic_async_query(proj_id, std_gcs_path): bigq = bqtools.BigQuery() query_job = bigq.async_query(proj_id, query, allow_large_results=False) bigq.poll_job(query_job) extract_job = bigq.async_extract_query(query_job, std_gcs_path, compression=None) bigq.poll_job(extract_job) _download_and_check(std_gcs_path)
def test_basic_parallel_query_and_extract(proj_id, gcs_temp_dir): queries = [] for i in range(3): # We are creating three identical queries here, # typically, you'd use different queries gcs_path = gcs_temp_dir + "temp_bqtools_{}.csv".format(i) queries.append(dict( proj_id=proj_id, query=query, compression="NONE", path=gcs_path)) bigq = bqtools.BigQuery() for gcs_path in bigq.parallel_query_and_extract(queries): _download_and_check(gcs_path)
def update_base_lists(): """update lists derived for BiqQuery """ bigq = bqtools.BigQuery() queries = [] path_map = {} print("Building queries") for fl in filter_lists: sql_path = os.path.join(this_dir, "sql", "{}.sql".format(fl.sql)) sql = open(sql_path).read() # Create a query object for each date range so that we can # run all ranges in parallel to speed things up for date_range in fl.date_ranges: start_date, end_date = date_range year = end_date[:4] query = sql.format(start_date=start_date, end_date=end_date, **config) gcs_path = gcs_path_template.format(len(path_map)) path_map[gcs_path] = (fl.path, year) queries.append( dict(proj_id=proj_id, query=query, format="CSV", compression="NONE", path=gcs_path)) # As each query finishes, copy the query to local temp dir # and then move all but the first line (the header) to # its final destination. print("Waiting for results:") for gcs_path in bigq.parallel_query_and_extract(queries): (fl_path, year) = path_map[gcs_path] bqtools.gs_mv(gcs_path, tmp_path) dest_path = os.path.join(asset_dir, fl_path, "{}.txt".format(year)) copy_to_sorted_mmsi(tmp_path, dest_path) print(" {0}/{1} done".format(fl_path, year)) os.unlink(tmp_path)
def test_basic_query_and_extract(proj_id, std_gcs_path): "Test the basic, small-result interface" bigq = bqtools.BigQuery() bigq.query_and_extract(proj_id, query, std_gcs_path, compression="NONE") _download_and_check(std_gcs_path)
def test_broken_query(proj_id, std_gcs_path): query = "SELECT COUNT(*) FRM_MISPELLED [bigquery-public-data:noaa_gsod.gsod2015]" bigq = bqtools.BigQuery() assert pytest.raises(RuntimeError, bigq.query_and_extract, proj_id, query, std_gcs_path, compression="NONE")