Beispiel #1
0
def test_basic_async_query(proj_id, std_gcs_path):
    bigq = bqtools.BigQuery()
    query_job = bigq.async_query(proj_id, query, allow_large_results=False)
    bigq.poll_job(query_job)
    extract_job = bigq.async_extract_query(query_job, std_gcs_path, compression=None)
    bigq.poll_job(extract_job)
    _download_and_check(std_gcs_path)
Beispiel #2
0
def test_basic_parallel_query_and_extract(proj_id, gcs_temp_dir):
    queries = []
    for i in range(3):
        # We are creating three identical queries here,
        # typically, you'd use different queries
        gcs_path = gcs_temp_dir + "temp_bqtools_{}.csv".format(i)
        queries.append(dict(
            proj_id=proj_id,
            query=query,
            compression="NONE",
            path=gcs_path))
    bigq = bqtools.BigQuery()
    for gcs_path in bigq.parallel_query_and_extract(queries):
        _download_and_check(gcs_path)
Beispiel #3
0
def update_base_lists():
    """update lists derived for BiqQuery
    """
    bigq = bqtools.BigQuery()
    queries = []
    path_map = {}
    print("Building queries")
    for fl in filter_lists:
        sql_path = os.path.join(this_dir, "sql", "{}.sql".format(fl.sql))
        sql = open(sql_path).read()
        # Create a query object for each date range so that we can
        # run all ranges in parallel to speed things up
        for date_range in fl.date_ranges:
            start_date, end_date = date_range
            year = end_date[:4]
            query = sql.format(start_date=start_date,
                               end_date=end_date,
                               **config)
            gcs_path = gcs_path_template.format(len(path_map))
            path_map[gcs_path] = (fl.path, year)
            queries.append(
                dict(proj_id=proj_id,
                     query=query,
                     format="CSV",
                     compression="NONE",
                     path=gcs_path))
    # As each query finishes, copy the query to local temp dir
    # and then move all but the first line (the header) to
    # its final destination.
    print("Waiting for results:")
    for gcs_path in bigq.parallel_query_and_extract(queries):
        (fl_path, year) = path_map[gcs_path]
        bqtools.gs_mv(gcs_path, tmp_path)
        dest_path = os.path.join(asset_dir, fl_path, "{}.txt".format(year))
        copy_to_sorted_mmsi(tmp_path, dest_path)
        print("    {0}/{1} done".format(fl_path, year))
    os.unlink(tmp_path)
Beispiel #4
0
def test_basic_query_and_extract(proj_id, std_gcs_path):
    "Test the basic, small-result interface"
    bigq = bqtools.BigQuery()
    bigq.query_and_extract(proj_id, query, std_gcs_path, compression="NONE")  
    _download_and_check(std_gcs_path)
Beispiel #5
0
def test_broken_query(proj_id, std_gcs_path):
    query = "SELECT COUNT(*) FRM_MISPELLED [bigquery-public-data:noaa_gsod.gsod2015]"
    bigq = bqtools.BigQuery()
    assert pytest.raises(RuntimeError, bigq.query_and_extract, 
                         proj_id, query, std_gcs_path, compression="NONE")