コード例 #1
0
def test_create_table(cleanup_test_table, client):
    result = query(TOY_QUERY, table='test', dataset=DATASET)
    assert result.errors is None
    # Check the result
    job = client.query(TOY_QUERY)
    assert isinstance(job, bigquery.QueryJob)
    job_result = job.result()
    assert isinstance(job_result, bigquery.table.RowIterator)
    rows = [row for row in job_result]
    assert len(rows) == 2
    assert list(rows[0].keys()) == ['x', 'y']
コード例 #2
0
def test_recreate_table(cleanup_test_table, client):
    """If the cleanup fixture works, creating the test table a second time won't raise NotFound.

    Keep this test below test_create_table().
    """
    # Create the test table a second time
    job_2 = query(TOY_QUERY, table='test', dataset=DATASET)
    assert job_2.state == 'DONE'
    table_2_rows = [row for row in job_2.result()]
    assert table_2_rows[0]['x'] == 1 and table_2_rows[0]['y'] == 'foo'
    table_2 = client.get_table(f'{DATASET}.test')
    # Trying to create the table a third time and passing truncate=True should replace the contents of the table
    job_3 = query(ALT_TOY_QUERY, table='test', dataset=DATASET, truncate=True)
    assert job_3.state == 'DONE'
    table_3 = client.get_table(f'{DATASET}.test')
    # The table isn't recreated
    assert table_3.created == table_2.created
    # Its contents are replaced
    assert table_3.modified > table_2.created
    table_3_rows = [row for row in job_3.result()]
    assert table_3_rows[0]['x'] == 2 and table_3_rows[0]['y'] == 'baz'
コード例 #3
0
def csv_file_updates(csv_file):
    """
    Summarize the deletes associated with a CSV file
    
    :param csv_file: path to a file where each column is a list of pids and the header is an observation_source_value
    :return: dictionary with keys file_name, observation_source_value, num_pids, num_rows, q
    """

    if not os.path.exists(csv_file):
        raise IOError('File "%s" not found' % csv_file)
    obs_count_fmt = OBS_COUNT_FMT
    obs_query_fmt = OBS_QUERY_FMT
    if DEID:
        obs_count_fmt = DEID_OBS_COUNT_FMT
        obs_query_fmt = DEID_OBS_QUERY_FMT
    file_name = os.path.basename(csv_file)
    csv_df = pd.read_csv(csv_file)
    cols = list(csv_df.columns.to_native_types())
    results = list()
    for col in cols:
        person_ids = csv_df[col].dropna().apply(str).to_list()
        q = obs_count_fmt.format(PROJECT_ID=PROJECT_ID,
                                 TARGET_DATASET_ID=TARGET_DATASET_ID,
                                 COMBINED=COMBINED,
                                 OBSERVATION_SOURCE_VALUE=col,
                                 PERSON_IDS=', '.join(person_ids))
        num_rows_result = query(q)
        q = obs_query_fmt.format(PROJECT_ID=PROJECT_ID,
                                 TARGET_DATASET_ID=TARGET_DATASET_ID,
                                 COMBINED=COMBINED,
                                 OBSERVATION_SOURCE_VALUE=col,
                                 PERSON_IDS=', '.join(person_ids))
        num_rows = num_rows_result.iloc[0]['n']
        result = dict(file_name=file_name,
                      observation_source_value=col,
                      q=q,
                      num_pids=len(person_ids),
                      num_rows=num_rows)
        results.append(result)
    return results
コード例 #4
0
                     for schema_dictionary in res['schema']['fields']
                 ])

# input_file = 'data/BQ_data.plk'
# try:
#     with open(input_file, 'rb') as input:
#     d = pickle.load(input)
# except IOError:
#     print("cannot open %s" % (input_file))

buf = """select * from `tommy-boy.applemusic_analytics.am_streams` 
                        where ingest_datestamp = '%s';"""

get_streaming_data = buf % date_id
res = bq.query('tommy-boy',
               'applemusic_analytics',
               get_streaming_data,
               max_results=1000000)
streams = pd.DataFrame(data=bq.coerce_rows(fields=res['schema']['fields'],
                                           rows=res['rows']),
                       columns=[
                           schema_dictionary['name']
                           for schema_dictionary in res['schema']['fields']
                       ])

# extract list of distinct listeners
try:
    anonymized_person_id_unique = list(set(d.anonymized_person_id))
except AttributeError:
    print("data needs anonymized_person_id")

try:
コード例 #5
0
def make_table(table, **kw):
    sql = read_sql(f'../ai2_replication/{table}.sql')
    job = query(sql, table, dataset=DATASET, truncate=True, **kw)
    return job.result()