def test_create_table(cleanup_test_table, client): result = query(TOY_QUERY, table='test', dataset=DATASET) assert result.errors is None # Check the result job = client.query(TOY_QUERY) assert isinstance(job, bigquery.QueryJob) job_result = job.result() assert isinstance(job_result, bigquery.table.RowIterator) rows = [row for row in job_result] assert len(rows) == 2 assert list(rows[0].keys()) == ['x', 'y']
def test_recreate_table(cleanup_test_table, client): """If the cleanup fixture works, creating the test table a second time won't raise NotFound. Keep this test below test_create_table(). """ # Create the test table a second time job_2 = query(TOY_QUERY, table='test', dataset=DATASET) assert job_2.state == 'DONE' table_2_rows = [row for row in job_2.result()] assert table_2_rows[0]['x'] == 1 and table_2_rows[0]['y'] == 'foo' table_2 = client.get_table(f'{DATASET}.test') # Trying to create the table a third time and passing truncate=True should replace the contents of the table job_3 = query(ALT_TOY_QUERY, table='test', dataset=DATASET, truncate=True) assert job_3.state == 'DONE' table_3 = client.get_table(f'{DATASET}.test') # The table isn't recreated assert table_3.created == table_2.created # Its contents are replaced assert table_3.modified > table_2.created table_3_rows = [row for row in job_3.result()] assert table_3_rows[0]['x'] == 2 and table_3_rows[0]['y'] == 'baz'
def csv_file_updates(csv_file): """ Summarize the deletes associated with a CSV file :param csv_file: path to a file where each column is a list of pids and the header is an observation_source_value :return: dictionary with keys file_name, observation_source_value, num_pids, num_rows, q """ if not os.path.exists(csv_file): raise IOError('File "%s" not found' % csv_file) obs_count_fmt = OBS_COUNT_FMT obs_query_fmt = OBS_QUERY_FMT if DEID: obs_count_fmt = DEID_OBS_COUNT_FMT obs_query_fmt = DEID_OBS_QUERY_FMT file_name = os.path.basename(csv_file) csv_df = pd.read_csv(csv_file) cols = list(csv_df.columns.to_native_types()) results = list() for col in cols: person_ids = csv_df[col].dropna().apply(str).to_list() q = obs_count_fmt.format(PROJECT_ID=PROJECT_ID, TARGET_DATASET_ID=TARGET_DATASET_ID, COMBINED=COMBINED, OBSERVATION_SOURCE_VALUE=col, PERSON_IDS=', '.join(person_ids)) num_rows_result = query(q) q = obs_query_fmt.format(PROJECT_ID=PROJECT_ID, TARGET_DATASET_ID=TARGET_DATASET_ID, COMBINED=COMBINED, OBSERVATION_SOURCE_VALUE=col, PERSON_IDS=', '.join(person_ids)) num_rows = num_rows_result.iloc[0]['n'] result = dict(file_name=file_name, observation_source_value=col, q=q, num_pids=len(person_ids), num_rows=num_rows) results.append(result) return results
for schema_dictionary in res['schema']['fields'] ]) # input_file = 'data/BQ_data.plk' # try: # with open(input_file, 'rb') as input: # d = pickle.load(input) # except IOError: # print("cannot open %s" % (input_file)) buf = """select * from `tommy-boy.applemusic_analytics.am_streams` where ingest_datestamp = '%s';""" get_streaming_data = buf % date_id res = bq.query('tommy-boy', 'applemusic_analytics', get_streaming_data, max_results=1000000) streams = pd.DataFrame(data=bq.coerce_rows(fields=res['schema']['fields'], rows=res['rows']), columns=[ schema_dictionary['name'] for schema_dictionary in res['schema']['fields'] ]) # extract list of distinct listeners try: anonymized_person_id_unique = list(set(d.anonymized_person_id)) except AttributeError: print("data needs anonymized_person_id") try:
def make_table(table, **kw): sql = read_sql(f'../ai2_replication/{table}.sql') job = query(sql, table, dataset=DATASET, truncate=True, **kw) return job.result()