def drop_table(auth, project_id, dataset_id, table_id, billing_project_id=None): if not billing_project_id: billing_project_id = project_id query = ('DROP TABLE `' + project_id + '.' + dataset_id + '.' + table_id + '` ') body = { "kind": "bigquery#queryRequest", 'query': query, 'defaultDataset': { 'datasetId': dataset_id, }, 'useLegacySql': False, } job_wait( auth, API_BigQuery(auth).jobs().query(projectId=billing_project_id, body=body).execute())
def _get_min_date_from_table(auth, project_id, dataset_id, table_id, billing_project_id=None): if not billing_project_id: billing_project_id = project_id query = ('SELECT MIN(Report_Day) FROM `' + project_id + '.' + dataset_id + '.' + table_id + '` ') body = { "kind": "bigquery#queryRequest", 'query': query, 'defaultDataset': { 'datasetId': dataset_id, }, 'useLegacySql': False, } job = API_BigQuery(auth).jobs().query(projectId=billing_project_id, body=body).execute() return job['rows'][0]['f'][0]['v']
def query_to_table(config, auth, project_id, dataset_id, table_id, query, disposition='WRITE_TRUNCATE', legacy=True, billing_project_id=None, target_project_id=None): target_project_id = target_project_id or project_id if not billing_project_id: billing_project_id = project_id body = { 'configuration': { 'query': { 'useLegacySql': legacy, 'query': query, 'destinationTable': { 'projectId': target_project_id, 'datasetId': dataset_id, 'tableId': table_id }, 'createDisposition': 'CREATE_IF_NEEDED', 'writeDisposition': disposition, 'allowLargeResults': True }, } } job_wait( config, auth, API_BigQuery(config, auth).jobs().insert(projectId=billing_project_id, body=body).execute())
def query_to_rows(auth, project_id, dataset_id, query, row_max=None, legacy=True): # Create the query body = { "kind": "bigquery#queryRequest", "query": query, "timeoutMs": 10000, "dryRun": False, "useQueryCache": True, "useLegacySql": legacy } if row_max: body['maxResults'] = row_max if dataset_id: body['defaultDataset'] = { "projectId": project_id, "datasetId": dataset_id } # wait for query to complete response = API_BigQuery(auth).jobs().query(projectId=project_id, body=body).execute() while not response['jobComplete']: sleep(5) response = API_BigQuery(auth).jobs().getQueryResults( projectId=project_id, jobId=response['jobReference']['jobId']).execute(iterate=False) # fetch query results row_count = 0 while 'rows' in response: converters = _build_converter_array(response.get('schema', None), None, len(response['rows'][0].get('f'))) for row in response['rows']: yield [ converters[i](next(iter(r.values()))) for i, r in enumerate(row['f']) ] # may break if we attempt nested reads row_count += 1 if 'PageToken' in response: response = API_BigQuery(auth).jobs().getQueryResults( projectId=project_id, jobId=response['jobReference']['jobId'], pageToken=response['PageToken']).execute(iterate=False) elif row_count < int(response['totalRows']): response = API_BigQuery(auth).jobs().getQueryResults( projectId=project_id, jobId=response['jobReference']['jobId'], startIndex=row_count).execute(iterate=False) else: break
def table_to_schema(auth, project_id, dataset_id, table_id): if project.verbose: print('TABLE SCHEMA:', project_id, dataset_id, table_id) return API_BigQuery(auth).tables().get( projectId=project_id, datasetId=dataset_id, tableId=table_id).execute()['schema']
def table_get(auth, project_id, dataset_id, table_id): return API_BigQuery(auth).tables().get(projectId=project_id, datasetId=dataset_id, tableId=table_id).execute()
def io_to_table(auth, project_id, dataset_id, table_id, data, source_format='CSV', schema=None, skip_rows=0, disposition='WRITE_TRUNCATE', wait=True): # if data exists, write data to table data.seek(0, 2) if data.tell() > 0: data.seek(0) media = MediaIoBaseUpload(BytesIO(data.read().encode('utf8')), mimetype='application/octet-stream', resumable=True, chunksize=BIGQUERY_CHUNKSIZE) body = { 'configuration': { 'load': { 'destinationTable': { 'projectId': project_id, 'datasetId': dataset_id, 'tableId': table_id, }, 'sourceFormat': source_format, 'writeDisposition': disposition, 'autodetect': True, 'allowJaggedRows': True, 'allowQuotedNewlines': True, 'ignoreUnknownValues': True, } } } if schema: body['configuration']['load']['schema'] = {'fields': schema} body['configuration']['load']['autodetect'] = False if source_format == 'CSV': body['configuration']['load']['skipLeadingRows'] = skip_rows job = API_BigQuery(auth).jobs().insert( projectId=project.id, body=body, media_body=media).execute(run=False) execution = job.execute() response = None while response is None: status, response = job.next_chunk() if project.verbose and status: print("Uploaded %d%%." % int(status.progress() * 100)) if project.verbose: print("Uploaded 100%") if wait: job_wait(auth, job.execute()) else: return job # if it does not exist and write, clear the table elif disposition == 'WRITE_TRUNCATE': if project.verbose: print("BIGQUERY: No data, clearing table.") body = { "tableReference": { "projectId": project_id, "datasetId": dataset_id, "tableId": table_id }, "schema": { "fields": schema } } # change project_id to be project.id, better yet project.cloud_id from JSON API_BigQuery(auth).tables().insert(projectId=project.id, datasetId=dataset_id, body=body).execute()
def query_to_rows(auth, project_id, dataset_id, query, row_max=None, legacy=True, as_object=False): if project.verbose: print('BIGQUERY QUERY:', project_id, dataset_id) # Create the query body = { 'kind': 'bigquery#queryRequest', 'query': query, 'timeoutMs': 10000, 'dryRun': False, 'useQueryCache': True, 'useLegacySql': legacy } if row_max: body['maxResults'] = row_max if dataset_id: body['defaultDataset'] = { 'projectId': project_id, 'datasetId': dataset_id } # wait for query to complete response = API_BigQuery(auth).jobs().query(projectId=project_id, body=body).execute() while not response['jobComplete']: sleep(5) response = API_BigQuery(auth).jobs().getQueryResults( projectId=project_id, jobId=response['jobReference']['jobId']).execute(iterate=False) # fetch query results schema = response.get('schema', {}).get('fields', None) row_count = 0 while 'rows' in response: for row in response['rows']: yield row_to_json(row, schema, as_object) row_count += 1 if 'PageToken' in response: response = API_BigQuery(auth).jobs().getQueryResults( projectId=project_id, jobId=response['jobReference']['jobId'], pageToken=response['PageToken']).execute(iterate=False) elif row_count < int(response['totalRows']): response = API_BigQuery(auth).jobs().getQueryResults( projectId=project_id, jobId=response['jobReference']['jobId'], startIndex=row_count).execute(iterate=False) else: break
def table_to_type(auth, project_id, dataset_id, table_id): if project.verbose: print('TABLE TYPE:', project_id, dataset_id, table_id) return API_BigQuery(auth).tables().get(projectId=project_id, datasetId=dataset_id, tableId=table_id).execute()['type']
def io_to_table(auth, project_id, dataset_id, table_id, data_bytes, source_format='CSV', schema=None, skip_rows=0, disposition='WRITE_TRUNCATE', wait=True): # if data exists, write data to table data_bytes.seek(0, 2) if data_bytes.tell() > 0: data_bytes.seek(0) media = MediaIoBaseUpload(data_bytes, mimetype='application/octet-stream', resumable=True, chunksize=BIGQUERY_CHUNKSIZE) body = { 'configuration': { 'load': { 'destinationTable': { 'projectId': project_id, 'datasetId': dataset_id, 'tableId': table_id, }, 'sourceFormat': source_format, # CSV, NEWLINE_DELIMITED_JSON 'writeDisposition': disposition, # WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY 'autodetect': True, 'allowJaggedRows': True, 'allowQuotedNewlines': True, 'ignoreUnknownValues': True, } } } if schema: body['configuration']['load']['schema'] = {'fields': schema} body['configuration']['load']['autodetect'] = False if disposition == 'WRITE_APPEND': body['configuration']['load']['autodetect'] = False if source_format == 'CSV': body['configuration']['load']['skipLeadingRows'] = skip_rows job = API_BigQuery(auth).jobs().insert( projectId=project.id, body=body, media_body=media).execute(run=False) execution = job.execute() response = None while response is None: status, response = job.next_chunk() if project.verbose and status: print('Uploaded %d%%.' % int(status.progress() * 100)) if project.verbose: print('Uploaded 100%') if wait: job_wait(auth, execution) else: return execution # if it does not exist and write, clear the table elif disposition == 'WRITE_TRUNCATE': if project.verbose: print('BIGQUERY: No data, clearing table.') table_create(auth, project_id, dataset_id, table_id, schema)