def _bq_get_data(self): hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) table_ref = TableReference.from_string(self.source_project_dataset_table) self.log.info('Fetching Data from:') self.log.info('Dataset: %s, Table: %s', table_ref.dataset_id, table_ref.table_id) conn = hook.get_conn() cursor = conn.cursor() i = 0 while True: response = cursor.get_tabledata( dataset_id=table_ref.dataset_id, table_id=table_ref.table_id, max_results=self.batch_size, selected_fields=self.selected_fields, start_index=i * self.batch_size, ) if 'rows' not in response: self.log.info('Job Finished') return rows = response['rows'] self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) table_data = [] table_data = [[fields['v'] for fields in dict_row['f']] for dict_row in rows] yield table_data i += 1
def _table_reference(self, provided_schema_name, provided_table_name, client_project): project_id_from_table, dataset_id_from_table, table_id = self._split_table_name( provided_table_name) project_id_from_schema = None dataset_id_from_schema = None if provided_schema_name is not None: provided_schema_name_split = provided_schema_name.split(".") if len(provided_schema_name_split) == 1: if dataset_id_from_table: project_id_from_schema = provided_schema_name_split[0] else: dataset_id_from_schema = provided_schema_name_split[0] elif len(provided_schema_name_split) == 2: project_id_from_schema = provided_schema_name_split[0] dataset_id_from_schema = provided_schema_name_split[1] else: raise ValueError("Did not understand schema: {}".format( provided_schema_name)) if (dataset_id_from_schema and dataset_id_from_table and dataset_id_from_schema != dataset_id_from_table): raise ValueError( "dataset_id specified in schema and table_name disagree: " "got {} in schema, and {} in table_name".format( dataset_id_from_schema, dataset_id_from_table)) if (project_id_from_schema and project_id_from_table and project_id_from_schema != project_id_from_table): raise ValueError( "project_id specified in schema and table_name disagree: " "got {} in schema, and {} in table_name".format( project_id_from_schema, project_id_from_table)) project_id = project_id_from_schema or project_id_from_table or client_project dataset_id = dataset_id_from_schema or dataset_id_from_table or self.dataset_id table_ref = TableReference.from_string("{}.{}.{}".format( project_id, dataset_id, table_id)) return table_ref
def execute(self, context): self.log.info( 'Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris, ) hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) table_ref = TableReference.from_string( self.source_project_dataset_table, hook.project_id) configuration: Dict[str, Any] = { 'extract': { 'sourceTable': table_ref.to_api_repr(), 'compression': self.compression, 'destinationUris': self.destination_cloud_storage_uris, 'destinationFormat': self.export_format, } } if self.labels: configuration['labels'] = self.labels if self.export_format == 'CSV': # Only set fieldDelimiter and printHeader fields if using CSV. # Google does not like it if you set these fields for other export # formats. configuration['extract']['fieldDelimiter'] = self.field_delimiter configuration['extract']['printHeader'] = self.print_header hook.insert_job(configuration=configuration)
def bq_insert(rows: List): """ Inserts rows into BigQuery :param rows: list of dictionaries which are representing rows :return: """ from google.cloud import bigquery if not rows: logging.error("no rows to upload") return bq = bigquery.Client(project=GCP_PROJECT) table_ref = TableReference.from_string( f"{GCP_PROJECT}.live.om_state_latencies") schema = [ { "name": "date", "type": "DATE" }, { "name": "sym", "type": "STRING" }, { "name": "from_state", "type": "STRING" }, { "name": "to_state", "type": "STRING" }, { "name": "count", "type": "INTEGER" }, { "name": "average", "type": "FLOAT" }, { "name": "percentile_10", "type": "FLOAT" }, { "name": "percentile_50", "type": "FLOAT" }, { "name": "percentile_90", "type": "FLOAT" }, { "name": "percentile_99", "type": "FLOAT" }, { "name": "percentile_99_99", "type": "FLOAT" }, ] table = Table(table_ref) table.schema = schema table = bq.create_table(table, exists_ok=True) logging.info("inserting {} rows".format(len(rows))) res = bq.insert_rows(table, rows) logging.info(res)