def _gs_to_bq(self, configs): self._logger.debug('Starting gs to bq...') start_timestamp = datetime.now() nb_of_batches = len(configs) // self._mcgj + 1 for i in range(nb_of_batches): jobs = [] for config in configs[i * self._mcgj:(i + 1) * self._mcgj]: if not self.exist_in_gs(data_name=config.data_name): raise ValueError('There is no data named {} in gs'.format( config.data_name)) job_config = bigquery.job.LoadJobConfig() job_config.field_delimiter = self._separator job_config.schema = config.schema job_config.skip_leading_rows = 1 job_config.write_disposition = config.write_disposition job = self._bq_client.load_table_from_uri( source_uris=self.list_blob_uris( data_name=config.data_name), destination=self._dataset_ref.table( table_id=config.data_name), job_config=job_config) jobs.append(job) wait_for_jobs(jobs=jobs) for config in configs: if config.delete_in_source: self.delete_in_gs(data_name=config.data_name) end_timestamp = datetime.now() duration = (end_timestamp - start_timestamp).seconds self._logger.debug('Ended gs to bq [{}s]'.format(duration)) return duration
def _bq_to_gs(self, configs): self._logger.debug('Starting bq to gs...') start_timestamp = datetime.now() for config in configs: self.delete_in_gs(data_name=config.data_name) nb_of_batches = len(configs) // self._mcgj + 1 for i in range(nb_of_batches): jobs = [] for config in configs[i * self._mcgj:(i + 1) * self._mcgj]: if not self.exist_in_bq(data_name=config.data_name): raise ValueError('There is no data named {} in bq'.format( config.data_name)) source = self._dataset_ref.table(table_id=config.data_name) job_config = bigquery.job.ExtractJobConfig() job_config.compression = self._bq_to_gs_compression destination_uri = self._gs_dir_uri + '/' + config.data_name + self._bq_to_gs_ext job_config.field_delimiter = self._separator job = self._bq_client.extract_table( source=source, destination_uris=destination_uri, job_config=job_config) jobs.append(job) wait_for_jobs(jobs=jobs) for config in configs: if config.delete_in_source: self.delete_in_bq(data_name=config.data_name) end_timestamp = datetime.now() duration = (end_timestamp - start_timestamp).seconds self._logger.debug('Ended bq to gs [{}s]'.format(duration)) return duration
def populate_dataset(table_ids=table_ids_default): jobs = [] for n in table_ids: table_ref = dataset_ref.table(n) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job = bq_client.query(query="select 'data_{}' as x".format(n), job_config=job_config) jobs.append(job) wait_for_jobs(jobs=jobs)
def _query_to_bq(self, configs): self._logger.debug('Starting query to bq...') start_timestamp = datetime.now() total_bytes_billed_list = [] nb_of_batches = len(configs) // self._mcgj + 1 for i in range(nb_of_batches): jobs = [] for config in configs[i * self._mcgj:(i + 1) * self._mcgj]: job_config = bigquery.job.QueryJobConfig() job_config.destination = self._dataset_ref.table( table_id=config.data_name) job_config.write_disposition = config.write_disposition job = self._bq_client.query(query=config.query, job_config=job_config) jobs.append(job) wait_for_jobs(jobs=jobs) total_bytes_billed_list += [j.total_bytes_billed for j in jobs] costs = [round(tbb / 10**12 * 5, 5) for tbb in total_bytes_billed_list] cost = sum(costs) end_timestamp = datetime.now() duration = (end_timestamp - start_timestamp).seconds self._logger.debug('Ended query to bq [{}s, {}$]'.format( duration, cost)) return duration, cost, costs
def _execute_bq_client_loads(self, atomic_configs): configs = atomic_configs jobs = [self._launch_bq_client_job(c) for c in configs] utils.wait_for_jobs(jobs) return jobs