Ejemplo n.º 1
0
 def _gs_to_bq(self, configs):
     self._logger.debug('Starting gs to bq...')
     start_timestamp = datetime.now()
     nb_of_batches = len(configs) // self._mcgj + 1
     for i in range(nb_of_batches):
         jobs = []
         for config in configs[i * self._mcgj:(i + 1) * self._mcgj]:
             if not self.exist_in_gs(data_name=config.data_name):
                 raise ValueError('There is no data named {} in gs'.format(
                     config.data_name))
             job_config = bigquery.job.LoadJobConfig()
             job_config.field_delimiter = self._separator
             job_config.schema = config.schema
             job_config.skip_leading_rows = 1
             job_config.write_disposition = config.write_disposition
             job = self._bq_client.load_table_from_uri(
                 source_uris=self.list_blob_uris(
                     data_name=config.data_name),
                 destination=self._dataset_ref.table(
                     table_id=config.data_name),
                 job_config=job_config)
             jobs.append(job)
         wait_for_jobs(jobs=jobs)
     for config in configs:
         if config.delete_in_source:
             self.delete_in_gs(data_name=config.data_name)
     end_timestamp = datetime.now()
     duration = (end_timestamp - start_timestamp).seconds
     self._logger.debug('Ended gs to bq [{}s]'.format(duration))
     return duration
Ejemplo n.º 2
0
 def _bq_to_gs(self, configs):
     self._logger.debug('Starting bq to gs...')
     start_timestamp = datetime.now()
     for config in configs:
         self.delete_in_gs(data_name=config.data_name)
     nb_of_batches = len(configs) // self._mcgj + 1
     for i in range(nb_of_batches):
         jobs = []
         for config in configs[i * self._mcgj:(i + 1) * self._mcgj]:
             if not self.exist_in_bq(data_name=config.data_name):
                 raise ValueError('There is no data named {} in bq'.format(
                     config.data_name))
             source = self._dataset_ref.table(table_id=config.data_name)
             job_config = bigquery.job.ExtractJobConfig()
             job_config.compression = self._bq_to_gs_compression
             destination_uri = self._gs_dir_uri + '/' + config.data_name + self._bq_to_gs_ext
             job_config.field_delimiter = self._separator
             job = self._bq_client.extract_table(
                 source=source,
                 destination_uris=destination_uri,
                 job_config=job_config)
             jobs.append(job)
         wait_for_jobs(jobs=jobs)
     for config in configs:
         if config.delete_in_source:
             self.delete_in_bq(data_name=config.data_name)
     end_timestamp = datetime.now()
     duration = (end_timestamp - start_timestamp).seconds
     self._logger.debug('Ended bq to gs [{}s]'.format(duration))
     return duration
Ejemplo n.º 3
0
def populate_dataset(table_ids=table_ids_default):
    jobs = []
    for n in table_ids:
        table_ref = dataset_ref.table(n)
        job_config = bigquery.QueryJobConfig()
        job_config.destination = table_ref
        job = bq_client.query(query="select 'data_{}' as x".format(n),
                              job_config=job_config)
        jobs.append(job)
    wait_for_jobs(jobs=jobs)
Ejemplo n.º 4
0
 def _query_to_bq(self, configs):
     self._logger.debug('Starting query to bq...')
     start_timestamp = datetime.now()
     total_bytes_billed_list = []
     nb_of_batches = len(configs) // self._mcgj + 1
     for i in range(nb_of_batches):
         jobs = []
         for config in configs[i * self._mcgj:(i + 1) * self._mcgj]:
             job_config = bigquery.job.QueryJobConfig()
             job_config.destination = self._dataset_ref.table(
                 table_id=config.data_name)
             job_config.write_disposition = config.write_disposition
             job = self._bq_client.query(query=config.query,
                                         job_config=job_config)
             jobs.append(job)
         wait_for_jobs(jobs=jobs)
         total_bytes_billed_list += [j.total_bytes_billed for j in jobs]
     costs = [round(tbb / 10**12 * 5, 5) for tbb in total_bytes_billed_list]
     cost = sum(costs)
     end_timestamp = datetime.now()
     duration = (end_timestamp - start_timestamp).seconds
     self._logger.debug('Ended query to bq [{}s, {}$]'.format(
         duration, cost))
     return duration, cost, costs
Ejemplo n.º 5
0
 def _execute_bq_client_loads(self, atomic_configs):
     configs = atomic_configs
     jobs = [self._launch_bq_client_job(c) for c in configs]
     utils.wait_for_jobs(jobs)
     return jobs