class DatasetBackupHandler(webapp2.RequestHandler): def __init__(self, request=None, response=None): super(DatasetBackupHandler, self).__init__(request, response) self.BQ = BigQuery() # now let's check if this task is not a retry of some previous (which # failed for some reason) if so - let's log when it hits the defined # mark so we can catch it on monitoring: Tasks.log_task_metadata_for(request=self.request) def post(self): project_id = self.request.get('projectId') dataset_id = self.request.get('datasetId') logging.info('Backing up dataset: ' + dataset_id) self.BQ.for_each_table(project_id=project_id, dataset_id=dataset_id, func=self.schedule_backup_task) # pylint: disable=R0201 def schedule_backup_task(self, projectId, datasetId, tableId): logging.info("Schedule_backup_task: '%s:%s.%s'", projectId, datasetId, tableId) task = Tasks.create(method='GET', url='/tasks/backups/table/{0}/{1}/{2}'.format( projectId, datasetId, tableId)) Tasks.schedule('backup-worker', task)
def __init__(self, request=None, response=None): super(DatasetBackupHandler, self).__init__(request, response) self.BQ = BigQuery() # now let's check if this task is not a retry of some previous (which # failed for some reason) if so - let's log when it hits the defined # mark so we can catch it on monitoring: Tasks.log_task_metadata_for(request=self.request)
def test_get_dataset_cached_should_only_call_bq_once_but_response_is_cached( self, _): # given self._create_http.return_value = \ self.__create_dataset_responses_with_only_one_response_for_get_dataset() # when bq = BigQuery() result1 = bq.get_dataset_cached('project', 'dataset') result2 = bq.get_dataset_cached('project', 'dataset') # then self.assertEqual(result1, result2)
class BackupScheduler(object): def __init__(self): self.big_query = BigQuery() self.request_correlation_id = str(uuid.uuid4()) def iterate_over_all_datasets_and_schedule_backups(self): custom_project_list = configuration.backup_settings_custom_project_list if custom_project_list: project_ids = custom_project_list logging.info( 'Only projects specified in the configuration will' ' be backed up: %s', project_ids) else: project_ids = list(self.big_query.list_project_ids()) logging.info('Scheduling backups of %s projects', len(project_ids)) for project_id in project_ids: try: self.__list_and_backup_datasets(project_id) except Exception as ex: error_message = 'Failed to list and backup datasets: ' + str( ex) ErrorReporting().report(error_message) def __list_and_backup_datasets(self, project_id): if project_id in configuration.projects_to_skip: logging.info('Skipping project: %s', project_id) return logging.info('Backing up project: %s, request_correlation_id: %s', project_id, self.request_correlation_id) for dataset_id in self.big_query.list_dataset_ids(project_id): try: self.__backup_dataset(project_id, dataset_id) except Exception as ex: error_message = 'Failed to backup dataset: ' + str(ex) ErrorReporting().report(error_message) def __backup_dataset(self, project_id, dataset_id): logging.info('Backing up dataset: %s', dataset_id) task = Tasks.create(url='/tasks/backups/dataset', params={ 'projectId': project_id, 'datasetId': dataset_id }, headers={ request_correlation_id.HEADER_NAME: self.request_correlation_id }) Tasks.schedule('backup-scheduler', task)
class TableRetention(object): def __init__(self, policy): self.big_query_service = BigQuery() self.policy = policy def perform_retention(self, table_reference, table_key): backups = Backup.get_all_backups_sorted(ndb.Key(urlsafe=table_key)) logging.debug("Fetched %s backups for the table: %s", len(backups), table_reference) if not ShouldPerformRetentionPredicate.test(backups): return logging.info("Retention policy used for table '%s': '%s'", table_reference, type(self.policy).__name__) for backup in self.policy\ .get_backups_eligible_for_deletion(backups=backups, table_reference=table_reference): self.__delete_backup_in_bq_and_update_datastore(backup) def __delete_backup_in_bq_and_update_datastore(self, backup): try: table_reference = TableReference(configuration.backup_project_id, backup.dataset_id, backup.table_id) self.big_query_service.delete_table(table_reference) logging.debug( u"Table %s deleted from BigQuery. " u"Updating datastore. Retention policy used: '%s'", table_reference, type(self.policy).__name__) Backup.mark_backup_deleted(backup.key) except TableNotFoundException: Backup.mark_backup_deleted(backup.key) logging.warning( u"Table '%s' was not found. But we updated datastore anyway", backup.table_id) except HttpError as ex: error_message = u"Unexpected HttpError occurred while deleting " \ u"table '{}', error: {}: {}"\ .format(backup.table_id, type(ex), ex) logging.exception(error_message) except Exception as ex: error_message = u"Could not delete backup '{}' error: {}: {}"\ .format(backup.table_id, type(ex), ex) logging.exception(error_message)
def __get_table_or_partition(project_id, dataset_id, table_id, partition_id): table_metadata = BigQuery().get_table( project_id, dataset_id, BigQueryTableMetadata.get_table_id_with_partition_id( table_id, partition_id)) return BigQueryTableMetadata(table_metadata)
def test_execute_query_when_executing_long_query(self, _): # given self._create_http.return_value = self.__execute_long_query_responses() # when result = BigQuery().execute_query("SELECT * FROM tableXYZ") # then self.assertEqual(result, [ { "f": [ { "v": "a-gcp-project2" }, { "v": "test1" } ] }, { "f": [ { "v": "a-gcp-project3" }, { "v": "smoke_test_US" } ] } ])
def __init__(self): big_query = BigQuery() self.querier = SLIViewQuerier(big_query, QualityQuerySpecification()) self.streamer = SLIResultsStreamer(table_id="SLI_backup_quality") self.table_newer_modification_predicate = SLITableNewerModificationPredicate( big_query) self.table_existence_predicate = SLITableExistsPredicate( big_query, QualityQuerySpecification)
def test_iterating_projects(self, _): # given self._create_http.return_value = self.__create_project_list_responses() bq = BigQuery() # when project_ids, next_page_token = bq.list_project_ids() # then self.assertEqual(self.count(project_ids), 3) self.assertEqual(next_page_token, '3') # when (next_page_token) project_ids, next_page_token = bq.list_project_ids( page_token=next_page_token) # then self.assertEqual(self.count(project_ids), 1) self.assertEqual(next_page_token, None)
def test_should_return_false_when_there_is_no_schema(self): # given sli_table = self.__create_non_partitioned_sli_table() # when exists = SLITableExistsPredicate(BigQuery(), LatencyQuerySpecification).exists(sli_table) # then self.assertFalse(exists)
def test_should_return_true_for_existing_partition(self): # given sli_table = self.__create_partitioned_sli_table() # when exists = SLITableExistsPredicate(BigQuery(), LatencyQuerySpecification).exists(sli_table) # then self.assertTrue(exists)
def test_iterating_tables(self, _): # given self._create_http.return_value = self.__create_tables_list_responses() # when tables_ids = BigQuery().list_table_ids("project1233", "dataset_id") # then self.assertEqual(self.count(tables_ids), 5)
def test_iterating_datasets(self, _): # given self._create_http.return_value = self.__create_dataset_list_responses() # when dataset_ids = BigQuery().list_dataset_ids("project123") # then self.assertEqual(self.count(dataset_ids), 3)
def test_iterating_projects(self, _): # given self._create_http.return_value = self.__create_project_list_responses() # when project_ids = BigQuery().list_project_ids() # then self.assertEqual(self.count(project_ids), 4)
def test_iterating_datasets(self, _): # given self._create_http.return_value = self.__create_dataset_list_responses() bq = BigQuery() # when dataset_ids, next_page_token = bq.list_dataset_ids("project123") # then self.assertEqual(self.count(dataset_ids), 2) self.assertEqual(next_page_token, 'FMLMpsxvgM') # when dataset_ids, next_page_token = bq.list_dataset_ids( "project123", page_token=next_page_token) # then self.assertEqual(self.count(dataset_ids), 1) self.assertEqual(next_page_token, None)
def start(table_reference): big_query_table_metadata = BigQueryTableMetadata.get_table_by_reference( table_reference) BackupProcess( table_reference=table_reference, big_query=BigQuery(), big_query_table_metadata=big_query_table_metadata, should_backup_predicate=OnDemandBackupPredicate()).start()
def test_listing_table_partitions_when_table_not_exist_should_throw_table_not_found_exception( self, _): # given self._create_http.return_value = self.__create_table_partititions_list_responses_table_404_not_found( ) # when & then with self.assertRaises(TableNotFoundException) as exception: BigQuery().list_table_partitions("project123", "dataset123", "table123")
def test_when_dataset_not_exist_then_iterating_tables_should_not_return_any_table( self, _): # given self._create_http.return_value = self.__create_dataset_not_found_during_tables_list_responses() # when tables_ids = BigQuery().list_table_ids("projectid", "dataset_id") # then self.assertEqual(self.count(tables_ids), 0)
def test_iterating_tables_should_retry_if_gets_http_503_response_once( self, func, _, _1): # given self._create_http.return_value = self.__create_tables_list_responses_with_503() # when BigQuery().for_each_table("project1233", "dataset_id", func) # then self.assertEquals(5, func.call_count)
def test_iterating_tables(self, _): # given self._create_http.return_value = self.__create_tables_list_responses() bq = BigQuery() # when tables_ids, next_page_token = bq.list_table_ids( "project1233", "dataset_id") # then self.assertEqual(self.count(tables_ids), 4) self.assertEqual(next_page_token, 'table_id_5') # when tables_ids, next_page_token = bq.list_table_ids( "project1233", "dataset_id", page_token=next_page_token) # then self.assertEqual(self.count(tables_ids), 1) self.assertEqual(next_page_token, None)
def test_insert_job_forwarding_503_error(self, _): # given self._create_http.return_value = self.__create_503_response() # when with self.assertRaises(HttpError) as context: BigQuery().insert_job('project_id', {}) # then self.assertEqual(context.exception.resp.status, 503)
def test_should_not_list_partitions_in_non_partitioned_table(self, list_table_partitions): # given sli_table = self.__create_non_partitioned_sli_table() # when exists = SLITableExistsPredicate(BigQuery(), LatencyQuerySpecification).exists(sli_table) # then self.assertTrue(exists) list_table_partitions.assert_not_called()
def __init__(self, x_days): self.x_days = x_days big_query = BigQuery() self.streamer = SLIResultsStreamer( table_id="SLI_backup_creation_latency" ) self.table_existence_predicate = SLITableExistsPredicate(big_query, LatencyQuerySpecification) self.table_recreation_predicate = SLITableRecreationPredicate(big_query) self.table_emptiness_predicate = SLITableEmptinessPredicate(big_query) self.table_has_any_backup_predicate = SLITableHasAnyBackupPredicate()
def test_should_return_false_if_backup_table_doesnt_exists(self): # given sli_table = self.__create_sli_entry_without_census_data() # when is_not_seen_by_census = SLIBackupTableNotSeenByCensusPredicate( BigQuery(), QualityQuerySpecification).is_not_seen_by_census( sli_table) # then self.assertFalse(is_not_seen_by_census)
def __init__(self, x_days): self.x_days = x_days big_query = BigQuery() self.querier = SLIViewQuerier(big_query, LatencyQuerySpecification(self.x_days)) self.streamer = SLIResultsStreamer( table_id="SLI_backup_creation_latency") self.table_existence_predicate = SLITableExistsPredicate( big_query, LatencyQuerySpecification) self.table_recreation_predicate = SLITableRecreationPredicate( big_query)
def test_should_return_false_if_backup_table_havent_data_from_census_and_datastore_num_bytes_are_different_than_reality( self): # given sli_table = self.__create_sli_entry_without_census_data() # when is_not_seen_by_census = SLIBackupTableNotSeenByCensusPredicate( BigQuery(), QualityQuerySpecification).is_not_seen_by_census( sli_table) # then self.assertFalse(is_not_seen_by_census)
def create_the_same_empty_table(self, target_reference): body = { "tableReference": { "projectId": target_reference.get_project_id(), "datasetId": target_reference.get_dataset_id(), "tableId": target_reference.get_table_id(), }, "timePartitioning": self.table_metadata.get("timePartitioning"), "schema": self.table_metadata.get("schema") } BigQuery().create_table(target_reference.get_project_id(), target_reference.get_dataset_id(), body)
def __schedule(source_big_query_table, target_big_query_table, job_id, create_disposition, write_disposition): logging.info("Scheduling job ID: " + job_id) target_project_id = target_big_query_table.get_project_id() job_data = { "jobReference": { "jobId": job_id, "projectId": target_project_id }, "configuration": { "copy": { "sourceTable": { "projectId": source_big_query_table.get_project_id(), "datasetId": source_big_query_table.get_dataset_id(), "tableId": source_big_query_table.get_table_id(), }, "destinationTable": { "projectId": target_project_id, "datasetId": target_big_query_table.get_dataset_id(), "tableId": target_big_query_table.get_table_id(), }, "createDisposition": create_disposition, "writeDisposition": write_disposition } } } try: job_reference = BigQuery().insert_job(target_project_id, job_data) logging.info("Successfully insert: %s", job_reference) return job_reference except HttpError as bq_error: copy_job_error = BigQueryJobError(bq_error, source_big_query_table, target_big_query_table) if copy_job_error.is_deadline_exceeded(): job_json = CopyJobService.__get_job(job_id, target_project_id, copy_job_error.location) return CopyJobService.__to_bq_job_reference(job_json) elif copy_job_error.should_be_retried(): logging.warning(copy_job_error) return BigQueryJobReference( project_id=target_project_id, job_id=job_id, location=BigQueryTableMetadata. get_table_by_big_query_table( source_big_query_table).get_location()) else: logging.exception(copy_job_error) return copy_job_error except Exception as error: logging.error("%s Exception thrown during Copy Job creation: %s", type(error), error) raise error
def test_listing_table_partitions(self, _): # given self._create_http.return_value = self.__create_table_partititions_list_responses() # when partitions = BigQuery() \ .list_table_partitions("project123", "dataset123", "table123") # then self.assertEqual(self.count(partitions), 5) self.assertEqual(partitions[0]['partitionId'], '20170317') self.assertEqual(partitions[0]['creationTime'], '2017-03-17 14:32:17.755000') self.assertEqual(partitions[0]['lastModifiedTime'], '2017-03-17 14:32:19.289000')
class OrganizationBackupScheduler(object): def __init__(self): self.big_query = BigQuery() self.custom_projects_list = configuration.backup_settings_custom_project_list self.projects_to_skip = configuration.projects_to_skip def schedule_backup(self, page_token=None): if self.custom_projects_list: self._schedule_project_backup_scheduler_for_custom_project_list() return projects_ids_to_backup, next_page_token = self.big_query.list_project_ids( page_token=page_token) self._schedule_project_backup_scheduler_tasks(projects_ids_to_backup) if next_page_token: logging.info( u'Scheduling Organisation Backup Scheduler task for page_token: %s', next_page_token) Tasks.schedule( 'backup-scheduler', TaskCreator.create_organisation_backup_scheduler_task( page_token=next_page_token)) def _schedule_project_backup_scheduler_tasks(self, project_ids): logging.info( u'Scheduling Project Backup Scheduler tasks for %s projects: %s', len(project_ids), project_ids) tasks = [] for project_id in project_ids: if project_id not in self.projects_to_skip: tasks.append( TaskCreator.create_project_backup_scheduler_task( project_id=project_id)) else: logging.info(u'Project %s is skipped.', project_id) Tasks.schedule('backup-scheduler', tasks) def _schedule_project_backup_scheduler_for_custom_project_list(self): logging.info( u'Custom project list is defined. Only projects defined in configuration will be scheduled for backup' ) self._schedule_project_backup_scheduler_tasks( self.custom_projects_list)