def process_hpo_copy(hpo_id): """copies over files from hpo bucket to drc bucket :hpo_id: hpo from which to copy """ try: project_id = app_identity.get_application_id() storage_client = StorageClient(project_id) hpo_bucket = storage_client.get_hpo_bucket(hpo_id) drc_private_bucket = storage_client.get_drc_bucket() source_bucket = storage_client.bucket(hpo_bucket) destination_bucket = storage_client.bucket(drc_private_bucket) bucket_items = list_bucket(hpo_bucket) ignored_items = 0 filtered_bucket_items = [] for item in bucket_items: item_root = item['name'].split('/')[0] + '/' if item_root.lower() in common.IGNORE_DIRECTORIES: ignored_items += 1 else: filtered_bucket_items.append(item) logging.info(f"Ignoring {ignored_items} items in {hpo_bucket}") prefix = f'{hpo_id}/{hpo_bucket}/' for item in filtered_bucket_items: item_name = item['name'] source_blob = source_bucket.get_blob(item_name) destination_blob_name = f'{prefix}{item_name}' source_bucket.copy_blob(source_blob, destination_bucket, destination_blob_name) except BucketDoesNotExistError as bucket_error: bucket = bucket_error.bucket # App engine converts an env var set but left empty to be the string 'None' if bucket and bucket.lower() != 'none': logging.warning( f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' does not exist" ) else: logging.info( f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' is empty/unset" ) except HttpError as http_error: message = ( f"Failed to copy files for hpo_id '{hpo_id}' due to the following " f"HTTP error: {http_error.content.decode()}") logging.exception(message)
class RetractDataGcsTest(TestCase): @classmethod def setUpClass(cls): print('**************************************************************') print(cls.__name__) print('**************************************************************') def setUp(self): self.project_id = app_identity.get_application_id() self.hpo_id = test_util.FAKE_HPO_ID self.bucket = os.environ.get(f'BUCKET_NAME_FAKE') self.site_bucket = 'test_bucket' self.folder_1 = '2019-01-01-v1/' self.folder_2 = '2019-02-02-v2/' self.client = StorageClient(self.project_id) self.folder_prefix_1 = f'{self.hpo_id}/{self.site_bucket}/{self.folder_1}' self.folder_prefix_2 = f'{self.hpo_id}/{self.site_bucket}/{self.folder_2}' self.pids = [17, 20] self.skip_pids = [10, 25] self.project_id = 'project_id' self.sandbox_dataset_id = os.environ.get('UNIONED_DATASET_ID') self.pid_table_id = 'pid_table' self.gcs_bucket = self.client.bucket(self.bucket) self.client.empty_bucket(self.gcs_bucket) @patch('retraction.retract_data_gcs.extract_pids_from_table') @patch('gcs_utils.get_drc_bucket') @patch('gcs_utils.get_hpo_bucket') def test_integration_five_person_data_retraction_skip( self, mock_hpo_bucket, mock_bucket, mock_extract_pids): mock_hpo_bucket.return_value = self.site_bucket mock_bucket.return_value = self.bucket mock_extract_pids.return_value = self.skip_pids lines_to_remove = {} expected_lines_post = {} for file_path in test_util.FIVE_PERSONS_FILES: # generate results files file_name = file_path.split('/')[-1] lines_to_remove[file_name] = 0 with open(file_path, 'rb') as f: # skip header next(f) expected_lines_post[file_name] = [] for line in f: line = line.strip() if line != b'': expected_lines_post[file_name].append(line) # write file to cloud for testing blob = self.gcs_bucket.blob(self.folder_prefix_1 + file_name) blob.upload_from_file(f, rewind=True, content_type='text/csv') blob = self.gcs_bucket.blob(self.folder_prefix_2 + file_name) blob.upload_from_file(f, rewind=True, content_type='text/csv') rd.run_gcs_retraction(self.project_id, self.sandbox_dataset_id, self.pid_table_id, self.hpo_id, folder='all_folders', force_flag=True, bucket=self.bucket, site_bucket=self.site_bucket) total_lines_post = {} for file_path in test_util.FIVE_PERSONS_FILES: file_name = file_path.split('/')[-1] blob = self.gcs_bucket.blob(self.folder_prefix_1 + file_name) actual_result_contents = blob.download_as_string().split(b'\n') # convert to list and remove header and last list item since it is a newline total_lines_post[file_name] = actual_result_contents[1:-1] for key in expected_lines_post: self.assertEqual(lines_to_remove[key], 0) self.assertListEqual(expected_lines_post[key], total_lines_post[key]) @patch('retraction.retract_data_gcs.extract_pids_from_table') @patch('gcs_utils.get_drc_bucket') @patch('gcs_utils.get_hpo_bucket') def test_integration_five_person_data_retraction(self, mock_hpo_bucket, mock_bucket, mock_extract_pids): mock_hpo_bucket.return_value = self.site_bucket mock_bucket.return_value = self.bucket mock_extract_pids.return_value = self.pids expected_lines_post = {} for file_path in test_util.FIVE_PERSONS_FILES: # generate results files file_name = file_path.split('/')[-1] table_name = file_name.split('.')[0] expected_lines_post[file_name] = [] with open(file_path, 'rb') as f: # skip header next(f) expected_lines_post[file_name] = [] for line in f: line = line.strip() if line != b'': if not ((table_name in rd.PID_IN_COL1 and int(line.split(b",")[0]) in self.pids) or (table_name in rd.PID_IN_COL2 and int(line.split(b",")[1]) in self.pids)): expected_lines_post[file_name].append(line) # write file to cloud for testing blob = self.gcs_bucket.blob(self.folder_prefix_1 + file_name) blob.upload_from_file(f, rewind=True, content_type='text/csv') blob = self.gcs_bucket.blob(self.folder_prefix_2 + file_name) blob.upload_from_file(f, rewind=True, content_type='text/csv') rd.run_gcs_retraction(self.project_id, self.sandbox_dataset_id, self.pid_table_id, self.hpo_id, folder='all_folders', force_flag=True, bucket=self.bucket, site_bucket=self.site_bucket) total_lines_post = {} for file_path in test_util.FIVE_PERSONS_FILES: file_name = file_path.split('/')[-1] blob = self.gcs_bucket.blob(self.folder_prefix_1 + file_name) actual_result_contents = blob.download_as_string().split(b'\n') # convert to list and remove header and last list item since it is a newline total_lines_post[file_name] = actual_result_contents[1:-1] for key in expected_lines_post: self.assertListEqual(expected_lines_post[key], total_lines_post[key]) def tearDown(self): self.client.empty_bucket(self.gcs_bucket)
def _validation_done(bucket, folder): project_id = app_identity.get_application_id() storage_client = StorageClient(project_id) bucket = storage_client.bucket(bucket) return Blob(bucket=bucket, name=f'{folder}{common.PROCESSED_TXT}').exists(storage_client)
class GcsClientTest(unittest.TestCase): @classmethod def setUpClass(cls): print('**************************************************************') print(cls.__name__) print('**************************************************************') def setUp(self): self.project_id = app_identity.get_application_id() self.client = StorageClient(self.project_id) self.bucket_name: str = os.environ.get('BUCKET_NAME_FAKE') self.prefix: str = 'prefix' self.data: bytes = b'bytes' # NOTE: this needs to be in sorted order self.sub_prefixes: tuple = (f'{self.prefix}/a', f'{self.prefix}/b', f'{self.prefix}/c', f'{self.prefix}/d') self.client.empty_bucket(self.bucket_name) self._stage_bucket() def test_get_bucket_items_metadata(self): items_metadata: list = self.client.get_bucket_items_metadata( self.bucket_name) actual_metadata: list = [item['name'] for item in items_metadata] expected_metadata: list = [ f'{prefix}/obj.txt' for prefix in self.sub_prefixes ] self.assertCountEqual(actual_metadata, expected_metadata) self.assertIsNotNone(items_metadata[0]['id']) def test_get_blob_metadata(self): bucket = self.client.get_bucket(self.bucket_name) blob_name: str = f'{self.sub_prefixes[0]}/obj.txt' blob = bucket.blob(blob_name) metadata: dict = self.client.get_blob_metadata(blob) self.assertIsNotNone(metadata['id']) self.assertIsNotNone(metadata['name']) self.assertIsNotNone(metadata['bucket']) self.assertIsNotNone(metadata['generation']) self.assertIsNotNone(metadata['metageneration']) self.assertIsNotNone(metadata['contentType']) self.assertIsNotNone(metadata['storageClass']) self.assertIsNotNone(metadata['size']) self.assertIsNotNone(metadata['md5Hash']) self.assertIsNotNone(metadata['crc32c']) self.assertIsNotNone(metadata['etag']) self.assertIsNotNone(metadata['updated']) self.assertIsNotNone(metadata['timeCreated']) self.assertEqual(metadata['name'], blob_name) self.assertEqual(metadata['size'], len(self.data)) def test_empty_bucket(self): self.client.empty_bucket(self.bucket_name) items: list = self.client.list_blobs(self.bucket_name) # check that bucket is empty self.assertCountEqual(items, []) def test_list_sub_prefixes(self): items: list = self.client.list_sub_prefixes(self.bucket_name, self.prefix) # Check same number of elements self.assertEqual(len(self.sub_prefixes), len(items)) # Check same prefix for index, item in enumerate(items): self.assertEqual(item[:-1], self.sub_prefixes[index]) def _stage_bucket(self): bucket = self.client.bucket(self.bucket_name) for sub_prefix in self.sub_prefixes: blob = bucket.blob(f'{sub_prefix}/obj.txt') blob.upload_from_string(self.data) def tearDown(self): self.client.empty_bucket(self.bucket_name)