def test_push_objects(self, mock_dataset_with_cache_dir, temp_directories): with aioresponses() as mocked_responses: sb = get_storage_backend("gigantum_object_v1") ds = mock_dataset_with_cache_dir[0] sb.set_default_configuration(ds.namespace, "abcd", '1234') object_dir, compressed_dir = temp_directories obj1_id = uuid.uuid4().hex obj2_id = uuid.uuid4().hex obj1_src_path = helper_write_object(object_dir, obj1_id, 'abcd') obj2_src_path = helper_write_object(object_dir, obj2_id, '1234') assert os.path.isfile(obj1_src_path) is True assert os.path.isfile(obj2_src_path) is True objects = [PushObject(object_path=obj1_src_path, revision=ds.git.repo.head.commit.hexsha, dataset_path='myfile1.txt'), PushObject(object_path=obj2_src_path, revision=ds.git.repo.head.commit.hexsha, dataset_path='myfile2.txt') ] mocked_responses.put(f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1_id}', payload={ "presigned_url": f"https://dummyurl.com/{obj1_id}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj1_id, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj1_id}?params=1", payload={}, status=200) mocked_responses.put(f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2_id}', payload={ "presigned_url": f"https://dummyurl.com/{obj2_id}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj2_id, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj2_id}?params=1", payload={}, status=200) result = sb.push_objects(ds, objects, updater) assert len(result.success) == 2 assert len(result.failure) == 0 assert isinstance(result, PushResult) is True assert isinstance(result.success[0], PushObject) is True assert result.success[0].object_path != result.success[1].object_path assert result.success[0].object_path in [obj1_src_path, obj2_src_path] assert result.success[1].object_path in [obj1_src_path, obj2_src_path]
async def test_presigneds3upload_get_presigned_s3_url_skip(self, event_loop, mock_dataset_with_cache_dir): sb = get_storage_backend("gigantum_object_v1") sb.set_default_configuration("test-user", "abcd", '1234') ds = mock_dataset_with_cache_dir[0] object_id = "abcd1234" object_service_root = f"{sb._object_service_endpoint(ds)}/{ds.namespace}/{ds.name}" headers = sb._object_service_headers() upload_chunk_size = 40000 object_details = PushObject(object_path=f"/tmp/{object_id}", revision=ds.git.repo.head.commit.hexsha, dataset_path='myfile1.txt') psu = PresignedS3Upload(object_service_root, headers, upload_chunk_size, object_details) with aioresponses() as mocked_responses: async with aiohttp.ClientSession() as session: mocked_responses.put(f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{object_id}', payload={ "presigned_url": "https://dummyurl.com?params=1", "key_id": "asdfasdf", "namespace": ds.namespace, "obj_id": object_id, "dataset": ds.name }, status=403) await psu.get_presigned_s3_url(session) assert psu.presigned_s3_url == "" assert psu.skip_object is True
def get_failed_objects(self) -> List[PushObject]: """Get the failed objects from the underlying call to `pull_objects`""" failed_objs: List[PushObject] = list() if self._job_status: if 'failures' in self._job_status.meta: fail_str = self._job_status.meta['failures'] if len(fail_str) > 0: failure_data = fail_str.split(',') for fd in failure_data: obj_path, dataset_path, revision = fd.split("|") failed_objs.append( PushObject(object_path=obj_path, dataset_path=dataset_path, revision=revision)) return failed_objs
def objects_to_push(self, remove_duplicates: bool = False) -> List[PushObject]: """Return a list of named tuples of all objects that need to be pushed Returns: List[namedtuple] """ objects = list() if os.path.exists(self.push_dir): push_files = [ f for f in os.listdir(self.push_dir) if os.path.isfile(os.path.join(self.push_dir, f)) ] if push_files: object_ids: List[str] = list() for pf in push_files: if os.path.basename(pf) == '.DS_Store': continue if not self._commit_in_branch(pf): continue with open(os.path.join(self.push_dir, pf), 'rt') as pfh: lines = pfh.readlines() lines = sorted(lines) for line in lines: line = line.strip() dataset_path, object_path = line.split(',') _, object_id = object_path.rsplit('/', 1) # Handle de-duplicating objects if the backend supports it if remove_duplicates is True: if object_id in object_ids: continue object_ids.append(object_id) objects.append( PushObject(dataset_path=dataset_path, object_path=object_path, revision=pf)) objects = natsorted(objects, key=attrgetter('dataset_path')) return objects