Exemple #1
0
    def test_python_nested_bundle_upload_download(self):
        bundle_path = os.path.join(TEST_DIR, "upload", "data")
        uploaded_paths = [x.path for x in iter_paths(str(bundle_path))]
        uploaded_files = [
            object_name_builder(p, bundle_path) for p in uploaded_paths
        ]
        client = hca.dss.DSSClient(
            swagger_url=
            "https://dss.dev.data.humancellatlas.org/v1/swagger.json")

        manifest = client.upload(src_dir=bundle_path,
                                 replica="aws",
                                 staging_bucket=self.staging_bucket)
        manifest_files = manifest['files']
        self.assertEqual(
            list(file['name'] for file in manifest_files).sort(),
            uploaded_files.sort())
        bundle_uuid = manifest['bundle_uuid']
        with self.subTest(bundle_uuid=bundle_uuid):
            with tempfile.TemporaryDirectory() as dest_dir:
                client.download(bundle_uuid=bundle_uuid,
                                replica='aws',
                                download_dir=dest_dir)
                downloaded_file_names = [x.path for x in iter_paths(dest_dir)]
                downloaded_file_paths = [
                    object_name_builder(p, dest_dir)
                    for p in downloaded_file_names
                ]
                self.assertEqual(uploaded_files.sort(),
                                 downloaded_file_paths.sort())
Exemple #2
0
 def test_python_open_file_limit_upload(self):
     import resource
     old_open_file_limits = resource.getrlimit(resource.RLIMIT_NOFILE)
     bundle_path = os.path.join(TEST_DIR, "upload", "many-files")
     uploaded_paths = [x.path for x in iter_paths(str(bundle_path))]
     uploaded_files = [
         object_name_builder(p, bundle_path) for p in uploaded_paths
     ]
     client = hca.dss.DSSClient(
         swagger_url=
         'https://dss.integration.data.humancellatlas.org/v1/swagger.json')
     resource.setrlimit(resource.RLIMIT_NOFILE,
                        (30, old_open_file_limits[1]))
     try:
         manifest = client.upload(src_dir=bundle_path,
                                  replica="aws",
                                  staging_bucket=self.staging_bucket)
     except Exception:
         raise
     else:
         manifest_files = manifest['files']
         self.assertEqual(
             list(file['name'] for file in manifest_files).sort(),
             uploaded_files.sort())
     finally:
         resource.setrlimit(resource.RLIMIT_NOFILE, old_open_file_limits)
Exemple #3
0
    def upload(self,
               src_dir,
               replica,
               staging_bucket,
               timeout_seconds=1200,
               no_progress=False,
               bundle_uuid=None):
        """
        Upload a directory of files from the local filesystem and create a bundle containing the uploaded files.

        :param str src_dir: file path to a directory of files to upload to the replica.
        :param str replica: the replica to upload to. The supported replicas are: `aws` for Amazon Web Services, and
            `gcp` for Google Cloud Platform. [aws, gcp]
        :param str staging_bucket: a client controlled AWS S3 storage bucket to upload from.
        :param int timeout_seconds: the time to wait for a file to upload to replica.
        :param bool no_progress: if set, will not report upload progress. Note that even if this flag
                                 is not set, progress will not be reported if the logging level is higher
                                 than INFO or if the session is not interactive.

        Upload a directory of files from the local filesystem and create a bundle containing the uploaded files.
        This method requires the use of a client-controlled object storage bucket to stage the data for upload.
        """
        bundle_uuid = bundle_uuid if bundle_uuid else str(uuid.uuid4())
        version = datetime.utcnow().strftime("%Y-%m-%dT%H%M%S.%fZ")

        files_to_upload, files_uploaded = [], []
        for filename in iter_paths(src_dir):
            full_file_name = filename.path
            files_to_upload.append(open(full_file_name, "rb"))

        logger.info("Uploading %i files from %s to %s", len(files_to_upload),
                    src_dir, staging_bucket)
        file_uuids, uploaded_keys, abs_file_paths = upload_to_cloud(
            files_to_upload,
            staging_bucket=staging_bucket,
            replica=replica,
            from_cloud=False,
            log_progress=not no_progress)
        for file_handle in files_to_upload:
            file_handle.close()
        filenames = [object_name_builder(p, src_dir) for p in abs_file_paths]
        filename_key_list = list(zip(filenames, file_uuids, uploaded_keys))

        for filename, file_uuid, key in filename_key_list:
            filename = filename.replace('\\', '/')  # for windows paths
            if filename.startswith('/'):
                filename = filename.lstrip('/')
            logger.info("File %s: registering...", filename)

            # Generating file data
            creator_uid = self.config.get("creator_uid", 0)
            source_url = "s3://{}/{}".format(staging_bucket, key)
            logger.info("File %s: registering from %s -> uuid %s", filename,
                        source_url, file_uuid)

            response = self.put_file._request(
                dict(uuid=file_uuid,
                     bundle_uuid=bundle_uuid,
                     version=version,
                     creator_uid=creator_uid,
                     source_url=source_url))
            files_uploaded.append(
                dict(name=filename,
                     version=version,
                     uuid=file_uuid,
                     creator_uid=creator_uid))

            if response.status_code in (requests.codes.ok,
                                        requests.codes.created):
                logger.info("File %s: Sync copy -> %s", filename, version)
            else:
                assert response.status_code == requests.codes.accepted
                logger.info("File %s: Async copy -> %s", filename, version)

                timeout = time.time() + timeout_seconds
                wait = 1.0
                while time.time() < timeout:
                    try:
                        self.head_file(uuid=file_uuid,
                                       replica="aws",
                                       version=version)
                        break
                    except SwaggerAPIException as e:
                        if e.code != requests.codes.not_found:
                            msg = "File {}: Unexpected server response during registration"
                            req_id = 'X-AWS-REQUEST-ID: {}'.format(
                                response.headers.get("X-AWS-REQUEST-ID"))
                            raise RuntimeError(msg.format(filename), req_id)
                        time.sleep(wait)
                        wait = min(60.0, wait * self.UPLOAD_BACKOFF_FACTOR)
                else:
                    # timed out. :(
                    req_id = 'X-AWS-REQUEST-ID: {}'.format(
                        response.headers.get("X-AWS-REQUEST-ID"))
                    raise RuntimeError(
                        "File {}: registration FAILED".format(filename),
                        req_id)
                logger.debug("Successfully uploaded file")

        file_args = [{
            'indexed': file_["name"].endswith(".json"),
            'name': file_['name'],
            'version': file_['version'],
            'uuid': file_['uuid']
        } for file_ in files_uploaded]

        logger.info("%s", "Bundle {}: Registering...".format(bundle_uuid))

        response = self.put_bundle(uuid=bundle_uuid,
                                   version=version,
                                   replica=replica,
                                   creator_uid=creator_uid,
                                   files=file_args)
        logger.info("%s",
                    "Bundle {}: Registered successfully".format(bundle_uuid))

        return {
            "bundle_uuid": bundle_uuid,
            "creator_uid": creator_uid,
            "replica": replica,
            "version": response["version"],
            "files": files_uploaded
        }