Exemple #1
0
    def upload_bundle(self, source_file, bundle_type, worksheet_uuid):
        """
        Upload |source_file| (a stream) to |worksheet_uuid|.
        """
        # Construct info for creating the bundle.
        bundle_subclass = get_bundle_subclass(bundle_type) # program or data
        metadata = metadata_util.fill_missing_metadata(bundle_subclass, {}, initial_metadata={'name': source_file.filename, 'description': 'Upload ' + source_file.filename})
        info = {'bundle_type': bundle_type, 'metadata': metadata}

        # Upload it by creating a file handle and copying source_file to it (see RemoteBundleClient.upload_bundle in the CLI).
        remote_file_uuid = self.client.open_temp_file(metadata['name'])
        try:
            with closing(RPCFileHandle(remote_file_uuid, self.client.proxy)) as dest:
                file_util.copy(source_file.file, dest, autoflush=False, print_status='Uploading %s' % metadata['name'])
           
            pack = False  # For now, always unpack (note: do this after set remote_file_uuid, which needs the extension)
            if not pack and zip_util.path_is_archive(metadata['name']):
                metadata['name'] = zip_util.strip_archive_ext(metadata['name'])
           
            # Then tell the client that the uploaded file handle is there.
            new_bundle_uuid = self.client.finish_upload_bundle(
                [remote_file_uuid],
                not pack,  # unpack
                info,
                worksheet_uuid,
                True)  # add_to_worksheet
        except:
            self.client.finalize_file(remote_file_uuid)
            raise
        return new_bundle_uuid
    def upload_bundle(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, info, worksheet_uuid, add_to_worksheet):
        """
        See local_bundle_client.py for documentation on the usage.
        Strategy:
        1) We copy the |sources| to a temporary directory on the server
          (streaming either a tar or tar.gz depending on whether compression is
          needed).
        2) We politely ask the server to finish_upload_bundle (performs a
          LocalBundleClient.upload_bundle from the temporary directory).
        """
        # URLs can be directly passed to the local client.
        if all(path_util.path_is_url(source) for source in sources):
            return self.upload_bundle_url(sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, info, worksheet_uuid, add_to_worksheet)

        # 1) Copy sources up to the server (temporary remote zip file)
        remote_file_uuids = []
        for source in sources:
            remote_file_uuid = self.open_temp_file(zip_util.add_packed_suffix(os.path.basename(source)))
            remote_file_uuids.append(remote_file_uuid)
            dest_handle = RPCFileHandle(remote_file_uuid, self.proxy)
            if zip_util.path_is_archive(source):
                source_handle = open(source)
            else:
                source_handle = zip_util.open_packed_path(source, follow_symlinks, exclude_patterns)
                unpack = True  # We packed it, so we have to unpack it
            status = 'Uploading %s%s to %s' % (source, ' ('+info['uuid']+')' if 'uuid' in info else '', self.address)
            # FileServer does not expose an API for forcibly flushing writes, so
            # we rely on closing the file to flush it.
            file_util.copy(source_handle, dest_handle, autoflush=False, print_status=status)
            dest_handle.close()

        # 2) Install upload (this call will be in charge of deleting the temporary file).
        result = self.finish_upload_bundle(remote_file_uuids, unpack, info, worksheet_uuid, add_to_worksheet)

        return result
Exemple #3
0
def get_blob(uuid, path=''):
    """
    API to download the contents of a bundle or a subpath within a bundle.

    For directories this method always returns a tarred and gzipped archive of
    the directory.

    For files, if the request has an Accept-Encoding header containing gzip,
    then the returned file is gzipped.
    """
    check_bundles_have_read_permission(local.model, request.user, [uuid])
    bundle = local.model.get_bundle(uuid)

    target_info = local.download_manager.get_target_info(uuid, path, 0)
    if target_info is None:
        abort(httplib.NOT_FOUND, 'Not found.')

    # Figure out the file name.
    if not path and bundle.metadata.name:
        filename = bundle.metadata.name
    else:
        filename = target_info['name']

    if target_info['type'] == 'directory':
        # Always tar and gzip directories.
        filename = filename + '.tar.gz'
        fileobj = local.download_manager.stream_tarred_gzipped_directory(
            uuid, path)
    elif target_info['type'] == 'file':
        if not zip_util.path_is_archive(
                filename) and request_accepts_gzip_encoding():
            # Let's gzip to save bandwidth. The browser will transparently decode
            # the file.
            filename = filename + '.gz'
            fileobj = local.download_manager.stream_file(uuid,
                                                         path,
                                                         gzipped=True)
        else:
            fileobj = local.download_manager.stream_file(uuid,
                                                         path,
                                                         gzipped=False)
    else:
        # Symlinks.
        abort(httplib.FORBIDDEN, 'Cannot download files of this type.')

    # Set headers.
    mimetype, _ = mimetypes.guess_type(filename, strict=False)
    response.set_header('Content-Type', mimetype or 'text/plain')
    if zip_util.get_archive_ext(
            filename) == '.gz' and request_accepts_gzip_encoding():
        filename = zip_util.strip_archive_ext(filename)
        response.set_header('Content-Encoding', 'gzip')
    else:
        response.set_header('Content-Encoding', 'identity')
    response.set_header('Content-Disposition', 'filename="%s"' % filename)

    return fileobj
Exemple #4
0
def get_blob(uuid, path=''):
    """
    API to download the contents of a bundle or a subpath within a bundle.

    For directories this method always returns a tarred and gzipped archive of
    the directory.

    For files, if the request has an Accept-Encoding header containing gzip,
    then the returned file is gzipped.
    """
    check_bundles_have_read_permission(local.model, request.user, [uuid])
    bundle = local.model.get_bundle(uuid)

    target_info = local.download_manager.get_target_info(uuid, path, 0)
    if target_info is None:
        abort(httplib.NOT_FOUND, 'Not found.')

    # Figure out the file name.
    if not path and bundle.metadata.name:
        filename = bundle.metadata.name
    else:
        filename = target_info['name']

    if target_info['type'] == 'directory':
        # Always tar and gzip directories.
        filename = filename + '.tar.gz'
        fileobj = local.download_manager.stream_tarred_gzipped_directory(uuid, path)
    elif target_info['type'] == 'file':
        if not zip_util.path_is_archive(filename) and request_accepts_gzip_encoding():
            # Let's gzip to save bandwidth. The browser will transparently decode
            # the file.
            filename = filename + '.gz'
            fileobj = local.download_manager.stream_file(uuid, path, gzipped=True)
        else:
            fileobj = local.download_manager.stream_file(uuid, path, gzipped=False)
    else:
        # Symlinks.
        abort(httplib.FORBIDDEN, 'Cannot download files of this type.')
    
    # Set headers.
    mimetype, _ = mimetypes.guess_type(filename, strict=False)
    response.set_header('Content-Type', mimetype or 'text/plain')
    if zip_util.get_archive_ext(filename) == '.gz' and request_accepts_gzip_encoding():
        filename = zip_util.strip_archive_ext(filename)
        response.set_header('Content-Encoding', 'gzip')
    else:
        response.set_header('Content-Encoding', 'identity')
    response.set_header('Content-Disposition', 'filename="%s"' % filename)

    return fileobj
    def upload_bundle(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, info, worksheet_uuid, add_to_worksheet):
        """
        See local_bundle_client.py for documentation on the usage.
        Strategy:
        1) We copy the |sources| to a temporary directory on the server
          (streaming either a tar or tar.gz depending on whether compression is
          needed).
        2) We politely ask the server to finish_upload_bundle (performs a
          LocalBundleClient.upload_bundle from the temporary directory).
        """
        # URLs can be directly passed to the local client.
        if all(path_util.path_is_url(source) for source in sources):
            return self.upload_bundle_url(sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, info, worksheet_uuid, add_to_worksheet)

        remote_file_uuids = []
        try:
            # 1) Copy sources up to the server (temporary remote zip file)
            for source in sources:
                if zip_util.path_is_archive(source):
                    source_handle = open(source)
                    temp_file_name = os.path.basename(source)
                elif os.path.isdir(source):
                    source_handle = tar_gzip_directory(source, follow_symlinks, exclude_patterns)
                    temp_file_name = os.path.basename(source) + '.tar.gz'
                    unpack = True  # We packed it, so we have to unpack it
                else:
                    resolved_source = source
                    if follow_symlinks:
                        resolved_source = os.path.realpath(source)
                        if not os.path.exists(resolved_source):
                            raise UsageError('Broken symlink')
                    elif os.path.islink(source):
                        raise UsageError('Not following symlinks.')
                    source_handle = gzip_file(resolved_source)
                    temp_file_name = os.path.basename(source) + '.gz'
                    unpack = True  # We packed it, so we have to unpack it

                remote_file_uuid = self.open_temp_file(temp_file_name)
                remote_file_uuids.append(remote_file_uuid)
                with closing(RPCFileHandle(remote_file_uuid, self.proxy)) as dest_handle:
                    status = 'Uploading %s%s to %s' % (source, ' ('+info['uuid']+')' if 'uuid' in info else '', self.address)
                    file_util.copy(source_handle, dest_handle, autoflush=False, print_status=status)

            # 2) Install upload (this call will be in charge of deleting the temporary file).
            return self.finish_upload_bundle(remote_file_uuids, unpack, info, worksheet_uuid, add_to_worksheet)
        except:
            for remote_file_uuid in remote_file_uuids:
                self.finalize_file(remote_file_uuid)
            raise
    def upload_to_bundle_store(self, bundle: Bundle, source: Source, git: bool,
                               unpack: bool):
        """Uploads the given source to the bundle store.
        Given arguments are the same as UploadManager.upload_to_bundle_store().
        Used when uploading from rest server."""
        try:
            # bundle_path = self._bundle_store.get_bundle_location(bundle.uuid)
            is_url, is_fileobj, filename = self._interpret_source(source)
            if is_url:
                assert isinstance(source, str)
                if git:
                    bundle_path = self._update_and_get_bundle_location(
                        bundle, is_directory=True)
                    self.write_git_repo(source, bundle_path)
                else:
                    # If downloading from a URL, convert the source to a file object.
                    is_fileobj = True
                    source = (filename, urlopen_with_retry(source))
            if is_fileobj:
                source_filename, source_fileobj = cast(Tuple[str, IO[bytes]],
                                                       source)
                source_ext = zip_util.get_archive_ext(source_filename)
                if unpack and zip_util.path_is_archive(filename):
                    bundle_path = self._update_and_get_bundle_location(
                        bundle, is_directory=source_ext in ARCHIVE_EXTS_DIR)
                    self.write_fileobj(source_ext,
                                       source_fileobj,
                                       bundle_path,
                                       unpack_archive=True)
                else:
                    bundle_path = self._update_and_get_bundle_location(
                        bundle, is_directory=False)
                    self.write_fileobj(source_ext,
                                       source_fileobj,
                                       bundle_path,
                                       unpack_archive=False)

        except UsageError:
            if FileSystems.exists(bundle_path):
                path_util.remove(bundle_path)
            raise
Exemple #7
0
    def upload_bundle(self, source_file, bundle_type, worksheet_uuid):
        """
        Upload |source_file| (a stream) to |worksheet_uuid|.
        """
        # Construct info for creating the bundle.
        bundle_subclass = get_bundle_subclass(bundle_type)  # program or data
        metadata = metadata_util.fill_missing_metadata(
            bundle_subclass, {},
            initial_metadata={
                'name': source_file.filename,
                'description': 'Upload ' + source_file.filename
            })
        info = {'bundle_type': bundle_type, 'metadata': metadata}

        # Upload it by creating a file handle and copying source_file to it (see RemoteBundleClient.upload_bundle in the CLI).
        remote_file_uuid = self.client.open_temp_file(metadata['name'])
        try:
            with closing(RPCFileHandle(remote_file_uuid,
                                       self.client.proxy)) as dest:
                file_util.copy(source_file.file,
                               dest,
                               autoflush=False,
                               print_status='Uploading %s' % metadata['name'])

            pack = False  # For now, always unpack (note: do this after set remote_file_uuid, which needs the extension)
            if not pack and zip_util.path_is_archive(metadata['name']):
                metadata['name'] = zip_util.strip_archive_ext(metadata['name'])

            # Then tell the client that the uploaded file handle is there.
            new_bundle_uuid = self.client.finish_upload_bundle(
                [remote_file_uuid],
                not pack,  # unpack
                info,
                worksheet_uuid,
                True)  # add_to_worksheet
        except:
            self.client.finalize_file(remote_file_uuid)
            raise
        return new_bundle_uuid
 def _can_unpack_file(self, path):
     return os.path.isfile(path) and zip_util.path_is_archive(path)
    def upload_to_bundle_store(self, bundle, sources, follow_symlinks, exclude_patterns, remove_sources, git, unpack, simplify_archives):
        """
        Uploads contents for the given bundle to the bundle store.

        |sources|: specifies the locations of the contents to upload. Each element is
                   either a URL, a local path or a tuple (filename, file-like object).
        |follow_symlinks|: for local path(s), whether to follow (resolve) symlinks,
                           but only if remove_sources is False.
        |exclude_patterns|: for local path(s), don't upload these patterns (e.g., *.o),
                            but only if remove_sources is False.
        |remove_sources|: for local path(s), whether |sources| should be removed
        |git|: for URLs, whether |source| is a git repo to clone.
        |unpack|: for each source in |sources|, whether to unpack it if it's an archive.
        |simplify_archives|: whether to simplify unpacked archives so that if they
                             contain a single file, the final path is just that file,
                             not a directory containing that file.
    
        If |sources| contains one source, then the bundle contents will be that source.
        Otherwise, the bundle contents will be a directory with each of the sources.
        Exceptions:
        - If |git|, then each source is replaced with the result of running 'git clone |source|'
        - If |unpack| is True or a source is an archive (zip, tar.gz, etc.), then unpack the source.
        """
        bundle_path = self._bundle_store.get_bundle_location(bundle.uuid)
        try:
            path_util.make_directory(bundle_path)
            # Note that for uploads with a single source, the directory
            # structure is simplified at the end.
            for source in sources:
                is_url, is_local_path, is_fileobj, filename = self._interpret_source(source)
                source_output_path = os.path.join(bundle_path, filename)
    
                if is_url:
                    if git:
                        source_output_path = file_util.strip_git_ext(source_output_path)
                        file_util.git_clone(source, source_output_path)
                    else:
                        file_util.download_url(source, source_output_path)
                        if unpack and self._can_unpack_file(source_output_path):
                            self._unpack_file(
                                source_output_path, zip_util.strip_archive_ext(source_output_path),
                                remove_source=True, simplify_archive=simplify_archives)
                elif is_local_path:
                    source_path = path_util.normalize(source)
                    path_util.check_isvalid(source_path, 'upload')
                    
                    if unpack and self._can_unpack_file(source_path):
                        self._unpack_file(
                            source_path, zip_util.strip_archive_ext(source_output_path),
                            remove_source=remove_sources, simplify_archive=simplify_archives)
                    elif remove_sources:
                        path_util.rename(source_path, source_output_path)
                    else:
                        path_util.copy(source_path, source_output_path, follow_symlinks=follow_symlinks, exclude_patterns=exclude_patterns)
                elif is_fileobj:
                    if unpack and zip_util.path_is_archive(filename):
                        self._unpack_fileobj(
                            source[0], source[1],
                            zip_util.strip_archive_ext(source_output_path),
                            simplify_archive=simplify_archives)
                    else:
                        with open(source_output_path, 'wb') as out:
                            shutil.copyfileobj(source[1], out)

            if len(sources) == 1:
                self._simplify_directory(bundle_path)
        except:
            if os.path.exists(bundle_path):
                path_util.remove(bundle_path)
            raise
Exemple #10
0
    def upload_to_bundle_store(
        self,
        bundle,
        sources,
        follow_symlinks,
        exclude_patterns,
        remove_sources,
        git,
        unpack,
        simplify_archives,
    ):
        """
        Uploads contents for the given bundle to the bundle store.

        |sources|: specifies the locations of the contents to upload. Each element is
                   either a URL, a local path or a tuple (filename, binary file-like object).
        |follow_symlinks|: for local path(s), whether to follow (resolve) symlinks,
                           but only if remove_sources is False.
        |exclude_patterns|: for local path(s), don't upload these patterns (e.g., *.o),
                            but only if remove_sources is False.
        |remove_sources|: for local path(s), whether |sources| should be removed
        |git|: for URLs, whether |source| is a git repo to clone.
        |unpack|: for each source in |sources|, whether to unpack it if it's an archive.
        |simplify_archives|: whether to simplify unpacked archives so that if they
                             contain a single file, the final path is just that file,
                             not a directory containing that file.

        If |sources| contains one source, then the bundle contents will be that source.
        Otherwise, the bundle contents will be a directory with each of the sources.
        Exceptions:
        - If |git|, then each source is replaced with the result of running 'git clone |source|'
        - If |unpack| is True or a source is an archive (zip, tar.gz, etc.), then unpack the source.
        """
        exclude_patterns = (self._default_exclude_patterns +
                            exclude_patterns if exclude_patterns else
                            self._default_exclude_patterns)
        bundle_link_url = getattr(bundle.metadata, "link_url", None)
        if bundle_link_url:
            # Don't do anything for linked bundles.
            return
        bundle_path = self._bundle_store.get_bundle_location(bundle.uuid)
        try:
            path_util.make_directory(bundle_path)
            # Note that for uploads with a single source, the directory
            # structure is simplified at the end.
            for source in sources:
                is_url, is_local_path, is_fileobj, filename = self._interpret_source(
                    source)
                source_output_path = os.path.join(bundle_path, filename)
                if is_url:
                    if git:
                        source_output_path = file_util.strip_git_ext(
                            source_output_path)
                        file_util.git_clone(source, source_output_path)
                    else:
                        file_util.download_url(source, source_output_path)
                        if unpack and self._can_unpack_file(
                                source_output_path):
                            self._unpack_file(
                                source_output_path,
                                zip_util.strip_archive_ext(source_output_path),
                                remove_source=True,
                                simplify_archive=simplify_archives,
                            )
                elif is_local_path:
                    source_path = path_util.normalize(source)
                    path_util.check_isvalid(source_path, 'upload')

                    if unpack and self._can_unpack_file(source_path):
                        self._unpack_file(
                            source_path,
                            zip_util.strip_archive_ext(source_output_path),
                            remove_source=remove_sources,
                            simplify_archive=simplify_archives,
                        )
                    elif remove_sources:
                        path_util.rename(source_path, source_output_path)
                    else:
                        path_util.copy(
                            source_path,
                            source_output_path,
                            follow_symlinks=follow_symlinks,
                            exclude_patterns=exclude_patterns,
                        )
                elif is_fileobj:
                    if unpack and zip_util.path_is_archive(filename):
                        self._unpack_fileobj(
                            source[0],
                            source[1],
                            zip_util.strip_archive_ext(source_output_path),
                            simplify_archive=simplify_archives,
                        )
                    else:
                        with open(source_output_path, 'wb') as out:
                            shutil.copyfileobj(source[1], out)

            if len(sources) == 1:
                self._simplify_directory(bundle_path)
        except:
            if os.path.exists(bundle_path):
                path_util.remove(bundle_path)
            raise
Exemple #11
0
 def _can_unpack_file(self, path):
     return os.path.isfile(path) and zip_util.path_is_archive(path)
Exemple #12
0
 def test_local_file_archive(self):
     """This local should be categorized as an archive."""
     path = "/tmp/file.tar.gz"
     self.assertEqual(path_is_archive(path), True)
     self.assertEqual(get_archive_ext(path), ".tar.gz")
     self.assertEqual(strip_archive_ext(path), "/tmp/file")
    def upload_to_bundle_store(
        self,
        bundle: Dict,
        packed_source: Dict,
        use_azure_blob_beta: bool,
        destination_bundle_store=None,
    ):
        """
        Bypass server upload. Upload from client directly to different blob storage (Azure, GCS, Disk storage).
        Bypass server uploading is used in following situations:
        # 1. The server set CODALAB_DEFAULT_BUNDLE_STORE_NAME
        # 2. If the user specify `--store` and blob storage is on Azure
        """

        need_bypass = True
        bundle_store_uuid = None
        # 1) Read destination store from --store if user has specified it
        if destination_bundle_store is not None and destination_bundle_store != '':
            storage_info = self._client.fetch_one(
                'bundle_stores',
                params={
                    'name': destination_bundle_store,
                    'include': ['uuid', 'storage_type', 'url'],
                },
            )
            bundle_store_uuid = storage_info['uuid']
            if storage_info['storage_type'] in (
                    StorageType.DISK_STORAGE.value, ):
                need_bypass = False  # The user specify --store to upload to disk storage

        # 2) Pack the files to be uploaded
        source_ext = zip_util.get_archive_ext(packed_source['filename'])
        if packed_source['should_unpack'] and zip_util.path_is_archive(
                packed_source['filename']):
            unpack_before_upload = True
            is_dir = source_ext in zip_util.ARCHIVE_EXTS_DIR
        else:
            unpack_before_upload = False
            is_dir = False

        # 3) Create a bundle location for the bundle
        params = {'need_bypass': need_bypass, 'is_dir': is_dir}
        data = self._client.add_bundle_location(bundle['id'],
                                                bundle_store_uuid,
                                                params)[0].get('attributes')

        # 4) If bundle location has bundle_conn_str, we should bypass the server when uploading.
        if data.get('bundle_conn_str', None) is not None:
            # Mimic the rest server behavior
            # decided the bundle type (file/directory) and decide whether need to unpack
            bundle_conn_str = data.get('bundle_conn_str')
            index_conn_str = data.get('index_conn_str')
            bundle_url = data.get('bundle_url')
            bundle_read_str = data.get('bundle_read_url', bundle_url)
            try:
                progress = FileTransferProgress('Sent ', f=self.stderr)
                with closing(packed_source['fileobj']), progress:
                    self.upload_Azure_blob_storage(
                        fileobj=packed_source['fileobj'],
                        bundle_url=bundle_url,
                        bundle_conn_str=bundle_conn_str,
                        bundle_read_str=bundle_read_str,
                        index_conn_str=index_conn_str,
                        source_ext=source_ext,
                        should_unpack=unpack_before_upload,
                        progress_callback=progress.update,
                    )
            except Exception as err:
                self._client.update_bundle_state(
                    bundle['id'],
                    params={
                        'success': False,
                        'error_msg': f'Bypass server upload error. {err}',
                    },
                )
                raise err
            else:
                self._client.update_bundle_state(bundle['id'],
                                                 params={'success': True})
        else:
            # 5) Otherwise, upload the bundle directly through the server.
            progress = FileTransferProgress('Sent ',
                                            packed_source['filesize'],
                                            f=self.stderr)
            with closing(packed_source['fileobj']), progress:
                self._client.upload_contents_blob(
                    bundle['id'],
                    fileobj=packed_source['fileobj'],
                    params={
                        'filename': packed_source['filename'],
                        'unpack': packed_source['should_unpack'],
                        'state_on_success': State.READY,
                        'finalize_on_success': True,
                        'use_azure_blob_beta': use_azure_blob_beta,
                        'store': destination_bundle_store or '',
                    },
                    progress_callback=progress.update,
                )
Exemple #14
0
    def upload_bundle(self, sources, follow_symlinks, exclude_patterns, git,
                      unpack, remove_sources, info, worksheet_uuid,
                      add_to_worksheet):
        """
        See local_bundle_client.py for documentation on the usage.
        Strategy:
        1) We copy the |sources| to a temporary directory on the server
          (streaming either a tar or tar.gz depending on whether compression is
          needed).
        2) We politely ask the server to finish_upload_bundle (performs a
          LocalBundleClient.upload_bundle from the temporary directory).
        """
        # URLs can be directly passed to the local client.
        if all(path_util.path_is_url(source) for source in sources):
            return self.upload_bundle_url(sources, follow_symlinks,
                                          exclude_patterns, git, unpack,
                                          remove_sources, info, worksheet_uuid,
                                          add_to_worksheet)

        remote_file_uuids = []
        try:
            # 1) Copy sources up to the server (temporary remote zip file)
            for source in sources:
                if zip_util.path_is_archive(source):
                    source_handle = open(source)
                    temp_file_name = os.path.basename(source)
                elif os.path.isdir(source):
                    source_handle = tar_gzip_directory(source, follow_symlinks,
                                                       exclude_patterns)
                    temp_file_name = os.path.basename(source) + '.tar.gz'
                    unpack = True  # We packed it, so we have to unpack it
                else:
                    resolved_source = source
                    if follow_symlinks:
                        resolved_source = os.path.realpath(source)
                        if not os.path.exists(resolved_source):
                            raise UsageError('Broken symlink')
                    elif os.path.islink(source):
                        raise UsageError('Not following symlinks.')
                    source_handle = gzip_file(resolved_source)
                    temp_file_name = os.path.basename(source) + '.gz'
                    unpack = True  # We packed it, so we have to unpack it

                remote_file_uuid = self.open_temp_file(temp_file_name)
                remote_file_uuids.append(remote_file_uuid)
                with closing(RPCFileHandle(remote_file_uuid,
                                           self.proxy)) as dest_handle:
                    status = 'Uploading %s%s to %s' % (
                        source, ' (' + info['uuid'] +
                        ')' if 'uuid' in info else '', self.address)
                    file_util.copy(source_handle,
                                   dest_handle,
                                   autoflush=False,
                                   print_status=status)

            # 2) Install upload (this call will be in charge of deleting the temporary file).
            return self.finish_upload_bundle(remote_file_uuids, unpack, info,
                                             worksheet_uuid, add_to_worksheet)
        except:
            for remote_file_uuid in remote_file_uuids:
                self.finalize_file(remote_file_uuid)
            raise
Exemple #15
0
def _fetch_bundle_contents_blob(uuid, path=''):
    """
    API to download the contents of a bundle or a subpath within a bundle.

    For directories this method always returns a tarred and gzipped archive of
    the directory.

    For files, if the request has an Accept-Encoding header containing gzip,
    then the returned file is gzipped.
    """
    byte_range = get_request_range()
    head_lines = query_get_type(int, 'head', default=0)
    tail_lines = query_get_type(int, 'tail', default=0)
    max_line_length = query_get_type(int, 'max_line_length', default=128)
    check_bundles_have_read_permission(local.model, request.user, [uuid])
    bundle = local.model.get_bundle(uuid)

    target_info = local.download_manager.get_target_info(uuid, path, 0)
    if target_info is None:
        abort(httplib.NOT_FOUND, 'Not found.')

    # Figure out the file name.
    if not path and bundle.metadata.name:
        filename = bundle.metadata.name
    else:
        filename = target_info['name']

    if target_info['type'] == 'directory':
        if byte_range:
            abort(httplib.BAD_REQUEST,
                  'Range not supported for directory blobs.')
        if head_lines:
            abort(httplib.BAD_REQUEST,
                  'Head not supported for directory blobs.')
        # Always tar and gzip directories.
        filename = filename + '.tar.gz'
        fileobj = local.download_manager.stream_tarred_gzipped_directory(
            uuid, path)
    elif target_info['type'] == 'file':
        gzipped = False
        if not zip_util.path_is_archive(
                filename) and request_accepts_gzip_encoding():
            # Let's gzip to save bandwidth. The browser will transparently decode
            # the file.
            filename = filename + '.gz'
            gzipped = True

        if byte_range and (head_lines or tail_lines):
            abort(httplib.BAD_REQUEST,
                  'Head and range not supported on the same request.')
        elif byte_range:
            start, end = byte_range
            fileobj = local.download_manager.read_file_section(
                uuid, path, start, end - start + 1, gzipped)
        elif head_lines or tail_lines:
            fileobj = local.download_manager.summarize_file(
                uuid, path, head_lines, tail_lines, max_line_length, None,
                gzipped)
        else:
            fileobj = local.download_manager.stream_file(uuid, path, gzipped)
    else:
        # Symlinks.
        abort(httplib.FORBIDDEN, 'Cannot download files of this type.')

    # Set headers.
    mimetype, _ = mimetypes.guess_type(filename, strict=False)
    response.set_header('Content-Type', mimetype or 'text/plain')
    if zip_util.get_archive_ext(
            filename) == '.gz' and request_accepts_gzip_encoding():
        filename = zip_util.strip_archive_ext(filename)
        response.set_header('Content-Encoding', 'gzip')
    else:
        response.set_header('Content-Encoding', 'identity')
    response.set_header('Content-Disposition', 'filename="%s"' % filename)

    return fileobj
Exemple #16
0
 def test_url_archive(self):
     """This URL should be categorized as an archive."""
     path = "https://codalab.org/file.tar.gz"
     self.assertEqual(path_is_archive(path), True)
     self.assertEqual(get_archive_ext(path), ".tar.gz")
     self.assertEqual(strip_archive_ext(path), "https://codalab.org/file")
Exemple #17
0
    def upload(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources):
        '''
        |sources|: specifies the locations of the contents to upload.  Each element is either a URL or a local path.
        |follow_symlinks|: for local path(s), whether to follow (resolve) symlinks
        |exclude_patterns|: for local path(s), don't upload these patterns (e.g., *.o)
        |git|: for URL, whether |source| is a git repo to clone.
        |unpack|: for each source in |sources|, whether to unpack it if it's an archive.
        |remove_sources|: remove |sources|.

        If |sources| contains one source, then the bundle contents will be that source.
        Otherwise, the bundle contents will be a directory with each of the sources.
        Exceptions:
        - If |git|, then each source is replaced with the result of running 'git clone |source|'
        - If |unpack| is True or a source is an archive (zip, tar.gz, etc.), then unpack the source.

        Install the contents of the directory at |source| into
        DATA_SUBDIRECTORY in a subdirectory named by a hash of the contents.

        Return a (data_hash, metadata) pair, where the metadata is a dict mapping
        keys to precomputed statistics about the new data directory.
        '''
        to_delete = []

        # Create temporary directory as a staging area and put everything there.
        temp_path = tempfile.mkdtemp('-bundle_store_upload')
        temp_subpaths = []
        for source in sources:
            # Where to save |source| to (might change this value if we unpack).
            temp_subpath = os.path.join(temp_path, os.path.basename(source))
            if remove_sources:
                to_delete.append(source)
            source_unpack = unpack and zip_util.path_is_archive(source)

            if path_util.path_is_url(source):
                # Download the URL.
                print_util.open_line('BundleStore.upload: downloading %s to %s' % (source, temp_path))
                if git:
                    file_util.git_clone(source, temp_subpath)
                else:
                    file_util.download_url(source, temp_subpath, print_status=True)
                    if source_unpack:
                        zip_util.unpack(temp_subpath, zip_util.strip_archive_ext(temp_subpath))
                        path_util.remove(temp_subpath)
                        temp_subpath = zip_util.strip_archive_ext(temp_subpath)
                print_util.clear_line()
            else:
                # Copy the local path.
                source_path = path_util.normalize(source)
                path_util.check_isvalid(source_path, 'upload')

                # Recursively copy the directory into a new BundleStore temp directory.
                print_util.open_line('BundleStore.upload: %s => %s' % (source_path, temp_subpath))
                if source_unpack:
                    zip_util.unpack(source_path, zip_util.strip_archive_ext(temp_subpath))
                    temp_subpath = zip_util.strip_archive_ext(temp_subpath)
                else:
                    if remove_sources:
                        path_util.rename(source_path, temp_subpath)
                    else:
                        path_util.copy(source_path, temp_subpath, follow_symlinks=follow_symlinks, exclude_patterns=exclude_patterns)
                print_util.clear_line()

            temp_subpaths.append(temp_subpath)

        # If exactly one source, then upload that directly.
        if len(temp_subpaths) == 1:
            to_delete.append(temp_path)
            temp_path = temp_subpaths[0]

        # Multiplex between uploading a directory and uploading a file here.
        # All other path_util calls will use these lists of directories and files.
        if os.path.isdir(temp_path):
            dirs_and_files = path_util.recursive_ls(temp_path)
        else:
            dirs_and_files = ([], [temp_path])

        # Hash the contents of the temporary directory, and then if there is no
        # data with this hash value, move this directory into the data directory.
        print_util.open_line('BundleStore.upload: hashing %s' % temp_path)
        data_hash = '0x%s' % (path_util.hash_directory(temp_path, dirs_and_files),)
        print_util.clear_line()
        print_util.open_line('BundleStore.upload: computing size of %s' % temp_path)
        data_size = path_util.get_size(temp_path, dirs_and_files)
        print_util.clear_line()
        final_path = os.path.join(self.data, data_hash)
        if os.path.exists(final_path):
            # Already exists, just delete it
            path_util.remove(temp_path)
        else:
            print >>sys.stderr, 'BundleStore.upload: moving %s to %s' % (temp_path, final_path)
            path_util.rename(temp_path, final_path)

        # Delete paths.
        for path in to_delete:
            if os.path.exists(path):
                path_util.remove(path)

        # After this operation there should always be a directory at the final path.
        assert(os.path.lexists(final_path)), 'Uploaded to %s failed!' % (final_path,)
        return (data_hash, {'data_size': data_size})
    def upload(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, uuid):
        """
        |sources|: specifies the locations of the contents to upload.  Each element is either a URL or a local path.
        |follow_symlinks|: for local path(s), whether to follow (resolve) symlinks
        |exclude_patterns|: for local path(s), don't upload these patterns (e.g., *.o)
        |git|: for URL, whether |source| is a git repo to clone.
        |unpack|: for each source in |sources|, whether to unpack it if it's an archive.
        |remove_sources|: remove |sources|.

        If |sources| contains one source, then the bundle contents will be that source.
        Otherwise, the bundle contents will be a directory with each of the sources.
        Exceptions:
        - If |git|, then each source is replaced with the result of running 'git clone |source|'
        - If |unpack| is True or a source is an archive (zip, tar.gz, etc.), then unpack the source.

        Install the contents of the directory at |source| into
        DATA_SUBDIRECTORY in a subdirectory named by a hash of the contents.

        Return a (data_hash, metadata) pair, where the metadata is a dict mapping
        keys to precomputed statistics about the new data directory.
        """
        to_delete = []

        # If just a single file, set the final path to be equal to that file
        single_path = len(sources) == 1

        # Determine which disk this will go on
        disk_choice = self.ring.get_node(uuid)

        final_path = os.path.join(self.partitions, disk_choice, self.DATA_SUBDIRECTORY, uuid)
        if os.path.exists(final_path):
            raise UsageError('Path %s already present in bundle store' % final_path)
        # Only make if not there
        elif not single_path:
            path_util.make_directory(final_path)

        # Paths to resources
        subpaths = []

        for source in sources:
            # Where to save |source| to (might change this value if we unpack).
            if not single_path:
                subpath = os.path.join(final_path, os.path.basename(source))
            else:
                subpath = final_path

            if remove_sources:
                to_delete.append(source)
            source_unpack = unpack and zip_util.path_is_archive(source)

            if source_unpack and single_path:
                # Load the file into the bundle store under the given path
                subpath += zip_util.get_archive_ext(source)

            if path_util.path_is_url(source):
                # Download the URL.
                print_util.open_line('BundleStore.upload: downloading %s to %s' % (source, subpath))
                if git:
                    file_util.git_clone(source, subpath)
                else:
                    file_util.download_url(source, subpath, print_status=True)
                    if source_unpack:
                        zip_util.unpack(subpath, zip_util.strip_archive_ext(subpath))
                        path_util.remove(subpath)
                        subpath = zip_util.strip_archive_ext(subpath)
                print_util.clear_line()
            else:
                # Copy the local path.
                source_path = path_util.normalize(source)
                path_util.check_isvalid(source_path, 'upload')

                # Recursively copy the directory into the BundleStore
                print_util.open_line('BundleStore.upload: %s => %s' % (source_path, subpath))
                if source_unpack:
                    zip_util.unpack(source_path, zip_util.strip_archive_ext(subpath))
                    subpath = zip_util.strip_archive_ext(subpath)
                else:
                    if remove_sources:
                        path_util.rename(source_path, subpath)
                    else:
                        path_util.copy(source_path, subpath, follow_symlinks=follow_symlinks, exclude_patterns=exclude_patterns)
                print_util.clear_line()

            subpaths.append(subpath)

        dirs_and_files = None
        if os.path.isdir(final_path):
            dirs_and_files = path_util.recursive_ls(final_path)
        else:
            dirs_and_files = [], [final_path]

        # Hash the contents of the bundle directory. Update the data_hash attribute
        # for the bundle
        print_util.open_line('BundleStore.upload: hashing %s' % final_path)
        data_hash = '0x%s' % (path_util.hash_directory(final_path, dirs_and_files))
        print_util.clear_line()
        print_util.open_line('BundleStore.upload: computing size of %s' % final_path)
        data_size = path_util.get_size(final_path, dirs_and_files)
        print_util.clear_line()

        # Delete paths.
        for path in to_delete:
            if os.path.exists(path):
                path_util.remove(path)

        # After this operation there should always be a directory at the final path.
        assert (os.path.lexists(final_path)), 'Uploaded to %s failed!' % (final_path,)
        return (data_hash, {'data_size': data_size})