Ejemplo n.º 1
0
    def duplicate(self, to_wf_module):
        key = _build_key(to_wf_module.workflow_id, to_wf_module.id)
        minio.copy(self.bucket, key, f'{self.bucket}/{self.key}')

        return to_wf_module.stored_objects.create(stored_at=self.stored_at,
                                                  hash=self.hash,
                                                  bucket=self.bucket,
                                                  key=key,
                                                  size=self.size)
Ejemplo n.º 2
0
    def convert_to_uploaded_file(self, filename):
        """
        Generate an UploadedFile and delete this InProgressUpload.

        Raise FileNotFoundError if the user never finished uploading. That's
        right: we throw an exception if the _end user_ doesn't do what we want.
        The user is meant to upload the file (putObject or multipart) and
        _then_ convert it. Callers should handle the case in which the end user
        asks to convert the file before the upload is complete.
        """
        assert not self.is_completed  # this InProgressUpload should not be visible

        key = self.get_upload_key()
        suffix = PurePath(filename).suffix
        final_key = self.wf_module.uploaded_file_prefix + str(self.id) + suffix
        try:
            minio.copy(
                minio.UserFilesBucket,
                final_key,
                f"{self.Bucket}/{key}",
                ACL="private",
                MetadataDirective="REPLACE",
                ContentDisposition=minio.encode_content_disposition(filename),
                ContentType="application/octet-stream",
            )
        except minio.error.NoSuchKey:
            raise FileNotFoundError
        # Potential tricky leak here: if there's an exception, then final_key
        # is in S3 but nothing in the database refers to it. Careful coding of
        # delete_s3_data() solves this.
        size = minio.stat(minio.UserFilesBucket, final_key).size
        uploaded_file = self.wf_module.uploaded_files.create(
            name=filename,
            size=size,
            uuid=str(self.id),
            bucket=minio.UserFilesBucket,
            key=final_key,
        )
        self.is_completed = True
        self.save(update_fields=["is_completed"])
        self.delete_s3_data()
        return uploaded_file
def move_uploaded_file(workflow, wf_module, uploaded_file):
    """
    Move files from /uuid.ext to /wf-1/wfm-2/uuid.ext.

    This helps delete leaked files and find problem files.
    """
    from server import minio

    bucket = uploaded_file.bucket
    old_key = uploaded_file.key
    if '/' in old_key:
        return

    new_key = f'wf-{workflow.id}/wfm-{wf_module.id}/{old_key}'

    logger.info(f'Move %s/%s to %s/%s', bucket, old_key, bucket, new_key)
    try:
        minio.copy(bucket, new_key, f'{bucket}/{old_key}')
        minio.remove(bucket, old_key)
    except minio.error.NoSuchKey:
        # old_key is missing. Two possibilities:
        #
        # 1. We're re-running this script after it failed once with
        #    atomic=True (which used to be set, by accident); the move already
        #    succeeded but the DB doesn't know it. In that case, continue
        #    because this error actually means, "all is well."
        # 2. The file didn't exist to begin with. In that case, write a blank
        #    file in its stead. That way the user will remark, "hey, Workbench
        #    ate my file!" instead of undefined behavior (which is worse).
        #    https://www.pivotaltracker.com/story/show/163336822
        if minio.exists(bucket, new_key):
            pass  # "all is well"
        else:
            # write an empty file
            minio.put_bytes(bucket, new_key, b'')
            uploaded_file.size = 0
            uploaded_file.save(update_fields=['size'])
    uploaded_file.key = new_key
    uploaded_file.save(update_fields=['key'])
Ejemplo n.º 4
0
    def duplicate(self, to_tab):
        to_workflow = to_tab.workflow

        # Initialize but don't save
        new_wfm = WfModule(
            tab=to_tab,
            module_id_name=self.module_id_name,
            fetch_error=self.fetch_error,
            stored_data_version=self.stored_data_version,
            order=self.order,
            notes=self.notes,
            is_collapsed=self.is_collapsed,
            auto_update_data=False,
            next_update=self.next_update,
            update_interval=self.update_interval,
            last_update_check=self.last_update_check,
            # to_workflow has exactly one delta, and that's the version of all
            # its modules. This is so we can cache render results. (Cached
            # render results require a delta ID.)
            last_relevant_delta_id=to_workflow.last_delta_id,
            params=self.params,
            secrets={}  # DO NOT COPY SECRETS
        )

        # Copy cached render result, if there is one.
        #
        # If we duplicate a Workflow mid-render, the cached render result might
        # not have any useful data. But that's okay: just kick off a new
        # render. The common case (all-rendered Workflow) will produce a
        # fully-rendered duplicate Workflow.
        cached_result = self.cached_render_result
        if cached_result is not None:
            # assuming file-copy succeeds, copy cached results.
            # Not using `new_wfm.cache_render_result(cached_result.result)`
            # because that would involve reading the whole thing.
            new_wfm.cached_render_result_delta_id = \
                new_wfm.last_relevant_delta_id
            for attr in ('status', 'error', 'json', 'quick_fixes', 'columns',
                         'nrows'):
                full_attr = f'cached_render_result_{attr}'
                setattr(new_wfm, full_attr, getattr(self, full_attr))

            new_wfm.save()  # so there is a new_wfm.id for parquet_key

            # Now new_wfm.cached_render_result will return a
            # CachedRenderResult, because all the DB values are set. It'll have
            # a .parquet_key ... but there won't be a file there (because we
            # never wrote it).
            parquet_key = new_wfm.cached_render_result.parquet_key

            try:
                minio.copy(
                    minio.CachedRenderResultsBucket, parquet_key,
                    '%(Bucket)s/%(Key)s' % {
                        'Bucket': minio.CachedRenderResultsBucket,
                        'Key': cached_result.parquet_key,
                    })
            except minio.error.NoSuchKey:
                # DB and filesystem are out of sync. CachedRenderResult handles
                # such cases gracefully. So `new_result` will behave exactly
                # like `cached_result`.
                pass
        else:
            new_wfm.save()

        # Duplicate the current stored data only, not the history
        if self.stored_data_version is not None:
            self.stored_objects.get(stored_at=self.stored_data_version) \
                    .duplicate(new_wfm)

        # Duplicate the "selected" file, if there is one; otherwise, duplicate
        # the most-recently-uploaded file.
        #
        # We special-case the 'upload' module because it's the only one that
        # has 'file' params right now. (If that ever changes, we'll want to
        # change a few things: upload paths should include param name, and this
        # test will need to check module_version to find the param name of the
        # file.)
        if self.module_id_name == 'upload':
            uuid = self.params['file']
            uploaded_file = self.uploaded_files.filter(uuid=uuid).first()
            if uploaded_file is not None:
                new_key = uploaded_file.key.replace(
                    self.uploaded_file_prefix,
                    new_wfm.uploaded_file_prefix,
                )
                assert new_key != uploaded_file.key
                # TODO handle file does not exist
                minio.copy(minio.UserFilesBucket, new_key,
                           f'{uploaded_file.bucket}/{uploaded_file.key}')
                new_wfm.uploaded_files.create(
                    created_at=uploaded_file.created_at,
                    name=uploaded_file.name,
                    size=uploaded_file.size,
                    uuid=uploaded_file.uuid,
                    bucket=minio.UserFilesBucket,
                    key=new_key,
                )

        return new_wfm
Ejemplo n.º 5
0
    def duplicate(self, to_tab):
        to_workflow = to_tab.workflow

        # Initialize but don't save
        new_wfm = WfModule(
            tab=to_tab,
            module_id_name=self.module_id_name,
            fetch_error=self.fetch_error,
            stored_data_version=self.stored_data_version,
            order=self.order,
            notes=self.notes,
            is_collapsed=self.is_collapsed,
            auto_update_data=False,
            next_update=self.next_update,
            update_interval=self.update_interval,
            last_update_check=self.last_update_check,
            # to_workflow has exactly one delta, and that's the version of all
            # its modules. This is so we can cache render results. (Cached
            # render results require a delta ID.)
            last_relevant_delta_id=to_workflow.last_delta_id,
            params=self.params,
            secrets={}  # DO NOT COPY SECRETS
        )

        # Copy cached render result, if there is one.
        #
        # If we duplicate a Workflow mid-render, the cached render result might
        # not have any useful data. But that's okay: just kick off a new
        # render. The common case (all-rendered Workflow) will produce a
        # fully-rendered duplicate Workflow.
        cached_result = self.cached_render_result
        if cached_result is not None:
            # assuming file-copy succeeds, copy cached results.
            # Not using `new_wfm.cache_render_result(cached_result.result)`
            # because that would involve reading the whole thing.
            new_wfm.cached_render_result_delta_id = \
                new_wfm.last_relevant_delta_id
            for attr in ('status', 'error', 'json', 'quick_fixes', 'columns',
                         'nrows'):
                full_attr = f'cached_render_result_{attr}'
                setattr(new_wfm, full_attr, getattr(self, full_attr))

            new_wfm.save()  # so there is a new_wfm.id for parquet_key

            # Now new_wfm.cached_render_result will return a
            # CachedRenderResult, because all the DB values are set. It'll have
            # a .parquet_key ... but there won't be a file there (because we
            # never wrote it).
            parquet_key = new_wfm.cached_render_result.parquet_key

            try:
                minio.copy(
                    minio.CachedRenderResultsBucket,
                    parquet_key,
                    '%(Bucket)s/%(Key)s' % {
                        'Bucket': minio.CachedRenderResultsBucket,
                        'Key': cached_result.parquet_key,
                    }
                )
            except minio.error.NoSuchKey:
                # DB and filesystem are out of sync. CachedRenderResult handles
                # such cases gracefully. So `new_result` will behave exactly
                # like `cached_result`.
                pass
        else:
            new_wfm.save()

        # Duplicate the current stored data only, not the history
        if self.stored_data_version is not None:
            self.stored_objects.get(stored_at=self.stored_data_version) \
                    .duplicate(new_wfm)

        return new_wfm