Example #1
0
    def test_metadata_comes_from_db_columns(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Datetime()),
            Column("C", ColumnType.Text()),
        ]
        result = RenderResult(
            arrow_table({
                "A": [1],
                "B": [datetime.datetime.now()],
                "C": ["x"]
            },
                        columns=columns))
        cache_render_result(self.workflow, self.wf_module, self.delta.id,
                            result)
        # Delete from disk entirely, to prove we did not read.
        minio.remove(BUCKET,
                     crr_parquet_key(self.wf_module.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_wf_module = WfModule.objects.get(id=self.wf_module.id)
        cached_result = fresh_wf_module.cached_render_result

        self.assertEqual(cached_result.table_metadata,
                         TableMetadata(1, columns))
    def test_fetch_result_deleted_file_means_none(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        wf_module = tab.wf_modules.create(
            order=0,
            slug="step-1",
            module_id_name="x",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        with parquet_file({"A": [1]}) as path:
            so = create_stored_object(workflow.id, wf_module.id, path)
        wf_module.stored_data_version = so.stored_at
        wf_module.save(update_fields=["stored_data_version"])
        # Now delete the file on S3 -- but leave the DB pointing to it.
        minio.remove(so.bucket, so.key)

        def render(*args, fetch_result, **kwargs):
            self.assertIsNone(fetch_result)
            return RenderResult()

        with self._stub_module(render):
            self.run_with_async_db(
                execute_wfmodule(
                    self.chroot_context,
                    workflow,
                    wf_module,
                    {},
                    Tab(tab.slug, tab.name),
                    RenderResult(),
                    {},
                    self.output_path,
                ))
def _delete_from_s3_pre_delete(sender, instance, **kwargs):
    """
    Delete file from S3, pre-delete.

    Why pre-delete and not post-delete? Because our user expects the file to be
    _gone_, completely, forever -- that's what "delete" means to the user. If
    deletion fails, we need the link to remain in our database -- that's how
    the user will know it isn't deleted.
    """
    if instance.bucket and instance.key:
        minio.remove(instance.bucket, instance.key)
def delete_upload_stored_objects(apps, _):
    from cjwstate import minio

    StoredObject = apps.get_model("server", "StoredObject")

    for obj in StoredObject.objects.filter(wf_module__module_id_name="upload").all():
        try:
            minio.remove(obj.bucket, obj.key)
        except FileNotFoundError:
            # We're resuming, or the file never existed anyway. (We never
            # designed for errors, and we changed error behavior over time;
            # it's possible some uploads never had data.)
            pass
        obj.delete()
Example #5
0
    def delete_s3_data(self):
        """
        Delete all data from S3 that is part of this upload.

        Call this within a Workflow.cooperative_lock().

        This always leaves S3 and the database in a consistent state.
        """
        key = self.get_upload_key()
        minio.abort_multipart_uploads_by_prefix(self.Bucket, key)
        minio.remove(self.Bucket, key)

        if not self.wf_module.uploaded_files.filter(uuid=str(self.id)).count():
            # If there's no UploadedFile even though we copied this file to where the
            # UploadedFile _should_ point, then we've leaked that copy. Delete. See
            # "tricky leak here" in convert_to_uploaded_file().
            final_key_prefix = self.wf_module.uploaded_file_prefix + str(self.id)
            # no ".xlsx"-type suffix
            minio.remove_by_prefix(minio.UserFilesBucket, final_key_prefix)
Example #6
0
    def test_fetch_result_deleted_file_means_none(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        wf_module = tab.wf_modules.create(
            order=0,
            slug="step-1",
            module_id_name="x",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        with parquet_file({"A": [1]}) as path:
            so = create_stored_object(workflow.id, wf_module.id, path)
        wf_module.stored_data_version = so.stored_at
        wf_module.save(update_fields=["stored_data_version"])
        # Now delete the file on S3 -- but leave the DB pointing to it.
        minio.remove(minio.StoredObjectsBucket, so.key)

        def render(*args, fetch_result, **kwargs):
            self.assertIsNone(fetch_result)
            return RenderResult()

        module_zipfile = create_module_zipfile(
            "x",
            python_code=textwrap.dedent("""
                import pandas as pd
                def render(table, params, *, fetch_result, **kwargs):
                    assert fetch_result is None
                    return pd.DataFrame()
                """),
        )

        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                execute_wfmodule(
                    self.chroot_context,
                    workflow,
                    wf_module,
                    module_zipfile,
                    {},
                    Tab(tab.slug, tab.name),
                    RenderResult(),
                    {},
                    self.output_path,
                ))
def move_uploaded_file(workflow, wf_module, uploaded_file):
    """
    Move files from /uuid.ext to /wf-1/wfm-2/uuid.ext.

    This helps delete leaked files and find problem files.
    """
    from cjwstate import minio

    bucket = uploaded_file.bucket
    old_key = uploaded_file.key
    if "/" in old_key:
        return

    new_key = f"wf-{workflow.id}/wfm-{wf_module.id}/{old_key}"

    logger.info(f"Move %s/%s to %s/%s", bucket, old_key, bucket, new_key)
    try:
        minio.copy(bucket, new_key, f"{bucket}/{old_key}")
        minio.remove(bucket, old_key)
    except minio.error.NoSuchKey:
        # old_key is missing. Two possibilities:
        #
        # 1. We're re-running this script after it failed once with
        #    atomic=True (which used to be set, by accident); the move already
        #    succeeded but the DB doesn't know it. In that case, continue
        #    because this error actually means, "all is well."
        # 2. The file didn't exist to begin with. In that case, write a blank
        #    file in its stead. That way the user will remark, "hey, Workbench
        #    ate my file!" instead of undefined behavior (which is worse).
        #    https://www.pivotaltracker.com/story/show/163336822
        if minio.exists(bucket, new_key):
            pass  # "all is well"
        else:
            # write an empty file
            minio.put_bytes(bucket, new_key, b"")
            uploaded_file.size = 0
            uploaded_file.save(update_fields=["size"])
    uploaded_file.key = new_key
    uploaded_file.save(update_fields=["key"])
def auto_delete_file_on_delete(sender, instance, **kwargs):
    # Delete S3 data when UploadedFile is deleted
    minio.remove(instance.bucket, instance.key)
Example #9
0
def _clear() -> None:
    try:
        minio.remove(Bucket, Key)
    except minio.error.NoSuchKey:
        pass