Exemple #1
0
 def test_convert_to_uploaded_file_happy_path(self):
     workflow = Workflow.create_and_init()
     wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                         slug="step-1",
                                                         module_id_name="x")
     ipu = wf_module.in_progress_uploads.create()
     minio.put_bytes(ipu.Bucket, ipu.get_upload_key(), b"1234567")
     uploaded_file = ipu.convert_to_uploaded_file("test sheet.xlsx")
     self.assertEqual(uploaded_file.uuid, str(ipu.id))
     final_key = wf_module.uploaded_file_prefix + str(ipu.id) + ".xlsx"
     # New file on S3 has the right bytes and metadata
     self.assertEqual(
         minio.get_object_with_data(minio.UserFilesBucket,
                                    final_key)["Body"],
         b"1234567",
     )
     self.assertEqual(
         minio.client.head_object(Bucket=minio.UserFilesBucket,
                                  Key=final_key)["ContentDisposition"],
         "attachment; filename*=UTF-8''test%20sheet.xlsx",
     )
     # InProgressUpload is completed
     self.assertEqual(ipu.is_completed, True)
     ipu.refresh_from_db()
     self.assertEqual(ipu.is_completed, True)  # also on DB
     # Uploaded file is deleted
     self.assertFalse(
         minio.exists(minio.UserFilesBucket, ipu.get_upload_key()))
Exemple #2
0
    def test_db_minio_syntax_error_is_runtime_error(self):
        mv = ModuleVersion.create_or_replace_from_spec(
            {
                "id_name": "regtest9",
                "name": "regtest9 v1",
                "category": "Clean",
                "parameters": [{
                    "id_name": "url",
                    "type": "string"
                }],
            },
            source_version_hash="b1c2d3",
        )
        bio = io.BytesIO()
        with zipfile.ZipFile(bio, mode="w") as zf:
            zf.writestr(
                "regtest9.yaml",
                json.dumps({
                    **mv.spec, "parameters": "not an Array"
                }).encode("utf-8"),
            )
            zf.writestr("regtest9.py", b"def render(")
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest9/regtest9.b1c2d3.zip",
            bytes(bio.getbuffer()),
        )

        with self.assertRaises(RuntimeError) as cm:
            MODULE_REGISTRY.latest("regtest9")
        self.assertIsInstance(cm.exception.__cause__, SyntaxError)
Exemple #3
0
    def test_db_minio_use_cache_for_same_version(self):
        mv = ModuleVersion.create_or_replace_from_spec(
            {
                "id_name": "regtest4",
                "name": "regtest4 v1",
                "category": "Clean",
                "parameters": [{
                    "id_name": "url",
                    "type": "string"
                }],
            },
            source_version_hash="b1c2d2",
        )
        bio = io.BytesIO()
        with zipfile.ZipFile(bio, mode="w") as zf:
            zf.writestr("regtest4.yaml", json.dumps(mv.spec).encode("utf-8"))
            zf.writestr("regtest4.py",
                        b"def render(table, params):\n    return table")
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest4/regtest4.b1c2d2.zip",
            bytes(bio.getbuffer()),
        )

        zf1 = MODULE_REGISTRY.latest("regtest4")
        zf2 = MODULE_REGISTRY.latest("regtest4")
        self.assertIs(zf2, zf1)
Exemple #4
0
    def test_delete_remove_leaked_stored_objects_and_uploaded_files(self):
        workflow = Workflow.create_and_init()
        # If the user deletes a workflow, all data associated with that
        # workflow should disappear. Postgres handles DB objects; but Django's
        # ORM doesn't do a great job with StoredObjects and UploadedFiles.
        #
        # This test isn't about minutae. It's just: if the user deletes a
        # Workflow, make sure all data gets deleted.
        #
        # TODO fix all other bugs that leak data.
        wf_module = workflow.tabs.first().wf_modules.create(
            order=0, slug="step-1", module_id_name="x"
        )

        # "Leak" a StoredObject by writing its file to S3 but neglecting to
        # write an accompanying StoredObject record.
        stored_object_key = f"{workflow.id}/{wf_module.id}/1234.dat"
        minio.put_bytes(minio.StoredObjectsBucket, stored_object_key, b"1234")

        # Add UploadedFile, missing a DB entry. (Even if we fix all bugs that
        # leak an S3 object after deleting a DB entry [and 2019-06-03 there are
        # still more] we'll still need to handle missing DB entries from legacy
        # code.)
        uploaded_file_key = f"{wf_module.uploaded_file_prefix}{uuid.uuid4()}.csv"
        minio.put_bytes(minio.UserFilesBucket, uploaded_file_key, b"A\nb")
        workflow.delete()
        self.assertFalse(minio.exists(minio.StoredObjectsBucket, stored_object_key))
        self.assertFalse(minio.exists(minio.UserFilesBucket, uploaded_file_key))
Exemple #5
0
    def test_db_minio_validate_code_with_kernel(self):
        mv = ModuleVersion.create_or_replace_from_spec(
            {
                "id_name": "regtest7",
                "name": "regtest7 v1",
                "category": "Clean",
                "parameters": [{
                    "id_name": "url",
                    "type": "string"
                }],
            },
            source_version_hash="b1c2d3",
        )
        bio = io.BytesIO()
        with zipfile.ZipFile(bio, mode="w") as zf:
            zf.writestr("regtest7.yaml", json.dumps(mv.spec).encode("utf-8"))
            zf.writestr(
                "regtest7.py",
                b"def render(table, params):\n    return table\nfoo()")
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest7/regtest7.b1c2d3.zip",
            bytes(bio.getbuffer()),
        )

        with self.assertRaises(RuntimeError) as cm:
            MODULE_REGISTRY.latest("regtest7")
        self.assertIsInstance(cm.exception.__cause__, ModuleExitedError)
Exemple #6
0
    def test_db_minio_latest_load_deprecated_simple(self):
        mv = ModuleVersion.create_or_replace_from_spec(
            {
                "id_name": "regtest2",
                "name": "regtest2 v1",
                "category": "Clean",
                "parameters": [{
                    "id_name": "url",
                    "type": "string"
                }],
            },
            source_version_hash="b1c2d2",
        )
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest2/b1c2d2/regtest2.py",
            "def render(table, params):\n    return table",
        )
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest2/b1c2d2/regtest2.yaml",
            json.dumps(mv.spec).encode("utf-8"),
        )

        zf = MODULE_REGISTRY.latest("regtest2")
        self.assertEqual(zf.get_spec(), ModuleSpec(**mv.spec))
        self.assertIsNone(zf.get_optional_html())
Exemple #7
0
    def test_clean_file_happy_path(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step = tab.wf_modules.create(module_id_name="uploadfile",
                                     order=0,
                                     slug="step-1")
        id = str(uuid.uuid4())
        key = f"wf-${workflow.id}/wfm-${step.id}/${id}"
        minio.put_bytes(minio.UserFilesBucket, key, b"1234")
        UploadedFile.objects.create(
            wf_module=step,
            name="x.csv.gz",
            size=4,
            uuid=id,
            bucket=minio.UserFilesBucket,
            key=key,
        )
        with ExitStack() as inner_stack:
            context = self._render_context(wf_module_id=step.id,
                                           exit_stack=inner_stack)
            result: Path = clean_value(ParamDType.File(), id, context)
            self.assertIsInstance(result, Path)
            self.assertEqual(result.read_bytes(), b"1234")
            self.assertEqual(result.suffixes, [".csv", ".gz"])

        # Assert that once `exit_stack` goes out of scope, file is deleted
        self.assertFalse(result.exists())
Exemple #8
0
 def test_clean_file_wrong_wf_module(self):
     workflow = Workflow.create_and_init()
     tab = workflow.tabs.first()
     step = tab.wf_modules.create(module_id_name="uploadfile",
                                  order=0,
                                  slug="step-1")
     step2 = tab.wf_modules.create(module_id_name="uploadfile",
                                   order=1,
                                   slug="step-2")
     id = str(uuid.uuid4())
     key = f"wf-${workflow.id}/wfm-${step.id}/${id}"
     minio.put_bytes(minio.UserFilesBucket, key, b"1234")
     UploadedFile.objects.create(
         wf_module=step2,
         name="x.csv.gz",
         size=4,
         uuid=id,
         bucket=minio.UserFilesBucket,
         key=key,
     )
     context = self._render_context(wf_module_id=step.id)
     result = clean_value(ParamDType.File(), id, context)
     self.assertIsNone(result)
     # Assert that if a temporary file was created to house the download, it
     # no longer exists.
     self.assertListEqual(list(self.basedir.iterdir()), [])
Exemple #9
0
    def test_fetch_integration(self, send_update, queue_render):
        queue_render.side_effect = async_value(None)
        send_update.side_effect = async_value(None)
        workflow = Workflow.create_and_init()
        ModuleVersion.create_or_replace_from_spec(
            {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []},
            source_version_hash="abc123",
        )
        wf_module = workflow.tabs.first().wf_modules.create(
            order=0, slug="step-1", module_id_name="mod"
        )
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "mod/abc123/code.py",
            b"import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table",
        )
        cjwstate.modules.init_module_system()
        now = timezone.now()
        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                fetch.fetch(workflow_id=workflow.id, wf_module_id=wf_module.id, now=now)
            )
        wf_module.refresh_from_db()
        so = wf_module.stored_objects.get(stored_at=wf_module.stored_data_version)
        with minio.temporarily_download(so.bucket, so.key) as parquet_path:
            table = pyarrow.parquet.read_table(str(parquet_path), use_threads=False)
            assert_arrow_table_equals(table, {"A": [1]})

        workflow.refresh_from_db()
        queue_render.assert_called_with(workflow.id, workflow.last_delta_id)
        send_update.assert_called()
Exemple #10
0
 def test_delete_deletes_from_s3(self):
     minio.put_bytes(minio.StoredObjectsBucket, "test.dat", b"abcd")
     workflow = Workflow.create_and_init()
     wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                         slug="step-1")
     so = wf_module.stored_objects.create(size=4, key="test.dat")
     so.delete()
     self.assertFalse(minio.exists(minio.StoredObjectsBucket, "test.dat"))
Exemple #11
0
 def test_invalid_parquet_is_corrupt_cache_error(self):
     result = RenderResult(arrow_table({"A": [1]}))
     cache_render_result(self.workflow, self.wf_module, self.delta.id,
                         result)
     crr = self.wf_module.cached_render_result
     minio.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET")
     with tempfile_context() as arrow_path:
         with self.assertRaises(CorruptCacheError):
             load_cached_render_result(crr, arrow_path)
 def test_delete_tab_deletes_from_s3(self):
     minio.put_bytes(minio.StoredObjectsBucket, "test.dat", b"abcd")
     workflow = Workflow.create_and_init()
     tab = workflow.tabs.create(position=1)
     wf_module = tab.wf_modules.create(order=0, slug="step-1")
     wf_module.stored_objects.create(size=4,
                                     bucket=minio.StoredObjectsBucket,
                                     key="test.dat")
     tab.delete()
     self.assertFalse(minio.exists(minio.StoredObjectsBucket, "test.dat"))
    def test_resume_backtrack_on_corrupt_cache_error(self):
        module_zipfile = create_module_zipfile("mod")
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        # step1: cached result is fresh -- but CORRUPT
        step1 = tab.wf_modules.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        rendercache.cache_render_result(
            workflow,
            step1,
            workflow.last_delta_id,
            RenderResult(arrow_table({"A": [1]})),
        )
        minio.put_bytes(
            # Write corrupted data -- will lead to CorruptCacheError
            rendercache.io.BUCKET,
            rendercache.io.crr_parquet_key(step1.cached_render_result),
            b"CORRUPT",
        )
        # step2: no cached result -- must re-render
        step2 = tab.wf_modules.create(order=1,
                                      slug="step-2",
                                      module_id_name="mod")

        tab_flow = TabFlow(
            tab.to_arrow(),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        with patch.object(Kernel,
                          "render",
                          side_effect=mock_render({"B": [2]})):
            with self._execute(workflow,
                               tab_flow, {},
                               expect_log_level=logging.ERROR) as result:
                expected = RenderResult(arrow_table({"B": [2]}))
                assert_render_result_equals(result, expected)

            self.assertEqual(
                # called with step1, then step2
                Kernel.render.call_count,
                2,
            )
            self.assertRegex(
                # Output is to the correct file
                Kernel.render.call_args[1]["output_filename"],
                r"execute-tab-output.*\.arrow",
            )
Exemple #14
0
 def test_delete_s3_data_leaked_file(self):
     # Delete a file with our UUID but without an UploadedFile.
     workflow = Workflow.create_and_init()
     wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                         slug="step-1",
                                                         module_id_name="x")
     ipu = wf_module.in_progress_uploads.create()
     key = wf_module.uploaded_file_prefix + str(ipu.id) + ".xlsx"
     minio.put_bytes(minio.UserFilesBucket, key, b"1234567")
     ipu.delete_s3_data()
     self.assertFalse(minio.exists(minio.UserFilesBucket, key))
Exemple #15
0
 def test_delete_remove_uploaded_data_by_prefix_in_case_model_missing(self):
     workflow = Workflow.create_and_init()
     wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                         slug="step-1")
     uuid = str(uuidgen.uuid4())
     key = wf_module.uploaded_file_prefix + uuid
     minio.put_bytes(minio.UserFilesBucket, key, b"A\n1")
     # Don't create the UploadedFile. Simulates races during upload/delete
     # that could write a file on S3 but not in our database.
     # wf_module.uploaded_files.create(name='t.csv', size=3, uuid=uuid, key=key)
     wf_module.delete()  # do not crash
     self.assertFalse(minio.exists(minio.UserFilesBucket, key))
Exemple #16
0
    def test_resume_backtrack_on_corrupt_cache_error(self, fake_load_module):
        ModuleVersion.create_or_replace_from_spec(
            {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []}
        )
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        # step1: cached result is fresh -- but CORRUPT
        step1 = tab.wf_modules.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        rendercache.cache_render_result(
            workflow,
            step1,
            workflow.last_delta_id,
            RenderResult(arrow_table({"A": [1]})),
        )
        minio.put_bytes(
            # Write corrupted data -- will lead to CorruptCacheError
            rendercache.io.BUCKET,
            rendercache.io.crr_parquet_key(step1.cached_render_result),
            b"CORRUPT",
        )
        # step2: no cached result -- must re-render
        step2 = tab.wf_modules.create(order=1, slug="step-2", module_id_name="mod")

        tab_flow = TabFlow(
            tab.to_arrow(),
            [
                ExecuteStep(step1, ParamDType.Dict({}), {}),
                ExecuteStep(step2, ParamDType.Dict({}), {}),
            ],
        )

        expected = RenderResult(arrow_table({"B": [2]}))
        fake_load_module.return_value.render.return_value = expected
        with self._execute(
            workflow, tab_flow, {}, expect_log_level=logging.ERROR
        ) as result:
            assert_render_result_equals(result, expected)

        self.assertEqual(
            # called with step1, then step2
            fake_load_module.return_value.render.call_count,
            2,
        )
        self.assertRegex(
            # Output is to the correct file
            fake_load_module.return_value.render.call_args[1]["output_filename"],
            r"execute-tab-output.*\.arrow",
        )
Exemple #17
0
def create_module_zipfile(
    module_id: str = "testmodule",
    *,
    version: Optional[str] = None,
    spec_kwargs: Dict[str, Any] = {},
    python_code: str = "",
    html: Optional[str] = None,
    js_module: str = "",
    extra_file_contents: Dict[str, bytes] = {},
) -> ModuleZipfile:
    """
    Create a ModuleZipfile, stored in the database and minio.

    If `version` is not supplied, generate one using the sha1 of the zipfile.
    This is usually what you want: minio reads on overwrites are _eventually_
    consistent, so if you 1. write a file; 2. overwrite it; and 3. read it, the
    read might result in the file from step 1 or the file from step 2. A sha1
    version means overwrites will never modify data, solving the problem.
    """
    spec = {
        "id_name": module_id,
        "name": "Test Module",
        "category": "Clean",
        "parameters": [],
        **spec_kwargs,
    }

    bio = io.BytesIO()
    with zipfile.ZipFile(bio, mode="w") as zf:
        zf.writestr(module_id + ".yaml", json.dumps(spec))
        zf.writestr(module_id + ".py", python_code.encode("utf-8"))
        if html is not None:
            zf.writestr(module_id + ".html", html.encode("utf-8"))
        if js_module:
            zf.writestr(module_id + ".js", js_module.encode("utf-8"))
        for path, content in extra_file_contents.items():
            zf.writestr(path, content)
    data = bytes(bio.getbuffer())
    if version is None:
        sha1 = hashlib.sha1()
        sha1.update(data)
        version = sha1.hexdigest()

    minio.put_bytes(
        minio.ExternalModulesBucket,
        "%s/%s.%s.zip" % (module_id, module_id, version),
        data,
    )
    ModuleVersion.objects.create(
        id_name=module_id, source_version_hash=version, spec=spec, js_module=js_module
    )
    return MODULE_REGISTRY.latest(module_id)
Exemple #18
0
 def test_complete_happy_path(self, queue_render, send_update):
     send_update.side_effect = async_noop
     queue_render.side_effect = async_noop
     _init_module("x")
     self.kernel.migrate_params.side_effect = lambda m, p: p
     workflow = Workflow.create_and_init()
     wf_module = workflow.tabs.first().wf_modules.create(
         order=0,
         slug="step-123",
         module_id_name="x",
         file_upload_api_token="abc123",
         params={"file": None},
     )
     upload = wf_module.in_progress_uploads.create()
     uuid = str(upload.id)
     key = upload.get_upload_key()
     minio.put_bytes(upload.Bucket, key, b"1234567")
     with self.assertLogs(level=logging.INFO):
         # Logs ChangeParametersCommand's migrate_params()
         response = self.client.post(
             f"/api/v1/workflows/{workflow.id}/steps/step-123/uploads/{upload.id}",
             {"filename": "test.csv"},
             content_type="application/json",
             HTTP_AUTHORIZATION="Bearer abc123",
         )
     self.assertEqual(response.status_code, 200)
     # Upload and its S3 data were deleted
     self.assertFalse(minio.exists(upload.Bucket, key))
     upload.refresh_from_db()
     self.assertTrue(upload.is_completed)
     # Final upload was created
     uploaded_file = wf_module.uploaded_files.first()
     self.assertEqual(uploaded_file.key,
                      f"wf-{workflow.id}/wfm-{wf_module.id}/{uuid}.csv")
     self.assertEqual(
         minio.get_object_with_data(minio.UserFilesBucket,
                                    uploaded_file.key)["Body"],
         b"1234567",
     )
     self.assertEqual(uploaded_file.name, "test.csv")
     # Return value includes uuid
     data = json.loads(response.content)
     self.assertEqual(data["uuid"], uuid)
     self.assertEqual(data["name"], "test.csv")
     self.assertEqual(data["size"], 7)
     # ChangeParametersCommand ran
     wf_module.refresh_from_db()
     self.assertEqual(wf_module.params, {"file": uuid})
     # Send deltas
     send_update.assert_called()
     queue_render.assert_called()
Exemple #19
0
 def test_delete_s3_data_ignore_non_leaked_file(self):
     workflow = Workflow.create_and_init()
     wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                         slug="step-1",
                                                         module_id_name="x")
     ipu = wf_module.in_progress_uploads.create()
     key = wf_module.uploaded_file_prefix + str(ipu.id) + ".xlsx"
     minio.put_bytes(minio.UserFilesBucket, key, b"1234567")
     wf_module.uploaded_files.create(name="text.xlsx",
                                     size=7,
                                     uuid=str(self.id),
                                     key=key)
     ipu.delete_s3_data()
     self.assertFalse(minio.exists(minio.UserFilesBucket, key))
Exemple #20
0
    def test_db_minio_refresh_cache_for_new_version(self):
        v1 = ModuleVersion.create_or_replace_from_spec(
            {
                "id_name": "regtest5",
                "name": "regtest5 v1",
                "category": "Clean",
                "parameters": [{
                    "id_name": "url",
                    "type": "string"
                }],
            },
            source_version_hash="b1c2d2",
        )
        bio = io.BytesIO()
        with zipfile.ZipFile(bio, mode="w") as zf:
            zf.writestr("regtest5.yaml", json.dumps(v1.spec).encode("utf-8"))
            zf.writestr("regtest5.py",
                        b"def render(table, params):\n    return table")
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest5/regtest5.b1c2d2.zip",
            bytes(bio.getbuffer()),
        )

        zipfile1 = MODULE_REGISTRY.latest("regtest5")

        ModuleVersion.create_or_replace_from_spec(
            {
                "id_name": "regtest5",
                "name": "regtest5 v2",
                "category": "Clean",
                "parameters": [{
                    "id_name": "url",
                    "type": "string"
                }],
            },
            source_version_hash="b1c2d3",
        )
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest5/regtest5.b1c2d3.zip",
            bytes(bio.getbuffer()),  # reuse zipfile to save lines of code
        )

        zipfile2 = MODULE_REGISTRY.latest("regtest5")

        self.assertIsNot(zipfile2, zipfile1)
        self.assertEqual(zipfile2.version, "b1c2d3")
Exemple #21
0
    def test_duplicate_bytes(self):
        key = f"{self.workflow.id}/{self.step1.id}/{uuid1()}"
        minio.put_bytes(minio.StoredObjectsBucket, key, b"12345")
        self.step2 = self.step1.tab.wf_modules.create(order=1, slug="step-2")
        so1 = self.step1.stored_objects.create(
            bucket=minio.StoredObjectsBucket, key=key, size=5)
        so2 = so1.duplicate(self.step2)

        # new StoredObject should have same time,
        # different file with same contents
        self.assertEqual(so1.stored_at, so2.stored_at)
        self.assertEqual(so1.size, so2.size)
        self.assertEqual(so1.bucket, so2.bucket)
        self.assertNotEqual(so1.key, so2.key)
        self.assertEqual(
            minio.get_object_with_data(so2.bucket, so2.key)["Body"], b"12345")
Exemple #22
0
 def test_abort_missing_upload_is_404(self):
     _init_module("x")
     workflow = Workflow.create_and_init()
     wf_module = workflow.tabs.first().wf_modules.create(
         order=0, slug="step-123", module_id_name="x", file_upload_api_token="abc123"
     )
     upload = wf_module.in_progress_uploads.create()
     key = upload.get_upload_key()
     minio.put_bytes(upload.Bucket, key, b"1234567")
     response = self.client.delete(
         f"/api/v1/workflows/{workflow.id}/steps/step-123/uploads/dcc00084-812d-4769-bf77-94518f18ff3d",
         HTTP_AUTHORIZATION="Bearer abc123",
     )
     self.assertEqual(response.status_code, 404)
     self.assertEqual(
         json.loads(response.content)["error"]["code"], "upload-not-found"
     )
 def test_finish_upload_happy_path(self, send_update):
     user = User.objects.create(username="******", email="*****@*****.**")
     workflow = Workflow.create_and_init(owner=user)
     wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                         slug="step-1",
                                                         module_id_name="x")
     in_progress_upload = wf_module.in_progress_uploads.create(
         id="147a9f5d-5b3e-41c3-a968-a84a5a9d587f")
     key = in_progress_upload.get_upload_key()
     minio.put_bytes(in_progress_upload.Bucket, key, b"1234567")
     send_update.side_effect = async_noop
     response = self.run_handler(
         finish_upload,
         user=user,
         workflow=workflow,
         wfModuleId=wf_module.id,
         key=key,
         filename="test sheet.csv",
     )
     self.assertResponse(
         response, data={"uuid": "147a9f5d-5b3e-41c3-a968-a84a5a9d587f"})
     # The uploaded file is deleted
     self.assertFalse(minio.exists(in_progress_upload.Bucket, key))
     # A new upload is created
     uploaded_file = wf_module.uploaded_files.first()
     self.assertEqual(uploaded_file.name, "test sheet.csv")
     self.assertEqual(uploaded_file.size, 7)
     self.assertEqual(uploaded_file.uuid,
                      "147a9f5d-5b3e-41c3-a968-a84a5a9d587f")
     self.assertEqual(uploaded_file.bucket, in_progress_upload.Bucket)
     final_key = f"wf-{workflow.id}/wfm-{wf_module.id}/147a9f5d-5b3e-41c3-a968-a84a5a9d587f.csv"
     self.assertEqual(uploaded_file.key, final_key)
     # The file has the right bytes and metadata
     self.assertEqual(
         minio.get_object_with_data(minio.UserFilesBucket,
                                    final_key)["Body"],
         b"1234567",
     )
     self.assertEqual(
         minio.client.head_object(Bucket=minio.UserFilesBucket,
                                  Key=final_key)["ContentDisposition"],
         "attachment; filename*=UTF-8''test%20sheet.csv",
     )
     # wf_module is updated
     send_update.assert_called()
 def test_abort_upload_happy_path_after_complete(self):
     user = User.objects.create(username="******", email="*****@*****.**")
     workflow = Workflow.create_and_init(owner=user)
     wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                         slug="step-1",
                                                         module_id_name="x")
     in_progress_upload = wf_module.in_progress_uploads.create(
         id="147a9f5d-5b3e-41c3-a968-a84a5a9d587f")
     key = in_progress_upload.get_upload_key()
     minio.put_bytes(in_progress_upload.Bucket, key, b"1234567")
     response = self.run_handler(abort_upload,
                                 user=user,
                                 workflow=workflow,
                                 wfModuleId=wf_module.id,
                                 key=key)
     self.assertResponse(response, data=None)
     wf_module.refresh_from_db()
     self.assertFalse(minio.exists(in_progress_upload.Bucket, key))
Exemple #25
0
 def test_abort(self):
     _init_module("x")
     workflow = Workflow.create_and_init()
     wf_module = workflow.tabs.first().wf_modules.create(
         order=0, slug="step-123", module_id_name="x", file_upload_api_token="abc123"
     )
     upload = wf_module.in_progress_uploads.create()
     key = upload.get_upload_key()
     minio.put_bytes(upload.Bucket, key, b"1234567")
     response = self.client.delete(
         f"/api/v1/workflows/{workflow.id}/steps/step-123/uploads/{upload.id}",
         HTTP_AUTHORIZATION="Bearer abc123",
     )
     self.assertEqual(response.status_code, 200)
     self.assertEqual(json.loads(response.content), {})
     self.assertFalse(minio.exists(upload.Bucket, key))  # file was deleted
     upload.refresh_from_db()
     self.assertTrue(upload.is_completed)
Exemple #26
0
 def test_complete_json_form_error(self):
     _init_module("x")
     workflow = Workflow.create_and_init()
     wf_module = workflow.tabs.first().wf_modules.create(
         order=0, slug="step-123", module_id_name="x", file_upload_api_token="abc123"
     )
     upload = wf_module.in_progress_uploads.create()
     key = upload.get_upload_key()
     minio.put_bytes(upload.Bucket, key, b"1234567")
     response = self.client.post(
         f"/api/v1/workflows/{workflow.id}/steps/step-123/uploads/{upload.id}",
         {"filename": None},
         content_type="application/json",
         HTTP_AUTHORIZATION="Bearer abc123",
     )
     self.assertEqual(response.status_code, 400)
     error = json.loads(response.content)["error"]
     self.assertEqual(error["code"], "body-has-errors")
     self.assertIn("filename", error["errors"])
Exemple #27
0
    def test_wf_module_duplicate_copy_uploaded_file(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        wf_module = tab.wf_modules.create(order=0,
                                          slug="step-1",
                                          module_id_name="upload")
        uuid = str(uuidgen.uuid4())
        key = f"{wf_module.uploaded_file_prefix}{uuid}.csv"
        minio.put_bytes(minio.UserFilesBucket, key, b"1234567")
        # Write the uuid to the old module -- we'll check the new module points
        # to a valid file
        wf_module.params = {"file": uuid, "has_header": True}
        wf_module.save(update_fields=["params"])
        uploaded_file = wf_module.uploaded_files.create(
            name="t.csv",
            uuid=uuid,
            bucket=minio.UserFilesBucket,
            key=key,
            size=7)

        workflow2 = Workflow.create_and_init()
        tab2 = workflow2.tabs.first()
        wf_module2 = wf_module.duplicate_into_new_workflow(tab2)

        uploaded_file2 = wf_module2.uploaded_files.first()
        self.assertIsNotNone(uploaded_file2)
        # New file gets same uuid -- because it's the same file and we don't
        # want to edit params during copy
        self.assertEqual(uploaded_file2.uuid, uuid)
        self.assertEqual(wf_module2.params["file"], uuid)
        self.assertTrue(
            # The new file should be in a different path
            uploaded_file2.key.startswith(wf_module2.uploaded_file_prefix))
        self.assertEqual(uploaded_file2.name, "t.csv")
        self.assertEqual(uploaded_file2.size, 7)
        self.assertEqual(uploaded_file2.created_at, uploaded_file.created_at)
        self.assertEqual(
            minio.get_object_with_data(uploaded_file2.bucket,
                                       uploaded_file2.key)["Body"],
            b"1234567",
        )
Exemple #28
0
    def test_db_minio_latest_order_by_last_update_time(self):
        # old version
        ModuleVersion.create_or_replace_from_spec(
            {
                "id_name": "regtest1",
                "name": "regtest1 v1",
                "category": "Clean",
                "parameters": [{
                    "id_name": "url",
                    "type": "string"
                }],
            },
            source_version_hash="b1c2d3",
        )
        time.sleep(0.000002)  # guarantee new timestamp
        # new version
        v2 = ModuleVersion.create_or_replace_from_spec(
            {
                "id_name": "regtest1",
                "name": "regtest1 v2",
                "category": "Clean",
                "parameters": [{
                    "id_name": "url",
                    "type": "string"
                }],
            },
            source_version_hash="b1c2d2",
        )
        bio = io.BytesIO()
        with zipfile.ZipFile(bio, mode="w") as zf:
            zf.writestr("regtest1.yaml", json.dumps(v2.spec).encode("utf-8"))
            zf.writestr("regtest1.py",
                        b"def render(table, params):\n    return table")
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest1/regtest1.b1c2d2.zip",
            bytes(bio.getbuffer()),
        )

        zf = MODULE_REGISTRY.latest("regtest1")
        self.assertEqual(zf.get_spec(), ModuleSpec(**v2.spec))
Exemple #29
0
    def test_db_minio_latest_load_deprecated_html(self):
        mv = ModuleVersion.create_or_replace_from_spec(
            {
                "id_name": "regtest3",
                "name": "regtest3 v2",
                "category": "Clean",
                "parameters": [{
                    "id_name": "url",
                    "type": "string"
                }],
            },
            source_version_hash="b1c2d2",
        )
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest3/b1c2d2/regtest3.py",
            "def render(table, params):\n    return table",
        )
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest3/b1c2d2/regtest3.yaml",
            json.dumps(mv.spec).encode("utf-8"),
        )
        html = "<!DOCTYPE html><html><head><title>Hi</title></head><body>Hello, world!</body></html>"
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "regtest3/b1c2d2/regtest3.html",
            html.encode("utf-8"),
        )

        zf = MODULE_REGISTRY.latest("regtest3")
        self.assertEqual(zf.get_optional_html(), html)
def move_uploaded_file(workflow, wf_module, uploaded_file):
    """
    Move files from /uuid.ext to /wf-1/wfm-2/uuid.ext.

    This helps delete leaked files and find problem files.
    """
    from cjwstate import minio

    bucket = uploaded_file.bucket
    old_key = uploaded_file.key
    if "/" in old_key:
        return

    new_key = f"wf-{workflow.id}/wfm-{wf_module.id}/{old_key}"

    logger.info(f"Move %s/%s to %s/%s", bucket, old_key, bucket, new_key)
    try:
        minio.copy(bucket, new_key, f"{bucket}/{old_key}")
        minio.remove(bucket, old_key)
    except minio.error.NoSuchKey:
        # old_key is missing. Two possibilities:
        #
        # 1. We're re-running this script after it failed once with
        #    atomic=True (which used to be set, by accident); the move already
        #    succeeded but the DB doesn't know it. In that case, continue
        #    because this error actually means, "all is well."
        # 2. The file didn't exist to begin with. In that case, write a blank
        #    file in its stead. That way the user will remark, "hey, Workbench
        #    ate my file!" instead of undefined behavior (which is worse).
        #    https://www.pivotaltracker.com/story/show/163336822
        if minio.exists(bucket, new_key):
            pass  # "all is well"
        else:
            # write an empty file
            minio.put_bytes(bucket, new_key, b"")
            uploaded_file.size = 0
            uploaded_file.save(update_fields=["size"])
    uploaded_file.key = new_key
    uploaded_file.save(update_fields=["key"])