def test_db_s3_syntax_error_is_runtime_error(self): mv = create_or_replace_from_spec( { "id_name": "regtest9", "name": "regtest9 v1", "category": "Clean", "parameters": [{ "id_name": "url", "type": "string" }], }, source_version_hash="b1c2d3", ) bio = io.BytesIO() with zipfile.ZipFile(bio, mode="w") as zf: zf.writestr( "regtest9.yaml", json.dumps({ **mv.spec, "parameters": "not an Array" }).encode("utf-8"), ) zf.writestr("regtest9.py", b"def render(") s3.put_bytes( s3.ExternalModulesBucket, "regtest9/regtest9.b1c2d3.zip", bytes(bio.getbuffer()), ) with self.assertRaises(RuntimeError) as cm: MODULE_REGISTRY.latest("regtest9") self.assertIsInstance(cm.exception.__cause__, SyntaxError)
def test_db_s3_validate_code_with_kernel(self): mv = create_or_replace_from_spec( { "id_name": "regtest7", "name": "regtest7 v1", "category": "Clean", "parameters": [{ "id_name": "url", "type": "string" }], }, source_version_hash="b1c2d3", ) bio = io.BytesIO() with zipfile.ZipFile(bio, mode="w") as zf: zf.writestr("regtest7.yaml", json.dumps(mv.spec).encode("utf-8")) zf.writestr( "regtest7.py", b"def render(table, params):\n return table\nfoo()") s3.put_bytes( s3.ExternalModulesBucket, "regtest7/regtest7.b1c2d3.zip", bytes(bio.getbuffer()), ) with self.assertRaises(RuntimeError) as cm: MODULE_REGISTRY.latest("regtest7") self.assertIsInstance(cm.exception.__cause__, ModuleExitedError)
def test_db_s3_use_cache_for_same_version(self): mv = create_or_replace_from_spec( { "id_name": "regtest4", "name": "regtest4 v1", "category": "Clean", "parameters": [{ "id_name": "url", "type": "string" }], }, source_version_hash="b1c2d2", ) bio = io.BytesIO() with zipfile.ZipFile(bio, mode="w") as zf: zf.writestr("regtest4.yaml", json.dumps(mv.spec).encode("utf-8")) zf.writestr("regtest4.py", b"def render(table, params):\n return table") s3.put_bytes( s3.ExternalModulesBucket, "regtest4/regtest4.b1c2d2.zip", bytes(bio.getbuffer()), ) zf1 = MODULE_REGISTRY.latest("regtest4") zf2 = MODULE_REGISTRY.latest("regtest4") self.assertIs(zf2, zf1)
def test_delete_deletes_from_s3(self): s3.put_bytes(s3.StoredObjectsBucket, "test.dat", b"abcd") workflow = Workflow.create_and_init() step = workflow.tabs.first().steps.create(order=0, slug="step-1") so = step.stored_objects.create(size=4, key="test.dat") so.delete() self.assertFalse(s3.exists(s3.StoredObjectsBucket, "test.dat"))
def test_clean_file_safe_filename(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create(module_id_name="uploadfile", order=0, slug="step-1") key = f"wf-${workflow.id}/wfm-${step.id}/6e00511a-8ac4-4b72-9acc-9d069992b5cf" s3.put_bytes(s3.UserFilesBucket, key, b"1234") model = UploadedFileModel.objects.create( step=step, name="/etc/passwd.$/etc/passwd", size=4, uuid="6e00511a-8ac4-4b72-9acc-9d069992b5cf", key=key, ) with ExitStack() as inner_stack: result = self._call_prep_params( ParamSchema.Dict({"file": ParamSchema.File()}), {"file": "6e00511a-8ac4-4b72-9acc-9d069992b5cf"}, step_id=step.id, exit_stack=inner_stack, ) self.assertEqual( result.uploaded_files["6e00511a-8ac4-4b72-9acc-9d069992b5cf"], UploadedFile( "/etc/passwd.$/etc/passwd", "6e00511a-8ac4-4b72-9acc-9d069992b5cf_-etc-passwd.--etc-passwd", model.created_at, ), )
def test_pre_finish_no_op_when_api_token_is_off(self): _init_module("x") self.kernel.migrate_params.side_effect = lambda m, p: p workflow = Workflow.create_and_init() step = workflow.tabs.first().steps.create( order=0, slug="step-123", module_id_name="x", file_upload_api_token="abc123", params={"file": None}, ) s3.put_bytes(s3.TusUploadBucket, "data", b"1234567") response = self.client.post( f"/tusd-hooks", { "Upload": { "MetaData": { "filename": "foo.csv", "workflowId": str(workflow.id), "stepSlug": step.slug, "apiToken": "an-out-of-date-token", }, "Size": 7, "Storage": {"Bucket": s3.TusUploadBucket, "Key": "data"}, } }, HTTP_HOOK_NAME="pre-finish", content_type="application/json", ) self.assertEqual(response.status_code, 403) self.assertEqual( response.json(), {"error": {"code": "authorization-bearer-token-invalid"}} ) # File was not created self.assertEqual(0, step.uploaded_files.count())
def test_delete_remove_uploaded_data_by_prefix_in_case_model_missing(self): workflow = Workflow.create_and_init() step = workflow.tabs.first().steps.create(order=0, slug="step-1") uuid = str(uuidgen.uuid4()) key = step.uploaded_file_prefix + uuid s3.put_bytes(s3.UserFilesBucket, key, b"A\n1") # Don't create the UploadedFile. Simulates races during upload/delete # that could write a file on S3 but not in our database. # step.uploaded_files.create(name='t.csv', size=3, uuid=uuid, key=key) step.delete() # do not crash self.assertFalse(s3.exists(s3.UserFilesBucket, key))
def test_resume_backtrack_on_corrupt_cache_error(self): module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh -- but CORRUPT step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache(workflow, step1, workflow.last_delta_id, make_table(make_column("A", [1]))) step1.refresh_from_db() s3.put_bytes( # Write corrupted data -- will lead to CorruptCacheError rendercache.io.BUCKET, rendercache.io.crr_parquet_key(step1.cached_render_result), b"CORRUPT", ) # step2: no cached result -- must re-render step2 = tab.steps.create(order=1, slug="step-2", module_id_name="mod") tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) new_table = make_table(make_column("B", ["b"])) with patch.object(Kernel, "render", side_effect=mock_render(new_table)): with self._execute(workflow, tab_flow, {}, expect_log_level=logging.ERROR) as (result, path): self.assertEqual( result, StepResult(path, [Column("B", ColumnType.Text())])) self.assertEqual( # called with step1, then step2 Kernel.render.call_count, 2, ) self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def create_module_zipfile( module_id: str = "testmodule", *, version: Optional[str] = None, spec_kwargs: Dict[str, Any] = {}, python_code: str = "", html: Optional[str] = None, js_module: str = "", extra_file_contents: Dict[str, bytes] = {}, ) -> ModuleZipfile: """ Create a ModuleZipfile, stored in the database and s3. If `version` is not supplied, generate one using the sha1 of the zipfile. This is usually what you want: s3 reads on overwrites are _eventually_ consistent, so if you 1. write a file; 2. overwrite it; and 3. read it, the read might result in the file from step 1 or the file from step 2. A sha1 version means overwrites will never modify data, solving the problem. """ spec = { "id_name": module_id, "name": "Test Module", "category": "Clean", "parameters": [], **spec_kwargs, } bio = io.BytesIO() with zipfile.ZipFile(bio, mode="w") as zf: zf.writestr(module_id + ".yaml", json.dumps(spec)) zf.writestr(module_id + ".py", python_code.encode("utf-8")) if html is not None: zf.writestr(module_id + ".html", html.encode("utf-8")) if js_module: zf.writestr(module_id + ".js", js_module.encode("utf-8")) for path, content in extra_file_contents.items(): zf.writestr(path, content) data = bytes(bio.getbuffer()) if version is None: sha1 = hashlib.sha1() sha1.update(data) version = sha1.hexdigest() s3.put_bytes( s3.ExternalModulesBucket, "%s/%s.%s.zip" % (module_id, module_id, version), data, ) ModuleVersion.objects.create(id_name=module_id, source_version_hash=version, spec=spec, js_module=js_module) return MODULE_REGISTRY.latest(module_id)
def test_resume_backtrack_on_corrupt_cache_error(self): module_zipfile = create_module_zipfile("mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh -- but CORRUPT step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) s3.put_bytes( # Write corrupted data -- will lead to CorruptCacheError rendercache.io.BUCKET, rendercache.io.crr_parquet_key(step1.cached_render_result), b"CORRUPT", ) # step2: no cached result -- must re-render step2 = tab.steps.create(order=1, slug="step-2", module_id_name="mod") tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) with patch.object(Kernel, "render", side_effect=mock_render({"B": [2]})): with self._execute( workflow, tab_flow, {}, expect_log_level=logging.ERROR ) as result: expected = RenderResult(arrow_table({"B": [2]})) assert_render_result_equals(result, expected) self.assertEqual( # called with step1, then step2 Kernel.render.call_count, 2, ) self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_db_s3_refresh_cache_for_new_version(self): v1 = create_or_replace_from_spec( { "id_name": "regtest5", "name": "regtest5 v1", "category": "Clean", "parameters": [{ "id_name": "url", "type": "string" }], }, source_version_hash="b1c2d2", ) bio = io.BytesIO() with zipfile.ZipFile(bio, mode="w") as zf: zf.writestr("regtest5.yaml", json.dumps(v1.spec).encode("utf-8")) zf.writestr("regtest5.py", b"def render(table, params):\n return table") s3.put_bytes( s3.ExternalModulesBucket, "regtest5/regtest5.b1c2d2.zip", bytes(bio.getbuffer()), ) zipfile1 = MODULE_REGISTRY.latest("regtest5") create_or_replace_from_spec( { "id_name": "regtest5", "name": "regtest5 v2", "category": "Clean", "parameters": [{ "id_name": "url", "type": "string" }], }, source_version_hash="b1c2d3", ) s3.put_bytes( s3.ExternalModulesBucket, "regtest5/regtest5.b1c2d3.zip", bytes(bio.getbuffer()), # reuse zipfile to save lines of code ) zipfile2 = MODULE_REGISTRY.latest("regtest5") self.assertIsNot(zipfile2, zipfile1) self.assertEqual(zipfile2.version, "b1c2d3")
def test_invalid_parquet_is_corrupt_cache_error(self): with arrow_table_context(make_column("A", ["x"])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Text())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET") with tempfile_context() as arrow_path: with self.assertRaises(CorruptCacheError): with open_cached_render_result(crr) as loaded: pass
def test_duplicate_bytes(self): key = f"{self.workflow.id}/{self.step1.id}/{uuid1()}" s3.put_bytes(s3.StoredObjectsBucket, key, b"12345") self.step2 = self.step1.tab.steps.create(order=1, slug="step-2") so1 = self.step1.stored_objects.create(key=key, size=5) so2 = so1.duplicate(self.step2) # new StoredObject should have same time, # different file with same contents self.assertEqual(so2.stored_at, so1.stored_at) self.assertEqual(so2.size, so1.size) self.assertNotEqual(so2.key, so1.key) self.assertEqual( get_s3_object_with_data(s3.StoredObjectsBucket, so2.key)["Body"], b"12345", )
def test_clean_file_happy_path(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create(module_id_name="uploadfile", order=0, slug="step-1") key = f"wf-${workflow.id}/wfm-${step.id}/6e00511a-8ac4-4b72-9acc-9d069992b5cf" s3.put_bytes(s3.UserFilesBucket, key, b"1234") model = UploadedFileModel.objects.create( step=step, name="x.csv.gz", size=4, uuid="6e00511a-8ac4-4b72-9acc-9d069992b5cf", key=key, ) with ExitStack() as inner_stack: result = self._call_prep_params( ParamSchema.Dict({"file": ParamSchema.File()}), {"file": "6e00511a-8ac4-4b72-9acc-9d069992b5cf"}, step_id=step.id, exit_stack=inner_stack, ) self.assertEqual( result, PrepParamsResult( {"file": "6e00511a-8ac4-4b72-9acc-9d069992b5cf"}, tab_outputs={}, uploaded_files={ "6e00511a-8ac4-4b72-9acc-9d069992b5cf": UploadedFile( "x.csv.gz", "6e00511a-8ac4-4b72-9acc-9d069992b5cf_x.csv.gz", model.created_at, ) }, ), ) self.assertEqual( (self.basedir / "6e00511a-8ac4-4b72-9acc-9d069992b5cf_x.csv.gz").read_bytes(), b"1234", ) # Assert that once `exit_stack` goes out of scope, file is deleted self.assertFalse( (self.basedir / "6e00511a-8ac4-4b72-9acc-9d069992b5cf_x.csv.gz").exists())
def test_clean_file_wrong_step(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create(module_id_name="uploadfile", order=0, slug="step-1") step2 = tab.steps.create(module_id_name="uploadfile", order=1, slug="step-2") id = str(uuid.uuid4()) key = f"wf-${workflow.id}/wfm-${step.id}/${id}" s3.put_bytes(s3.UserFilesBucket, key, b"1234") UploadedFile.objects.create( step=step2, name="x.csv.gz", size=4, uuid=id, key=key ) context = self._render_context(step_id=step.id) result = clean_value(ParamDType.File(), id, context) self.assertIsNone(result) # Assert that if a temporary file was created to house the download, it # no longer exists. self.assertListEqual(list(self.basedir.iterdir()), [])
def test_clean_file_happy_path(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create(module_id_name="uploadfile", order=0, slug="step-1") id = str(uuid.uuid4()) key = f"wf-${workflow.id}/wfm-${step.id}/${id}" s3.put_bytes(s3.UserFilesBucket, key, b"1234") UploadedFile.objects.create( step=step, name="x.csv.gz", size=4, uuid=id, key=key ) with ExitStack() as inner_stack: context = self._render_context(step_id=step.id, exit_stack=inner_stack) result: Path = clean_value(ParamDType.File(), id, context) self.assertIsInstance(result, Path) self.assertEqual(result.read_bytes(), b"1234") self.assertEqual(result.suffixes, [".csv", ".gz"]) # Assert that once `exit_stack` goes out of scope, file is deleted self.assertFalse(result.exists())
def test_db_s3_latest_order_by_last_update_time(self): # old version create_or_replace_from_spec( { "id_name": "regtest1", "name": "regtest1 v1", "category": "Clean", "parameters": [{ "id_name": "url", "type": "string" }], }, source_version_hash="b1c2d3", ) time.sleep(0.000002) # guarantee new timestamp # new version v2 = create_or_replace_from_spec( { "id_name": "regtest1", "name": "regtest1 v2", "category": "Clean", "parameters": [{ "id_name": "url", "type": "string" }], }, source_version_hash="b1c2d2", ) bio = io.BytesIO() with zipfile.ZipFile(bio, mode="w") as zf: zf.writestr("regtest1.yaml", json.dumps(v2.spec).encode("utf-8")) zf.writestr("regtest1.py", b"def render(table, params):\n return table") s3.put_bytes( s3.ExternalModulesBucket, "regtest1/regtest1.b1c2d2.zip", bytes(bio.getbuffer()), ) zf = MODULE_REGISTRY.latest("regtest1") self.assertEqual(zf.get_spec(), load_spec(v2.spec))
def test_step_duplicate_copy_only_selected_uploaded_file(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create(order=0, slug="step-1", module_id_name="upload") uuid1 = str(uuidgen.uuid4()) key1 = f"{step.uploaded_file_prefix}{uuid1}.csv" s3.put_bytes(s3.UserFilesBucket, key1, b"1234567") uuid2 = str(uuidgen.uuid4()) key2 = f"{step.uploaded_file_prefix}{uuid2}.csv" s3.put_bytes(s3.UserFilesBucket, key2, b"7654321") uuid3 = str(uuidgen.uuid4()) key3 = f"{step.uploaded_file_prefix}{uuid3}.csv" s3.put_bytes(s3.UserFilesBucket, key3, b"9999999") step.uploaded_files.create(name="t1.csv", uuid=uuid1, key=key1, size=7) step.uploaded_files.create(name="t2.csv", uuid=uuid2, key=key2, size=7) step.uploaded_files.create(name="t3.csv", uuid=uuid3, key=key3, size=7) # Write the _middle_ uuid to the old module -- proving that we aren't # selecting by ordering step.params = {"file": uuid2, "has_header": True} step.save(update_fields=["params"]) workflow2 = Workflow.create_and_init() tab2 = workflow2.tabs.first() step2 = step.duplicate_into_new_workflow(tab2) self.assertEqual(step2.uploaded_files.count(), 1) new_uf = step2.uploaded_files.first() self.assertEqual(new_uf.uuid, uuid2)
def test_step_duplicate_copy_uploaded_file(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create(order=0, slug="step-1", module_id_name="upload") uuid = str(uuidgen.uuid4()) key = f"{step.uploaded_file_prefix}{uuid}.csv" s3.put_bytes(s3.UserFilesBucket, key, b"1234567") # Write the uuid to the old module -- we'll check the new module points # to a valid file step.params = {"file": uuid, "has_header": True} step.save(update_fields=["params"]) uploaded_file = step.uploaded_files.create(name="t.csv", uuid=uuid, key=key, size=7) workflow2 = Workflow.create_and_init() tab2 = workflow2.tabs.first() step2 = step.duplicate_into_new_workflow(tab2) uploaded_file2 = step2.uploaded_files.first() self.assertIsNotNone(uploaded_file2) # New file gets same uuid -- because it's the same file and we don't # want to edit params during copy self.assertEqual(uploaded_file2.uuid, uuid) self.assertEqual(step2.params["file"], uuid) self.assertTrue( # The new file should be in a different path uploaded_file2.key.startswith(step2.uploaded_file_prefix)) self.assertEqual(uploaded_file2.name, "t.csv") self.assertEqual(uploaded_file2.size, 7) self.assertEqual(uploaded_file2.created_at, uploaded_file.created_at) self.assertEqual( get_s3_object_with_data(s3.UserFilesBucket, uploaded_file2.key)["Body"], b"1234567", )
def test_pre_finish_enforce_storage_limits(self, send_update): send_update.side_effect = async_noop _init_module("x") self.kernel.migrate_params.side_effect = lambda m, p: p workflow = Workflow.create_and_init() step = workflow.tabs.first().steps.create( order=0, slug="step-123", module_id_name="x", file_upload_api_token="abc123", params={"file": None}, ) s3.put_bytes(s3.UserFilesBucket, "foo/1.txt", b"1") step.uploaded_files.create( created_at=datetime.datetime(2020, 1, 1), name="file1.txt", size=1, uuid="df46244d-268a-0001-9b47-360502dd9b32", key="foo/1.txt", ) s3.put_bytes(s3.UserFilesBucket, "foo/2.txt", b"22") step.uploaded_files.create( created_at=datetime.datetime(2020, 1, 2), name="file2.txt", size=2, uuid="df46244d-268a-0002-9b47-360502dd9b32", key="foo/2.txt", ) s3.put_bytes(s3.UserFilesBucket, "foo/3.txt", b"333") step.uploaded_files.create( created_at=datetime.datetime(2020, 1, 3), name="file3.txt", size=3, uuid="df46244d-268a-0003-9b47-360502dd9b32", key="foo/3.txt", ) # Upload the new file, "file4.txt" s3.put_bytes(s3.TusUploadBucket, "new-key", b"4444") with self.assertLogs(level=logging.INFO): # Logs SetStepParams's migrate_params() response = self.client.post( f"/tusd-hooks", { "Upload": { "MetaData": { "filename": "file4.txt", "workflowId": str(workflow.id), "stepSlug": step.slug, "apiToken": "abc123", }, "Size": 7, "Storage": { "Bucket": s3.TusUploadBucket, "Key": "new-key" }, } }, HTTP_HOOK_NAME="pre-finish", content_type="application/json", ) self.assertEqual(response.status_code, 200) # Test excess uploaded files were deleted self.assertEqual( list( step.uploaded_files.order_by("id").values_list("name", flat=True)), ["file3.txt", "file4.txt"], ) self.assertFalse(s3.exists(s3.UserFilesBucket, "foo/1.txt")) self.assertFalse(s3.exists(s3.UserFilesBucket, "foo/2.txt")) # Test delta nixes old files from clients' browsers send_update.assert_called() uploaded_file = step.uploaded_files.get(name="file4.txt") self.assertEqual( send_update.mock_calls[0][1][1].steps[step.id].files, [ clientside.UploadedFile( name="file4.txt", uuid=uploaded_file.uuid, size=7, created_at=uploaded_file.created_at, ), clientside.UploadedFile( name="file3.txt", uuid="df46244d-268a-0003-9b47-360502dd9b32", size=3, created_at=datetime.datetime(2020, 1, 3), ), ], )
def test_pre_finish_happy_path(self, queue_render, send_update): send_update.side_effect = async_noop queue_render.side_effect = async_noop _init_module("x") self.kernel.migrate_params.side_effect = lambda m, p: p workflow = Workflow.create_and_init() step = workflow.tabs.first().steps.create( order=0, slug="step-123", module_id_name="x", file_upload_api_token="abc123", params={"file": None}, ) s3.put_bytes(s3.TusUploadBucket, "data", b"1234567") with self.assertLogs(level=logging.INFO): # Logs SetStepParams's migrate_params() response = self.client.post( f"/tusd-hooks", { "Upload": { "MetaData": { "filename": "foo.csv", "workflowId": str(workflow.id), "stepSlug": step.slug, "apiToken": "abc123", }, "Size": 7, "Storage": { "Bucket": s3.TusUploadBucket, "Key": "data" }, } }, HTTP_HOOK_NAME="pre-finish", content_type="application/json", ) self.assertEqual(response.status_code, 200) self.assertEqual(response.json(), {}) # File was created uploaded_file = step.uploaded_files.first() self.assertRegex( uploaded_file.key, f"^wf-{workflow.id}/wfm-{step.id}/[-0-9a-f]{{36}}\\.csv$") self.assertEqual( get_s3_object_with_data(s3.UserFilesBucket, uploaded_file.key)["Body"], b"1234567", ) self.assertEqual(uploaded_file.name, "foo.csv") # SetStepParams ran uuid = uploaded_file.key[-40:-4] step.refresh_from_db() self.assertEqual(step.params, {"file": uuid}) # Send deltas send_update.assert_called() self.assertEqual( send_update.mock_calls[0][1][1].steps[step.id].files, [ clientside.UploadedFile( name="foo.csv", uuid=uuid, size=7, created_at=uploaded_file.created_at, ) ], ) queue_render.assert_called()
def _put(b: bytes) -> None: s3.put_bytes(Bucket, Key, b)