def duplicate(self, to_step): basename = self.key.split("/")[-1] key = f"{to_step.workflow_id}/{to_step.id}/{basename}" s3.copy(s3.StoredObjectsBucket, key, f"{s3.StoredObjectsBucket}/{self.key}") return to_step.stored_objects.create( stored_at=self.stored_at, hash=self.hash, key=key, size=self.size )
def _finish_upload(data: Dict[str, Any]) -> Dict[str, Any]: """Create an UploadedFile by moving data out of tusd's bucket. Return kwargs for SetStepParams. """ # SECURITY: we expect metadata to come from Workbench itself. (On # production, there's no route from the Internet to tusd's POST endpoint.) # However, let's cast to correct types just to be safe. If a miscreant # comes along, that'll cause a 500 error and we'll be notified. (Better # than sending untrusted data to Django ORM.) # Raise TypeError, KeyError, ValueError. filename = str(data["MetaData"]["filename"]) api_token = str(data["MetaData"]["apiToken"]) workflow_id = int(data["MetaData"]["workflowId"]) step_slug = data["MetaData"]["stepSlug"] size = int(data["Size"]) bucket = str(data["Storage"]["Bucket"]) key = str(data["Storage"]["Key"]) if bucket != s3.TusUploadBucket: # security: if a hijacker manages to craft a request here, prevent its # creator from copying a file he/she can't see. (The creator is only # known to be able to see `key` of `s3.TusUploadBucket`.) raise RuntimeError("SECURITY: did tusd send this request?") suffix = PurePath(filename).suffix file_uuid = str(uuid.uuid4()) final_key = None with upload.locked_and_loaded_step(workflow_id, step_slug) as ( workflow, step, param_id_name, ): # raise UploadError # Ensure upload's API token is the same as the one we sent tusd. # # This doesn't give security: an attacker can simulate a request from # tusd with api_token=None and it will look like a browser-initiated # one. # # It's for timing: if the user resets a module's API token, we should # disallow all prior uploads. if api_token: # empty when React client uploads upload.raise_if_api_token_is_wrong(step, api_token) # raise UploadError final_key = step.uploaded_file_prefix + str(file_uuid) + suffix # Tricky leak here: if there's an exception or crash, the transaction # is reverted. final_key will remain in S3 but the database won't point # to it. # # Not a huge deal, because `final_key` is in the Step's own directory. # The user can delete all leaked files by deleting the Step. s3.copy( s3.UserFilesBucket, final_key, f"{bucket}/{key}", MetadataDirective="REPLACE", ContentDisposition=s3.encode_content_disposition(filename), ContentType="application/octet-stream", ) step.uploaded_files.create( name=filename, size=size, uuid=file_uuid, key=final_key ) delete_old_files_to_enforce_storage_limits(step=step) s3.remove(bucket, key) return dict( workflow_id=workflow_id, step=step, new_values={param_id_name: file_uuid} )
def _duplicate_with_slug_and_delta_id(self, to_tab, slug, last_relevant_delta_id): # Initialize but don't save new_step = Step( tab=to_tab, slug=slug, module_id_name=self.module_id_name, fetch_errors=self.fetch_errors, stored_data_version=self.stored_data_version, order=self.order, notes=self.notes, is_collapsed=self.is_collapsed, auto_update_data=False, next_update=None, update_interval=self.update_interval, last_update_check=self.last_update_check, last_relevant_delta_id=last_relevant_delta_id, params=self.params, secrets={}, # DO NOT COPY SECRETS ) # Copy cached render result, if there is one. # # If we duplicate a Workflow mid-render, the cached render result might # not have any useful data. But that's okay: just kick off a new # render. The common case (all-rendered Workflow) will produce a # fully-rendered duplicate Workflow. # # We cannot copy the cached result if the destination Tab has a # different name than this one: tab_name is passed to render(), so even # an exactly-duplicated Step can have a different output. cached_result = self.cached_render_result if cached_result is not None and self.tab.name == to_tab.name: # assuming file-copy succeeds, copy cached results. new_step.cached_render_result_delta_id = new_step.last_relevant_delta_id for attr in ("status", "errors", "json", "columns", "nrows"): full_attr = f"cached_render_result_{attr}" setattr(new_step, full_attr, getattr(self, full_attr)) new_step.save() # so there is a new_step.id for parquet_key # Now new_step.cached_render_result will return a # CachedRenderResult, because all the DB values are set. It'll have # a .parquet_key ... but there won't be a file there (because we # never wrote it). from cjwstate.rendercache.io import BUCKET, crr_parquet_key old_parquet_key = crr_parquet_key(cached_result) new_parquet_key = crr_parquet_key(new_step.cached_render_result) try: s3.copy( s3.CachedRenderResultsBucket, new_parquet_key, "%(Bucket)s/%(Key)s" % {"Bucket": BUCKET, "Key": old_parquet_key}, ) except s3.layer.error.NoSuchKey: # DB and filesystem are out of sync. CachedRenderResult handles # such cases gracefully. So `new_result` will behave exactly # like `cached_result`. pass else: new_step.save() # Duplicate the current stored data only, not the history if self.stored_data_version is not None: self.stored_objects.get(stored_at=self.stored_data_version).duplicate( new_step ) # For each "file" param, duplicate the "selected" uploaded_file if there # is one. # # We assume any UUID in `params` that points to an uploaded file _is_ # a file-dtype param. ([adamhooper, 2020-07-14] when the assumption does # not hold, will this cause DB errors? Not sure, but it's not a security # risk.) # # Why not check the param schema? Because we'd need to define behavior # for when the module doesn't exist, or its version is changed, or its # code breaks.... bah! These behaviors don't line up with any user # expectations. Users want to copy the thing they see. for uuid_str in self.params.values(): if not isinstance(uuid_str, str): continue try: UUID(uuid_str) except ValueError: continue uploaded_file = self.uploaded_files.filter(uuid=uuid_str).first() if not uploaded_file: continue new_key = uploaded_file.key.replace( self.uploaded_file_prefix, new_step.uploaded_file_prefix ) assert new_key != uploaded_file.key # TODO handle file does not exist s3.copy( s3.UserFilesBucket, new_key, f"{s3.UserFilesBucket}/{uploaded_file.key}", ) new_step.uploaded_files.create( created_at=uploaded_file.created_at, name=uploaded_file.name, size=uploaded_file.size, uuid=uploaded_file.uuid, key=new_key, ) return new_step