def hydrate_input_uploads(input_, input_schema, hydrate_values=True): """Hydrate input basic:upload types with upload location. Find basic:upload fields in input. Add the upload location for relative paths. """ files = [] for field_schema, fields in iterate_fields(input_, input_schema): name = field_schema['name'] value = fields[name] if 'type' in field_schema: if field_schema['type'] == 'basic:file:': files.append(value) elif field_schema['type'] == 'list:basic:file:': files.extend(value) urlregex = re.compile(r'^(https?|ftp)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]') for value in files: if 'file_temp' in value: if isinstance(value['file_temp'], six.string_types): # If file_temp not url, nor absolute path: hydrate path if not os.path.isabs(value['file_temp']) and not urlregex.search(value['file_temp']): value['file_temp'] = os.path.join(settings.FLOW_EXECUTOR['UPLOAD_DIR'], value['file_temp']) else: # Something very strange happened value['file_temp'] = 'Invalid value for file_temp in DB'
def update_dependency_kinds(apps, schema_editor): """Update historical dependency kinds as they may be wrong.""" DataDependency = apps.get_model("flow", "DataDependency") for dependency in DataDependency.objects.all(): # Assume dependency is of subprocess kind. dependency.kind = "subprocess" # Check child inputs to determine if this is an IO dependency. child = dependency.child parent = dependency.parent for field_schema, fields in iterate_fields(child.input, child.process.input_schema): name = field_schema["name"] value = fields[name] if field_schema.get("type", "").startswith("data:"): if value == parent.pk: dependency.kind = "io" break elif field_schema.get("type", "").startswith("list:data:"): for data in value: if value == parent.pk: dependency.kind = "io" break dependency.save()
def save_storage(self, instance, schema): """Save basic:json values to a Storage collection.""" for field_schema, fields in iterate_fields(instance, schema): name = field_schema['name'] value = fields[name] if field_schema.get('type', '').startswith('basic:json:'): if value and not self.pk: raise ValidationError( 'Data object must be `created` before creating `basic:json:` fields' ) if isinstance(value, int): # already in Storage continue if isinstance(value, six.string_types): file_path = os.path.join( settings.FLOW_EXECUTOR['DATA_DIR'], str(self.pk), value) if os.path.isfile(file_path): with open(file_path) as file_handler: value = json.load(file_handler) storage = Storage.objects.create( name='Storage for data id {}'.format(self.pk), contributor=self.contributor, data_id=self.pk, json=value, ) # `value` is copied by value, so `fields[name]` must be changed fields[name] = storage.pk
def dependency_status(data): """Return abstracted satus of dependencies. STATUS_ERROR .. one dependency has error status STATUS_DONE .. all dependencies have done status None .. other """ for field_schema, fields in iterate_fields(data.input, data.process.input_schema): if (field_schema['type'].lower().startswith('data:') or field_schema['type'].lower().startswith('list:data:')): name = field_schema['name'] value = fields[name] if field_schema['type'].lower().startswith('data:'): value = [value] for uid in value: try: _data = Data.objects.get(id=uid) except Data.DoesNotExist: return Data.STATUS_ERROR if _data.status == Data.STATUS_ERROR: return Data.STATUS_ERROR if _data.status != Data.STATUS_DONE: return None return Data.STATUS_DONE
def purge_data_dependencies(apps, schema_editor): Data = apps.get_model("flow", "Data") DataDependency = apps.get_model("flow", "DataDependency") for data in Data.objects.iterator(): parent_pks = [] for field_schema, fields in iterate_fields(data.input, data.process.input_schema): name = field_schema["name"] value = fields[name] if field_schema.get("type", "").startswith("data:"): parent_pks.append(value) elif field_schema.get("type", "").startswith("list:data:"): parent_pks.extend(value) parent_pks = [ pk if Data.objects.filter(pk=pk).exists() else None for pk in parent_pks ] for dependency in DataDependency.objects.filter(child=data.id, kind="io"): parent_pk = dependency.parent.pk if dependency.parent else None if parent_pk in parent_pks: parent_pks.remove(parent_pk) else: dependency.delete()
def update_dependency_kinds(apps, schema_editor): """Update historical dependency kinds as they may be wrong.""" DataDependency = apps.get_model('flow', 'DataDependency') for dependency in DataDependency.objects.all(): # Assume dependency is of subprocess kind. dependency.kind = 'subprocess' # Check child inputs to determine if this is an IO dependency. child = dependency.child parent = dependency.parent for field_schema, fields in iterate_fields(child.input, child.process.input_schema): name = field_schema['name'] value = fields[name] if field_schema.get('type', '').startswith('data:'): if value == parent.pk: dependency.kind = 'io' break elif field_schema.get('type', '').startswith('list:data:'): for data in value: if value == parent.pk: dependency.kind = 'io' break dependency.save()
def hydrate_size(data): """Compute size of all Data object outputs and its cumultative size. This is a simplified version of original ``hydrate_size`` function, since we need just a subset of it. """ def add_file_size(obj): """Add file size to the basic:file field.""" path = os.path.join(settings.FLOW_EXECUTOR["DATA_DIR"], data.location.subpath, obj["file"]) obj["size"] = os.path.getsize(path) obj["total_size"] = obj["size"] data_size = 0 for field_schema, fields in iterate_fields(data.output, data.process.output_schema): name = field_schema["name"] value = fields[name] if "type" in field_schema: if field_schema["type"].startswith("basic:file:"): add_file_size(value) data_size += value.get("total_size", 0) data.size = data_size
def hydrate_input_uploads(input_, input_schema, hydrate_values=True): """Hydrate input basic:upload types with upload location. Find basic:upload fields in input. Add the upload location for relative paths. """ from resolwe.flow.managers import manager files = [] for field_schema, fields in iterate_fields(input_, input_schema): name = field_schema['name'] value = fields[name] if 'type' in field_schema: if field_schema['type'] == 'basic:file:': files.append(value) elif field_schema['type'] == 'list:basic:file:': files.extend(value) urlregex = re.compile(r'^(https?|ftp)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]') for value in files: if 'file_temp' in value: if isinstance(value['file_temp'], str): # If file_temp not url, hydrate path. if not urlregex.search(value['file_temp']): value['file_temp'] = manager.get_executor().resolve_upload_path(value['file_temp']) else: # Something very strange happened. value['file_temp'] = 'Invalid value for file_temp in DB'
def _rewire_inputs(data, pk_mapping): """Rewire inputs of provided data objects. References to input data objects in ``input`` field of the Data objects are chenged with references of their copies according to the provided mapping. If primary key is not in the mapping, original value is kept. :param list data: A list of Data objects to process. :param dict pk_mapping: A dict where keys are primary keys of original Data objects and values are primary keys of their copies. """ for datum in data: for field_schema, fields in iterate_fields(datum.input, datum.process.input_schema): name = field_schema["name"] value = fields[name] if field_schema["type"].startswith( "data:") and value in pk_mapping: fields[name] = pk_mapping[value] elif field_schema["type"].startswith("list:data:"): fields[name] = [ pk_mapping[pk] if pk in pk_mapping else pk for pk in value ]
def referenced_schema_files(fields, schema): """Get the list of files and directories references by fields. :return: tuple of lists, first list containing files and directories refereced in data.output. :rtype: Tuple[List[str], List[str]] """ refs = [] for field_schema, fields in iterate_fields(fields, schema): if "type" in field_schema: field_type = field_schema["type"] field_name = field_schema["name"] # Add basic:file: entries if field_type.startswith("basic:file:"): refs.append(fields[field_name]["file"]) refs += fields[field_name].get("refs", []) # Add list:basic:file: entries elif field_type.startswith("list:basic:file:"): for field in fields[field_name]: refs.append(field["file"]) refs += field.get("refs", []) # Add basic:dir: entries elif field_type.startswith("basic:dir:"): refs.append(fields[field_name]["dir"]) refs += fields[field_name].get("refs", []) # Add list:basic:dir: entries elif field_type.startswith("list:basic:dir:"): for field in fields[field_name]: refs.append(field["dir"]) refs += field.get("refs", []) return refs
def get_collection_of_input_entities(data): """Get collection that contains all "entity inputs" of a given data. With "entity input", one refers to the inputs that are part of an entity. """ # Prevent circular imports: from resolwe.flow.models import Collection data_ids = set() for field_schema, fields in iterate_fields(data.input, data.process.input_schema): name = field_schema["name"] value = fields[name] if "type" not in field_schema: continue if field_schema["type"].startswith("data:"): value = [value] elif not field_schema["type"].startswith("list:data:"): continue data_ids.update([val for val in value if val is not None]) collections = Collection.objects.filter( data__in=list(data_ids), data__entity__isnull=False, ).distinct() if collections.count() != 1: raise ValueError( "Entity inputs should be part of exactly one collection. (not {})". format(len(collections))) return collections.first()
def _hydrate_values(output, output_schema, data): """Hydrate basic:file and basic:json values. Find fields with basic:file type and assign a full path to the file. Find fields with basic:json type and assign a JSON object from storage. """ def hydrate_path(file_name): """Hydrate file paths.""" from resolwe.flow.managers import manager class HydratedPath(str): """String wrapper, which also stores the original filename.""" __slots__ = ("data_id", "file_name") def __new__(cls, value=""): """Initialize hydrated path.""" hydrated = str.__new__(cls, value) hydrated.data_id = data.id hydrated.file_name = file_name return hydrated return HydratedPath(manager.get_executor().resolve_data_path(data, file_name)) def hydrate_storage(storage_id): """Hydrate storage fields.""" from .storage import LazyStorageJSON # Prevent circular import. return LazyStorageJSON(pk=storage_id) for field_schema, fields in iterate_fields(output, output_schema): name = field_schema["name"] value = fields[name] if "type" in field_schema: if field_schema["type"].startswith("basic:file:"): value["file"] = hydrate_path(value["file"]) value["refs"] = [hydrate_path(ref) for ref in value.get("refs", [])] elif field_schema["type"].startswith("list:basic:file:"): for obj in value: obj["file"] = hydrate_path(obj["file"]) obj["refs"] = [hydrate_path(ref) for ref in obj.get("refs", [])] if field_schema["type"].startswith("basic:dir:"): value["dir"] = hydrate_path(value["dir"]) value["refs"] = [hydrate_path(ref) for ref in value.get("refs", [])] elif field_schema["type"].startswith("list:basic:dir:"): for obj in value: obj["dir"] = hydrate_path(obj["dir"]) obj["refs"] = [hydrate_path(ref) for ref in obj.get("refs", [])] elif field_schema["type"].startswith("basic:json:"): fields[name] = hydrate_storage(value) elif field_schema["type"].startswith("list:basic:json:"): fields[name] = [hydrate_storage(storage_id) for storage_id in value]
def hydrate_input_references(input_, input_schema, hydrate_values=True): """Hydrate ``input_`` with linked data. Find fields with complex data:<...> types in ``input_``. Assign an output of corresponding data object to those fields. """ from .data import Data # prevent circular import for field_schema, fields in iterate_fields(input_, input_schema): name = field_schema['name'] value = fields[name] if 'type' in field_schema: if field_schema['type'].startswith('data:'): if value is None: continue try: data = Data.objects.get(id=value) except Data.DoesNotExist: fields[name] = {} continue output = copy.deepcopy(data.output) if hydrate_values: _hydrate_values(output, data.process.output_schema, data) output["__id"] = data.id output["__type"] = data.process.type output["__descriptor"] = data.descriptor output["__entity_name"] = getattr(data.entity, 'name', None) output["__output_schema"] = data.process.output_schema fields[name] = output elif field_schema['type'].startswith('list:data:'): outputs = [] for val in value: if val is None: continue try: data = Data.objects.get(id=val) except Data.DoesNotExist: outputs.append({}) continue output = copy.deepcopy(data.output) if hydrate_values: _hydrate_values(output, data.process.output_schema, data) output["__id"] = data.id output["__type"] = data.process.type output["__descriptor"] = data.descriptor output["__entity_name"] = getattr(data.entity, 'name', None) output["__output_schema"] = data.process.output_schema outputs.append(output) fields[name] = outputs
def _hydrate_values(output, output_schema, data): """Hydrate basic:file and basic:json values. Find fields with basic:file type and assign a full path to the file. Find fields with basic:json type and assign a JSON object from storage. """ def hydrate_path(file_name): """Hydrate file paths.""" from resolwe.flow.managers import manager class HydratedPath(str): """String wrapper, which also stores the original filename.""" __slots__ = ('data_id', 'file_name') def __new__(cls, value=''): """Initialize hydrated path.""" hydrated = str.__new__(cls, value) hydrated.data_id = data.id hydrated.file_name = file_name return hydrated return HydratedPath(manager.get_executor().resolve_data_path(data, file_name)) def hydrate_storage(storage_id): """Hydrate storage fields.""" from .storage import LazyStorageJSON # Prevent circular import. return LazyStorageJSON(pk=storage_id) for field_schema, fields in iterate_fields(output, output_schema): name = field_schema['name'] value = fields[name] if 'type' in field_schema: if field_schema['type'].startswith('basic:file:'): value['file'] = hydrate_path(value['file']) value['refs'] = [hydrate_path(ref) for ref in value.get('refs', [])] elif field_schema['type'].startswith('list:basic:file:'): for obj in value: obj['file'] = hydrate_path(obj['file']) obj['refs'] = [hydrate_path(ref) for ref in obj.get('refs', [])] if field_schema['type'].startswith('basic:dir:'): value['dir'] = hydrate_path(value['dir']) value['refs'] = [hydrate_path(ref) for ref in value.get('refs', [])] elif field_schema['type'].startswith('list:basic:dir:'): for obj in value: obj['dir'] = hydrate_path(obj['dir']) obj['refs'] = [hydrate_path(ref) for ref in obj.get('refs', [])] elif field_schema['type'].startswith('basic:json:'): fields[name] = hydrate_storage(value) elif field_schema['type'].startswith('list:basic:json:'): fields[name] = [hydrate_storage(storage_id) for storage_id in value]
def hydrate_input_references(input_, input_schema, hydrate_values=True): """Hydrate ``input_`` with linked data. Find fields with complex data:<...> types in ``input_``. Assign an output of corresponding data object to those fields. """ from .data import Data # prevent circular import for field_schema, fields in iterate_fields(input_, input_schema): name = field_schema['name'] value = fields[name] if 'type' in field_schema: if field_schema['type'].startswith('data:'): # if re.match('^[0-9a-fA-F]{24}$', str(value)) is None: # print "ERROR: data:<...> value in field \"{}\", type \"{}\" not ObjectId but {}.".format( # name, field_schema['type'], value) if value is None: continue data = Data.objects.get(id=value) output = copy.deepcopy(data.output) # static = Data.static.to_python(data.static) if hydrate_values: _hydrate_values(output, data.process.output_schema, data) # _hydrate_values(static, data.static_schema, data) output["__id"] = data.id output["__type"] = data.process.type output["__descriptor"] = data.descriptor fields[name] = output elif field_schema['type'].startswith('list:data:'): outputs = [] for val in value: # if re.match('^[0-9a-fA-F]{24}$', str(val)) is None: # print "ERROR: data:<...> value in {}, type \"{}\" not ObjectId but {}.".format( # name, field_schema['type'], val) if val is None: continue data = Data.objects.get(id=val) output = copy.deepcopy(data.output) # static = Data.static.to_python(data.static) if hydrate_values: _hydrate_values(output, data.process.output_schema, data) # _hydrate_values(static, data.static_schema, data) output["__id"] = data.id output["__type"] = data.process.type output["__descriptor"] = data.descriptor outputs.append(output) fields[name] = outputs
def hydrate_size(data): """Add file and dir sizes. Add sizes to ``basic:file:``, ``list:basic:file``, ``basic:dir:`` and ``list:basic:dir:`` fields. """ from .data import Data # prevent circular import def add_file_size(obj): """Add file size to the basic:file field.""" if data.status in [Data.STATUS_DONE, Data.STATUS_ERROR] and 'size' in obj: return path = os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(data.pk), obj['file']) if not os.path.isfile(path): raise ValidationError("Referenced file does not exist ({})".format(path)) obj['size'] = os.path.getsize(path) def get_dir_size(path): """Get directory size.""" total_size = 0 for dirpath, _, filenames in os.walk(path): for file_name in filenames: file_path = os.path.join(dirpath, file_name) total_size += os.path.getsize(file_path) return total_size def add_dir_size(obj): """Add directory size to the basic:dir field.""" if data.status in [Data.STATUS_DONE, Data.STATUS_ERROR] and 'size' in obj: return path = os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(data.pk), obj['dir']) if not os.path.isdir(path): raise ValidationError("Referenced dir does not exist ({})".format(path)) obj['size'] = get_dir_size(path) for field_schema, fields in iterate_fields(data.output, data.process.output_schema): name = field_schema['name'] value = fields[name] if 'type' in field_schema: if field_schema['type'].startswith('basic:file:'): add_file_size(value) elif field_schema['type'].startswith('list:basic:file:'): for obj in value: add_file_size(obj) elif field_schema['type'].startswith('basic:dir:'): add_dir_size(value) elif field_schema['type'].startswith('list:basic:dir:'): for obj in value: add_dir_size(obj)
def _get_output_field(self, obj, path): """Return object's output field schema and field dict. :param obj: object with the output field :type obj: ~resolwe.flow.models.Data :param str path: path to :class:`~resolwe.flow.models.Data` object's output field """ for field_schema, field, field_path in iterate_fields( obj.output, obj.process.output_schema, ""): if path == field_path: return field_schema, field self.fail("Field not found in path {}.".format(path))
def save_storage(data): """Parse output field and create Storage objects if needed.""" for field_schema, fields, path in iterate_fields( data.output, data.process.output_schema, "" ): name = field_schema["name"] value = fields[name] if field_schema.get("type", "").startswith("basic:json:"): if value and not data.pk: raise ValidationError( "Data object must be `created` before creating `basic:json:` fields" ) if isinstance(value, int): # already in Storage continue if isinstance(value, str): file_path = data.location.get_path(filename=value) if os.path.isfile(file_path): try: with open(file_path) as file_handler: value = json.load(file_handler) except json.JSONDecodeError: with open(file_path) as file_handler: content = file_handler.read() content = content.rstrip() raise ValidationError( "Value of '{}' must be a valid JSON, current: {}".format( name, content ) ) existing_storage_pk = None with suppress(KeyError): existing_storage_pk = dict_dot(data._original_output, path) if isinstance(existing_storage_pk, int): data.storages.filter(pk=existing_storage_pk).update(json=value) fields[name] = existing_storage_pk else: storage = data.storages.create( name="Storage for data id {}".format(data.pk), contributor=data.contributor, json=value, ) fields[name] = storage.pk
def handle_run( self, message: Message[dict], manager: "Processor" ) -> Response[List[int]]: """Handle spawning new data object. The response is the id of the created object. """ export_files_mapper = message.message_data["export_files_mapper"] manager.data.refresh_from_db() try: data = message.message_data["data"] logger.debug(__("Spawning new data object from dict: {}", data)) data["contributor"] = manager.data.contributor data["process"] = Process.objects.filter(slug=data["process"]).latest() data["tags"] = manager.data.tags data["collection"] = manager.data.collection data["subprocess_parent"] = manager.data with transaction.atomic(): for field_schema, fields in iterate_fields( data.get("input", {}), data["process"].input_schema ): type_ = field_schema["type"] name = field_schema["name"] value = fields[name] if type_ == "basic:file:": fields[name] = self.hydrate_spawned_files( export_files_mapper, value ) elif type_ == "list:basic:file:": fields[name] = [ self.hydrate_spawned_files(export_files_mapper, fn) for fn in value ] created_object = Data.objects.create(**data) except Exception: manager._log_exception( f"Error while preparing spawned Data objects for process '{manager.data.process.slug}'" ) return message.respond_error([]) else: return message.respond_ok(created_object.id)
def _hydrate_values(output, output_schema, data): """Hydrate basic:file and basic:json values. Find fields with basic:file type and assign a full path to the file. Find fields with basic:json type and assign a JSON object from storage. """ def hydrate_path(file_name): """Hydrate file paths.""" id_ = "{}/".format(data.id) # needs trailing slash if id_ in file_name: file_name = file_name[file_name.find(id_) + len(id_):] # remove id from filename return os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], id_, file_name) def hydrate_storage(storage_id): """Hydrate storage fields.""" return LazyStorageJSON(pk=storage_id) for field_schema, fields in iterate_fields(output, output_schema): name = field_schema['name'] value = fields[name] if 'type' in field_schema: if field_schema['type'].startswith('basic:file:'): value['file'] = hydrate_path(value['file']) elif field_schema['type'].startswith('list:basic:file:'): for obj in value: obj['file'] = hydrate_path(obj['file']) if field_schema['type'].startswith('basic:dir:'): value['dir'] = hydrate_path(value['dir']) elif field_schema['type'].startswith('list:basic:dir:'): for obj in value: obj['dir'] = hydrate_path(obj['dir']) elif field_schema['type'].startswith('basic:json:'): fields[name] = hydrate_storage(value) elif field_schema['type'].startswith('list:basic:json:'): fields[name] = [ hydrate_storage(storage_id) for storage_id in value ]
def save_dependencies(self, instance, schema): """Save data: and list:data: references as parents.""" def add_dependency(value): """Add parent Data dependency.""" try: self.parents.add(Data.objects.get(pk=value)) # pylint: disable=no-member except Data.DoesNotExist: pass for field_schema, fields in iterate_fields(instance, schema): name = field_schema['name'] value = fields[name] if field_schema.get('type', '').startswith('data:'): add_dependency(value) elif field_schema.get('type', '').startswith('list:data:'): for data in value: add_dependency(data)
def resolve_secrets(self): """Retrieve handles for all basic:secret: fields on input. The process must have the ``secrets`` resource requirement specified in order to access any secrets. Otherwise this method will raise a ``PermissionDenied`` exception. :return: A dictionary of secrets where key is the secret handle and value is the secret value. """ secrets = {} for field_schema, fields in iterate_fields( self.input, self.process.input_schema ): if not field_schema.get("type", "").startswith("basic:secret:"): continue name = field_schema["name"] value = fields[name] try: handle = value["handle"] except KeyError: continue try: secrets[handle] = Secret.objects.get_secret( handle, contributor=self.contributor ) except Secret.DoesNotExist: raise PermissionDenied( "Access to secret not allowed or secret does not exist" ) # If the process does not not have the right requirements it is not # allowed to access any secrets. allowed = self.process.requirements.get("resources", {}).get("secrets", False) if secrets and not allowed: raise PermissionDenied( "Process '{}' has secret inputs, but no permission to see secrets".format( self.process.slug ) ) return secrets
def rewire_inputs(data_list): """Rewire inputs of provided data objects. Input parameter is a list of original and copied data object model instances: ``[{'original': original, 'copy': copy}]``. This function finds which objects reference other objects (in the list) on the input and replaces original objects with the copies (mutates copies' inputs). """ if len(data_list) < 2: return data_list mapped_ids = { bundle["original"].id: bundle["copy"].id for bundle in data_list } for bundle in data_list: updated = False copy = bundle["copy"] for field_schema, fields in iterate_fields(copy.input, copy.process.input_schema): name = field_schema["name"] value = fields[name] if field_schema["type"].startswith( "data:") and value in mapped_ids: fields[name] = mapped_ids[value] updated = True elif field_schema["type"].startswith("list:data:") and any( [id_ in mapped_ids for id_ in value]): fields[name] = [ mapped_ids[id_] if id_ in mapped_ids else id_ for id_ in value ] updated = True if updated: copy.save() return data_list
def remove_total_size(apps, schema_editor): """Remove ``total_size`` field from all file/dir-type outputs.""" Data = apps.get_model('flow', 'Data') for data in Data.objects.all(): for field_schema, fields in iterate_fields(data.output, data.process.output_schema): name = field_schema['name'] value = fields[name] if 'type' in field_schema: if field_schema['type'].startswith('basic:file:'): del value['total_size'] elif field_schema['type'].startswith('list:basic:file:'): for obj in value: del obj['total_size'] elif field_schema['type'].startswith('basic:dir:'): del value['total_size'] elif field_schema['type'].startswith('list:basic:dir:'): for obj in value: del obj['total_size'] data.save()
def handle_get_output_files_dirs(self, message: Message[str], manager: "Processor") -> Response[dict]: """Get the output for file and dir fields. The sent dictionary has field names as its keys and tuple filed_type, field_value for its values. """ def is_file_or_dir(field_type: str) -> bool: """Is file or directory.""" return "basic:file" in field_type or "basic:dir" in field_type output = dict() for field_schema, fields in iterate_fields( manager.data.output, manager.data.process.output_schema): if is_file_or_dir(field_schema["type"]): name = field_schema["name"] output[name] = (field_schema["type"], fields[name]) return message.respond_ok(output)
def resolve_secrets(self): """Retrieve handles for all basic:secret: fields on input. The process must have the ``secrets`` resource requirement specified in order to access any secrets. Otherwise this method will raise a ``PermissionDenied`` exception. :return: A dictionary of secrets where key is the secret handle and value is the secret value. """ secrets = {} for field_schema, fields in iterate_fields(self.input, self.process.input_schema): # pylint: disable=no-member if not field_schema.get('type', '').startswith('basic:secret:'): continue name = field_schema['name'] value = fields[name] try: handle = value['handle'] except KeyError: continue try: secrets[handle] = Secret.objects.get_secret( handle, contributor=self.contributor ) except Secret.DoesNotExist: raise PermissionDenied("Access to secret not allowed or secret does not exist") # If the process does not not have the right requirements it is not # allowed to access any secrets. allowed = self.process.requirements.get('resources', {}).get('secrets', False) # pylint: disable=no-member if secrets and not allowed: raise PermissionDenied( "Process '{}' has secret inputs, but no permission to see secrets".format( self.process.slug # pylint: disable=no-member ) ) return secrets
def recreate_parent_dependencies(apps, schema_editor): """Create empty dependency relation if parent has been deleted.""" Data = apps.get_model("flow", "Data") DataDependency = apps.get_model("flow", "DataDependency") def process_dependency(data, parent): if not Data.objects.filter(pk=parent).exists(): DataDependency.objects.create(child=data, parent=None, kind="io") for data in Data.objects.all(): for field_schema, fields in iterate_fields(data.input, data.process.input_schema): name = field_schema["name"] value = fields[name] if field_schema.get("type", "").startswith("data:"): process_dependency(data, value) elif field_schema.get("type", "").startswith("list:data:"): for parent in value: process_dependency(data, parent)
def remove_total_size(apps, schema_editor): """Remove ``total_size`` field from all file/dir-type outputs.""" Data = apps.get_model("flow", "Data") for data in Data.objects.all(): for field_schema, fields in iterate_fields( data.output, data.process.output_schema ): name = field_schema["name"] value = fields[name] if "type" in field_schema: if field_schema["type"].startswith("basic:file:"): del value["total_size"] elif field_schema["type"].startswith("list:basic:file:"): for obj in value: del obj["total_size"] elif field_schema["type"].startswith("basic:dir:"): del value["total_size"] elif field_schema["type"].startswith("list:basic:dir:"): for obj in value: del obj["total_size"] data.save()
def save_storage(self, instance, schema): """Save basic:json values to a Storage collection.""" for field_schema, fields in iterate_fields(instance, schema): name = field_schema["name"] value = fields[name] if field_schema.get("type", "").startswith("basic:json:"): if value and not self.pk: raise ValidationError( "Data object must be `created` before creating `basic:json:` fields" ) if isinstance(value, int): # already in Storage continue if isinstance(value, str): file_path = self.location.get_path(filename=value) if os.path.isfile(file_path): try: with open(file_path) as file_handler: value = json.load(file_handler) except json.JSONDecodeError: with open(file_path) as file_handler: content = file_handler.read() content = content.rstrip() raise ValidationError( "Value of '{}' must be a valid JSON, current: {}".format( name, content ) ) storage = self.storages.create( name="Storage for data id {}".format(self.pk), contributor=self.contributor, json=value, ) # `value` is copied by value, so `fields[name]` must be changed fields[name] = storage.pk
def save_dependencies(self, instance, schema): """Save data: and list:data: references as parents.""" def add_dependency(value): """Add parent Data dependency.""" try: DataDependency.objects.update_or_create( parent=Data.objects.get(pk=value), child=self, defaults={'kind': DataDependency.KIND_IO}, ) except Data.DoesNotExist: pass for field_schema, fields in iterate_fields(instance, schema): name = field_schema['name'] value = fields[name] if field_schema.get('type', '').startswith('data:'): add_dependency(value) elif field_schema.get('type', '').startswith('list:data:'): for data in value: add_dependency(data)
def save_dependencies(self, instance, schema): """Save data: and list:data: references as parents.""" def add_dependency(value): """Add parent Data dependency.""" try: DataDependency.objects.update_or_create( parent=Data.objects.get(pk=value), child=self, defaults={"kind": DataDependency.KIND_IO}, ) except Data.DoesNotExist: pass for field_schema, fields in iterate_fields(instance, schema): name = field_schema["name"] value = fields[name] if field_schema.get("type", "").startswith("data:"): add_dependency(value) elif field_schema.get("type", "").startswith("list:data:"): for data in value: add_dependency(data)
def recreate_parent_dependencies(apps, schema_editor): """Create empty dependency relation if parent has been deleted.""" Data = apps.get_model('flow', 'Data') DataDependency = apps.get_model('flow', 'DataDependency') def process_dependency(data, parent): if not Data.objects.filter(pk=parent).exists(): DataDependency.objects.create( child=data, parent=None, kind='io' ) for data in Data.objects.all(): for field_schema, fields in iterate_fields(data.input, data.process.input_schema): name = field_schema['name'] value = fields[name] if field_schema.get('type', '').startswith('data:'): process_dependency(data, value) elif field_schema.get('type', '').startswith('list:data:'): for parent in value: process_dependency(data, parent)
def handle_get_inputs_no_shared_storage(self, message: Message[int], manager: "Processor") -> Response: """Get a files belonging to input data objects. The format of the output is as follows: { base_url_1: (connector1, [list, of, ReferencedPath, instances]), bose_url_2: (connector2, [another, list, of, ReferencedPath, instances]) ... } """ output_data = {} # First get ids of data objecs which are inputs for data object we are # processing. input_data_ids = [] for schema, fields in iterate_fields( manager.data.input, manager.data.process.input_schema): type_ = schema["type"] if type_.startswith("data:") or type_.startswith("list:data:"): value = fields[schema["name"]] if isinstance(value, int): input_data_ids.append(value) else: input_data_ids += value for input_data_id in input_data_ids: file_storage = FileStorage.objects.get(data=input_data_id) location = file_storage.default_storage_location output_data[location.url] = ( location.connector_name, list( ReferencedPath.objects.filter( storage_locations=location).values()), ) manager._listener.communicator.suspend_heartbeat(manager.peer_identity) return message.respond_ok(output_data)
def save_storage(self, instance, schema): """Save basic:json values to a Storage collection.""" for field_schema, fields in iterate_fields(instance, schema): name = field_schema['name'] value = fields[name] if field_schema.get('type', '').startswith('basic:json:'): if value and not self.pk: raise ValidationError( 'Data object must be `created` before creating `basic:json:` fields') if isinstance(value, int): # already in Storage continue if isinstance(value, str): file_path = self.location.get_path(filename=value) # pylint: disable=no-member if os.path.isfile(file_path): try: with open(file_path) as file_handler: value = json.load(file_handler) except json.JSONDecodeError: with open(file_path) as file_handler: content = file_handler.read() content = content.rstrip() raise ValidationError( "Value of '{}' must be a valid JSON, current: {}".format(name, content) ) storage = self.storages.create( # pylint: disable=no-member name='Storage for data id {}'.format(self.pk), contributor=self.contributor, json=value, ) # `value` is copied by value, so `fields[name]` must be changed fields[name] = storage.pk
def handle_finish(self, obj): """Handle an incoming ``Data`` finished processing request. :param obj: The Channels message object. Command object format: .. code-block:: none { 'command': 'finish', 'data_id': [id of the :class:`~resolwe.flow.models.Data` object this command changes], 'process_rc': [exit status of the processing] 'spawn_processes': [optional; list of spawn dictionaries], 'exported_files_mapper': [if spawn_processes present] } """ data_id = obj[ExecutorProtocol.DATA_ID] logger.debug( __("Finishing Data with id {} (handle_finish).", data_id), extra={ 'data_id': data_id, 'packet': obj } ) spawning_failed = False with transaction.atomic(): # Spawn any new jobs in the request. spawned = False if ExecutorProtocol.FINISH_SPAWN_PROCESSES in obj: if is_testing(): # NOTE: This is a work-around for Django issue #10827 # (https://code.djangoproject.com/ticket/10827), same as in # TestCaseHelpers._pre_setup(). Because the listener is running # independently, it must clear the cache on its own. ContentType.objects.clear_cache() spawned = True exported_files_mapper = obj[ExecutorProtocol.FINISH_EXPORTED_FILES] logger.debug( __("Spawning new Data objects for Data with id {} (handle_finish).", data_id), extra={ 'data_id': data_id } ) try: # This transaction is needed because we're running # asynchronously with respect to the main Django code # here; the manager can get nudged from elsewhere. with transaction.atomic(): parent_data = Data.objects.get(pk=data_id) # Spawn processes. for d in obj[ExecutorProtocol.FINISH_SPAWN_PROCESSES]: d['contributor'] = parent_data.contributor d['process'] = Process.objects.filter(slug=d['process']).latest() d['tags'] = parent_data.tags for field_schema, fields in iterate_fields(d.get('input', {}), d['process'].input_schema): type_ = field_schema['type'] name = field_schema['name'] value = fields[name] if type_ == 'basic:file:': fields[name] = self.hydrate_spawned_files( exported_files_mapper, value, data_id ) elif type_ == 'list:basic:file:': fields[name] = [self.hydrate_spawned_files(exported_files_mapper, fn, data_id) for fn in value] with transaction.atomic(): d = Data.objects.create(**d) DataDependency.objects.create( parent=parent_data, child=d, kind=DataDependency.KIND_SUBPROCESS, ) # Copy permissions. copy_permissions(parent_data, d) # Entity is added to the collection only when it is # created - when it only contains 1 Data object. entities = Entity.objects.filter(data=d).annotate(num_data=Count('data')).filter( num_data=1) # Copy collections. for collection in parent_data.collection_set.all(): collection.data.add(d) # Add entities to which data belongs to the collection. for entity in entities: entity.collections.add(collection) except Exception: # pylint: disable=broad-except logger.error( __( "Error while preparing spawned Data objects of process '{}' (handle_finish):\n\n{}", parent_data.process.slug, traceback.format_exc() ), extra={ 'data_id': data_id } ) spawning_failed = True # Data wrap up happens last, so that any triggered signals # already see the spawned children. What the children themselves # see is guaranteed by the transaction we're in. if ExecutorProtocol.FINISH_PROCESS_RC in obj: process_rc = obj[ExecutorProtocol.FINISH_PROCESS_RC] try: d = Data.objects.get(pk=data_id) except Data.DoesNotExist: logger.warning( "Data object does not exist (handle_finish).", extra={ 'data_id': data_id, } ) async_to_sync(self._send_reply)(obj, {ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_ERROR}) return changeset = { 'process_progress': 100, 'finished': now(), } if spawning_failed: changeset['status'] = Data.STATUS_ERROR changeset['process_error'] = ["Error while preparing spawned Data objects"] elif process_rc == 0 and not d.status == Data.STATUS_ERROR: changeset['status'] = Data.STATUS_DONE else: changeset['status'] = Data.STATUS_ERROR changeset['process_rc'] = process_rc obj[ExecutorProtocol.UPDATE_CHANGESET] = changeset self.handle_update(obj, internal_call=True) if not getattr(settings, 'FLOW_MANAGER_KEEP_DATA', False): # Purge worker is not running in test runner, so we should skip triggering it. if not is_testing(): channel_layer = get_channel_layer() try: async_to_sync(channel_layer.send)( CHANNEL_PURGE_WORKER, { 'type': TYPE_PURGE_RUN, 'location_id': d.location.id, 'verbosity': self._verbosity, } ) except ChannelFull: logger.warning( "Cannot trigger purge because channel is full.", extra={'data_id': data_id} ) # Notify the executor that we're done. async_to_sync(self._send_reply)(obj, {ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_OK}) # Now nudge the main manager to perform final cleanup. This is # needed even if there was no spawn baggage, since the manager # may need to know when executors have finished, to keep count # of them and manage synchronization. async_to_sync(consumer.send_event)({ WorkerProtocol.COMMAND: WorkerProtocol.FINISH, WorkerProtocol.DATA_ID: data_id, WorkerProtocol.FINISH_SPAWNED: spawned, WorkerProtocol.FINISH_COMMUNICATE_EXTRA: { 'executor': getattr(settings, 'FLOW_EXECUTOR', {}).get('NAME', 'resolwe.flow.executors.local'), }, })
def hydrate_input_references(input_, input_schema, hydrate_values=True): """Hydrate ``input_`` with linked data. Find fields with complex data:<...> types in ``input_``. Assign an output of corresponding data object to those fields. """ from .data import Data # prevent circular import for field_schema, fields in iterate_fields(input_, input_schema): name = field_schema['name'] value = fields[name] if 'type' in field_schema: if field_schema['type'].startswith('data:'): if value is None: continue try: data = Data.objects.get(id=value) except Data.DoesNotExist: fields[name] = {} continue output = copy.deepcopy(data.output) if hydrate_values: _hydrate_values(output, data.process.output_schema, data) output["__id"] = data.id output["__type"] = data.process.type output["__descriptor"] = data.descriptor output["__entity_name"] = None output["__output_schema"] = data.process.output_schema entity = data.entity_set.values('name').first() if entity: output["__entity_name"] = entity['name'] fields[name] = output elif field_schema['type'].startswith('list:data:'): outputs = [] for val in value: if val is None: continue try: data = Data.objects.get(id=val) except Data.DoesNotExist: outputs.append({}) continue output = copy.deepcopy(data.output) if hydrate_values: _hydrate_values(output, data.process.output_schema, data) output["__id"] = data.id output["__type"] = data.process.type output["__descriptor"] = data.descriptor output["__output_schema"] = data.process.output_schema entity = data.entity_set.values('name').first() if entity: output["__entity_name"] = entity['name'] outputs.append(output) fields[name] = outputs
def hydrate_size(data, force=False): """Add file and dir sizes. Add sizes to ``basic:file:``, ``list:basic:file``, ``basic:dir:`` and ``list:basic:dir:`` fields. ``force`` parameter is used to recompute file sizes also on objects that already have these values, e.g. in migrations. """ from .data import Data # prevent circular import def get_dir_size(path): """Get directory size.""" total_size = 0 for dirpath, _, filenames in os.walk(path): for file_name in filenames: file_path = os.path.join(dirpath, file_name) if not os.path.isfile(file_path): # Skip all "not normal" files (links, ...) continue total_size += os.path.getsize(file_path) return total_size def get_refs_size(obj, obj_path): """Calculate size of all references of ``obj``. :param dict obj: Data object's output field (of type file/dir). :param str obj_path: Path to ``obj``. """ total_size = 0 for ref in obj.get('refs', []): ref_path = data.location.get_path(filename=ref) if ref_path in obj_path: # It is a common case that ``obj['file']`` is also contained in # one of obj['ref']. In that case, we need to make sure that it's # size is not counted twice: continue if os.path.isfile(ref_path): total_size += os.path.getsize(ref_path) elif os.path.isdir(ref_path): total_size += get_dir_size(ref_path) return total_size def add_file_size(obj): """Add file size to the basic:file field.""" if data.status in [Data.STATUS_DONE, Data.STATUS_ERROR] and 'size' in obj and not force: return path = data.location.get_path(filename=obj['file']) if not os.path.isfile(path): raise ValidationError("Referenced file does not exist ({})".format(path)) obj['size'] = os.path.getsize(path) obj['total_size'] = obj['size'] + get_refs_size(obj, path) def add_dir_size(obj): """Add directory size to the basic:dir field.""" if data.status in [Data.STATUS_DONE, Data.STATUS_ERROR] and 'size' in obj and not force: return path = data.location.get_path(filename=obj['dir']) if not os.path.isdir(path): raise ValidationError("Referenced dir does not exist ({})".format(path)) obj['size'] = get_dir_size(path) obj['total_size'] = obj['size'] + get_refs_size(obj, path) data_size = 0 for field_schema, fields in iterate_fields(data.output, data.process.output_schema): name = field_schema['name'] value = fields[name] if 'type' in field_schema: if field_schema['type'].startswith('basic:file:'): add_file_size(value) data_size += value.get('total_size', 0) elif field_schema['type'].startswith('list:basic:file:'): for obj in value: add_file_size(obj) data_size += obj.get('total_size', 0) elif field_schema['type'].startswith('basic:dir:'): add_dir_size(value) data_size += value.get('total_size', 0) elif field_schema['type'].startswith('list:basic:dir:'): for obj in value: add_dir_size(obj) data_size += obj.get('total_size', 0) data.size = data_size
def validate_schema(instance, schema, test_required=True, data_location=None, skip_missing_data=False): """Check if DictField values are consistent with our data types. Perform basic JSON schema validation and our custom validations: * check that required fields are given (if `test_required` is set to ``True``) * check if ``basic:file:`` and ``list:basic:file`` fields match regex given in schema (only if ``validate_regex`` is defined in schema for coresponding fields) and exists (only if ``data_location`` is given) * check if directories referenced in ``basic:dir:`` and ``list:basic:dir``fields exist (only if ``data_location`` is given) * check that referenced ``Data`` objects (in ``data:<data_type>`` and ``list:data:<data_type>`` fields) exists and are of type ``<data_type>`` * check that referenced ``Storage`` objects (in ``basic:json`` fields) exists :param list instance: Instance to be validated :param list schema: Schema for validation :param bool test_required: Flag for testing if all required fields are present. It is usefule if validation is run before ``Data`` object is finished and there are some field stil missing (default: ``False``) :param :class:`~resolwe.flow.models.data.DataLocation` data_location: data location used for checking if files and directories exist (default: ``None``) :param bool skip_missing_data: Don't raise an error if referenced ``Data`` object does not exist :rtype: None :raises ValidationError: if ``instance`` doesn't match schema defined in ``schema`` """ from .storage import Storage # Prevent circular import. path_prefix = None if data_location: path_prefix = data_location.get_path() def validate_refs(field): """Validate reference paths.""" for ref_filename in field.get('refs', []): ref_path = os.path.join(path_prefix, ref_filename) if not os.path.exists(ref_path): raise ValidationError("Path referenced in `refs` ({}) does not exist.".format(ref_path)) if not (os.path.isfile(ref_path) or os.path.isdir(ref_path)): raise ValidationError( "Path referenced in `refs` ({}) is neither a file or directory.".format(ref_path)) def validate_file(field, regex): """Validate file name (and check that it exists).""" filename = field['file'] if regex and not re.search(regex, filename): raise ValidationError( "File name {} does not match regex {}".format(filename, regex)) if path_prefix: path = os.path.join(path_prefix, filename) if not os.path.exists(path): raise ValidationError("Referenced path ({}) does not exist.".format(path)) if not os.path.isfile(path): raise ValidationError("Referenced path ({}) is not a file.".format(path)) validate_refs(field) def validate_dir(field): """Check that dirs and referenced files exists.""" dirname = field['dir'] if path_prefix: path = os.path.join(path_prefix, dirname) if not os.path.exists(path): raise ValidationError("Referenced path ({}) does not exist.".format(path)) if not os.path.isdir(path): raise ValidationError("Referenced path ({}) is not a directory.".format(path)) validate_refs(field) def validate_data(data_pk, type_): """Check that `Data` objects exist and is of right type.""" from .data import Data # prevent circular import data_qs = Data.objects.filter(pk=data_pk).values('process__type') if not data_qs.exists(): if skip_missing_data: return raise ValidationError( "Referenced `Data` object does not exist (id:{})".format(data_pk)) data = data_qs.first() if not data['process__type'].startswith(type_): raise ValidationError( "Data object of type `{}` is required, but type `{}` is given. " "(id:{})".format(type_, data['process__type'], data_pk)) def validate_range(value, interval, name): """Check that given value is inside the specified range.""" if not interval: return if value < interval[0] or value > interval[1]: raise ValidationError( "Value of field '{}' is out of range. It should be between {} and {}.".format( name, interval[0], interval[1] ) ) is_dirty = False dirty_fields = [] for _schema, _fields, _ in iterate_schema(instance, schema): name = _schema['name'] is_required = _schema.get('required', True) if test_required and is_required and name not in _fields: is_dirty = True dirty_fields.append(name) if name in _fields: field = _fields[name] type_ = _schema.get('type', "") # Treat None as if the field is missing. if not is_required and field is None: continue try: jsonschema.validate([{"type": type_, "value": field}], TYPE_SCHEMA) except jsonschema.exceptions.ValidationError as ex: raise ValidationError(ex.message) choices = [choice['value'] for choice in _schema.get('choices', [])] allow_custom_choice = _schema.get('allow_custom_choice', False) if choices and not allow_custom_choice and field not in choices: raise ValidationError( "Value of field '{}' must match one of predefined choices. " "Current value: {}".format(name, field) ) if type_ == 'basic:file:': validate_file(field, _schema.get('validate_regex')) elif type_ == 'list:basic:file:': for obj in field: validate_file(obj, _schema.get('validate_regex')) elif type_ == 'basic:dir:': validate_dir(field) elif type_ == 'list:basic:dir:': for obj in field: validate_dir(obj) elif type_ == 'basic:json:' and not Storage.objects.filter(pk=field).exists(): raise ValidationError( "Referenced `Storage` object does not exist (id:{})".format(field)) elif type_.startswith('data:'): validate_data(field, type_) elif type_.startswith('list:data:'): for data_id in field: validate_data(data_id, type_[5:]) # remove `list:` from type elif type_ == 'basic:integer:' or type_ == 'basic:decimal:': validate_range(field, _schema.get('range'), name) elif type_ == 'list:basic:integer:' or type_ == 'list:basic:decimal:': for obj in field: validate_range(obj, _schema.get('range'), name) try: # Check that schema definitions exist for all fields for _, _ in iterate_fields(instance, schema): pass except KeyError as ex: raise ValidationError(str(ex)) if is_dirty: dirty_fields = ['"{}"'.format(field) for field in dirty_fields] raise DirtyError("Required fields {} not given.".format(', '.join(dirty_fields)))
def run_process(self, process_slug, input_={}, assert_status=Data.STATUS_DONE, descriptor=None, descriptor_schema=None, verbosity=0, tags=None): """Run the specified process with the given inputs. If input is a file, file path should be given relative to the ``tests/files`` directory of a Django application. If ``assert_status`` is given, check if :class:`~resolwe.flow.models.Data` object's status matches it after the process has finished. .. note:: If you need to delay calling the manager, you must put the desired code in a ``with transaction.atomic()`` block. :param str process_slug: slug of the :class:`~resolwe.flow.models.Process` to run :param dict ``input_``: :class:`~resolwe.flow.models.Process`'s input parameters .. note:: You don't have to specify parameters with defined default values. :param str ``assert_status``: desired status of the :class:`~resolwe.flow.models.Data` object :param dict descriptor: descriptor to set on the :class:`~resolwe.flow.models.Data` object :param dict descriptor_schema: descriptor schema to set on the :class:`~resolwe.flow.models.Data` object :param list tags: list of tags that will be added to the created :class:`~resolwe.flow.models.Data` object :return: object created by :class:`~resolwe.flow.models.Process` :rtype: ~resolwe.flow.models.Data """ # Copy input_, to avoid mutation that would occur in ``mock_upload`` input_ = input_.copy() # backward compatibility process_slug = slugify(process_slug.replace(':', '-')) # Enforce correct process tags. if getattr(settings, 'TEST_PROCESS_REQUIRE_TAGS', False) and not self._preparation_stage: test = getattr(self, self._testMethodName) if not has_process_tag(test, process_slug): self.fail( 'Tried to run process with slug "{0}" outside of preparation_stage\n' 'block while test is not tagged for this process. Either tag the\n' 'test using tag_process decorator or move this under the preparation\n' 'stage block if this process is only used to prepare upstream inputs.\n' '\n' 'To tag the test you can add the following decorator:\n' ' @tag_process(\'{0}\')\n' ''.format(process_slug) ) self._executed_processes.add(process_slug) process = Process.objects.filter(slug=process_slug).order_by('-version').first() if process is None: self.fail('No process with slug "{}"'.format(process_slug)) def mock_upload(file_path): """Mock file upload.""" def is_url(path): """Check if path is a URL.""" validate = URLValidator() try: validate(path) except (ValueError, ValidationError): return False return True if is_url(file_path): return { 'file': file_path, 'file_temp': file_path, 'is_remote': True, } else: old_path = os.path.join(self.files_path, file_path) if not os.path.isfile(old_path): raise RuntimeError('Missing file: {}'.format(old_path)) file_basename = os.path.basename(file_path) file_temp = '{}_{}'.format(file_basename, uuid.uuid4()) upload_file_path = os.path.join(self.upload_dir, file_temp) # create directories needed by new_path upload_file_dir = os.path.dirname(upload_file_path) if not os.path.exists(upload_file_dir): os.makedirs(upload_file_dir) shutil.copy2(old_path, upload_file_path) self._upload_files.append(upload_file_path) return { 'file': file_basename, 'file_temp': file_temp, } for field_schema, fields in iterate_fields(input_, process.input_schema): # copy referenced files to upload dir if field_schema['type'] == "basic:file:": fields[field_schema['name']] = mock_upload(fields[field_schema['name']]) elif field_schema['type'] == "list:basic:file:": file_list = [mock_upload(file_path) for file_path in fields[field_schema['name']]] fields[field_schema['name']] = file_list # convert primary keys to strings if field_schema['type'].startswith('data:'): fields[field_schema['name']] = fields[field_schema['name']] if field_schema['type'].startswith('list:data:'): fields[field_schema['name']] = [obj for obj in fields[field_schema['name']]] data = Data.objects.create( input=input_, contributor=self.admin, process=process, slug=get_random_string(length=6), tags=tags or [], descriptor_schema=descriptor_schema, descriptor=descriptor or {}) self.collection.data.add(data) # Fetch latest Data object from database data = Data.objects.get(pk=data.pk) if assert_status: if not transaction.get_autocommit() and assert_status == Data.STATUS_DONE: # We are in an atomic transaction block, hence the data object will not be done # until after the block. Therefore the expected status is resolving. assert_status = Data.STATUS_RESOLVING self.assertStatus(data, assert_status) # Purge is normally called in an async worker, so we have to emulate the call. if data.location: purge.location_purge(location_id=data.location.id, delete=True) return data