Example #1
0
def hydrate_input_uploads(input_, input_schema, hydrate_values=True):
    """Hydrate input basic:upload types with upload location.

    Find basic:upload fields in input.
    Add the upload location for relative paths.

    """
    files = []
    for field_schema, fields in iterate_fields(input_, input_schema):
        name = field_schema['name']
        value = fields[name]
        if 'type' in field_schema:
            if field_schema['type'] == 'basic:file:':
                files.append(value)

            elif field_schema['type'] == 'list:basic:file:':
                files.extend(value)

    urlregex = re.compile(r'^(https?|ftp)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]')
    for value in files:
        if 'file_temp' in value:
            if isinstance(value['file_temp'], six.string_types):
                # If file_temp not url, nor absolute path: hydrate path
                if not os.path.isabs(value['file_temp']) and not urlregex.search(value['file_temp']):
                    value['file_temp'] = os.path.join(settings.FLOW_EXECUTOR['UPLOAD_DIR'], value['file_temp'])
            else:
                # Something very strange happened
                value['file_temp'] = 'Invalid value for file_temp in DB'
Example #2
0
def update_dependency_kinds(apps, schema_editor):
    """Update historical dependency kinds as they may be wrong."""
    DataDependency = apps.get_model("flow", "DataDependency")
    for dependency in DataDependency.objects.all():
        # Assume dependency is of subprocess kind.
        dependency.kind = "subprocess"

        # Check child inputs to determine if this is an IO dependency.
        child = dependency.child
        parent = dependency.parent

        for field_schema, fields in iterate_fields(child.input,
                                                   child.process.input_schema):
            name = field_schema["name"]
            value = fields[name]

            if field_schema.get("type", "").startswith("data:"):
                if value == parent.pk:
                    dependency.kind = "io"
                    break
            elif field_schema.get("type", "").startswith("list:data:"):
                for data in value:
                    if value == parent.pk:
                        dependency.kind = "io"
                        break

        dependency.save()
Example #3
0
    def save_storage(self, instance, schema):
        """Save basic:json values to a Storage collection."""
        for field_schema, fields in iterate_fields(instance, schema):
            name = field_schema['name']
            value = fields[name]
            if field_schema.get('type', '').startswith('basic:json:'):
                if value and not self.pk:
                    raise ValidationError(
                        'Data object must be `created` before creating `basic:json:` fields'
                    )

                if isinstance(value, int):
                    # already in Storage
                    continue

                if isinstance(value, six.string_types):
                    file_path = os.path.join(
                        settings.FLOW_EXECUTOR['DATA_DIR'], str(self.pk),
                        value)
                    if os.path.isfile(file_path):
                        with open(file_path) as file_handler:
                            value = json.load(file_handler)

                storage = Storage.objects.create(
                    name='Storage for data id {}'.format(self.pk),
                    contributor=self.contributor,
                    data_id=self.pk,
                    json=value,
                )

                # `value` is copied by value, so `fields[name]` must be changed
                fields[name] = storage.pk
Example #4
0
def dependency_status(data):
    """Return abstracted satus of dependencies.

    STATUS_ERROR .. one dependency has error status
    STATUS_DONE .. all dependencies have done status
    None .. other

    """
    for field_schema, fields in iterate_fields(data.input,
                                               data.process.input_schema):
        if (field_schema['type'].lower().startswith('data:')
                or field_schema['type'].lower().startswith('list:data:')):
            name = field_schema['name']
            value = fields[name]

            if field_schema['type'].lower().startswith('data:'):
                value = [value]

            for uid in value:
                try:
                    _data = Data.objects.get(id=uid)
                except Data.DoesNotExist:
                    return Data.STATUS_ERROR

                if _data.status == Data.STATUS_ERROR:
                    return Data.STATUS_ERROR

                if _data.status != Data.STATUS_DONE:
                    return None

    return Data.STATUS_DONE
def purge_data_dependencies(apps, schema_editor):
    Data = apps.get_model("flow", "Data")
    DataDependency = apps.get_model("flow", "DataDependency")

    for data in Data.objects.iterator():
        parent_pks = []
        for field_schema, fields in iterate_fields(data.input,
                                                   data.process.input_schema):
            name = field_schema["name"]
            value = fields[name]

            if field_schema.get("type", "").startswith("data:"):
                parent_pks.append(value)

            elif field_schema.get("type", "").startswith("list:data:"):
                parent_pks.extend(value)

        parent_pks = [
            pk if Data.objects.filter(pk=pk).exists() else None
            for pk in parent_pks
        ]

        for dependency in DataDependency.objects.filter(child=data.id,
                                                        kind="io"):
            parent_pk = dependency.parent.pk if dependency.parent else None
            if parent_pk in parent_pks:
                parent_pks.remove(parent_pk)
            else:
                dependency.delete()
Example #6
0
def update_dependency_kinds(apps, schema_editor):
    """Update historical dependency kinds as they may be wrong."""
    DataDependency = apps.get_model('flow', 'DataDependency')
    for dependency in DataDependency.objects.all():
        # Assume dependency is of subprocess kind.
        dependency.kind = 'subprocess'

        # Check child inputs to determine if this is an IO dependency.
        child = dependency.child
        parent = dependency.parent

        for field_schema, fields in iterate_fields(child.input,
                                                   child.process.input_schema):
            name = field_schema['name']
            value = fields[name]

            if field_schema.get('type', '').startswith('data:'):
                if value == parent.pk:
                    dependency.kind = 'io'
                    break
            elif field_schema.get('type', '').startswith('list:data:'):
                for data in value:
                    if value == parent.pk:
                        dependency.kind = 'io'
                        break

        dependency.save()
def hydrate_size(data):
    """Compute size of all Data object outputs and its cumultative size.

    This is a simplified version of original ``hydrate_size`` function,
    since we need just a subset of it.
    """
    def add_file_size(obj):
        """Add file size to the basic:file field."""
        path = os.path.join(settings.FLOW_EXECUTOR["DATA_DIR"],
                            data.location.subpath, obj["file"])

        obj["size"] = os.path.getsize(path)
        obj["total_size"] = obj["size"]

    data_size = 0
    for field_schema, fields in iterate_fields(data.output,
                                               data.process.output_schema):
        name = field_schema["name"]
        value = fields[name]
        if "type" in field_schema:
            if field_schema["type"].startswith("basic:file:"):
                add_file_size(value)
                data_size += value.get("total_size", 0)

    data.size = data_size
Example #8
0
def hydrate_input_uploads(input_, input_schema, hydrate_values=True):
    """Hydrate input basic:upload types with upload location.

    Find basic:upload fields in input.
    Add the upload location for relative paths.

    """
    from resolwe.flow.managers import manager

    files = []
    for field_schema, fields in iterate_fields(input_, input_schema):
        name = field_schema['name']
        value = fields[name]
        if 'type' in field_schema:
            if field_schema['type'] == 'basic:file:':
                files.append(value)

            elif field_schema['type'] == 'list:basic:file:':
                files.extend(value)

    urlregex = re.compile(r'^(https?|ftp)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]')
    for value in files:
        if 'file_temp' in value:
            if isinstance(value['file_temp'], str):
                # If file_temp not url, hydrate path.
                if not urlregex.search(value['file_temp']):
                    value['file_temp'] = manager.get_executor().resolve_upload_path(value['file_temp'])
            else:
                # Something very strange happened.
                value['file_temp'] = 'Invalid value for file_temp in DB'
def update_dependency_kinds(apps, schema_editor):
    """Update historical dependency kinds as they may be wrong."""
    DataDependency = apps.get_model('flow', 'DataDependency')
    for dependency in DataDependency.objects.all():
        # Assume dependency is of subprocess kind.
        dependency.kind = 'subprocess'

        # Check child inputs to determine if this is an IO dependency.
        child = dependency.child
        parent = dependency.parent

        for field_schema, fields in iterate_fields(child.input, child.process.input_schema):
            name = field_schema['name']
            value = fields[name]

            if field_schema.get('type', '').startswith('data:'):
                if value == parent.pk:
                    dependency.kind = 'io'
                    break
            elif field_schema.get('type', '').startswith('list:data:'):
                for data in value:
                    if value == parent.pk:
                        dependency.kind = 'io'
                        break

        dependency.save()
Example #10
0
def _rewire_inputs(data, pk_mapping):
    """Rewire inputs of provided data objects.

    References to input data objects in ``input`` field of the Data objects are
    chenged with references of their copies according to the provided mapping.
    If primary key is not in the mapping, original value is kept.

    :param list data: A list of Data objects to process.

    :param dict pk_mapping: A dict where keys are primary keys of original Data
        objects and values are primary keys of their copies.

    """
    for datum in data:
        for field_schema, fields in iterate_fields(datum.input,
                                                   datum.process.input_schema):
            name = field_schema["name"]
            value = fields[name]

            if field_schema["type"].startswith(
                    "data:") and value in pk_mapping:
                fields[name] = pk_mapping[value]

            elif field_schema["type"].startswith("list:data:"):
                fields[name] = [
                    pk_mapping[pk] if pk in pk_mapping else pk for pk in value
                ]
Example #11
0
def hydrate_input_uploads(input_, input_schema, hydrate_values=True):
    """Hydrate input basic:upload types with upload location.

    Find basic:upload fields in input.
    Add the upload location for relative paths.

    """
    from resolwe.flow.managers import manager

    files = []
    for field_schema, fields in iterate_fields(input_, input_schema):
        name = field_schema['name']
        value = fields[name]
        if 'type' in field_schema:
            if field_schema['type'] == 'basic:file:':
                files.append(value)

            elif field_schema['type'] == 'list:basic:file:':
                files.extend(value)

    urlregex = re.compile(r'^(https?|ftp)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]')
    for value in files:
        if 'file_temp' in value:
            if isinstance(value['file_temp'], str):
                # If file_temp not url, hydrate path.
                if not urlregex.search(value['file_temp']):
                    value['file_temp'] = manager.get_executor().resolve_upload_path(value['file_temp'])
            else:
                # Something very strange happened.
                value['file_temp'] = 'Invalid value for file_temp in DB'
Example #12
0
def referenced_schema_files(fields, schema):
    """Get the list of files and directories references by fields.

    :return: tuple of lists, first list containing  files and
        directories refereced in data.output.
    :rtype: Tuple[List[str], List[str]]
    """
    refs = []
    for field_schema, fields in iterate_fields(fields, schema):
        if "type" in field_schema:
            field_type = field_schema["type"]
            field_name = field_schema["name"]

            # Add basic:file: entries
            if field_type.startswith("basic:file:"):
                refs.append(fields[field_name]["file"])
                refs += fields[field_name].get("refs", [])

            # Add list:basic:file: entries
            elif field_type.startswith("list:basic:file:"):
                for field in fields[field_name]:
                    refs.append(field["file"])
                    refs += field.get("refs", [])

            # Add basic:dir: entries
            elif field_type.startswith("basic:dir:"):
                refs.append(fields[field_name]["dir"])
                refs += fields[field_name].get("refs", [])

            # Add list:basic:dir: entries
            elif field_type.startswith("list:basic:dir:"):
                for field in fields[field_name]:
                    refs.append(field["dir"])
                    refs += field.get("refs", [])
    return refs
Example #13
0
def get_collection_of_input_entities(data):
    """Get collection that contains all "entity inputs" of a given data.

    With "entity input", one refers to the inputs that are part of an entity.
    """
    # Prevent circular imports:
    from resolwe.flow.models import Collection

    data_ids = set()

    for field_schema, fields in iterate_fields(data.input,
                                               data.process.input_schema):
        name = field_schema["name"]
        value = fields[name]
        if "type" not in field_schema:
            continue

        if field_schema["type"].startswith("data:"):
            value = [value]
        elif not field_schema["type"].startswith("list:data:"):
            continue

        data_ids.update([val for val in value if val is not None])

    collections = Collection.objects.filter(
        data__in=list(data_ids),
        data__entity__isnull=False,
    ).distinct()

    if collections.count() != 1:
        raise ValueError(
            "Entity inputs should be part of exactly one collection. (not {})".
            format(len(collections)))

    return collections.first()
Example #14
0
def _hydrate_values(output, output_schema, data):
    """Hydrate basic:file and basic:json values.

    Find fields with basic:file type and assign a full path to the file.
    Find fields with basic:json type and assign a JSON object from storage.

    """

    def hydrate_path(file_name):
        """Hydrate file paths."""
        from resolwe.flow.managers import manager

        class HydratedPath(str):
            """String wrapper, which also stores the original filename."""

            __slots__ = ("data_id", "file_name")

            def __new__(cls, value=""):
                """Initialize hydrated path."""
                hydrated = str.__new__(cls, value)
                hydrated.data_id = data.id
                hydrated.file_name = file_name
                return hydrated

        return HydratedPath(manager.get_executor().resolve_data_path(data, file_name))

    def hydrate_storage(storage_id):
        """Hydrate storage fields."""
        from .storage import LazyStorageJSON  # Prevent circular import.

        return LazyStorageJSON(pk=storage_id)

    for field_schema, fields in iterate_fields(output, output_schema):
        name = field_schema["name"]
        value = fields[name]
        if "type" in field_schema:
            if field_schema["type"].startswith("basic:file:"):
                value["file"] = hydrate_path(value["file"])
                value["refs"] = [hydrate_path(ref) for ref in value.get("refs", [])]

            elif field_schema["type"].startswith("list:basic:file:"):
                for obj in value:
                    obj["file"] = hydrate_path(obj["file"])
                    obj["refs"] = [hydrate_path(ref) for ref in obj.get("refs", [])]

            if field_schema["type"].startswith("basic:dir:"):
                value["dir"] = hydrate_path(value["dir"])
                value["refs"] = [hydrate_path(ref) for ref in value.get("refs", [])]

            elif field_schema["type"].startswith("list:basic:dir:"):
                for obj in value:
                    obj["dir"] = hydrate_path(obj["dir"])
                    obj["refs"] = [hydrate_path(ref) for ref in obj.get("refs", [])]

            elif field_schema["type"].startswith("basic:json:"):
                fields[name] = hydrate_storage(value)

            elif field_schema["type"].startswith("list:basic:json:"):
                fields[name] = [hydrate_storage(storage_id) for storage_id in value]
Example #15
0
def hydrate_input_references(input_, input_schema, hydrate_values=True):
    """Hydrate ``input_`` with linked data.

    Find fields with complex data:<...> types in ``input_``.
    Assign an output of corresponding data object to those fields.

    """
    from .data import Data  # prevent circular import

    for field_schema, fields in iterate_fields(input_, input_schema):
        name = field_schema['name']
        value = fields[name]
        if 'type' in field_schema:
            if field_schema['type'].startswith('data:'):
                if value is None:
                    continue

                try:
                    data = Data.objects.get(id=value)
                except Data.DoesNotExist:
                    fields[name] = {}
                    continue

                output = copy.deepcopy(data.output)
                if hydrate_values:
                    _hydrate_values(output, data.process.output_schema, data)
                output["__id"] = data.id
                output["__type"] = data.process.type
                output["__descriptor"] = data.descriptor
                output["__entity_name"] = getattr(data.entity, 'name', None)
                output["__output_schema"] = data.process.output_schema

                fields[name] = output

            elif field_schema['type'].startswith('list:data:'):
                outputs = []
                for val in value:
                    if val is None:
                        continue

                    try:
                        data = Data.objects.get(id=val)
                    except Data.DoesNotExist:
                        outputs.append({})
                        continue

                    output = copy.deepcopy(data.output)
                    if hydrate_values:
                        _hydrate_values(output, data.process.output_schema, data)

                    output["__id"] = data.id
                    output["__type"] = data.process.type
                    output["__descriptor"] = data.descriptor
                    output["__entity_name"] = getattr(data.entity, 'name', None)
                    output["__output_schema"] = data.process.output_schema

                    outputs.append(output)

                fields[name] = outputs
Example #16
0
def _hydrate_values(output, output_schema, data):
    """Hydrate basic:file and basic:json values.

    Find fields with basic:file type and assign a full path to the file.
    Find fields with basic:json type and assign a JSON object from storage.

    """
    def hydrate_path(file_name):
        """Hydrate file paths."""
        from resolwe.flow.managers import manager

        class HydratedPath(str):
            """String wrapper, which also stores the original filename."""

            __slots__ = ('data_id', 'file_name')

            def __new__(cls, value=''):
                """Initialize hydrated path."""
                hydrated = str.__new__(cls, value)
                hydrated.data_id = data.id
                hydrated.file_name = file_name
                return hydrated

        return HydratedPath(manager.get_executor().resolve_data_path(data, file_name))

    def hydrate_storage(storage_id):
        """Hydrate storage fields."""
        from .storage import LazyStorageJSON  # Prevent circular import.

        return LazyStorageJSON(pk=storage_id)

    for field_schema, fields in iterate_fields(output, output_schema):
        name = field_schema['name']
        value = fields[name]
        if 'type' in field_schema:
            if field_schema['type'].startswith('basic:file:'):
                value['file'] = hydrate_path(value['file'])
                value['refs'] = [hydrate_path(ref) for ref in value.get('refs', [])]

            elif field_schema['type'].startswith('list:basic:file:'):
                for obj in value:
                    obj['file'] = hydrate_path(obj['file'])
                    obj['refs'] = [hydrate_path(ref) for ref in obj.get('refs', [])]

            if field_schema['type'].startswith('basic:dir:'):
                value['dir'] = hydrate_path(value['dir'])
                value['refs'] = [hydrate_path(ref) for ref in value.get('refs', [])]

            elif field_schema['type'].startswith('list:basic:dir:'):
                for obj in value:
                    obj['dir'] = hydrate_path(obj['dir'])
                    obj['refs'] = [hydrate_path(ref) for ref in obj.get('refs', [])]

            elif field_schema['type'].startswith('basic:json:'):
                fields[name] = hydrate_storage(value)

            elif field_schema['type'].startswith('list:basic:json:'):
                fields[name] = [hydrate_storage(storage_id) for storage_id in value]
Example #17
0
def hydrate_input_references(input_, input_schema, hydrate_values=True):
    """Hydrate ``input_`` with linked data.

    Find fields with complex data:<...> types in ``input_``.
    Assign an output of corresponding data object to those fields.

    """
    from .data import Data  # prevent circular import

    for field_schema, fields in iterate_fields(input_, input_schema):
        name = field_schema['name']
        value = fields[name]
        if 'type' in field_schema:
            if field_schema['type'].startswith('data:'):
                # if re.match('^[0-9a-fA-F]{24}$', str(value)) is None:
                #     print "ERROR: data:<...> value in field \"{}\", type \"{}\" not ObjectId but {}.".format(
                #         name, field_schema['type'], value)
                if value is None:
                    continue

                data = Data.objects.get(id=value)
                output = copy.deepcopy(data.output)
                # static = Data.static.to_python(data.static)
                if hydrate_values:
                    _hydrate_values(output, data.process.output_schema, data)
                    # _hydrate_values(static, data.static_schema, data)
                output["__id"] = data.id
                output["__type"] = data.process.type
                output["__descriptor"] = data.descriptor
                fields[name] = output

            elif field_schema['type'].startswith('list:data:'):
                outputs = []
                for val in value:
                    # if re.match('^[0-9a-fA-F]{24}$', str(val)) is None:
                    #     print "ERROR: data:<...> value in {}, type \"{}\" not ObjectId but {}.".format(
                    #         name, field_schema['type'], val)
                    if val is None:
                        continue

                    data = Data.objects.get(id=val)
                    output = copy.deepcopy(data.output)
                    # static = Data.static.to_python(data.static)
                    if hydrate_values:
                        _hydrate_values(output, data.process.output_schema,
                                        data)
                        # _hydrate_values(static, data.static_schema, data)

                    output["__id"] = data.id
                    output["__type"] = data.process.type
                    output["__descriptor"] = data.descriptor
                    outputs.append(output)

                fields[name] = outputs
Example #18
0
def hydrate_size(data):
    """Add file and dir sizes.

    Add sizes to ``basic:file:``, ``list:basic:file``, ``basic:dir:``
    and ``list:basic:dir:`` fields.

    """
    from .data import Data  # prevent circular import

    def add_file_size(obj):
        """Add file size to the basic:file field."""
        if data.status in [Data.STATUS_DONE, Data.STATUS_ERROR] and 'size' in obj:
            return

        path = os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(data.pk), obj['file'])
        if not os.path.isfile(path):
            raise ValidationError("Referenced file does not exist ({})".format(path))

        obj['size'] = os.path.getsize(path)

    def get_dir_size(path):
        """Get directory size."""
        total_size = 0
        for dirpath, _, filenames in os.walk(path):
            for file_name in filenames:
                file_path = os.path.join(dirpath, file_name)
                total_size += os.path.getsize(file_path)
        return total_size

    def add_dir_size(obj):
        """Add directory size to the basic:dir field."""
        if data.status in [Data.STATUS_DONE, Data.STATUS_ERROR] and 'size' in obj:
            return

        path = os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(data.pk), obj['dir'])
        if not os.path.isdir(path):
            raise ValidationError("Referenced dir does not exist ({})".format(path))

        obj['size'] = get_dir_size(path)

    for field_schema, fields in iterate_fields(data.output, data.process.output_schema):
        name = field_schema['name']
        value = fields[name]
        if 'type' in field_schema:
            if field_schema['type'].startswith('basic:file:'):
                add_file_size(value)
            elif field_schema['type'].startswith('list:basic:file:'):
                for obj in value:
                    add_file_size(obj)
            elif field_schema['type'].startswith('basic:dir:'):
                add_dir_size(value)
            elif field_schema['type'].startswith('list:basic:dir:'):
                for obj in value:
                    add_dir_size(obj)
Example #19
0
    def _get_output_field(self, obj, path):
        """Return object's output field schema and field dict.

        :param obj: object with the output field
        :type obj: ~resolwe.flow.models.Data

        :param str path: path to :class:`~resolwe.flow.models.Data`
            object's output field

        """
        for field_schema, field, field_path in iterate_fields(
                obj.output, obj.process.output_schema, ""):
            if path == field_path:
                return field_schema, field

        self.fail("Field not found in path {}.".format(path))
Example #20
0
def save_storage(data):
    """Parse output field and create Storage objects if needed."""
    for field_schema, fields, path in iterate_fields(
        data.output, data.process.output_schema, ""
    ):
        name = field_schema["name"]
        value = fields[name]
        if field_schema.get("type", "").startswith("basic:json:"):
            if value and not data.pk:
                raise ValidationError(
                    "Data object must be `created` before creating `basic:json:` fields"
                )

            if isinstance(value, int):
                # already in Storage
                continue

            if isinstance(value, str):
                file_path = data.location.get_path(filename=value)
                if os.path.isfile(file_path):
                    try:
                        with open(file_path) as file_handler:
                            value = json.load(file_handler)
                    except json.JSONDecodeError:
                        with open(file_path) as file_handler:
                            content = file_handler.read()
                            content = content.rstrip()
                            raise ValidationError(
                                "Value of '{}' must be a valid JSON, current: {}".format(
                                    name, content
                                )
                            )

            existing_storage_pk = None
            with suppress(KeyError):
                existing_storage_pk = dict_dot(data._original_output, path)

            if isinstance(existing_storage_pk, int):
                data.storages.filter(pk=existing_storage_pk).update(json=value)
                fields[name] = existing_storage_pk
            else:
                storage = data.storages.create(
                    name="Storage for data id {}".format(data.pk),
                    contributor=data.contributor,
                    json=value,
                )
                fields[name] = storage.pk
Example #21
0
    def handle_run(
        self, message: Message[dict], manager: "Processor"
    ) -> Response[List[int]]:
        """Handle spawning new data object.

        The response is the id of the created object.
        """
        export_files_mapper = message.message_data["export_files_mapper"]
        manager.data.refresh_from_db()

        try:
            data = message.message_data["data"]
            logger.debug(__("Spawning new data object from dict: {}", data))

            data["contributor"] = manager.data.contributor
            data["process"] = Process.objects.filter(slug=data["process"]).latest()
            data["tags"] = manager.data.tags
            data["collection"] = manager.data.collection
            data["subprocess_parent"] = manager.data

            with transaction.atomic():
                for field_schema, fields in iterate_fields(
                    data.get("input", {}), data["process"].input_schema
                ):
                    type_ = field_schema["type"]
                    name = field_schema["name"]
                    value = fields[name]

                    if type_ == "basic:file:":
                        fields[name] = self.hydrate_spawned_files(
                            export_files_mapper, value
                        )
                    elif type_ == "list:basic:file:":
                        fields[name] = [
                            self.hydrate_spawned_files(export_files_mapper, fn)
                            for fn in value
                        ]
                created_object = Data.objects.create(**data)
        except Exception:
            manager._log_exception(
                f"Error while preparing spawned Data objects for process '{manager.data.process.slug}'"
            )
            return message.respond_error([])
        else:
            return message.respond_ok(created_object.id)
Example #22
0
def _hydrate_values(output, output_schema, data):
    """Hydrate basic:file and basic:json values.

    Find fields with basic:file type and assign a full path to the file.
    Find fields with basic:json type and assign a JSON object from storage.

    """
    def hydrate_path(file_name):
        """Hydrate file paths."""
        id_ = "{}/".format(data.id)  # needs trailing slash
        if id_ in file_name:
            file_name = file_name[file_name.find(id_) +
                                  len(id_):]  # remove id from filename

        return os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], id_, file_name)

    def hydrate_storage(storage_id):
        """Hydrate storage fields."""
        return LazyStorageJSON(pk=storage_id)

    for field_schema, fields in iterate_fields(output, output_schema):
        name = field_schema['name']
        value = fields[name]
        if 'type' in field_schema:
            if field_schema['type'].startswith('basic:file:'):
                value['file'] = hydrate_path(value['file'])

            elif field_schema['type'].startswith('list:basic:file:'):
                for obj in value:
                    obj['file'] = hydrate_path(obj['file'])

            if field_schema['type'].startswith('basic:dir:'):
                value['dir'] = hydrate_path(value['dir'])

            elif field_schema['type'].startswith('list:basic:dir:'):
                for obj in value:
                    obj['dir'] = hydrate_path(obj['dir'])

            elif field_schema['type'].startswith('basic:json:'):
                fields[name] = hydrate_storage(value)

            elif field_schema['type'].startswith('list:basic:json:'):
                fields[name] = [
                    hydrate_storage(storage_id) for storage_id in value
                ]
Example #23
0
    def save_dependencies(self, instance, schema):
        """Save data: and list:data: references as parents."""
        def add_dependency(value):
            """Add parent Data dependency."""
            try:
                self.parents.add(Data.objects.get(pk=value))  # pylint: disable=no-member
            except Data.DoesNotExist:
                pass

        for field_schema, fields in iterate_fields(instance, schema):
            name = field_schema['name']
            value = fields[name]

            if field_schema.get('type', '').startswith('data:'):
                add_dependency(value)
            elif field_schema.get('type', '').startswith('list:data:'):
                for data in value:
                    add_dependency(data)
Example #24
0
    def resolve_secrets(self):
        """Retrieve handles for all basic:secret: fields on input.

        The process must have the ``secrets`` resource requirement
        specified in order to access any secrets. Otherwise this method
        will raise a ``PermissionDenied`` exception.

        :return: A dictionary of secrets where key is the secret handle
            and value is the secret value.
        """
        secrets = {}
        for field_schema, fields in iterate_fields(
            self.input, self.process.input_schema
        ):
            if not field_schema.get("type", "").startswith("basic:secret:"):
                continue

            name = field_schema["name"]
            value = fields[name]
            try:
                handle = value["handle"]
            except KeyError:
                continue

            try:
                secrets[handle] = Secret.objects.get_secret(
                    handle, contributor=self.contributor
                )
            except Secret.DoesNotExist:
                raise PermissionDenied(
                    "Access to secret not allowed or secret does not exist"
                )

        # If the process does not not have the right requirements it is not
        # allowed to access any secrets.
        allowed = self.process.requirements.get("resources", {}).get("secrets", False)
        if secrets and not allowed:
            raise PermissionDenied(
                "Process '{}' has secret inputs, but no permission to see secrets".format(
                    self.process.slug
                )
            )

        return secrets
Example #25
0
def rewire_inputs(data_list):
    """Rewire inputs of provided data objects.

    Input parameter is a list of original and copied data object model
    instances: ``[{'original': original, 'copy': copy}]``. This
    function finds which objects reference other objects (in the list)
    on the input and replaces original objects with the copies (mutates
    copies' inputs).

    """
    if len(data_list) < 2:
        return data_list

    mapped_ids = {
        bundle["original"].id: bundle["copy"].id
        for bundle in data_list
    }

    for bundle in data_list:
        updated = False
        copy = bundle["copy"]

        for field_schema, fields in iterate_fields(copy.input,
                                                   copy.process.input_schema):
            name = field_schema["name"]
            value = fields[name]

            if field_schema["type"].startswith(
                    "data:") and value in mapped_ids:
                fields[name] = mapped_ids[value]
                updated = True

            elif field_schema["type"].startswith("list:data:") and any(
                [id_ in mapped_ids for id_ in value]):
                fields[name] = [
                    mapped_ids[id_] if id_ in mapped_ids else id_
                    for id_ in value
                ]
                updated = True

        if updated:
            copy.save()

    return data_list
def remove_total_size(apps, schema_editor):
    """Remove ``total_size`` field from all file/dir-type outputs."""
    Data = apps.get_model('flow', 'Data')
    for data in Data.objects.all():
        for field_schema, fields in iterate_fields(data.output, data.process.output_schema):
            name = field_schema['name']
            value = fields[name]
            if 'type' in field_schema:
                if field_schema['type'].startswith('basic:file:'):
                    del value['total_size']
                elif field_schema['type'].startswith('list:basic:file:'):
                    for obj in value:
                        del obj['total_size']
                elif field_schema['type'].startswith('basic:dir:'):
                    del value['total_size']
                elif field_schema['type'].startswith('list:basic:dir:'):
                    for obj in value:
                        del obj['total_size']
        data.save()
    def handle_get_output_files_dirs(self, message: Message[str],
                                     manager: "Processor") -> Response[dict]:
        """Get the output for file and dir fields.

        The sent dictionary has field names as its keys and tuple filed_type,
        field_value for its values.
        """
        def is_file_or_dir(field_type: str) -> bool:
            """Is file or directory."""
            return "basic:file" in field_type or "basic:dir" in field_type

        output = dict()
        for field_schema, fields in iterate_fields(
                manager.data.output, manager.data.process.output_schema):
            if is_file_or_dir(field_schema["type"]):
                name = field_schema["name"]
                output[name] = (field_schema["type"], fields[name])

        return message.respond_ok(output)
Example #28
0
def remove_total_size(apps, schema_editor):
    """Remove ``total_size`` field from all file/dir-type outputs."""
    Data = apps.get_model('flow', 'Data')
    for data in Data.objects.all():
        for field_schema, fields in iterate_fields(data.output,
                                                   data.process.output_schema):
            name = field_schema['name']
            value = fields[name]
            if 'type' in field_schema:
                if field_schema['type'].startswith('basic:file:'):
                    del value['total_size']
                elif field_schema['type'].startswith('list:basic:file:'):
                    for obj in value:
                        del obj['total_size']
                elif field_schema['type'].startswith('basic:dir:'):
                    del value['total_size']
                elif field_schema['type'].startswith('list:basic:dir:'):
                    for obj in value:
                        del obj['total_size']
        data.save()
Example #29
0
    def resolve_secrets(self):
        """Retrieve handles for all basic:secret: fields on input.

        The process must have the ``secrets`` resource requirement
        specified in order to access any secrets. Otherwise this method
        will raise a ``PermissionDenied`` exception.

        :return: A dictionary of secrets where key is the secret handle
            and value is the secret value.
        """
        secrets = {}
        for field_schema, fields in iterate_fields(self.input, self.process.input_schema):  # pylint: disable=no-member
            if not field_schema.get('type', '').startswith('basic:secret:'):
                continue

            name = field_schema['name']
            value = fields[name]
            try:
                handle = value['handle']
            except KeyError:
                continue

            try:
                secrets[handle] = Secret.objects.get_secret(
                    handle,
                    contributor=self.contributor
                )
            except Secret.DoesNotExist:
                raise PermissionDenied("Access to secret not allowed or secret does not exist")

        # If the process does not not have the right requirements it is not
        # allowed to access any secrets.
        allowed = self.process.requirements.get('resources', {}).get('secrets', False)  # pylint: disable=no-member
        if secrets and not allowed:
            raise PermissionDenied(
                "Process '{}' has secret inputs, but no permission to see secrets".format(
                    self.process.slug  # pylint: disable=no-member
                )
            )

        return secrets
Example #30
0
def recreate_parent_dependencies(apps, schema_editor):
    """Create empty dependency relation if parent has been deleted."""
    Data = apps.get_model("flow", "Data")
    DataDependency = apps.get_model("flow", "DataDependency")

    def process_dependency(data, parent):
        if not Data.objects.filter(pk=parent).exists():
            DataDependency.objects.create(child=data, parent=None, kind="io")

    for data in Data.objects.all():
        for field_schema, fields in iterate_fields(data.input,
                                                   data.process.input_schema):
            name = field_schema["name"]
            value = fields[name]

            if field_schema.get("type", "").startswith("data:"):
                process_dependency(data, value)

            elif field_schema.get("type", "").startswith("list:data:"):
                for parent in value:
                    process_dependency(data, parent)
Example #31
0
def remove_total_size(apps, schema_editor):
    """Remove ``total_size`` field from all file/dir-type outputs."""
    Data = apps.get_model("flow", "Data")
    for data in Data.objects.all():
        for field_schema, fields in iterate_fields(
            data.output, data.process.output_schema
        ):
            name = field_schema["name"]
            value = fields[name]
            if "type" in field_schema:
                if field_schema["type"].startswith("basic:file:"):
                    del value["total_size"]
                elif field_schema["type"].startswith("list:basic:file:"):
                    for obj in value:
                        del obj["total_size"]
                elif field_schema["type"].startswith("basic:dir:"):
                    del value["total_size"]
                elif field_schema["type"].startswith("list:basic:dir:"):
                    for obj in value:
                        del obj["total_size"]
        data.save()
Example #32
0
    def save_storage(self, instance, schema):
        """Save basic:json values to a Storage collection."""
        for field_schema, fields in iterate_fields(instance, schema):
            name = field_schema["name"]
            value = fields[name]
            if field_schema.get("type", "").startswith("basic:json:"):
                if value and not self.pk:
                    raise ValidationError(
                        "Data object must be `created` before creating `basic:json:` fields"
                    )

                if isinstance(value, int):
                    # already in Storage
                    continue

                if isinstance(value, str):
                    file_path = self.location.get_path(filename=value)
                    if os.path.isfile(file_path):
                        try:
                            with open(file_path) as file_handler:
                                value = json.load(file_handler)
                        except json.JSONDecodeError:
                            with open(file_path) as file_handler:
                                content = file_handler.read()
                                content = content.rstrip()
                                raise ValidationError(
                                    "Value of '{}' must be a valid JSON, current: {}".format(
                                        name, content
                                    )
                                )

                storage = self.storages.create(
                    name="Storage for data id {}".format(self.pk),
                    contributor=self.contributor,
                    json=value,
                )

                # `value` is copied by value, so `fields[name]` must be changed
                fields[name] = storage.pk
Example #33
0
    def save_dependencies(self, instance, schema):
        """Save data: and list:data: references as parents."""
        def add_dependency(value):
            """Add parent Data dependency."""
            try:
                DataDependency.objects.update_or_create(
                    parent=Data.objects.get(pk=value),
                    child=self,
                    defaults={'kind': DataDependency.KIND_IO},
                )
            except Data.DoesNotExist:
                pass

        for field_schema, fields in iterate_fields(instance, schema):
            name = field_schema['name']
            value = fields[name]

            if field_schema.get('type', '').startswith('data:'):
                add_dependency(value)
            elif field_schema.get('type', '').startswith('list:data:'):
                for data in value:
                    add_dependency(data)
Example #34
0
    def save_dependencies(self, instance, schema):
        """Save data: and list:data: references as parents."""
        def add_dependency(value):
            """Add parent Data dependency."""
            try:
                DataDependency.objects.update_or_create(
                    parent=Data.objects.get(pk=value),
                    child=self,
                    defaults={"kind": DataDependency.KIND_IO},
                )
            except Data.DoesNotExist:
                pass

        for field_schema, fields in iterate_fields(instance, schema):
            name = field_schema["name"]
            value = fields[name]

            if field_schema.get("type", "").startswith("data:"):
                add_dependency(value)
            elif field_schema.get("type", "").startswith("list:data:"):
                for data in value:
                    add_dependency(data)
def recreate_parent_dependencies(apps, schema_editor):
    """Create empty dependency relation if parent has been deleted."""
    Data = apps.get_model('flow', 'Data')
    DataDependency = apps.get_model('flow', 'DataDependency')

    def process_dependency(data, parent):
        if not Data.objects.filter(pk=parent).exists():
            DataDependency.objects.create(
                child=data, parent=None, kind='io'
            )

    for data in Data.objects.all():
        for field_schema, fields in iterate_fields(data.input, data.process.input_schema):
            name = field_schema['name']
            value = fields[name]

            if field_schema.get('type', '').startswith('data:'):
                process_dependency(data, value)

            elif field_schema.get('type', '').startswith('list:data:'):
                for parent in value:
                    process_dependency(data, parent)
Example #36
0
    def handle_get_inputs_no_shared_storage(self, message: Message[int],
                                            manager: "Processor") -> Response:
        """Get a files belonging to input data objects.

        The format of the output is as follows:
        {
            base_url_1: (connector1, [list, of, ReferencedPath, instances]),
            bose_url_2: (connector2, [another, list, of, ReferencedPath, instances])
            ...
        }
        """
        output_data = {}

        # First get ids of data objecs which are inputs for data object we are
        # processing.
        input_data_ids = []
        for schema, fields in iterate_fields(
                manager.data.input, manager.data.process.input_schema):
            type_ = schema["type"]
            if type_.startswith("data:") or type_.startswith("list:data:"):
                value = fields[schema["name"]]
                if isinstance(value, int):
                    input_data_ids.append(value)
                else:
                    input_data_ids += value

        for input_data_id in input_data_ids:
            file_storage = FileStorage.objects.get(data=input_data_id)
            location = file_storage.default_storage_location
            output_data[location.url] = (
                location.connector_name,
                list(
                    ReferencedPath.objects.filter(
                        storage_locations=location).values()),
            )

        manager._listener.communicator.suspend_heartbeat(manager.peer_identity)
        return message.respond_ok(output_data)
Example #37
0
    def save_storage(self, instance, schema):
        """Save basic:json values to a Storage collection."""
        for field_schema, fields in iterate_fields(instance, schema):
            name = field_schema['name']
            value = fields[name]
            if field_schema.get('type', '').startswith('basic:json:'):
                if value and not self.pk:
                    raise ValidationError(
                        'Data object must be `created` before creating `basic:json:` fields')

                if isinstance(value, int):
                    # already in Storage
                    continue

                if isinstance(value, str):
                    file_path = self.location.get_path(filename=value)  # pylint: disable=no-member
                    if os.path.isfile(file_path):
                        try:
                            with open(file_path) as file_handler:
                                value = json.load(file_handler)
                        except json.JSONDecodeError:
                            with open(file_path) as file_handler:
                                content = file_handler.read()
                                content = content.rstrip()
                                raise ValidationError(
                                    "Value of '{}' must be a valid JSON, current: {}".format(name, content)
                                )

                storage = self.storages.create(  # pylint: disable=no-member
                    name='Storage for data id {}'.format(self.pk),
                    contributor=self.contributor,
                    json=value,
                )

                # `value` is copied by value, so `fields[name]` must be changed
                fields[name] = storage.pk
Example #38
0
    def handle_finish(self, obj):
        """Handle an incoming ``Data`` finished processing request.

        :param obj: The Channels message object. Command object format:

            .. code-block:: none

                {
                    'command': 'finish',
                    'data_id': [id of the :class:`~resolwe.flow.models.Data` object
                               this command changes],
                    'process_rc': [exit status of the processing]
                    'spawn_processes': [optional; list of spawn dictionaries],
                    'exported_files_mapper': [if spawn_processes present]
                }
        """
        data_id = obj[ExecutorProtocol.DATA_ID]
        logger.debug(
            __("Finishing Data with id {} (handle_finish).", data_id),
            extra={
                'data_id': data_id,
                'packet': obj
            }
        )
        spawning_failed = False
        with transaction.atomic():
            # Spawn any new jobs in the request.
            spawned = False
            if ExecutorProtocol.FINISH_SPAWN_PROCESSES in obj:
                if is_testing():
                    # NOTE: This is a work-around for Django issue #10827
                    # (https://code.djangoproject.com/ticket/10827), same as in
                    # TestCaseHelpers._pre_setup(). Because the listener is running
                    # independently, it must clear the cache on its own.
                    ContentType.objects.clear_cache()

                spawned = True
                exported_files_mapper = obj[ExecutorProtocol.FINISH_EXPORTED_FILES]
                logger.debug(
                    __("Spawning new Data objects for Data with id {} (handle_finish).", data_id),
                    extra={
                        'data_id': data_id
                    }
                )

                try:
                    # This transaction is needed because we're running
                    # asynchronously with respect to the main Django code
                    # here; the manager can get nudged from elsewhere.
                    with transaction.atomic():
                        parent_data = Data.objects.get(pk=data_id)

                        # Spawn processes.
                        for d in obj[ExecutorProtocol.FINISH_SPAWN_PROCESSES]:
                            d['contributor'] = parent_data.contributor
                            d['process'] = Process.objects.filter(slug=d['process']).latest()
                            d['tags'] = parent_data.tags

                            for field_schema, fields in iterate_fields(d.get('input', {}), d['process'].input_schema):
                                type_ = field_schema['type']
                                name = field_schema['name']
                                value = fields[name]

                                if type_ == 'basic:file:':
                                    fields[name] = self.hydrate_spawned_files(
                                        exported_files_mapper, value, data_id
                                    )
                                elif type_ == 'list:basic:file:':
                                    fields[name] = [self.hydrate_spawned_files(exported_files_mapper, fn, data_id)
                                                    for fn in value]

                            with transaction.atomic():
                                d = Data.objects.create(**d)
                                DataDependency.objects.create(
                                    parent=parent_data,
                                    child=d,
                                    kind=DataDependency.KIND_SUBPROCESS,
                                )

                                # Copy permissions.
                                copy_permissions(parent_data, d)

                                # Entity is added to the collection only when it is
                                # created - when it only contains 1 Data object.
                                entities = Entity.objects.filter(data=d).annotate(num_data=Count('data')).filter(
                                    num_data=1)

                                # Copy collections.
                                for collection in parent_data.collection_set.all():
                                    collection.data.add(d)

                                    # Add entities to which data belongs to the collection.
                                    for entity in entities:
                                        entity.collections.add(collection)

                except Exception:  # pylint: disable=broad-except
                    logger.error(
                        __(
                            "Error while preparing spawned Data objects of process '{}' (handle_finish):\n\n{}",
                            parent_data.process.slug,
                            traceback.format_exc()
                        ),
                        extra={
                            'data_id': data_id
                        }
                    )
                    spawning_failed = True

            # Data wrap up happens last, so that any triggered signals
            # already see the spawned children. What the children themselves
            # see is guaranteed by the transaction we're in.
            if ExecutorProtocol.FINISH_PROCESS_RC in obj:
                process_rc = obj[ExecutorProtocol.FINISH_PROCESS_RC]

                try:
                    d = Data.objects.get(pk=data_id)
                except Data.DoesNotExist:
                    logger.warning(
                        "Data object does not exist (handle_finish).",
                        extra={
                            'data_id': data_id,
                        }
                    )
                    async_to_sync(self._send_reply)(obj, {ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_ERROR})
                    return

                changeset = {
                    'process_progress': 100,
                    'finished': now(),
                }

                if spawning_failed:
                    changeset['status'] = Data.STATUS_ERROR
                    changeset['process_error'] = ["Error while preparing spawned Data objects"]

                elif process_rc == 0 and not d.status == Data.STATUS_ERROR:
                    changeset['status'] = Data.STATUS_DONE

                else:
                    changeset['status'] = Data.STATUS_ERROR
                    changeset['process_rc'] = process_rc

                obj[ExecutorProtocol.UPDATE_CHANGESET] = changeset
                self.handle_update(obj, internal_call=True)

        if not getattr(settings, 'FLOW_MANAGER_KEEP_DATA', False):
            # Purge worker is not running in test runner, so we should skip triggering it.
            if not is_testing():
                channel_layer = get_channel_layer()
                try:
                    async_to_sync(channel_layer.send)(
                        CHANNEL_PURGE_WORKER,
                        {
                            'type': TYPE_PURGE_RUN,
                            'location_id': d.location.id,
                            'verbosity': self._verbosity,
                        }
                    )
                except ChannelFull:
                    logger.warning(
                        "Cannot trigger purge because channel is full.",
                        extra={'data_id': data_id}
                    )

        # Notify the executor that we're done.
        async_to_sync(self._send_reply)(obj, {ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_OK})

        # Now nudge the main manager to perform final cleanup. This is
        # needed even if there was no spawn baggage, since the manager
        # may need to know when executors have finished, to keep count
        # of them and manage synchronization.
        async_to_sync(consumer.send_event)({
            WorkerProtocol.COMMAND: WorkerProtocol.FINISH,
            WorkerProtocol.DATA_ID: data_id,
            WorkerProtocol.FINISH_SPAWNED: spawned,
            WorkerProtocol.FINISH_COMMUNICATE_EXTRA: {
                'executor': getattr(settings, 'FLOW_EXECUTOR', {}).get('NAME', 'resolwe.flow.executors.local'),
            },
        })
Example #39
0
def hydrate_input_references(input_, input_schema, hydrate_values=True):
    """Hydrate ``input_`` with linked data.

    Find fields with complex data:<...> types in ``input_``.
    Assign an output of corresponding data object to those fields.

    """
    from .data import Data  # prevent circular import

    for field_schema, fields in iterate_fields(input_, input_schema):
        name = field_schema['name']
        value = fields[name]
        if 'type' in field_schema:
            if field_schema['type'].startswith('data:'):
                if value is None:
                    continue

                try:
                    data = Data.objects.get(id=value)
                except Data.DoesNotExist:
                    fields[name] = {}
                    continue

                output = copy.deepcopy(data.output)
                if hydrate_values:
                    _hydrate_values(output, data.process.output_schema, data)
                output["__id"] = data.id
                output["__type"] = data.process.type
                output["__descriptor"] = data.descriptor
                output["__entity_name"] = None
                output["__output_schema"] = data.process.output_schema

                entity = data.entity_set.values('name').first()
                if entity:
                    output["__entity_name"] = entity['name']

                fields[name] = output

            elif field_schema['type'].startswith('list:data:'):
                outputs = []
                for val in value:
                    if val is None:
                        continue

                    try:
                        data = Data.objects.get(id=val)
                    except Data.DoesNotExist:
                        outputs.append({})
                        continue

                    output = copy.deepcopy(data.output)
                    if hydrate_values:
                        _hydrate_values(output, data.process.output_schema, data)

                    output["__id"] = data.id
                    output["__type"] = data.process.type
                    output["__descriptor"] = data.descriptor
                    output["__output_schema"] = data.process.output_schema

                    entity = data.entity_set.values('name').first()
                    if entity:
                        output["__entity_name"] = entity['name']

                    outputs.append(output)

                fields[name] = outputs
Example #40
0
def hydrate_size(data, force=False):
    """Add file and dir sizes.

    Add sizes to ``basic:file:``, ``list:basic:file``, ``basic:dir:``
    and ``list:basic:dir:`` fields.

    ``force`` parameter is used to recompute file sizes also on objects
    that already have these values, e.g. in migrations.
    """
    from .data import Data  # prevent circular import

    def get_dir_size(path):
        """Get directory size."""
        total_size = 0
        for dirpath, _, filenames in os.walk(path):
            for file_name in filenames:
                file_path = os.path.join(dirpath, file_name)
                if not os.path.isfile(file_path):  # Skip all "not normal" files (links, ...)
                    continue
                total_size += os.path.getsize(file_path)
        return total_size

    def get_refs_size(obj, obj_path):
        """Calculate size of all references of ``obj``.

        :param dict obj: Data object's output field (of type file/dir).
        :param str obj_path: Path to ``obj``.
        """
        total_size = 0
        for ref in obj.get('refs', []):
            ref_path = data.location.get_path(filename=ref)
            if ref_path in obj_path:
                # It is a common case that ``obj['file']`` is also contained in
                # one of obj['ref']. In that case, we need to make sure that it's
                # size is not counted twice:
                continue
            if os.path.isfile(ref_path):
                total_size += os.path.getsize(ref_path)
            elif os.path.isdir(ref_path):
                total_size += get_dir_size(ref_path)

        return total_size

    def add_file_size(obj):
        """Add file size to the basic:file field."""
        if data.status in [Data.STATUS_DONE, Data.STATUS_ERROR] and 'size' in obj and not force:
            return

        path = data.location.get_path(filename=obj['file'])
        if not os.path.isfile(path):
            raise ValidationError("Referenced file does not exist ({})".format(path))

        obj['size'] = os.path.getsize(path)
        obj['total_size'] = obj['size'] + get_refs_size(obj, path)

    def add_dir_size(obj):
        """Add directory size to the basic:dir field."""
        if data.status in [Data.STATUS_DONE, Data.STATUS_ERROR] and 'size' in obj and not force:
            return

        path = data.location.get_path(filename=obj['dir'])
        if not os.path.isdir(path):
            raise ValidationError("Referenced dir does not exist ({})".format(path))

        obj['size'] = get_dir_size(path)
        obj['total_size'] = obj['size'] + get_refs_size(obj, path)

    data_size = 0
    for field_schema, fields in iterate_fields(data.output, data.process.output_schema):
        name = field_schema['name']
        value = fields[name]
        if 'type' in field_schema:
            if field_schema['type'].startswith('basic:file:'):
                add_file_size(value)
                data_size += value.get('total_size', 0)
            elif field_schema['type'].startswith('list:basic:file:'):
                for obj in value:
                    add_file_size(obj)
                    data_size += obj.get('total_size', 0)
            elif field_schema['type'].startswith('basic:dir:'):
                add_dir_size(value)
                data_size += value.get('total_size', 0)
            elif field_schema['type'].startswith('list:basic:dir:'):
                for obj in value:
                    add_dir_size(obj)
                    data_size += obj.get('total_size', 0)

    data.size = data_size
Example #41
0
def validate_schema(instance, schema, test_required=True, data_location=None,
                    skip_missing_data=False):
    """Check if DictField values are consistent with our data types.

    Perform basic JSON schema validation and our custom validations:

      * check that required fields are given (if `test_required` is set
        to ``True``)
      * check if ``basic:file:`` and ``list:basic:file`` fields match
        regex given in schema (only if ``validate_regex`` is defined in
        schema for coresponding fields) and exists (only if
        ``data_location`` is given)
      * check if directories referenced in ``basic:dir:`` and
        ``list:basic:dir``fields exist (only if ``data_location`` is
        given)
      * check that referenced ``Data`` objects (in ``data:<data_type>``
        and  ``list:data:<data_type>`` fields) exists and are of type
        ``<data_type>``
      * check that referenced ``Storage`` objects (in ``basic:json``
        fields) exists

    :param list instance: Instance to be validated
    :param list schema: Schema for validation
    :param bool test_required: Flag for testing if all required fields
        are present. It is usefule if validation is run before ``Data``
        object is finished and there are some field stil missing
        (default: ``False``)
    :param :class:`~resolwe.flow.models.data.DataLocation` data_location:
        data location used for checking if files and directories exist
        (default: ``None``)
    :param bool skip_missing_data: Don't raise an error if referenced
        ``Data`` object does not exist
    :rtype: None
    :raises ValidationError: if ``instance`` doesn't match schema
        defined in ``schema``

    """
    from .storage import Storage  # Prevent circular import.

    path_prefix = None
    if data_location:
        path_prefix = data_location.get_path()

    def validate_refs(field):
        """Validate reference paths."""
        for ref_filename in field.get('refs', []):
            ref_path = os.path.join(path_prefix, ref_filename)
            if not os.path.exists(ref_path):
                raise ValidationError("Path referenced in `refs` ({}) does not exist.".format(ref_path))
            if not (os.path.isfile(ref_path) or os.path.isdir(ref_path)):
                raise ValidationError(
                    "Path referenced in `refs` ({}) is neither a file or directory.".format(ref_path))

    def validate_file(field, regex):
        """Validate file name (and check that it exists)."""
        filename = field['file']

        if regex and not re.search(regex, filename):
            raise ValidationError(
                "File name {} does not match regex {}".format(filename, regex))

        if path_prefix:
            path = os.path.join(path_prefix, filename)
            if not os.path.exists(path):
                raise ValidationError("Referenced path ({}) does not exist.".format(path))
            if not os.path.isfile(path):
                raise ValidationError("Referenced path ({}) is not a file.".format(path))

            validate_refs(field)

    def validate_dir(field):
        """Check that dirs and referenced files exists."""
        dirname = field['dir']

        if path_prefix:
            path = os.path.join(path_prefix, dirname)
            if not os.path.exists(path):
                raise ValidationError("Referenced path ({}) does not exist.".format(path))
            if not os.path.isdir(path):
                raise ValidationError("Referenced path ({}) is not a directory.".format(path))

            validate_refs(field)

    def validate_data(data_pk, type_):
        """Check that `Data` objects exist and is of right type."""
        from .data import Data  # prevent circular import

        data_qs = Data.objects.filter(pk=data_pk).values('process__type')
        if not data_qs.exists():
            if skip_missing_data:
                return

            raise ValidationError(
                "Referenced `Data` object does not exist (id:{})".format(data_pk))
        data = data_qs.first()
        if not data['process__type'].startswith(type_):
            raise ValidationError(
                "Data object of type `{}` is required, but type `{}` is given. "
                "(id:{})".format(type_, data['process__type'], data_pk))

    def validate_range(value, interval, name):
        """Check that given value is inside the specified range."""
        if not interval:
            return

        if value < interval[0] or value > interval[1]:
            raise ValidationError(
                "Value of field '{}' is out of range. It should be between {} and {}.".format(
                    name, interval[0], interval[1]
                )
            )

    is_dirty = False
    dirty_fields = []
    for _schema, _fields, _ in iterate_schema(instance, schema):
        name = _schema['name']
        is_required = _schema.get('required', True)

        if test_required and is_required and name not in _fields:
            is_dirty = True
            dirty_fields.append(name)

        if name in _fields:
            field = _fields[name]
            type_ = _schema.get('type', "")

            # Treat None as if the field is missing.
            if not is_required and field is None:
                continue

            try:
                jsonschema.validate([{"type": type_, "value": field}], TYPE_SCHEMA)
            except jsonschema.exceptions.ValidationError as ex:
                raise ValidationError(ex.message)

            choices = [choice['value'] for choice in _schema.get('choices', [])]
            allow_custom_choice = _schema.get('allow_custom_choice', False)
            if choices and not allow_custom_choice and field not in choices:
                raise ValidationError(
                    "Value of field '{}' must match one of predefined choices. "
                    "Current value: {}".format(name, field)
                )

            if type_ == 'basic:file:':
                validate_file(field, _schema.get('validate_regex'))

            elif type_ == 'list:basic:file:':
                for obj in field:
                    validate_file(obj, _schema.get('validate_regex'))

            elif type_ == 'basic:dir:':
                validate_dir(field)

            elif type_ == 'list:basic:dir:':
                for obj in field:
                    validate_dir(obj)

            elif type_ == 'basic:json:' and not Storage.objects.filter(pk=field).exists():
                raise ValidationError(
                    "Referenced `Storage` object does not exist (id:{})".format(field))

            elif type_.startswith('data:'):
                validate_data(field, type_)

            elif type_.startswith('list:data:'):
                for data_id in field:
                    validate_data(data_id, type_[5:])  # remove `list:` from type

            elif type_ == 'basic:integer:' or type_ == 'basic:decimal:':
                validate_range(field, _schema.get('range'), name)

            elif type_ == 'list:basic:integer:' or type_ == 'list:basic:decimal:':
                for obj in field:
                    validate_range(obj, _schema.get('range'), name)

    try:
        # Check that schema definitions exist for all fields
        for _, _ in iterate_fields(instance, schema):
            pass
    except KeyError as ex:
        raise ValidationError(str(ex))

    if is_dirty:
        dirty_fields = ['"{}"'.format(field) for field in dirty_fields]
        raise DirtyError("Required fields {} not given.".format(', '.join(dirty_fields)))
Example #42
0
    def run_process(self, process_slug, input_={}, assert_status=Data.STATUS_DONE,
                    descriptor=None, descriptor_schema=None, verbosity=0, tags=None):
        """Run the specified process with the given inputs.

        If input is a file, file path should be given relative to the
        ``tests/files`` directory of a Django application.
        If ``assert_status`` is given, check if
        :class:`~resolwe.flow.models.Data` object's status matches
        it after the process has finished.

        .. note::

            If you need to delay calling the manager, you must put the
            desired code in a ``with transaction.atomic()`` block.

        :param str process_slug: slug of the
            :class:`~resolwe.flow.models.Process` to run

        :param dict ``input_``: :class:`~resolwe.flow.models.Process`'s
            input parameters

            .. note::

                You don't have to specify parameters with defined
                default values.

        :param str ``assert_status``: desired status of the
            :class:`~resolwe.flow.models.Data` object

        :param dict descriptor: descriptor to set on the
            :class:`~resolwe.flow.models.Data` object

        :param dict descriptor_schema: descriptor schema to set on the
            :class:`~resolwe.flow.models.Data` object

        :param list tags: list of tags that will be added to the created
            :class:`~resolwe.flow.models.Data` object

        :return: object created by
            :class:`~resolwe.flow.models.Process`
        :rtype: ~resolwe.flow.models.Data

        """
        # Copy input_, to avoid mutation that would occur in ``mock_upload``
        input_ = input_.copy()

        # backward compatibility
        process_slug = slugify(process_slug.replace(':', '-'))

        # Enforce correct process tags.
        if getattr(settings, 'TEST_PROCESS_REQUIRE_TAGS', False) and not self._preparation_stage:
            test = getattr(self, self._testMethodName)
            if not has_process_tag(test, process_slug):
                self.fail(
                    'Tried to run process with slug "{0}" outside of preparation_stage\n'
                    'block while test is not tagged for this process. Either tag the\n'
                    'test using tag_process decorator or move this under the preparation\n'
                    'stage block if this process is only used to prepare upstream inputs.\n'
                    '\n'
                    'To tag the test you can add the following decorator:\n'
                    '    @tag_process(\'{0}\')\n'
                    ''.format(process_slug)
                )

        self._executed_processes.add(process_slug)

        process = Process.objects.filter(slug=process_slug).order_by('-version').first()

        if process is None:
            self.fail('No process with slug "{}"'.format(process_slug))

        def mock_upload(file_path):
            """Mock file upload."""
            def is_url(path):
                """Check if path is a URL."""
                validate = URLValidator()
                try:
                    validate(path)
                except (ValueError, ValidationError):
                    return False
                return True

            if is_url(file_path):
                return {
                    'file': file_path,
                    'file_temp': file_path,
                    'is_remote': True,
                }
            else:
                old_path = os.path.join(self.files_path, file_path)
                if not os.path.isfile(old_path):
                    raise RuntimeError('Missing file: {}'.format(old_path))

                file_basename = os.path.basename(file_path)

                file_temp = '{}_{}'.format(file_basename, uuid.uuid4())
                upload_file_path = os.path.join(self.upload_dir, file_temp)
                # create directories needed by new_path
                upload_file_dir = os.path.dirname(upload_file_path)
                if not os.path.exists(upload_file_dir):
                    os.makedirs(upload_file_dir)

                shutil.copy2(old_path, upload_file_path)
                self._upload_files.append(upload_file_path)
                return {
                    'file': file_basename,
                    'file_temp': file_temp,
                }

        for field_schema, fields in iterate_fields(input_, process.input_schema):
            # copy referenced files to upload dir
            if field_schema['type'] == "basic:file:":
                fields[field_schema['name']] = mock_upload(fields[field_schema['name']])
            elif field_schema['type'] == "list:basic:file:":
                file_list = [mock_upload(file_path) for file_path in fields[field_schema['name']]]
                fields[field_schema['name']] = file_list

            # convert primary keys to strings
            if field_schema['type'].startswith('data:'):
                fields[field_schema['name']] = fields[field_schema['name']]
            if field_schema['type'].startswith('list:data:'):
                fields[field_schema['name']] = [obj for obj in fields[field_schema['name']]]

        data = Data.objects.create(
            input=input_,
            contributor=self.admin,
            process=process,
            slug=get_random_string(length=6),
            tags=tags or [],
            descriptor_schema=descriptor_schema,
            descriptor=descriptor or {})
        self.collection.data.add(data)

        # Fetch latest Data object from database
        data = Data.objects.get(pk=data.pk)

        if assert_status:
            if not transaction.get_autocommit() and assert_status == Data.STATUS_DONE:
                # We are in an atomic transaction block, hence the data object will not be done
                # until after the block. Therefore the expected status is resolving.
                assert_status = Data.STATUS_RESOLVING
            self.assertStatus(data, assert_status)

        # Purge is normally called in an async worker, so we have to emulate the call.
        if data.location:
            purge.location_purge(location_id=data.location.id, delete=True)

        return data