Esempio n. 1
0
    def handle_get_referenced_files(self, obj):
        """Get a list of files referenced by the data object.

        To get the entire output this request must be sent after processing
        is finished.

        :param obj: The Channels message object. Command object format:

            .. code-block:: none

                {
                    'command': 'get_referenced_data',
                    'data_id': [id of the :class:`~resolwe.flow.models.Data`
                               object],
                }
        """
        try:
            data_id = obj[ExecutorProtocol.DATA_ID]
            data = Data.objects.get(pk=data_id)
        except Data.DoesNotExist:
            logger.error(
                "Data object does not exist (handle_get_referenced_files).",
                extra={"data_id": data_id},
            )
            self._abort_processing(obj)
            return

        async_to_sync(self._send_reply)(
            obj,
            {
                ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_OK,
                ExecutorProtocol.REFERENCED_FILES: referenced_files(data),
            },
        )
Esempio n. 2
0
    def _prepare_data_dir(self, data):
        """Prepare destination directory where the data will live.

        :param data: The :class:`~resolwe.flow.models.Data` object for
            which to prepare the private execution directory.
        :return: The prepared data directory path.
        :rtype: str
        """
        logger.debug(
            __("Preparing data directory for Data with id {}.", data.id))

        with transaction.atomic():
            file_storage = FileStorage.objects.create()
            # Create StorageLocation with default connector.
            # We must also specify status since it is Uploading by default.
            data_location = StorageLocation.objects.create(
                file_storage=file_storage,
                url=str(file_storage.id),
                status=StorageLocation.STATUS_PREPARING,
                connector_name=STORAGE_LOCAL_CONNECTOR,
            )
            file_storage.data.add(data)

            # Reference 'special' files.
            for file_ in referenced_files(data, include_descriptor=False):
                referenced_path = ReferencedPath.objects.create(path=file_)
                referenced_path.storage_locations.add(data_location)

        output_path = self._get_per_data_dir("DATA_DIR", data_location.url)
        dir_mode = self.settings_actual.get("FLOW_EXECUTOR",
                                            {}).get("DATA_DIR_MODE", 0o755)
        os.mkdir(output_path, mode=dir_mode)
        # os.mkdir is not guaranteed to set the given mode
        os.chmod(output_path, dir_mode)
        return output_path
Esempio n. 3
0
    def _prepare_data_dir(self, data: Data):
        """Prepare destination directory where the data will live.

        :param data: The :class:`~resolwe.flow.models.Data` object for
            which to prepare the private execution directory.
        :return: The prepared data directory path.
        :rtype: str
        """
        logger.debug(__("Preparing data directory for Data with id {}.", data.id))
        connector_name = self._get_data_connector_name()
        with transaction.atomic():
            # Create Worker object and set its status to preparing if needed.
            if not Worker.objects.filter(data=data).exists():
                Worker.objects.get_or_create(data=data, status=Worker.STATUS_PREPARING)

            file_storage = FileStorage.objects.create()
            # Data produced by the processing container will be uploaded to the
            # created location.
            data_location = StorageLocation.objects.create(
                file_storage=file_storage,
                url=str(file_storage.id),
                status=StorageLocation.STATUS_PREPARING,
                connector_name=connector_name,
            )
            file_storage.data.add(data)

            # Reference 'special' files.
            for file_ in referenced_files(data, include_descriptor=False):
                referenced_path = ReferencedPath.objects.create(path=file_)
                referenced_path.storage_locations.add(data_location)

        dir_mode = getattr(settings, "FLOW_EXECUTOR", {}).get("DATA_DIR_MODE", 0o755)
        connectors[connector_name].prepare_url(data_location.url, dir_mode=dir_mode)
Esempio n. 4
0
    def assertFieldWorks(self, field_type, field_value, script_setup,
                         script_save, removed, not_removed):
        """
        Checks that a field is handled correctly by `get_purge_files` under a
        simulated Data object.
        """

        field_schema = {
            "name": "sample",
            "label": "Sample output",
            "type": field_type
        }

        # Test simulated operation.
        simulated_root = tempfile.mkdtemp()
        try:
            for filename in removed + not_removed:
                directory, basename = os.path.split(filename)
                if directory:
                    try:
                        os.makedirs(os.path.join(simulated_root, directory))
                    except OSError:
                        pass

                if basename:
                    with open(os.path.join(simulated_root, filename), "w"):
                        pass

            data_mock = MagicMock(
                output={"sample": field_value},
                process=MagicMock(output_schema=[field_schema]),
                descriptor={},
                descriptor_schema=[],
            )
            refs = referenced_files(data_mock)
            collected, deleted = collect_and_purge(simulated_root, refs)

            def strip_slash(filename):
                return filename[:-1] if filename[-1] == "/" else filename

            collected = [strip_slash(e) for e in collected]
            deleted = [strip_slash(e) for e in deleted]

            for filename in not_removed:
                self.assertIn(
                    strip_slash(filename),
                    collected,
                )

            for filename in removed:
                filename = strip_slash(filename)
                self.assertNotIn(filename, collected)
                self.assertIn(filename, deleted)
                deleted.remove(filename)

            # Ensure that nothing more is removed.
            self.assertEqual(len(deleted), 0)
        finally:
            shutil.rmtree(simulated_root)
    def handle_get_referenced_files(self, message: Message,
                                    manager: "Processor") -> Response[List]:
        """Get a list of files referenced by the data object.

        The list also depends on the Data object itself, more specifically on
        its output field. So the method is not idempotent.
        """
        return message.respond_ok(referenced_files(manager.data))
Esempio n. 6
0
    def _prepare_data_dir(self, data: Data) -> Path:
        """Prepare destination directory where the data will live.

        :param data: The :class:`~resolwe.flow.models.Data` object for
            which to prepare the private execution directory.
        :return: The prepared data directory path.
        :rtype: str
        """
        logger.debug(
            __("Preparing data directory for Data with id {}.", data.id))
        with transaction.atomic():
            # Create Worker object and set its status to preparing if needed.
            if not Worker.objects.filter(data=data).exists():
                Worker.objects.get_or_create(data=data,
                                             status=Worker.STATUS_PREPARING)

            file_storage = FileStorage.objects.create()
            # Create StorageLocation with default connector.
            # We must also specify status since it is Uploading by default.
            data_location = StorageLocation.objects.create(
                file_storage=file_storage,
                url=str(file_storage.id),
                status=StorageLocation.STATUS_PREPARING,
                connector_name=STORAGE_LOCAL_CONNECTOR,
            )
            file_storage.data.add(data)

            # Reference 'special' files.
            for file_ in referenced_files(data, include_descriptor=False):
                referenced_path = ReferencedPath.objects.create(path=file_)
                referenced_path.storage_locations.add(data_location)

        output_path = self._get_per_data_dir("DATA_DIR", data_location.url)
        logger.debug(__("Dispatcher creating data dir {}.", output_path))
        logger.debug(
            __(
                "Prepared location {} for data with id {}: {}.",
                data.location,
                data.id,
                output_path,
            ))
        dir_mode = getattr(settings, "FLOW_EXECUTOR",
                           {}).get("DATA_DIR_MODE", 0o755)
        output_path.mkdir(mode=dir_mode, parents=True)
        return output_path
Esempio n. 7
0
    def handle_update(self, obj, internal_call=False):
        """Handle an incoming ``Data`` object update request.

        :param obj: The Channels message object. Command object format:

            .. code-block:: none

                {
                    'command': 'update',
                    'data_id': [id of the :class:`~resolwe.flow.models.Data`
                               object this command changes],
                    'changeset': {
                        [keys to be changed]
                    }
                }

        :param internal_call: If ``True``, this is an internal delegate
            call, so a reply to the executor won't be sent.
        """
        data_id = obj[ExecutorProtocol.DATA_ID]
        changeset = obj[ExecutorProtocol.UPDATE_CHANGESET]
        if not internal_call:
            logger.debug(
                __("Handling update for Data with id {} (handle_update).",
                   data_id),
                extra={
                    "data_id": data_id,
                    "packet": obj
                },
            )
        try:
            d = Data.objects.get(pk=data_id)
        except Data.DoesNotExist:
            logger.warning(
                "Data object does not exist (handle_update).",
                extra={"data_id": data_id},
            )

            if not internal_call:
                async_to_sync(self._send_reply)(
                    obj, {
                        ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_ERROR
                    })

            async_to_sync(consumer.send_event)({
                WorkerProtocol.COMMAND:
                WorkerProtocol.ABORT,
                WorkerProtocol.DATA_ID:
                obj[ExecutorProtocol.DATA_ID],
                WorkerProtocol.FINISH_COMMUNICATE_EXTRA: {
                    "executor":
                    getattr(settings, "FLOW_EXECUTOR",
                            {}).get("NAME", "resolwe.flow.executors.local"),
                },
            })

            return

        if changeset.get("status", None) == Data.STATUS_ERROR:
            logger.error(
                __(
                    "Error occured while running process '{}' (handle_update).",
                    d.process.slug,
                ),
                extra={
                    "data_id":
                    data_id,
                    "api_url":
                    "{}{}".format(
                        getattr(settings, "RESOLWE_HOST_URL", ""),
                        reverse("resolwe-api:data-detail",
                                kwargs={"pk": data_id}),
                    ),
                },
            )
            self.unlock_all_inputs(data_id)

        if d.status == Data.STATUS_ERROR:
            changeset["status"] = Data.STATUS_ERROR

        if not d.started:
            changeset["started"] = now()
        changeset["modified"] = now()

        for key, val in changeset.items():
            if key in ["process_error", "process_warning", "process_info"]:
                # Trim process_* fields to not exceed max length of the database field.
                for i, entry in enumerate(val):
                    max_length = Data._meta.get_field(
                        key).base_field.max_length
                    if len(entry) > max_length:
                        val[i] = entry[:max_length - 3] + "..."

                getattr(d, key).extend(val)

            elif key != "output":
                setattr(d, key, val)

        if "output" in changeset:
            if not isinstance(d.output, dict):
                d.output = {}
            for key, val in changeset["output"].items():
                dict_dot(d.output, key, val)

        try:
            d.save(update_fields=list(changeset.keys()))
        except ValidationError as exc:
            logger.error(
                __(
                    "Validation error when saving Data object of process '{}' (handle_update):\n\n{}",
                    d.process.slug,
                    traceback.format_exc(),
                ),
                extra={"data_id": data_id},
            )
            d.refresh_from_db()
            d.process_error.append(exc.message)
            d.status = Data.STATUS_ERROR
            with suppress(Exception):
                d.save(update_fields=["process_error", "status"])
            self.unlock_all_inputs(data_id)

        except Exception:
            logger.error(
                __(
                    "Error when saving Data object of process '{}' (handle_update):\n\n{}",
                    d.process.slug,
                    traceback.format_exc(),
                ),
                extra={"data_id": data_id},
            )

        try:
            # Update referenced files. Since entire output is sent every time
            # just delete and recreate objects. Computing changes and updating
            # would probably be slower.
            if "output" in changeset:
                storage_location = d.location.default_storage_location
                ReferencedPath.objects.filter(
                    storage_locations=storage_location).delete()
                referenced_paths = [
                    ReferencedPath(path=path)
                    for path in referenced_files(d, include_descriptor=False)
                ]
                ReferencedPath.objects.bulk_create(referenced_paths)
                storage_location.files.add(*referenced_paths)
        except Exception:
            logger.error(
                __(
                    "Error when saving ReferencedFile objects of process '{}' (handle_update):\n\n{}",
                    d.process.slug,
                    traceback.format_exc(),
                ),
                extra={"data_id": data_id},
            )

        if not internal_call:
            async_to_sync(self._send_reply)(
                obj, {
                    ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_OK
                })