def update_tools_configmaps(self, core_api):
        """Create or update configmaps for tools."""

        def dict_from_directory(directory: Path) -> Dict[str, str]:
            """Get dictionary from given directory.

            File names are keys and corresponding file contents are values.
            """
            return {
                entry.name: entry.read_text()
                for entry in directory.glob("*")
                if entry.is_file()
            }

        configmaps = dict()
        for app_name, tool_path in get_apps_tools().items():
            logger.info(__("Processing '{}' from '{}'.", app_name, tool_path))
            data = dict_from_directory(tool_path)
            data_md5 = hashlib.md5(
                json.dumps(data, sort_keys=True).encode()
            ).hexdigest()
            configmap_name = sanitize_kubernetes_label(f"tools-{app_name}-{data_md5}")
            logger.info(__("Assigned configmap name '{}'.", configmap_name))
            self.update_configmap(configmap_name, data, core_api)
            configmaps[sanitize_kubernetes_label(app_name)] = configmap_name

        description_configmap_name = getattr(
            settings, "KUBERNETES_TOOLS_CONFIGMAPS", "tools-configmaps"
        )
        logger.info(__("Updating main configmap '{}'", description_configmap_name))
        self.update_configmap(description_configmap_name, configmaps, core_api)
Beispiel #2
0
    def discover_engines(self, executor=None):
        """Discover configured engines.

        :param executor: Optional executor module override
        """
        if executor is None:
            executor = getattr(settings, 'FLOW_EXECUTOR',
                               {}).get('NAME', 'resolwe.flow.executors.local')
        self.executor = self.load_executor(executor)
        logger.info(
            __("Loaded '{}' executor.",
               str(self.executor.__class__.__module__).replace('.prepare',
                                                               '')))

        expression_engines = getattr(settings, 'FLOW_EXPRESSION_ENGINES',
                                     ['resolwe.flow.expression_engines.jinja'])
        self.expression_engines = self.load_expression_engines(
            expression_engines)
        logger.info(
            __("Found {} expression engines: {}", len(self.expression_engines),
               ', '.join(self.expression_engines.keys())))

        execution_engines = getattr(settings, 'FLOW_EXECUTION_ENGINES',
                                    ['resolwe.flow.execution_engines.bash'])
        self.execution_engines = self.load_execution_engines(execution_engines)
        logger.info(
            __("Found {} execution engines: {}", len(self.execution_engines),
               ', '.join(self.execution_engines.keys())))
Beispiel #3
0
    def build(self, obj=None, push=True):
        """Build indexes."""
        if obj:
            if self.queryset.model != obj._meta.model:  # pylint: disable=protected-access
                return
            if not self.queryset.filter(pk=self.get_object_id(obj)).exists():
                return

        queryset = [obj] if obj else self.queryset.all()

        for obj in queryset:
            if self.filter(obj) is False:
                continue

            try:
                obj = self.preprocess_object(obj)
            except:  # pylint: disable=bare-except
                logger.exception(__(
                    "Error occurred while preprocessing '{}' Elasticsearch index.",
                    self.__class__.__name__),
                                 extra={
                                     'object_type': self.object_type,
                                     'obj_id': obj.pk
                                 })

            try:
                self.process_object(obj, push)
            except:  # pylint: disable=bare-except
                logger.exception(__(
                    "Error occurred while processing '{}' Elasticsearch index.",
                    self.__class__.__name__),
                                 extra={
                                     'object_type': self.object_type,
                                     'obj_id': obj.pk
                                 })
Beispiel #4
0
    def _cleanup(self, storage_location: StorageLocation):
        """Delete data from StorageLocation object."""
        # Make sure this will get writen to the database.
        # Problem is we are inside transaction so what happens if the
        # connection to the database fails or database crashed or...
        connector = storage_location.connector
        if connector is None:
            logger.error(
                __(
                    "Unable to cleanup StorageLocation {}: connector not found.",
                    storage_location.id,
                )
            )
            return
        try:
            # Delete the storage location(files will be also removed).
            logger.info(__("Deleting StorageLocation {}.", storage_location.pk))
            storage_location.delete()

        except Exception:
            logger.exception(
                __(
                    "Exception deleting StorageLocation {}.",
                    storage_location.id,
                )
            )
Beispiel #5
0
    async def communicate(self,
                          data_id=None,
                          run_sync=False,
                          save_settings=True):
        """Scan database for resolving Data objects and process them.

        This is submitted as a task to the manager's channel workers.

        :param data_id: Optional id of Data object which (+ its
            children) should be processes. If it is not given, all
            resolving objects are processed.
        :param run_sync: If ``True``, wait until all processes spawned
            from this point on have finished processing. If no processes
            are spawned, this results in a deadlock, since counts are
            handled on process finish.
        :param save_settings: If ``True``, save the current Django
            settings context to the global state. This should never be
            ``True`` for "automatic" calls, such as from Django signals,
            which can be invoked from inappropriate contexts (such as in
            the listener). For user code, it should be left at the
            default value. The saved settings are in effect until the
            next such call.
        """
        executor = getattr(settings, 'FLOW_EXECUTOR',
                           {}).get('NAME', 'resolwe.flow.executors.local')
        logger.debug(
            __(
                "Manager sending communicate command on '{}' triggered by Data with id {}.",
                state.MANAGER_CONTROL_CHANNEL,
                data_id,
            ))

        saved_settings = self.state.settings_override
        if save_settings:
            saved_settings = self._marshal_settings()
            self.state.settings_override = saved_settings

        try:
            await consumer.send_event({
                WorkerProtocol.COMMAND: WorkerProtocol.COMMUNICATE,
                WorkerProtocol.COMMUNICATE_SETTINGS: saved_settings,
                WorkerProtocol.COMMUNICATE_EXTRA: {
                    'data_id': data_id,
                    'executor': executor,
                },
            })
        except ChannelFull:
            logger.exception(
                "ChannelFull error occurred while sending communicate message."
            )

        if run_sync and not self.sync_counter.active:
            logger.debug(
                __("Manager on channel '{}' entering synchronization block.",
                   state.MANAGER_CONTROL_CHANNEL))
            await self.execution_barrier()
            logger.debug(
                __("Manager on channel '{}' exiting synchronization block.",
                   state.MANAGER_CONTROL_CHANNEL))
Beispiel #6
0
    def submit(self, data: Data, argv):
        """Run process with SLURM.

        For details, see
        :meth:`~resolwe.flow.managers.workload_connectors.base.BaseConnector.submit`.
        """
        limits = data.get_resource_limits()
        logger.debug(
            __(
                "Connector '{}' running for Data with id {} ({}).",
                self.__class__.__module__,
                data.id,
                repr(argv),
            ))

        # Compute target partition.
        partition = getattr(settings, "FLOW_SLURM_PARTITION_DEFAULT", None)
        if data.process.slug in getattr(settings,
                                        "FLOW_SLURM_PARTITION_OVERRIDES", {}):
            partition = settings.FLOW_SLURM_PARTITION_OVERRIDES[
                data.process.slug]

        try:
            # Make sure the resulting file is executable on creation.
            runtime_dir = storage_settings.FLOW_VOLUMES["runtime"]["config"][
                "path"]
            script_path = os.path.join(runtime_dir,
                                       "slurm-{}.sh".format(data.pk))
            file_descriptor = os.open(script_path,
                                      os.O_WRONLY | os.O_CREAT,
                                      mode=0o555)
            with os.fdopen(file_descriptor, "wt") as script:
                script.write("#!/bin/bash\n")
                script.write(
                    "#SBATCH --mem={}M\n".format(limits["memory"] +
                                                 EXECUTOR_MEMORY_OVERHEAD))
                script.write("#SBATCH --cpus-per-task={}\n".format(
                    limits["cores"]))
                if partition:
                    script.write("#SBATCH --partition={}\n".format(partition))
                    script.write(
                        "#SBATCH --output slurm-url-{}-job-%j.out\n".format(
                            data.location.subpath))

                # Render the argument vector into a command line.
                line = " ".join(map(shlex.quote, argv))
                script.write(line + "\n")

            command = ["/usr/bin/env", "sbatch", script_path]
            subprocess.Popen(command,
                             cwd=runtime_dir,
                             stdin=subprocess.DEVNULL).wait()
        except OSError as err:
            logger.error(
                __(
                    "OSError occurred while preparing SLURM script for Data {}: {}",
                    data.id,
                    err,
                ))
Beispiel #7
0
    def _data_execute(self, data: Data):
        """Execute the Data object.

        The activities carried out here include target directory
        preparation, executor copying, setting serialization and actual
        execution of the object.

        :param data: The :class:`~resolwe.flow.models.Data` object to
            execute.
        """
        logger.debug(__("Manager preparing Data with id {} for processing.", data.id))

        # Prepare the executor's environment.
        try:
            self._prepare_data_dir(data)

            executor_module = ".{}".format(
                getattr(settings, "FLOW_EXECUTOR", {})
                .get("NAME", "resolwe.flow.executors.local")
                .rpartition(".executors.")[-1]
            )
            self._lock_inputs_local_storage_locations(data)

            argv = [
                "/bin/bash",
                "-c",
                getattr(settings, "FLOW_EXECUTOR", {}).get(
                    "PYTHON", "/usr/bin/env python"
                )
                + " -m executors "
                + executor_module
                + " {}".format(data.pk),
            ]
            self.executor.prepare_for_execution(data)
        except PermissionDenied as error:
            data.status = Data.STATUS_ERROR
            data.process_error.append("Permission denied for process: {}".format(error))
            data.save()
            if hasattr(data, "worker"):
                data.worker.status = Worker.STATUS_ERROR_PREPARING
                data.worker.save()
            return
        except OSError as err:
            logger.exception(
                __(
                    "OSError occurred while preparing data {} (will skip): {}",
                    data.id,
                    err,
                )
            )
            if hasattr(data, "worker"):
                data.worker.status = Worker.STATUS_ERROR_PREPARING
                data.worker.save()
            return

        # Hand off to the run() method for execution.
        logger.info(__("Running executor for data with id {}", data.pk))
        self.run(data, argv)
Beispiel #8
0
def data_purge(data_ids=None, delete=False, verbosity=0):
    """Print files not referenced from meta data.

    If data_ids not given, run on all data objects.
    If delete is True, delete unreferenced files.

    """
    data_path = settings.FLOW_EXECUTOR['DATA_DIR']
    runtime_path = settings.FLOW_EXECUTOR['RUNTIME_DIR']
    unreferenced_files = set()

    data_qs = Data.objects.filter(
        status__in=[Data.STATUS_DONE, Data.STATUS_ERROR])
    if data_ids is not None:
        data_qs = data_qs.filter(pk__in=data_ids)

    for data in data_qs:
        root = os.path.join(data_path, str(data.id))

        unreferenced_files.update(
            get_purge_files(root, data.output, data.process.output_schema,
                            data.descriptor,
                            getattr(data.descriptor_schema, 'schema', [])))

    # Remove any folders, which do not belong to any data objects.
    if data_ids is None:
        for base_path in (data_path, runtime_path):
            for directory in os.listdir(base_path):
                directory_path = os.path.join(base_path, directory)
                if not os.path.isdir(directory_path):
                    continue

                try:
                    data_id = int(directory)
                except ValueError:
                    continue

                # Check if a data object with the given identifier exists.
                if not Data.objects.filter(pk=data_id).exists():
                    unreferenced_files.add(directory_path)

    if verbosity >= 1:
        # Print unreferenced files
        if unreferenced_files:
            logger.info(__("Unreferenced files ({}):",
                           len(unreferenced_files)))
            for name in unreferenced_files:
                logger.info(__("  {}", name))
        else:
            logger.info("No unreferenced files")

    # Go through unreferenced files and delete them.
    if delete:
        for name in unreferenced_files:
            if os.path.isfile(name) or os.path.islink(name):
                os.remove(name)
            elif os.path.isdir(name):
                shutil.rmtree(name)
Beispiel #9
0
    async def communicate(self, data_id=None, run_sync=False, save_settings=True):
        """Scan database for resolving Data objects and process them.

        This is submitted as a task to the manager's channel workers.

        :param data_id: Optional id of Data object which (+ its
            children) should be processes. If it is not given, all
            resolving objects are processed.
        :param run_sync: If ``True``, wait until all processes spawned
            from this point on have finished processing. If no processes
            are spawned, this results in a deadlock, since counts are
            handled on process finish.
        :param save_settings: If ``True``, save the current Django
            settings context to the global state. This should never be
            ``True`` for "automatic" calls, such as from Django signals,
            which can be invoked from inappropriate contexts (such as in
            the listener). For user code, it should be left at the
            default value. The saved settings are in effect until the
            next such call.
        """
        executor = getattr(settings, 'FLOW_EXECUTOR', {}).get('NAME', 'resolwe.flow.executors.local')
        logger.debug(__(
            "Manager sending communicate command on '{}' triggered by Data with id {}.",
            state.MANAGER_CONTROL_CHANNEL,
            data_id,
        ))

        saved_settings = self.state.settings_override
        if save_settings:
            saved_settings = self._marshal_settings()
            self.state.settings_override = saved_settings

        if run_sync:
            self._ensure_counter()
        await self.sync_counter.inc('communicate')
        try:
            await consumer.send_event({
                WorkerProtocol.COMMAND: WorkerProtocol.COMMUNICATE,
                WorkerProtocol.COMMUNICATE_SETTINGS: saved_settings,
                WorkerProtocol.COMMUNICATE_EXTRA: {
                    'data_id': data_id,
                    'executor': executor,
                },
            })
        except ChannelFull:
            logger.exception("ChannelFull error occurred while sending communicate message.")
            await self.sync_counter.dec('communicate')

        if run_sync and not self.sync_counter.active:
            logger.debug(__(
                "Manager on channel '{}' entering synchronization block.",
                state.MANAGER_CONTROL_CHANNEL
            ))
            await self.execution_barrier()
            logger.debug(__(
                "Manager on channel '{}' exiting synchronization block.",
                state.MANAGER_CONTROL_CHANNEL
            ))
Beispiel #10
0
    async def handle_control_event(self, message):
        """Handle an event from the Channels layer.

        Channels layer callback, do not call directly.
        """
        cmd = message[WorkerProtocol.COMMAND]
        logger.debug(__("Manager worker got channel command '{}'.", cmd))

        # Prepare settings for use; Django overlaid by state overlaid by
        # anything immediate in the current packet.
        immediates = {}
        if cmd == WorkerProtocol.COMMUNICATE:
            immediates = message.get(WorkerProtocol.COMMUNICATE_SETTINGS, {}) or {}
        override = self.state.settings_override or {}
        override.update(immediates)
        self.settings_actual = self._marshal_settings()
        self.settings_actual.update(override)

        if cmd == WorkerProtocol.COMMUNICATE:
            try:
                await database_sync_to_async(self._data_scan)(**message[WorkerProtocol.COMMUNICATE_EXTRA])
            finally:
                await self.sync_counter.dec('communicate')

        elif cmd == WorkerProtocol.FINISH:
            try:
                data_id = message[WorkerProtocol.DATA_ID]
                if not getattr(settings, 'FLOW_MANAGER_KEEP_DATA', False):
                    try:
                        def handle_error(func, path, exc_info):
                            """Handle permission errors while removing data directories."""
                            if isinstance(exc_info[1], PermissionError):
                                os.chmod(path, 0o700)
                                shutil.rmtree(path)

                        # Remove secrets directory, but leave the rest of the runtime directory
                        # intact. Runtime directory will be removed during data purge, when the
                        # data object is removed.
                        secrets_dir = os.path.join(
                            self._get_per_data_dir('RUNTIME_DIR', data_id),
                            ExecutorFiles.SECRETS_DIR
                        )
                        shutil.rmtree(secrets_dir, onerror=handle_error)
                    except OSError:
                        logger.exception("Manager exception while removing data runtime directory.")

                if message[WorkerProtocol.FINISH_SPAWNED]:
                    await database_sync_to_async(self._data_scan)(**message[WorkerProtocol.FINISH_COMMUNICATE_EXTRA])
            finally:
                await self.sync_counter.dec('executor')

        elif cmd == WorkerProtocol.ABORT:
            await self.sync_counter.dec('executor')

        else:
            logger.error(__("Ignoring unknown manager control command '{}'.", cmd))
Beispiel #11
0
    async def communicate(self, data_id=None, run_sync=False):
        """Scan database for resolving Data objects and process them.

        This is submitted as a task to the manager's channel workers.

        :param data_id: Optional id of Data object which (+ its
            children) should be processes. If it is not given, all
            resolving objects are processed.
        :param run_sync: If ``True``, wait until all processes spawned
            from this point on have finished processing.
        """
        first_sync_call = False
        if run_sync and self._sync_finished_event is None:
            first_sync_call = True
            self._sync_finished_event = asyncio.Event()

        self._messages_processing += 1
        logger.debug(
            __(
                "Manager sending communicate command on '{}' triggered by Data with id {}.",
                state.MANAGER_CONTROL_CHANNEL,
                data_id,
            )
        )
        try:
            await consumer.send_event(
                {
                    WorkerProtocol.COMMAND: WorkerProtocol.COMMUNICATE,
                    WorkerProtocol.COMMUNICATE_EXTRA: {
                        "data_id": data_id,
                    },
                }
            )
        except ChannelFull:
            logger.exception(
                "ChannelFull error occurred while sending communicate message."
            )
        except:
            logger.exception(
                "Unknown error occurred while sending communicate message."
            )

        if first_sync_call:
            logger.debug(
                __(
                    "Manager on channel '{}' entering synchronization block.",
                    state.MANAGER_CONTROL_CHANNEL,
                )
            )
            await self.execution_barrier()
            logger.debug(
                __(
                    "Manager on channel '{}' exiting synchronization block.",
                    state.MANAGER_CONTROL_CHANNEL,
                )
            )
    def create_expressions(self, num, gene_ids):
        """Generate expressions."""
        expressions = []
        sample_name = 'Deseq2_{}'.format(self.get_random_word(4))

        for i in range(num):
            expression_file = 'exp_{}.tab.gz'.format(random.choice([1, 2, 3]))

            # Create expressios
            started = timezone.now()
            exp = Data.objects.create(
                name='Smpl_Ex_{}_rep{}'.format(sample_name, i + 1),
                process=get_process(slug='upload-expression'),
                contributor=get_superuser(),
                started=started,
                finished=started + datetime.timedelta(minutes=60),
                status=Data.STATUS_PROCESSING,
                input={'exp': {'file': expression_file},
                       'exp_type': 'FPKM',
                       'exp_name': 'Expression',
                       'source': 'UCSC'})

            os.mkdir(os.path.join(self.data_dir, str(exp.id)))
            self.generate_expressions(gene_ids, os.path.join(self.data_dir, str(exp.id)),
                                      expression_file)

            json_object = Storage.objects.create(
                json=json.load(open(os.path.join(self.data_dir, str(exp.id), 'expressions.json'))),
                contributor=get_superuser(),
                name='{}_storage'.format(exp.name),
                data=exp)

            exp.output = {
                'exp': {'file': expression_file},
                'exp_type': 'FPKM',
                'exp_json': json_object.id,
                'source': 'UCSC'
            }
            exp.status = Data.STATUS_DONE
            exp.save()

            sample = Sample.objects.filter(data=exp)[0]
            sample.descriptor = generate_sample_descriptor('Hs_')
            sample.annotated = True
            sample.save()

            with open(os.path.join(self.data_dir, str(exp.id), 'stdout.txt'), 'w') as stdout:
                stdout.write('Upload gene expressions. Sample was created '
                             'with the generate_diffexpr_deseq django-admin command.')

            logger.info(__('Created sample: {} (id={})', sample.name, sample.id))
            logger.info(__('\tData object: (id={})', exp.id))
            expressions.append(exp)

        return expressions
    def create_expressions(self, num, gene_ids):
        """Generate expressions."""
        expressions = []
        sample_name = 'Deseq2_{}'.format(self.get_random_word(4))

        for i in range(num):
            expression_file = 'exp_{}.tab.gz'.format(random.choice([1, 2, 3]))

            # Create expressios
            started = timezone.now()
            exp = Data.objects.create(
                name='Smpl_Ex_{}_rep{}'.format(sample_name, i + 1),
                process=get_process(slug='upload-expression'),
                contributor=get_superuser(),
                started=started,
                finished=started + datetime.timedelta(minutes=60),
                status=Data.STATUS_PROCESSING,
                input={'exp': {'file': expression_file},
                       'exp_type': 'FPKM',
                       'exp_name': 'Expression',
                       'source': 'UCSC'})

            os.mkdir(os.path.join(self.data_dir, str(exp.id)))
            self.generate_expressions(gene_ids, os.path.join(self.data_dir, str(exp.id)),
                                      expression_file)

            json_object = Storage.objects.create(
                json=json.load(open(os.path.join(self.data_dir, str(exp.id), 'expressions.json'))),
                contributor=get_superuser(),
                name='{}_storage'.format(exp.name),
                data=exp)

            exp.output = {
                'exp': {'file': expression_file},
                'exp_type': 'FPKM',
                'exp_json': json_object.id,
                'source': 'UCSC'
            }
            exp.status = Data.STATUS_DONE
            exp.save()

            sample = Sample.objects.filter(data=exp)[0]
            sample.presample = False
            sample.descriptor = generate_sample_desciptor('Hs_')
            sample.save()

            with open(os.path.join(self.data_dir, str(exp.id), 'stdout.txt'), 'w') as stdout:
                stdout.write('Upload gene expressions. Sample was created '
                             'with the generate_diffexpr_deseq django-admin command.')

            logger.info(__('Created sample: {} (id={})', sample.name, sample.id))
            logger.info(__('\tData object: (id={})', exp.id))
            expressions.append(exp)

        return expressions
Beispiel #14
0
    def _process_file_storage(self, file_storage: FileStorage):
        """Delete all data from FileStorage object."""
        # Do not remove locked StorageLocation.
        logger.info(__("Processing FileStorage {}.", file_storage.pk))
        for storage_location in file_storage.storage_locations.filter(
            status=StorageLocation.STATUS_DELETING
        ):
            self._cleanup(storage_location)

        if file_storage.storage_locations.count() == 0:
            logger.info(__("Deleting FileStorage {}.", file_storage.pk))
            file_storage.delete()
Beispiel #15
0
    def _data_execute(self, data, program, executor):
        """Execute the Data object.

        The activities carried out here include target directory
        preparation, executor copying, setting serialization and actual
        execution of the object.

        :param data: The :class:`~resolwe.flow.models.Data` object to
            execute.
        :param program: The process text the manager got out of
            execution engine evaluation.
        :param executor: The executor to use for this object.
        """
        if not program:
            return

        logger.debug(__("Manager preparing Data with id {} for processing.", data.id))

        # Prepare the executor's environment.
        try:
            executor_env_vars = self.get_executor().get_environment_variables()
            program = self._include_environment_variables(program, executor_env_vars)
            data_dir = self._prepare_data_dir(data)
            executor_module, runtime_dir = self._prepare_executor(data, executor)

            # Execute execution engine specific runtime preparation.
            execution_engine = data.process.run.get('language', None)
            volume_maps = self.get_execution_engine(execution_engine).prepare_runtime(runtime_dir, data)

            self._prepare_context(data.id, data_dir, runtime_dir, RUNTIME_VOLUME_MAPS=volume_maps)
            self._prepare_script(runtime_dir, program)

            argv = [
                '/bin/bash',
                '-c',
                self.settings_actual.get('FLOW_EXECUTOR', {}).get('PYTHON', '/usr/bin/env python')
                + ' -m executors ' + executor_module
            ]
        except PermissionDenied as error:
            data.status = Data.STATUS_ERROR
            data.process_error.append("Permission denied for process: {}".format(error))
            data.save()
            return
        except OSError as err:
            logger.error(__(
                "OSError occurred while preparing data {} (will skip): {}",
                data.id, err
            ))
            return

        # Hand off to the run() method for execution.
        logger.info(__("Running {}", runtime_dir))
        self.run(data, runtime_dir, argv)
Beispiel #16
0
    def _data_execute(self, data, program, executor):
        """Execute the Data object.

        The activities carried out here include target directory
        preparation, executor copying, setting serialization and actual
        execution of the object.

        :param data: The :class:`~resolwe.flow.models.Data` object to
            execute.
        :param program: The process text the manager got out of
            execution engine evaluation.
        :param executor: The executor to use for this object.
        """
        if not program:
            return

        logger.debug(__("Manager preparing Data with id {} for processing.", data.id))

        # Prepare the executor's environment.
        try:
            executor_env_vars = self.get_executor().get_environment_variables()
            program = self._include_environment_variables(program, executor_env_vars)
            data_dir = self._prepare_data_dir(data)
            executor_module, runtime_dir = self._prepare_executor(data, executor)

            # Execute execution engine specific runtime preparation.
            execution_engine = data.process.run.get('language', None)
            volume_maps = self.get_execution_engine(execution_engine).prepare_runtime(runtime_dir, data)

            self._prepare_context(data.id, data_dir, runtime_dir, RUNTIME_VOLUME_MAPS=volume_maps)
            self._prepare_script(runtime_dir, program)

            argv = [
                '/bin/bash',
                '-c',
                self.settings_actual.get('FLOW_EXECUTOR', {}).get('PYTHON', '/usr/bin/env python')
                + ' -m executors ' + executor_module
            ]
        except PermissionDenied as error:
            data.status = Data.STATUS_ERROR
            data.process_error.append("Permission denied for process: {}".format(error))
            data.save()
            return
        except OSError as err:
            logger.error(__(
                "OSError occurred while preparing data {} (will skip): {}",
                data.id, err
            ))
            return

        # Hand off to the run() method for execution.
        logger.info(__("Running {}", runtime_dir))
        self.run(data, runtime_dir, argv)
Beispiel #17
0
    def update_data_status(self, **kwargs):
        """Update (PATCH) data object."""
        data = Data.objects.get(pk=self.data_id)
        for key, value in kwargs.items():
            setattr(data, key, value)

        update_fields = list(kwargs.keys())
        try:
            # Ensure that we only update the fields that were changed.
            data.save(update_fields=update_fields)

            if kwargs.get('status', None) == Data.STATUS_ERROR:
                self.process_failed = True
                logger.error(__("Error occured while running a '{}' process.",
                                self.process.name),
                             extra={
                                 'data_id':
                                 self.data_id,
                                 'api_url':
                                 '{}{}'.format(
                                     getattr(settings, 'RESOLWE_HOST_URL', ''),
                                     reverse('resolwe-api:data-detail',
                                             kwargs={'pk': self.data_id})),
                             })

        except ValidationError as exc:
            data = Data.objects.get(pk=self.data_id)

            data.process_error.append(exc.message)
            data.status = Data.STATUS_ERROR
            self.process_failed = True

            logger.error(__("Error occured while running a '{}' process.",
                            self.process.name),
                         exc_info=True,
                         extra={
                             'data_id':
                             self.data_id,
                             'api_url':
                             '{}{}'.format(
                                 getattr(settings, 'RESOLWE_HOST_URL', ''),
                                 reverse('resolwe-api:data-detail',
                                         kwargs={'pk': self.data_id})),
                         })

            update_fields = ['process_error', 'status']

            try:
                data.save(update_fields=update_fields)
            except:  # pylint: disable=bare-except
                pass
Beispiel #18
0
def _storage_purge_all(delete=False, verbosity=0):
    """Purge unreferenced storages."""
    orphaned_storages = Storage.objects.filter(data=None)

    if verbosity >= 1:
        if orphaned_storages.exists():
            logger.info(__("Unreferenced storages ({}):", orphaned_storages.count()))
            for storage_id in orphaned_storages.values_list('id', flat=True):
                logger.info(__("  {}", storage_id))
        else:
            logger.info("No unreferenced storages")

    if delete:
        orphaned_storages.delete()
Beispiel #19
0
    def __init__(
        self,
        hosts: List[str],
        port: int,
        protocol: str,
        zmq_socket: Optional[zmq.asyncio.Socket] = None,
    ):
        """Initialize."""
        if zmq_socket is None:
            zmq_context: zmq.asyncio.Context = zmq.asyncio.Context.instance()
            zmq_socket = zmq_context.socket(zmq.ROUTER)
            zmq_socket.setsockopt(zmq.ROUTER_HANDOVER, 1)
            for host in hosts:
                zmq_socket.bind(f"{protocol}://{host}:{port}")

        super().__init__(
            ZMQCommunicator(zmq_socket, "listener <-> workers", logger,
                            self.peer_status_changed),
            logger,
        )

        # Mapping from the data ID to the Worker instance. Each data object is
        # processed by (at most) one Worker so the map is one-to-one.
        self.peers: MutableMapping[PeerIdentity, Processor] = dict()

        # The code above is used when preparing script for the worker when it
        # is requested. Maybe there is a better way?
        self._execution_engines = self._load_execution_engines()
        logger.info(
            __(
                "Found {} execution engines: {}",
                len(self._execution_engines),
                ", ".join(self._execution_engines.keys()),
            ))
        self._expression_engines = self._load_expression_engines()
        logger.info(
            __(
                "Found {} expression engines: {}",
                len(self._expression_engines),
                ", ".join(self._expression_engines.keys()),
            ))
        self._executor_preparer = self._load_executor_preparer()

        self._parallel_commands_counter = 0
        self._max_parallel_commands = 10
        self._parallel_commands_semaphore: Optional[asyncio.Semaphore] = None
        self._get_program_lock = threading.Lock()

        self._bootstrap_cache: Dict[str, Any] = defaultdict(dict)
Beispiel #20
0
 async def push_stats(self):
     """Push current stats to Redis."""
     snapshot = self._make_stats()
     try:
         serialized = json.dumps(snapshot)
         await self._call_redis(aioredis.Redis.set,
                                state.MANAGER_LISTENER_STATS, serialized)
     except TypeError:
         logger.error(
             __("Listener can't serialize statistics:\n\n{}",
                traceback.format_exc()))
     except aioredis.RedisError:
         logger.error(
             __("Listener can't store updated statistics:\n\n{}",
                traceback.format_exc()))
Beispiel #21
0
    def create_expressions(self, num):
        """Generate expressions."""
        expressions = []
        sample_name = 'Cuffdiff_{}'.format(self.get_random_word(4))

        for i in range(num):
            cuffquant_file = 'cuffquant_{}.cxb'.format(random.choice([1, 2]))

            # Create expressios
            exp = Data.objects.create(
                name='Smpl_Ex_{}_rep{}'.format(sample_name, i + 1),
                process=get_process('upload-cxb'),
                contributor=get_superuser(),
                status=Data.STATUS_PROCESSING,
                input={
                    'src': {'file': cuffquant_file},
                    'source': 'UCSC',
                    'species': 'H**o sapiens',
                    'build': 'hg19',
                    'feature_type': 'gene'})

            os.mkdir(os.path.join(self.data_dir, str(exp.id)))
            shutil.copy(os.path.join(self.test_files_path, cuffquant_file), os.path.join(self.data_dir, str(exp.id)))

            exp.output = {
                'cxb': {'file': cuffquant_file},
                'source': 'UCSC',
                'species': 'H**o sapiens',
                'build': 'hg19',
                'feature_type': 'gene'
            }
            exp.status = Data.STATUS_DONE
            exp.save()

            sample = Sample.objects.filter(data=exp)[0]
            sample.descriptor = generate_sample_descriptor('Hs_')
            sample.annotated = True
            sample.save()

            with open(os.path.join(self.data_dir, str(exp.id), 'stdout.txt'), 'w') as stdout:
                stdout.write('Upload gene expressions. Sample was created '
                             'with the generate_diffexr_cuffdiff django-admin command.')

            logger.info(__('Created sample: {} (id={})', sample.name, sample.id))
            logger.info(__('\tData object: (id={})', exp.id))
            expressions.append(exp)

        return expressions
Beispiel #22
0
 def start(self, *args, **kwargs):
     """Start the listener thread."""
     signal(SIGINT, self._sigint)
     logger.info(
         __("Starting Resolwe listener on channel '{}'.",
            state.MANAGER_EXECUTOR_CHANNELS.queue))
     super().start(*args, **kwargs)
Beispiel #23
0
 def terminate(self):
     """Stop the standalone manager."""
     logger.info(__(
         "Terminating Resolwe listener on channel '{}'.",
         state.MANAGER_EXECUTOR_CHANNELS.queue
     ))
     self._should_stop = True
Beispiel #24
0
    def _sanitize_kubernetes_label(self, label: str) -> str:
        """Make sure kubernetes label complies with the rules.

        See the URL bellow for details.

        https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
        """
        registry = ""
        image_name = label
        max_length = 63
        if "/" in label:
            possible_registry, possible_image_name = label.split("/",
                                                                 maxsplit=1)
            if "." in possible_registry or ":" in possible_registry:
                registry = possible_registry
                image_name = possible_image_name

        sanitized_image_name = re.sub("[^0-9a-zA-Z._\-]+", "_",
                                      image_name).strip("-_.")
        if len(sanitized_image_name) > max_length:
            logger.warning(__("Label %s is too long and was truncated.",
                              label))
            sanitized_image_name = sanitized_image_name[-max_length:].strip(
                "-_.")

        if registry:
            return f"{registry}/{sanitized_image_name}"
        else:
            return sanitized_image_name
    def create_genome_annotation(self, filename):
        """Create a genome annotation."""
        ann = Data.objects.create(
            name='Annotation_{}'.format(filename.split('.')[0]),
            process=get_process('upload-gtf'),
            contributor=get_superuser(),
            status=Data.STATUS_PROCESSING,
            input={'src': {'file': filename}, 'source': 'hg19'})

        os.mkdir(os.path.join(self.data_dir, str(ann.id)))

        with gzip.open(os.path.join(self.test_files_path, filename), 'rb') as gzfile:
            with open(os.path.join(self.data_dir, str(ann.id), filename[:-3]), 'wb') as outfile:
                shutil.copyfileobj(gzfile, outfile)

        ann.output = {
            'gtf': {'file': filename[:-3]},
            'source': 'hg19'
        }
        ann.status = Data.STATUS_DONE
        ann.save()

        with open(os.path.join(self.data_dir, str(ann.id), 'stdout.txt'), 'w') as stdout:
            stdout.write('Upload genome annotation with the '
                         'generate_diffexpr_cuffdiff django-admin command.')

        logger.info(__('Genome annotation created: {} (id={})', filename, ann.id))

        return ann
Beispiel #26
0
    def _prepare_data_dir(self, data):
        """Prepare destination directory where the data will live.

        :param data: The :class:`~resolwe.flow.models.Data` object for
            which to prepare the private execution directory.
        :return: The prepared data directory path.
        :rtype: str
        """
        logger.debug(__("Preparing data directory for Data with id {}.", data.id))

        with transaction.atomic():
            # Create a temporary random location and then override it with data
            # location id since object has to be created first.
            # TODO Find a better solution, e.g. defer the database constraint.
            temporary_location_string = uuid.uuid4().hex[:10]
            data_location = DataLocation.objects.create(subpath=temporary_location_string)
            data_location.subpath = str(data_location.id)
            data_location.save()
            data_location.data.add(data)

        output_path = self._get_per_data_dir('DATA_DIR', data_location.subpath)
        dir_mode = self.settings_actual.get('FLOW_EXECUTOR', {}).get('DATA_DIR_MODE', 0o755)
        os.mkdir(output_path, mode=dir_mode)
        # os.mkdir is not guaranteed to set the given mode
        os.chmod(output_path, dir_mode)
        return output_path
Beispiel #27
0
 async def inc(self, tag):
     """Increase executor count by 1."""
     await self.condition.acquire()
     self.value += 1
     logger.debug(__("Sync semaphore increased to {}, tag {}.", self.value, tag))
     self.tag_sequence.append(tag + '-up')
     self.condition.release()
Beispiel #28
0
    def _prepare_data_dir(self, data: Data):
        """Prepare destination directory where the data will live.

        :param data: The :class:`~resolwe.flow.models.Data` object for
            which to prepare the private execution directory.
        :return: The prepared data directory path.
        :rtype: str
        """
        logger.debug(__("Preparing data directory for Data with id {}.", data.id))
        connector_name = self._get_data_connector_name()
        with transaction.atomic():
            # Create Worker object and set its status to preparing if needed.
            if not Worker.objects.filter(data=data).exists():
                Worker.objects.get_or_create(data=data, status=Worker.STATUS_PREPARING)

            file_storage = FileStorage.objects.create()
            # Data produced by the processing container will be uploaded to the
            # created location.
            data_location = StorageLocation.objects.create(
                file_storage=file_storage,
                url=str(file_storage.id),
                status=StorageLocation.STATUS_PREPARING,
                connector_name=connector_name,
            )
            file_storage.data.add(data)

            # Reference 'special' files.
            for file_ in referenced_files(data, include_descriptor=False):
                referenced_path = ReferencedPath.objects.create(path=file_)
                referenced_path.storage_locations.add(data_location)

        dir_mode = getattr(settings, "FLOW_EXECUTOR", {}).get("DATA_DIR_MODE", 0o755)
        connectors[connector_name].prepare_url(data_location.url, dir_mode=dir_mode)
Beispiel #29
0
        async def __aexit__(self, exc_type, exc_val, exc_tb):
            """Wait for executors to finish, then return."""
            logger.info(__("Waiting for executor count to drop to 0, now it is {}", self.value))

            await self.condition.acquire()
            try:
                await self.condition.wait()
            finally:
                self.condition.release()
            logger.debug(__(
                "Sync semaphore dropped to 0, tag sequence was {}.",
                self.tag_sequence
            ))

            self.active = False
            return False
 def handle_update_status(self, message: Message[str],
                          manager: "Processor") -> Response[str]:
     """Update data status."""
     new_status = manager._choose_worst_status(message.message_data,
                                               manager.data.status)
     if new_status != manager.data.status:
         manager._update_data({"status": new_status})
         if new_status == Data.STATUS_ERROR:
             logger.error(
                 __(
                     "Error occured while running process '{}' (handle_update).",
                     manager.data.process.slug,
                 ),
                 extra={
                     "data_id":
                     manager.data.id,
                     "api_url":
                     "{}{}".format(
                         getattr(settings, "RESOLWE_HOST_URL", ""),
                         reverse(
                             "resolwe-api:data-detail",
                             kwargs={"pk": manager.data.id},
                         ),
                     ),
                 },
             )
     return message.respond_ok(new_status)
Beispiel #31
0
    def _prepare_executor(self, data, executor):
        """Copy executor sources into the destination directory.

        :param data: The :class:`~resolwe.flow.models.Data` object being
            prepared for.
        :param executor: The fully qualified name of the executor that
            is to be used for this data object.
        :return: Tuple containing the relative fully qualified name of
            the executor class ('relative' to how the executor will be
            run) and the path to the directory where the executor will
            be deployed.
        :rtype: (str, str)
        """
        logger.debug(__("Preparing executor for Data with id {}", data.id))

        # Both of these imports are here only to get the packages' paths.
        import resolwe.flow.executors as executor_package

        exec_dir = os.path.dirname(inspect.getsourcefile(executor_package))
        dest_dir = self._get_per_data_dir('RUNTIME_DIR', data.location.subpath)
        dest_package_dir = os.path.join(dest_dir, 'executors')
        shutil.copytree(exec_dir, dest_package_dir)
        dir_mode = self.settings_actual.get('FLOW_EXECUTOR', {}).get('RUNTIME_DIR_MODE', 0o755)
        os.chmod(dest_dir, dir_mode)

        class_name = executor.rpartition('.executors.')[-1]
        return '.{}'.format(class_name), dest_dir
Beispiel #32
0
 async def inc(self, tag):
     """Increase executor count by 1."""
     await self.condition.acquire()
     self.value += 1
     logger.debug(__("Sync semaphore increased to {}, tag {}.", self.value, tag))
     self.tag_sequence.append(tag + '-up')
     self.condition.release()
Beispiel #33
0
    async def run(self):
        """Run the main listener run loop.

        Doesn't return until :meth:`terminate` is called.
        """
        logger.info(
            __(f"Starting Resolwe listener on  '{self.protocol}://{self.hosts}:{self.port}'."
               ))
        communicator_future = asyncio.ensure_future(
            self.listener_protocol.communicate())
        await self.should_stop.wait()
        self.listener_protocol.stop_communicate()
        await communicator_future
        logger.info(
            __(f"Stoping Resolwe listener on  '{self.protocol}://{self.hosts}:{self.port}'."
               ))
Beispiel #34
0
        async def __aexit__(self, exc_type, exc_val, exc_tb):
            """Wait for executors to finish, then return."""
            logger.info(__("Waiting for executor count to drop to 0, now it is {}", self.value))

            await self.condition.acquire()
            try:
                await self.condition.wait()
            finally:
                self.condition.release()
            logger.debug(__(
                "Sync semaphore dropped to 0, tag sequence was {}.",
                self.tag_sequence
            ))

            self.active = False
            return False
Beispiel #35
0
    def _prepare_data_dir(self, data):
        """Prepare destination directory where the data will live.

        :param data: The :class:`~resolwe.flow.models.Data` object for
            which to prepare the private execution directory.
        :return: The prepared data directory path.
        :rtype: str
        """
        logger.debug(
            __("Preparing data directory for Data with id {}.", data.id))

        with transaction.atomic():
            file_storage = FileStorage.objects.create()
            # Create StorageLocation with default connector.
            # We must also specify status since it is Uploading by default.
            data_location = StorageLocation.objects.create(
                file_storage=file_storage,
                url=str(file_storage.id),
                status=StorageLocation.STATUS_PREPARING,
                connector_name=STORAGE_LOCAL_CONNECTOR,
            )
            file_storage.data.add(data)

            # Reference 'special' files.
            for file_ in referenced_files(data, include_descriptor=False):
                referenced_path = ReferencedPath.objects.create(path=file_)
                referenced_path.storage_locations.add(data_location)

        output_path = self._get_per_data_dir("DATA_DIR", data_location.url)
        dir_mode = self.settings_actual.get("FLOW_EXECUTOR",
                                            {}).get("DATA_DIR_MODE", 0o755)
        os.mkdir(output_path, mode=dir_mode)
        # os.mkdir is not guaranteed to set the given mode
        os.chmod(output_path, dir_mode)
        return output_path
Beispiel #36
0
    def _prepare_executor(self, data, executor):
        """Copy executor sources into the destination directory.

        :param data: The :class:`~resolwe.flow.models.Data` object being
            prepared for.
        :param executor: The fully qualified name of the executor that
            is to be used for this data object.
        :return: Tuple containing the relative fully qualified name of
            the executor class ('relative' to how the executor will be
            run) and the path to the directory where the executor will
            be deployed.
        :rtype: (str, str)
        """
        logger.debug(__("Preparing executor for Data with id {}", data.id))

        # Both of these imports are here only to get the packages' paths.
        import resolwe.flow.executors as executor_package

        exec_dir = os.path.dirname(inspect.getsourcefile(executor_package))
        dest_dir = self._get_per_data_dir(
            "RUNTIME_DIR", data.location.default_storage_location.subpath)
        dest_package_dir = os.path.join(dest_dir, "executors")
        shutil.copytree(exec_dir, dest_package_dir)
        dir_mode = self.settings_actual.get("FLOW_EXECUTOR",
                                            {}).get("RUNTIME_DIR_MODE", 0o755)
        os.chmod(dest_dir, dir_mode)

        class_name = executor.rpartition(".executors.")[-1]
        return ".{}".format(class_name), dest_dir
Beispiel #37
0
    def _prepare_data_dir(self, data):
        """Prepare destination directory where the data will live.

        :param data: The :class:`~resolwe.flow.models.Data` object for
            which to prepare the private execution directory.
        :return: The prepared data directory path.
        :rtype: str
        """
        logger.debug(__("Preparing data directory for Data with id {}.", data.id))

        with transaction.atomic():
            # Create a temporary random location and then override it with data
            # location id since object has to be created first.
            # TODO Find a better solution, e.g. defer the database constraint.
            temporary_location_string = uuid.uuid4().hex[:10]
            data_location = DataLocation.objects.create(subpath=temporary_location_string)
            data_location.subpath = str(data_location.id)
            data_location.save()
            data_location.data.add(data)

        output_path = self._get_per_data_dir('DATA_DIR', data_location.subpath)
        dir_mode = self.settings_actual.get('FLOW_EXECUTOR', {}).get('DATA_DIR_MODE', 0o755)
        os.mkdir(output_path, mode=dir_mode)
        # os.mkdir is not guaranteed to set the given mode
        os.chmod(output_path, dir_mode)
        return output_path
Beispiel #38
0
    def create_genome_annotation(self, filename):
        """Create a genome annotation."""
        ann = Data.objects.create(name='Annotation_{}'.format(
            filename.split('.')[0]),
                                  process=get_process('upload-gtf'),
                                  contributor=get_superuser(),
                                  status=Data.STATUS_PROCESSING,
                                  input={
                                      'src': {
                                          'file': filename
                                      },
                                      'source': 'UCSC'
                                  })

        os.mkdir(os.path.join(self.data_dir, str(ann.id)))

        with gzip.open(os.path.join(self.test_files_path, filename),
                       'rb') as gzfile:
            with open(os.path.join(self.data_dir, str(ann.id), filename[:-3]),
                      'wb') as outfile:
                shutil.copyfileobj(gzfile, outfile)

        ann.output = {'gtf': {'file': filename[:-3]}, 'source': 'UCSC'}
        ann.status = Data.STATUS_DONE
        ann.save()

        with open(os.path.join(self.data_dir, str(ann.id), 'stdout.txt'),
                  'w') as stdout:
            stdout.write('Upload genome annotation with the '
                         'generate_diffexpr_cuffdiff django-admin command.')

        logger.info(
            __('Genome annotation created: {} (id={})', filename, ann.id))

        return ann
Beispiel #39
0
    def handle(self, *args, **options):
        """Command handle."""
        count_inserted, count_unchanged = 0, 0

        for tab_file_name, line_count, tab_file in decompress(
                options['file_name']):
            logger.info(__("Importing mappings from \"{}\":", tab_file_name))

            reader = csv.DictReader(tab_file, delimiter=str('\t'))
            bar_format = '{desc}{percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'

            for row in tqdm(reader, total=line_count, bar_format=bar_format):
                _, created = Mapping.objects.update_or_create(
                    relation_type=row['relation_type'],
                    source_db=row['source_db'],
                    source_id=row['source_id'],
                    target_db=row['target_db'],
                    target_id=row['target_id'])
                if created:
                    count_inserted += 1
                else:
                    count_unchanged += 1

        index_builder.push(index=MappingSearchIndex)

        logger.info("Total mappings: %d. Inserted %d, unchanged %d." %  # pylint: disable=logging-not-lazy
                    (count_inserted + count_unchanged, count_inserted,
                     count_unchanged))
Beispiel #40
0
 async def push_stats(self):
     """Push current stats to Redis."""
     snapshot = self._make_stats()
     try:
         serialized = json.dumps(snapshot)
         await self._call_redis(aioredis.Redis.set, state.MANAGER_LISTENER_STATS, serialized)
         await self._call_redis(aioredis.Redis.expire, state.MANAGER_LISTENER_STATS, 3600)
     except TypeError:
         logger.error(__(
             "Listener can't serialize statistics:\n\n{}",
             traceback.format_exc()
         ))
     except aioredis.RedisError:
         logger.error(__(
             "Listener can't store updated statistics:\n\n{}",
             traceback.format_exc()
         ))
Beispiel #41
0
    def submit(self, data, runtime_dir, argv):
        """Run process with SLURM.

        For details, see
        :meth:`~resolwe.flow.managers.workload_connectors.base.BaseConnector.submit`.
        """
        limits = data.process.get_resource_limits()
        logger.debug(__(
            "Connector '{}' running for Data with id {} ({}).",
            self.__class__.__module__,
            data.id,
            repr(argv)
        ))

        # Compute target partition.
        partition = getattr(settings, 'FLOW_SLURM_PARTITION_DEFAULT', None)
        if data.process.slug in getattr(settings, 'FLOW_SLURM_PARTITION_OVERRIDES', {}):
            partition = settings.FLOW_SLURM_PARTITION_OVERRIDES[data.process.slug]

        try:
            # Make sure the resulting file is executable on creation.
            script_path = os.path.join(runtime_dir, 'slurm.sh')
            file_descriptor = os.open(script_path, os.O_WRONLY | os.O_CREAT, mode=0o555)
            with os.fdopen(file_descriptor, 'wt') as script:
                script.write('#!/bin/bash\n')
                script.write('#SBATCH --mem={}M\n'.format(limits['memory'] + EXECUTOR_MEMORY_OVERHEAD))
                script.write('#SBATCH --cpus-per-task={}\n'.format(limits['cores']))
                if partition:
                    script.write('#SBATCH --partition={}\n'.format(partition))

                # Render the argument vector into a command line.
                line = ' '.join(map(shlex.quote, argv))
                script.write(line + '\n')

            command = ['/usr/bin/env', 'sbatch', script_path]
            subprocess.Popen(
                command,
                cwd=runtime_dir,
                stdin=subprocess.DEVNULL
            ).wait()
        except OSError as err:
            logger.error(__(
                "OSError occurred while preparing SLURM script for Data {}: {}",
                data.id, err
            ))
Beispiel #42
0
    def purge_run(self, event):
        """Run purge for the object with ``location_id`` specified in ``event`` argument."""
        location_id = event['location_id']
        verbosity = event['verbosity']

        try:
            logger.info(__("Running purge for location id {}.", location_id))
            location_purge(location_id=location_id, delete=True, verbosity=verbosity)
        except Exception:  # pylint: disable=broad-except
            logger.exception("Error while purging location.", extra={'location_id': location_id})
    def create_expressions(self, num):
        """Generate expressions."""
        expressions = []
        sample_name = 'Cuffdiff_{}'.format(self.get_random_word(4))

        for i in range(num):
            cuffquant_file = 'cuffquant_{}.cxb'.format(random.choice([1, 2]))

            # Create expressios
            exp = Data.objects.create(
                name='Smpl_Ex_{}_rep{}'.format(sample_name, i + 1),
                process=get_process('upload-cxb'),
                contributor=get_superuser(),
                status=Data.STATUS_PROCESSING,
                input={'src': {'file': cuffquant_file}, 'source': 'hg19'})

            os.mkdir(os.path.join(self.data_dir, str(exp.id)))
            shutil.copy(os.path.join(self.test_files_path, cuffquant_file), os.path.join(self.data_dir, str(exp.id)))

            exp.output = {
                'cxb': {'file': cuffquant_file},
                'source': 'hg19'
            }
            exp.status = Data.STATUS_DONE
            exp.save()

            sample = Sample.objects.filter(data=exp)[0]
            sample.presample = False
            sample.descriptor = generate_sample_desciptor('Hs_')
            sample.save()

            with open(os.path.join(self.data_dir, str(exp.id), 'stdout.txt'), 'w') as stdout:
                stdout.write('Upload gene expressions. Sample was created '
                             'with the generate_diffexr_cuffdiff django-admin command.')

            logger.info(__('Created sample: {} (id={})', sample.name, sample.id))
            logger.info(__('\tData object: (id={})', exp.id))
            expressions.append(exp)

        return expressions
Beispiel #44
0
    def handle(self, *args, **options):
        """Command handle."""
        count_inserted, count_updated, count_unchanged, count_failed = 0, 0, 0, 0

        for tab_file_name, line_count, tab_file in decompress(options['file_name']):
            logger.info(__("Importing features from \"{}\":", tab_file_name))

            reader = csv.DictReader(tab_file, delimiter=str('\t'))
            bar_format = '{desc}{percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'

            for row in tqdm(reader, total=line_count, bar_format=bar_format):
                aliases_text = row['Aliases'].strip()
                aliases = []
                if aliases_text and aliases_text != '-':
                    aliases = aliases_text.split(',')

                sub_type = SUBTYPE_MAP.get(row['Gene type'], 'other')

                values = {
                    'species': row['Species'],
                    'type': row['Type'],
                    'sub_type': sub_type,
                    'name': row['Name'],
                    'full_name': row['Full name'],
                    'description': row['Description'],
                    'aliases': aliases,
                }

                try:
                    feature = Feature.objects.get(source=row['Source'], feature_id=row['ID'])

                    is_update = False
                    for attr, value in values.items():
                        if getattr(feature, attr) != value:
                            setattr(feature, attr, value)
                            is_update = True

                    if is_update:
                        feature.save()
                        count_updated += 1
                    else:
                        count_unchanged += 1

                except Feature.DoesNotExist:
                    feature = Feature(source=row['Source'], feature_id=row['ID'], **values)
                    feature.save()
                    count_inserted += 1

        count_total = count_inserted + count_updated + count_unchanged + count_failed
        logger.info("Total features: %d. Inserted %d, updated %d, "  # pylint: disable=logging-not-lazy
                    "unchanged %d, failed %d." %
                    (count_total, count_inserted, count_updated, count_unchanged, count_failed))
Beispiel #45
0
    def discover_engines(self, executor=None):
        """Discover configured engines.

        :param executor: Optional executor module override
        """
        if executor is None:
            executor = getattr(settings, 'FLOW_EXECUTOR', {}).get('NAME', 'resolwe.flow.executors.local')
        self.executor = self.load_executor(executor)
        logger.info(
            __("Loaded '{}' executor.", str(self.executor.__class__.__module__).replace('.prepare', ''))
        )

        expression_engines = getattr(settings, 'FLOW_EXPRESSION_ENGINES', ['resolwe.flow.expression_engines.jinja'])
        self.expression_engines = self.load_expression_engines(expression_engines)
        logger.info(__(
            "Found {} expression engines: {}", len(self.expression_engines), ', '.join(self.expression_engines.keys())
        ))

        execution_engines = getattr(settings, 'FLOW_EXECUTION_ENGINES', ['resolwe.flow.execution_engines.bash'])
        self.execution_engines = self.load_execution_engines(execution_engines)
        logger.info(__(
            "Found {} execution engines: {}", len(self.execution_engines), ', '.join(self.execution_engines.keys())
        ))
Beispiel #46
0
    def communicate(self, run_sync=False, verbosity=1):
        """Resolving task dependancy and execution."""
        queue = []
        try:
            with transaction.atomic():
                for data in Data.objects.select_for_update().filter(status=Data.STATUS_RESOLVING):

                    dep_status = dependency_status(data)

                    if dep_status == Data.STATUS_ERROR:
                        data.status = Data.STATUS_ERROR
                        data.process_error.append("One or more inputs have status ERROR")
                        data.process_rc = 1
                        data.save()
                        continue

                    elif dep_status != Data.STATUS_DONE:
                        data.status = Data.STATUS_RESOLVING
                        data.save()
                        continue

                    if data.process.run:
                        try:
                            execution_engine = data.process.run.get('language', None)
                            program = self.get_execution_engine(execution_engine).evaluate(data)
                        except (ExecutionError, InvalidEngineError) as error:
                            data.status = Data.STATUS_ERROR
                            data.process_error.append('Error in process script: {}'.format(error))
                            data.save()
                            continue
                    else:
                        # If there is no run section, then we should not try to run anything. But the
                        # program must not be set to None as then the process will be stuck in waiting state.
                        program = ''

                    data.status = Data.STATUS_WAITING
                    data.save(render_name=True)

                    if program is not None:
                        queue.append((data.id, program))

        except IntegrityError as exp:
            logger.error(__("IntegrityError in manager {}", exp))
            return

        for data_id, program in queue:
            if verbosity >= 1:
                print("Running", program)
            self.run(data_id, program, verbosity=verbosity)
Beispiel #47
0
        async def dec(self, tag):
            """Decrease executor count by 1.

            Return ``True`` if the count dropped to 0 as a result.
            """
            ret = False
            await self.condition.acquire()
            try:
                self.value -= 1
                logger.debug(__("Sync semaphore decreased to {}, tag {}.", self.value, tag))
                self.tag_sequence.append(tag + '-down')
                ret = self.value == 0
                if self.active and self.value == 0:
                    self.condition.notify_all()
            finally:
                self.condition.release()
            return ret
Beispiel #48
0
    def submit(self, data, runtime_dir, argv):
        """Run process.

        For details, see
        :meth:`~resolwe.flow.managers.workload_connectors.base.BaseConnector.submit`.
        """
        queue = 'ordinary'
        if data.process.scheduling_class == Process.SCHEDULING_CLASS_INTERACTIVE:
            queue = 'hipri'

        logger.debug(__(
            "Connector '{}' running for Data with id {} ({}) in celery queue {}, EAGER is {}.",
            self.__class__.__module__,
            data.id,
            repr(argv),
            queue,
            getattr(settings, 'CELERY_ALWAYS_EAGER', None)
        ))
        celery_run.apply_async((data.id, runtime_dir, argv), queue=queue)
Beispiel #49
0
    def create_geneset(self):
        """Create gene set object."""
        started = timezone.now()
        geneset = Data.objects.create(
            name='GeneSet_{}_{}'.format(random.choice(['Hs', 'Mm']), self.get_random_word(3)),
            process=get_process('upload-geneset'),
            contributor=get_superuser(),
            started=started,
            finished=started + datetime.timedelta(seconds=5),
            descriptor_schema=get_descriptorschema('geneset'),
            descriptor={'description': 'Gene set description.'},
            status=Data.STATUS_PROCESSING,
            input={'src': {'file': 'geneset.tab.gz'}, 'source': 'UCSC'})

        mouse_genes = os.path.join(self.test_files_path, 'mouse_genes.tab.gz')

        os.mkdir(os.path.join(self.data_dir, str(geneset.id)))
        self.generate_geneset_file(mouse_genes,
                                   random.randint(15, 150),
                                   os.path.join(self.data_dir, str(geneset.id)))

        json_object = Storage.objects.create(
            json=json.load(open(os.path.join(self.data_dir, str(geneset.id), 'geneset.json'))),
            contributor=get_superuser(),
            name='{}_storage'.format(geneset.name),
            data=geneset)

        os.remove(os.path.join(self.data_dir, str(geneset.id), 'geneset.json'))

        geneset.output = {
            'geneset': {'file': 'geneset.tab.gz'},
            'geneset_json': json_object.id,
            'source': 'UCSC'
        }

        geneset.status = Data.STATUS_DONE
        geneset.save()

        with open(os.path.join(self.data_dir, str(geneset.id), 'stdout.txt'), 'w') as stdout:
            stdout.write('Generate gene set. Gene set was created '
                         'with the generate_geneset django-admin command.')

        logger.info(__('Created Gene set object: (id={})', geneset.id))
    def create_geneset(self):
        """Create gene set object."""
        started = timezone.now()
        geneset = Data.objects.create(
            name="GeneSet_{}_{}".format(random.choice(["Hs", "Mm"]), self.get_random_word(3)),
            process=get_process("upload-geneset"),
            contributor=get_superuser(),
            started=started,
            finished=started + datetime.timedelta(seconds=5),
            descriptor_schema=get_descriptorschema("geneset"),
            descriptor={"description": "Gene set description."},
            status=Data.STATUS_PROCESSING,
            input={"src": {"file": "geneset.tab.gz"}, "source": "UCSC"},
        )

        mouse_genes = os.path.join(self.test_files_path, "mouse_genes.tab.gz")

        os.mkdir(os.path.join(self.data_dir, str(geneset.id)))
        self.generate_geneset_file(mouse_genes, random.randint(15, 150), os.path.join(self.data_dir, str(geneset.id)))

        json_object = Storage.objects.create(
            json=json.load(open(os.path.join(self.data_dir, str(geneset.id), "geneset.json"))),
            contributor=get_superuser(),
            name="{}_storage".format(geneset.name),
            data=geneset,
        )

        os.remove(os.path.join(self.data_dir, str(geneset.id), "geneset.json"))

        geneset.output = {"geneset": {"file": "geneset.tab.gz"}, "geneset_json": json_object.id, "source": "UCSC"}

        geneset.status = Data.STATUS_DONE
        geneset.save()

        with open(os.path.join(self.data_dir, str(geneset.id), "stdout.txt"), "w") as stdout:
            stdout.write("Generate gene set. Gene set was created " "with the generate_geneset django-admin command.")

        logger.info(__("Created Gene set object: (id={})", geneset.id))
Beispiel #51
0
    def handle(self, *args, **options):
        """Command handle."""
        count_inserted, count_unchanged = 0, 0

        for tab_file_name, line_count, tab_file in decompress(options['file_name']):
            logger.info(__("Importing mappings from \"{}\":", tab_file_name))

            reader = csv.DictReader(tab_file, delimiter=str('\t'))
            bar_format = '{desc}{percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'

            for row in tqdm(reader, total=line_count, bar_format=bar_format):
                _, created = Mapping.objects.update_or_create(relation_type=row['relation_type'],
                                                              source_db=row['source_db'],
                                                              source_id=row['source_id'],
                                                              target_db=row['target_db'],
                                                              target_id=row['target_id'])
                if created:
                    count_inserted += 1
                else:
                    count_unchanged += 1

        logger.info("Total mappings: %d. Inserted %d, unchanged %d." %  # pylint: disable=logging-not-lazy
                    (count_inserted + count_unchanged, count_inserted, count_unchanged))
    def generate_diffexp_data(self, group_size):
        """Generate differential expression data."""
        de_name = 'cuffdiff'

        human_genes = os.path.join(self.test_files_path, 'human_genes.tab.gz')

        logger.info('---- case samples ----')
        case_samples = self.create_expressions(group_size)

        logger.info('---- control samples ----')
        control_samples = self.create_expressions(group_size)

        logger.info('---- upload annotation ----')

        case_input = [sample.id for sample in case_samples]
        control_input = [sample.id for sample in control_samples]
        genome_annotation_input = self.create_genome_annotation('hg19_chr20_small.gtf.gz')

        de_descriptor = {
            'thresholds': {
                'prob_field': 'fdr',
                'logfc': 2,
                'prob': 0.05},
            'case_label': 'Case group',
            'control_label': 'Control group'
        }

        de_inputs = {
            'case': case_input,
            'control': control_input,
            'annotation': genome_annotation_input.id
        }

        # Create DE data object
        started = timezone.now()

        de_obj = Data.objects.create(
            name=self.get_name(de_name),
            started=started,
            finished=started + datetime.timedelta(minutes=20),
            status=Data.STATUS_PROCESSING,
            descriptor_schema=get_descriptorschema('diff-exp'),
            descriptor=de_descriptor,
            process=get_process(de_name),
            contributor=get_superuser(),
            input=de_inputs)

        # Create data directory
        os.mkdir(os.path.join(self.data_dir, str(de_obj.id)))
        self.generate_raw_data(human_genes, os.path.join(self.data_dir, str(de_obj.id)))

        with open(os.path.join(self.data_dir, str(de_obj.id), 'test.txt'), 'w') as _:
            pass

        json_object = Storage.objects.create(
            json=json.load(open(os.path.join(self.data_dir, str(de_obj.id), 'de_json.json'))),
            contributor=get_superuser(),
            name='{}_storage'.format(de_obj.name),
            data=de_obj)

        os.remove(os.path.join(self.data_dir, str(de_obj.id), 'de_json.json'))

        # TODO: reference on existing true files
        de_obj.output = {
            'raw': {'file': 'de_raw.tab.gz'},
            'de_json': json_object.id,
            'de_file': {'file': 'de_file.tab.gz'},
            'transcript_diff_exp': {'file': 'test.txt'},
            'cds_diff_exp': {'file': 'test.txt'},
            'tss_group_diff_exp': {'file': 'test.txt'},
            'cuffdiff_output': {'file': 'test.txt'},
            'source': 'hg19'
        }

        de_obj.status = Data.STATUS_DONE
        de_obj.save()

        logger.info('---- new differential expression ----')
        logger.info(__('DE created with id: {}', de_obj.id))

        # Create stdout file
        with open(os.path.join(self.data_dir, str(de_obj.id), 'stdout.txt'), 'w') as stdout:
            stdout.write('Differential expression was '
                         'created with the generate_diffexpr_cuffdiff django-admin command.')
    def handle(self, *args, **options):
        """Command handle."""
        count_total, count_inserted = 0, 0
        to_index = []

        relation_type_choices = list(zip(*Mapping.RELATION_TYPE_CHOICES))[0]

        for tab_file_name, tab_file in decompress(options['file_name']):
            logger.info(__("Importing mappings from \"{}\"...", tab_file_name))

            mappings = set()
            for row in csv.DictReader(tab_file, delimiter=str('\t')):
                if row['relation_type'] not in relation_type_choices:
                    raise ValidationError(
                        "Unknown relation type: {}".format(row['relation_type'])
                    )

                # NOTE: For performance reasons this is a tuple instead of a dict.
                #       Tuple can be hashed, so it can be used in `ìn` operation,
                #       and is serialized to a JSON list.
                #       Make sure that any changes also reflect in the SQL query
                #       below.
                mapping = (
                    row['relation_type'],
                    row['source_db'],
                    row['source_id'],
                    row['source_species'],
                    row['target_db'],
                    row['target_id'],
                    row['target_species'],
                )

                if mapping in mappings:
                    raise ValidationError(
                        "Duplicated mapping (relation type: '{}', source db: '{}', source id: "
                        "'{}', source species: {}, target db: '{}', target id: '{}', "
                        "target species: {}) found in '{}'".format(
                            row['relation_type'], row['source_db'], row['source_id'],
                            row['source_species'], row['target_db'], row['target_id'],
                            row['target_species'], tab_file_name
                        )
                    )

                mappings.add(mapping)

            with connection.cursor() as cursor:
                cursor.execute(
                    """
                    WITH tmp AS(
                        INSERT INTO {table_name} (
                            relation_type, source_db, source_id, source_species,
                            target_db, target_id, target_species
                        )
                        SELECT
                            value->>0, value->>1, value->>2, value->>3,
                            value->>4, value->>5, value->>6
                        FROM json_array_elements(%s)
                        ON CONFLICT DO NOTHING -- conflict means that mapping is already present
                        RETURNING id
                    )
                    SELECT
                        COALESCE(array_agg(id), ARRAY[]::INTEGER[]) AS ids,
                        COUNT(*) AS count_inserted
                    FROM tmp;
                    """.format(
                        table_name=Mapping._meta.db_table,  # pylint: disable=no-member,protected-access
                    ),
                    params=[json.dumps(list(mappings))]
                )
                result = cursor.fetchone()

            to_index.extend(result[0])

            count_total += len(mappings)
            count_inserted += result[1]

        index_builder.build(queryset=Mapping.objects.filter(id__in=to_index))

        logger.info(  # pylint: disable=logging-not-lazy
            "Total mappings: %d. Inserted %d, unchanged %d." %
            (count_total, count_inserted, count_total - count_inserted)
        )
Beispiel #54
0
    async def run(self):
        """Run the main listener run loop.

        Doesn't return until :meth:`terminate` is called.
        """
        logger.info(__(
            "Starting Resolwe listener on channel '{}'.",
            state.MANAGER_EXECUTOR_CHANNELS.queue
        ))
        while not self._should_stop:
            await self.push_stats()
            ret = await self._call_redis(aioredis.Redis.blpop, state.MANAGER_EXECUTOR_CHANNELS.queue, timeout=1)
            if ret is None:
                self.load_avg.add(0)
                continue
            remaining = await self._call_redis(aioredis.Redis.llen, state.MANAGER_EXECUTOR_CHANNELS.queue)
            self.load_avg.add(remaining + 1)
            self.check_critical_load()
            _, item = ret
            try:
                item = item.decode('utf-8')
                logger.debug(__("Got command from executor: {}", item))
                obj = json.loads(item)
            except json.JSONDecodeError:
                logger.error(
                    __("Undecodable command packet:\n\n{}"),
                    traceback.format_exc()
                )
                continue

            command = obj.get(ExecutorProtocol.COMMAND, None)
            if command is None:
                continue

            service_start = time.perf_counter()

            handler = getattr(self, 'handle_' + command, None)
            if handler:
                try:
                    with PrioritizedBatcher.global_instance():
                        await database_sync_to_async(handler)(obj)
                except Exception:  # pylint: disable=broad-except
                    logger.error(__(
                        "Executor command handling error:\n\n{}",
                        traceback.format_exc()
                    ))
            else:
                logger.error(
                    __("Unknown executor command '{}'.", command),
                    extra={'decoded_packet': obj}
                )

            # We do want to measure wall-clock time elapsed, because
            # system load will impact event handling performance. On
            # a lagging system, good internal performance is meaningless.
            service_end = time.perf_counter()
            self.service_time.update(service_end - service_start)
        logger.info(__(
            "Stopping Resolwe listener on channel '{}'.",
            state.MANAGER_EXECUTOR_CHANNELS.queue
        ))
    def handle(self, *args, **options):
        """Command handle."""
        count_total, count_inserted, count_updated = 0, 0, 0
        to_index = []

        type_choices = list(zip(*Feature.TYPE_CHOICES))[0]
        subtype_choices = list(zip(*Feature.SUBTYPE_CHOICES))[0]

        for tab_file_name, tab_file in decompress(options['file_name']):
            logger.info(__("Importing features from \"{}\"...", tab_file_name))

            features = []
            unique_features = set()
            for row in csv.DictReader(tab_file, delimiter=str('\t')):
                sub_type = SUBTYPE_MAP.get(row['Gene type'], 'other')

                if row['Type'] not in type_choices:
                    raise ValidationError("Unknown type: {}".format(row['Type']))
                if sub_type not in subtype_choices:
                    raise ValidationError("Unknown subtype: {}".format(sub_type))

                aliases_text = row['Aliases'].strip()
                aliases = []
                if aliases_text and aliases_text != '-':
                    aliases = aliases_text.split(',')

                if (row['Source'], row['ID']) in unique_features:
                    raise ValidationError(
                        "Duplicated feature (source: '{}', id: '{}') found in '{}'".format(
                            row['Source'], row['ID'], tab_file_name
                        )
                    )

                # NOTE: For performance reasons this is a list instead of a dict.
                #       Make sure that any changes also reflect in the SQL query
                #       below.
                features.append([
                    row['Source'],
                    row['ID'],
                    row['Species'],
                    row['Type'],
                    sub_type,
                    row['Name'],
                    row['Full name'],
                    row['Description'],
                    aliases,
                ])
                unique_features.add((row['Source'], row['ID']))

            with connection.cursor() as cursor:
                cursor.execute(
                    """
                    WITH tmp AS (
                        INSERT INTO {table_name} (
                            source, feature_id, species, type,
                            sub_type, name, full_name, description,
                            aliases
                        )
                        SELECT
                            value->>0, value->>1, value->>2, value->>3,
                            value->>4, value->>5, value->>6, value->>7,
                            ARRAY(SELECT json_array_elements_text(value->8))
                        FROM json_array_elements(%s)
                        ON CONFLICT (species, source, feature_id) DO UPDATE
                        SET
                            type = EXCLUDED.type,
                            sub_type = EXCLUDED.sub_type,
                            name = EXCLUDED.name,
                            full_name = EXCLUDED.full_name,
                            description = EXCLUDED.description,
                            aliases = EXCLUDED.aliases
                        WHERE (
                            {table_name}.type, {table_name}.sub_type, {table_name}.name,
                            {table_name}.full_name, {table_name}.description, {table_name}.aliases
                        ) IS DISTINCT FROM (
                            EXCLUDED.type, EXCLUDED.sub_type, EXCLUDED.name,
                            EXCLUDED.full_name, EXCLUDED.description, EXCLUDED.aliases
                        )
                        RETURNING id, xmax
                    )
                    SELECT
                        COALESCE(array_agg(id), ARRAY[]::INTEGER[]) AS ids,
                        COUNT(CASE WHEN xmax = 0 THEN 1 END) AS count_inserted,
                        COUNT(CASE WHEN xmax != 0 THEN 1 END) AS count_updated
                    FROM tmp;
                    """.format(
                        table_name=Feature._meta.db_table,  # pylint: disable=no-member,protected-access
                    ),
                    params=[json.dumps(features)]
                )
                result = cursor.fetchone()

            to_index.extend(result[0])

            count_total += len(features)
            count_inserted += result[1]
            count_updated += result[2]

        index_builder.build(queryset=Feature.objects.filter(id__in=to_index))

        logger.info(  # pylint: disable=logging-not-lazy
            "Total features: %d. Inserted %d, updated %d, unchanged %d." %
            (count_total, count_inserted, count_updated,
             count_total - count_inserted - count_updated)
        )
Beispiel #56
0
    def handle_finish(self, obj):
        """Handle an incoming ``Data`` finished processing request.

        :param obj: The Channels message object. Command object format:

            .. code-block:: none

                {
                    'command': 'finish',
                    'data_id': [id of the :class:`~resolwe.flow.models.Data` object
                               this command changes],
                    'process_rc': [exit status of the processing]
                    'spawn_processes': [optional; list of spawn dictionaries],
                    'exported_files_mapper': [if spawn_processes present]
                }
        """
        data_id = obj[ExecutorProtocol.DATA_ID]
        logger.debug(
            __("Finishing Data with id {} (handle_finish).", data_id),
            extra={
                'data_id': data_id,
                'packet': obj
            }
        )
        spawning_failed = False
        with transaction.atomic():
            # Spawn any new jobs in the request.
            spawned = False
            if ExecutorProtocol.FINISH_SPAWN_PROCESSES in obj:
                if is_testing():
                    # NOTE: This is a work-around for Django issue #10827
                    # (https://code.djangoproject.com/ticket/10827), same as in
                    # TestCaseHelpers._pre_setup(). Because the listener is running
                    # independently, it must clear the cache on its own.
                    ContentType.objects.clear_cache()

                spawned = True
                exported_files_mapper = obj[ExecutorProtocol.FINISH_EXPORTED_FILES]
                logger.debug(
                    __("Spawning new Data objects for Data with id {} (handle_finish).", data_id),
                    extra={
                        'data_id': data_id
                    }
                )

                try:
                    # This transaction is needed because we're running
                    # asynchronously with respect to the main Django code
                    # here; the manager can get nudged from elsewhere.
                    with transaction.atomic():
                        parent_data = Data.objects.get(pk=data_id)

                        # Spawn processes.
                        for d in obj[ExecutorProtocol.FINISH_SPAWN_PROCESSES]:
                            d['contributor'] = parent_data.contributor
                            d['process'] = Process.objects.filter(slug=d['process']).latest()
                            d['tags'] = parent_data.tags

                            for field_schema, fields in iterate_fields(d.get('input', {}), d['process'].input_schema):
                                type_ = field_schema['type']
                                name = field_schema['name']
                                value = fields[name]

                                if type_ == 'basic:file:':
                                    fields[name] = self.hydrate_spawned_files(
                                        exported_files_mapper, value, data_id
                                    )
                                elif type_ == 'list:basic:file:':
                                    fields[name] = [self.hydrate_spawned_files(exported_files_mapper, fn, data_id)
                                                    for fn in value]

                            with transaction.atomic():
                                d = Data.objects.create(**d)
                                DataDependency.objects.create(
                                    parent=parent_data,
                                    child=d,
                                    kind=DataDependency.KIND_SUBPROCESS,
                                )

                                # Copy permissions.
                                copy_permissions(parent_data, d)

                                # Entity is added to the collection only when it is
                                # created - when it only contains 1 Data object.
                                entities = Entity.objects.filter(data=d).annotate(num_data=Count('data')).filter(
                                    num_data=1)

                                # Copy collections.
                                for collection in parent_data.collection_set.all():
                                    collection.data.add(d)

                                    # Add entities to which data belongs to the collection.
                                    for entity in entities:
                                        entity.collections.add(collection)

                except Exception:  # pylint: disable=broad-except
                    logger.error(
                        __(
                            "Error while preparing spawned Data objects of process '{}' (handle_finish):\n\n{}",
                            parent_data.process.slug,
                            traceback.format_exc()
                        ),
                        extra={
                            'data_id': data_id
                        }
                    )
                    spawning_failed = True

            # Data wrap up happens last, so that any triggered signals
            # already see the spawned children. What the children themselves
            # see is guaranteed by the transaction we're in.
            if ExecutorProtocol.FINISH_PROCESS_RC in obj:
                process_rc = obj[ExecutorProtocol.FINISH_PROCESS_RC]

                try:
                    d = Data.objects.get(pk=data_id)
                except Data.DoesNotExist:
                    logger.warning(
                        "Data object does not exist (handle_finish).",
                        extra={
                            'data_id': data_id,
                        }
                    )
                    async_to_sync(self._send_reply)(obj, {ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_ERROR})
                    return

                changeset = {
                    'process_progress': 100,
                    'finished': now(),
                }

                if spawning_failed:
                    changeset['status'] = Data.STATUS_ERROR
                    changeset['process_error'] = ["Error while preparing spawned Data objects"]

                elif process_rc == 0 and not d.status == Data.STATUS_ERROR:
                    changeset['status'] = Data.STATUS_DONE

                else:
                    changeset['status'] = Data.STATUS_ERROR
                    changeset['process_rc'] = process_rc

                obj[ExecutorProtocol.UPDATE_CHANGESET] = changeset
                self.handle_update(obj, internal_call=True)

        if not getattr(settings, 'FLOW_MANAGER_KEEP_DATA', False):
            # Purge worker is not running in test runner, so we should skip triggering it.
            if not is_testing():
                channel_layer = get_channel_layer()
                try:
                    async_to_sync(channel_layer.send)(
                        CHANNEL_PURGE_WORKER,
                        {
                            'type': TYPE_PURGE_RUN,
                            'location_id': d.location.id,
                            'verbosity': self._verbosity,
                        }
                    )
                except ChannelFull:
                    logger.warning(
                        "Cannot trigger purge because channel is full.",
                        extra={'data_id': data_id}
                    )

        # Notify the executor that we're done.
        async_to_sync(self._send_reply)(obj, {ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_OK})

        # Now nudge the main manager to perform final cleanup. This is
        # needed even if there was no spawn baggage, since the manager
        # may need to know when executors have finished, to keep count
        # of them and manage synchronization.
        async_to_sync(consumer.send_event)({
            WorkerProtocol.COMMAND: WorkerProtocol.FINISH,
            WorkerProtocol.DATA_ID: data_id,
            WorkerProtocol.FINISH_SPAWNED: spawned,
            WorkerProtocol.FINISH_COMMUNICATE_EXTRA: {
                'executor': getattr(settings, 'FLOW_EXECUTOR', {}).get('NAME', 'resolwe.flow.executors.local'),
            },
        })
Beispiel #57
0
    def __init__(self, *args, **kwargs):
        """Initialize arguments."""
        self.discover_engines()
        self.state = state.ManagerState(state.MANAGER_STATE_PREFIX)

        # Don't call the full self.reset() here, that's only meant for testing
        # since it also runs a dummy consumer to drain channels.
        with suppress(RedisConnectionError):
            # It's awkward to handle documentation and migration testing
            # any other way.
            self.state.reset()

        # The number of executors currently running; used for test synchronization.
        # We need to start out with a dummy object, so that the async
        # infrastructure isn't started too early. In particular, this handles
        # counter functions that are called before any actual synchronization
        # is wanted: in a test, what's called first is the Django signal.
        # This will call communicate(), which will (has to) first try upping
        # the counter value; the future that that produces can be scheduled
        # to the wrong event loop (the application one is started further
        # down communicate() in the synchronization block), which then
        # leads to exceedingly obscure crashes further down the line.
        self.sync_counter = self._SynchronizationManagerDummy()

        # Django's override_settings should be avoided at all cost here
        # to keep the manager as independent as possible, and in
        # particular to avoid overriding with dangerous variables, such
        # as the one controlling Django signal synchronicity. To make
        # such avoidance possible, all settings lookups in the worker
        # should use a private dictionary instead of the global
        # configuration. This variable is maintained around
        # _data_scan() calls, effectively emulating Django's
        # settings but having no effect on anything outside the worker
        # code (in particular, the signal triggers).
        self.settings_actual = {}

        # Ensure there is only one manager instance per process. This
        # is required as other parts of the code access the global
        # manager instance.
        with suppress(ImportError):
            from resolwe.flow import managers
            assert not hasattr(managers, 'manager')

        self.scheduling_class_map = dict(Process.SCHEDULING_CLASS_CHOICES)

        # Check settings for consistency.
        flow_manager = getattr(settings, 'FLOW_MANAGER', {})
        if 'DISPATCHER_MAPPING' in flow_manager and 'NAME' in flow_manager:
            raise ImproperlyConfigured("Key 'DISPATCHER_MAPPING' conflicts with key 'NAME' in FLOW_MANAGER settings.")

        if 'DISPATCHER_MAPPING' in flow_manager:
            mapping = flow_manager['DISPATCHER_MAPPING']

            scheduling_classes = set(self.scheduling_class_map.values())
            map_keys = set(mapping.keys())
            class_difference = scheduling_classes - map_keys
            if class_difference:
                raise ImproperlyConfigured(
                    "Dispatcher manager mapping settings incomplete, missing {}.".format(class_difference)
                )
            connector_list = [mapping[klass] for klass in scheduling_classes]
        else:
            connector_list = [flow_manager.get('NAME', DEFAULT_CONNECTOR)]

        # Pre-load all needed connectors.
        self.connectors = {}
        for module_name in connector_list:
            connector_module = import_module(module_name)
            self.connectors[module_name] = connector_module.Connector()

        logger.info(__(
            "Found {} workload connectors: {}", len(self.connectors), ', '.join(self.connectors.keys())
        ))

        super().__init__(*args, **kwargs)
Beispiel #58
0
    async def handle_control_event(self, message):
        """Handle an event from the Channels layer.

        Channels layer callback, do not call directly.
        """
        cmd = message[WorkerProtocol.COMMAND]
        logger.debug(__("Manager worker got channel command '{}'.", cmd))

        # Prepare settings for use; Django overlaid by state overlaid by
        # anything immediate in the current packet.
        immediates = {}
        if cmd == WorkerProtocol.COMMUNICATE:
            immediates = message.get(WorkerProtocol.COMMUNICATE_SETTINGS, {}) or {}
        override = self.state.settings_override or {}
        override.update(immediates)
        self.settings_actual = self._marshal_settings()
        self.settings_actual.update(override)

        if cmd == WorkerProtocol.COMMUNICATE:
            try:
                await database_sync_to_async(self._data_scan)(**message[WorkerProtocol.COMMUNICATE_EXTRA])
            except Exception:
                logger.exception("Unknown error occured while processing communicate control command.")
                raise
            finally:
                await self.sync_counter.dec('communicate')

        elif cmd == WorkerProtocol.FINISH:
            try:
                data_id = message[WorkerProtocol.DATA_ID]
                data_location = DataLocation.objects.get(data__id=data_id)
                if not getattr(settings, 'FLOW_MANAGER_KEEP_DATA', False):
                    try:
                        def handle_error(func, path, exc_info):
                            """Handle permission errors while removing data directories."""
                            if isinstance(exc_info[1], PermissionError):
                                os.chmod(path, 0o700)
                                shutil.rmtree(path)

                        # Remove secrets directory, but leave the rest of the runtime directory
                        # intact. Runtime directory will be removed during data purge, when the
                        # data object is removed.
                        secrets_dir = os.path.join(
                            self._get_per_data_dir('RUNTIME_DIR', data_location.subpath),
                            ExecutorFiles.SECRETS_DIR
                        )
                        shutil.rmtree(secrets_dir, onerror=handle_error)
                    except OSError:
                        logger.exception("Manager exception while removing data runtime directory.")

                if message[WorkerProtocol.FINISH_SPAWNED]:
                    await database_sync_to_async(self._data_scan)(**message[WorkerProtocol.FINISH_COMMUNICATE_EXTRA])
            except Exception:
                logger.exception(
                    "Unknown error occured while processing finish control command.",
                    extra={'data_id': data_id}
                )
                raise
            finally:
                await self.sync_counter.dec('executor')

        elif cmd == WorkerProtocol.ABORT:
            await self.sync_counter.dec('executor')

        else:
            logger.error(__("Ignoring unknown manager control command '{}'.", cmd))
Beispiel #59
0
    def _data_scan(self, data_id=None, executor='resolwe.flow.executors.local', **kwargs):
        """Scan for new Data objects and execute them.

        :param data_id: Optional id of Data object which (+ its
            children) should be scanned. If it is not given, all
            resolving objects are processed.
        :param executor: The fully qualified name of the executor to use
            for all :class:`~resolwe.flow.models.Data` objects
            discovered in this pass.
        """
        def process_data_object(data):
            """Process a single data object."""
            # Lock for update. Note that we want this transaction to be as short as possible in
            # order to reduce contention and avoid deadlocks. This is why we do not lock all
            # resolving objects for update, but instead only lock one object at a time. This
            # allows managers running in parallel to process different objects.
            data = Data.objects.select_for_update().get(pk=data.pk)
            if data.status != Data.STATUS_RESOLVING:
                # The object might have already been processed while waiting for the lock to be
                # obtained. In this case, skip the object.
                return

            dep_status = dependency_status(data)

            if dep_status == Data.STATUS_ERROR:
                data.status = Data.STATUS_ERROR
                data.process_error.append("One or more inputs have status ERROR")
                data.process_rc = 1
                data.save()
                return

            elif dep_status != Data.STATUS_DONE:
                return

            if data.process.run:
                try:
                    execution_engine = data.process.run.get('language', None)
                    # Evaluation by the execution engine may spawn additional data objects and
                    # perform other queries on the database. Queries of all possible execution
                    # engines need to be audited for possibilities of deadlocks in case any
                    # additional locks are introduced. Currently, we only take an explicit lock on
                    # the currently processing object.
                    program = self.get_execution_engine(execution_engine).evaluate(data)
                except (ExecutionError, InvalidEngineError) as error:
                    data.status = Data.STATUS_ERROR
                    data.process_error.append("Error in process script: {}".format(error))
                    data.save()
                    return

                # Set allocated resources:
                resource_limits = data.process.get_resource_limits()
                data.process_memory = resource_limits['memory']
                data.process_cores = resource_limits['cores']
            else:
                # If there is no run section, then we should not try to run anything. But the
                # program must not be set to None as then the process will be stuck in waiting
                # state.
                program = ''

            if data.status != Data.STATUS_DONE:
                # The data object may already be marked as done by the execution engine. In this
                # case we must not revert the status to STATUS_WAITING.
                data.status = Data.STATUS_WAITING
            data.save(render_name=True)

            # Actually run the object only if there was nothing with the transaction.
            transaction.on_commit(
                # Make sure the closure gets the right values here, since they're
                # changed in the loop.
                lambda d=data, p=program: self._data_execute(d, p, executor)
            )

        logger.debug(__("Manager processing communicate command triggered by Data with id {}.", data_id))

        if is_testing():
            # NOTE: This is a work-around for Django issue #10827
            # (https://code.djangoproject.com/ticket/10827), same as in
            # TestCaseHelpers._pre_setup(). Because the worker is running
            # independently, it must clear the cache on its own.
            ContentType.objects.clear_cache()

            # Ensure settings overrides apply
            self.discover_engines(executor=executor)

        try:
            queryset = Data.objects.filter(status=Data.STATUS_RESOLVING)
            if data_id is not None:
                # Scan only given data object and its children.
                queryset = queryset.filter(Q(parents=data_id) | Q(id=data_id)).distinct()

            for data in queryset:
                try:
                    with transaction.atomic():
                        process_data_object(data)

                        # All data objects created by the execution engine are commited after this
                        # point and may be processed by other managers running in parallel. At the
                        # same time, the lock for the current data object is released.
                except Exception as error:  # pylint: disable=broad-except
                    logger.exception(__(
                        "Unhandled exception in _data_scan while processing data object {}.",
                        data.pk
                    ))

                    # Unhandled error while processing a data object. We must set its
                    # status to STATUS_ERROR to prevent the object from being retried
                    # on next _data_scan run. We must perform this operation without
                    # using the Django ORM as using the ORM may be the reason the error
                    # occurred in the first place.
                    error_msg = "Internal error: {}".format(error)
                    process_error_field = Data._meta.get_field('process_error')  # pylint: disable=protected-access
                    max_length = process_error_field.base_field.max_length
                    if len(error_msg) > max_length:
                        error_msg = error_msg[:max_length - 3] + '...'

                    try:
                        with connection.cursor() as cursor:
                            cursor.execute(
                                """
                                    UPDATE {table}
                                    SET
                                        status = %(status)s,
                                        process_error = process_error || (%(error)s)::varchar[]
                                    WHERE id = %(id)s
                                """.format(
                                    table=Data._meta.db_table  # pylint: disable=protected-access
                                ),
                                {
                                    'status': Data.STATUS_ERROR,
                                    'error': [error_msg],
                                    'id': data.pk
                                }
                            )
                    except Exception as error:  # pylint: disable=broad-except
                        # If object's state cannot be changed due to some database-related
                        # issue, at least skip the object for this run.
                        logger.exception(__(
                            "Unhandled exception in _data_scan while trying to emit error for {}.",
                            data.pk
                        ))

        except IntegrityError as exp:
            logger.error(__("IntegrityError in manager {}", exp))
            return
Beispiel #60
0
    def handle_update(self, obj, internal_call=False):
        """Handle an incoming ``Data`` object update request.

        :param obj: The Channels message object. Command object format:

            .. code-block:: none

                {
                    'command': 'update',
                    'data_id': [id of the :class:`~resolwe.flow.models.Data`
                               object this command changes],
                    'changeset': {
                        [keys to be changed]
                    }
                }

        :param internal_call: If ``True``, this is an internal delegate
            call, so a reply to the executor won't be sent.
        """
        data_id = obj[ExecutorProtocol.DATA_ID]
        changeset = obj[ExecutorProtocol.UPDATE_CHANGESET]
        if not internal_call:
            logger.debug(
                __("Handling update for Data with id {} (handle_update).", data_id),
                extra={
                    'data_id': data_id,
                    'packet': obj
                }
            )
        try:
            d = Data.objects.get(pk=data_id)
        except Data.DoesNotExist:
            logger.warning(
                "Data object does not exist (handle_update).",
                extra={
                    'data_id': data_id,
                }
            )

            if not internal_call:
                async_to_sync(self._send_reply)(obj, {ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_ERROR})

            async_to_sync(consumer.send_event)({
                WorkerProtocol.COMMAND: WorkerProtocol.ABORT,
                WorkerProtocol.DATA_ID: obj[ExecutorProtocol.DATA_ID],
                WorkerProtocol.FINISH_COMMUNICATE_EXTRA: {
                    'executor': getattr(settings, 'FLOW_EXECUTOR', {}).get('NAME', 'resolwe.flow.executors.local'),
                },
            })

            return

        if changeset.get('status', None) == Data.STATUS_ERROR:
            logger.error(
                __("Error occured while running process '{}' (handle_update).", d.process.slug),
                extra={
                    'data_id': data_id,
                    'api_url': '{}{}'.format(
                        getattr(settings, 'RESOLWE_HOST_URL', ''),
                        reverse('resolwe-api:data-detail', kwargs={'pk': data_id})
                    ),
                }
            )

        if d.status == Data.STATUS_ERROR:
            changeset['status'] = Data.STATUS_ERROR

        if not d.started:
            changeset['started'] = now()
        changeset['modified'] = now()

        for key, val in changeset.items():
            if key in ['process_error', 'process_warning', 'process_info']:
                # Trim process_* fields to not exceed max length of the database field.
                for i, entry in enumerate(val):
                    max_length = Data._meta.get_field(key).base_field.max_length  # pylint: disable=protected-access
                    if len(entry) > max_length:
                        val[i] = entry[:max_length - 3] + '...'

                getattr(d, key).extend(val)

            elif key != 'output':
                setattr(d, key, val)

        if 'output' in changeset:
            if not isinstance(d.output, dict):
                d.output = {}
            for key, val in changeset['output'].items():
                dict_dot(d.output, key, val)

        try:
            d.save(update_fields=list(changeset.keys()))
        except ValidationError as exc:
            logger.error(
                __(
                    "Validation error when saving Data object of process '{}' (handle_update):\n\n{}",
                    d.process.slug,
                    traceback.format_exc()
                ),
                extra={
                    'data_id': data_id
                }
            )

            d.refresh_from_db()

            d.process_error.append(exc.message)
            d.status = Data.STATUS_ERROR

            try:
                d.save(update_fields=['process_error', 'status'])
            except Exception:  # pylint: disable=broad-except
                pass
        except Exception:  # pylint: disable=broad-except
            logger.error(
                __(
                    "Error when saving Data object of process '{}' (handle_update):\n\n{}",
                    d.process.slug,
                    traceback.format_exc()
                ),
                extra={
                    'data_id': data_id
                }
            )

        if not internal_call:
            async_to_sync(self._send_reply)(obj, {ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_OK})