def run(self, data: Data, argv: List): """Select a concrete connector and run the process through it. :param data: The :class:`~resolwe.flow.models.Data` object that is to be run. :param argv: The argument vector used to spawn the executor. """ process_scheduling = self.scheduling_class_map[ data.process.scheduling_class] if "DISPATCHER_MAPPING" in getattr(settings, "FLOW_MANAGER", {}): class_name = settings.FLOW_MANAGER["DISPATCHER_MAPPING"][ process_scheduling] else: class_name = getattr(settings, "FLOW_MANAGER", {}).get("NAME", DEFAULT_CONNECTOR) data.scheduled = now() data.save(update_fields=["scheduled"]) workload_class = class_name.rsplit(".", maxsplit=1)[1] host, port, protocol = self._get_listener_settings( data, workload_class) argv[-1] += " {} {} {}".format(host, port, protocol) return self.connectors[class_name].submit(data, argv)
def _data_execute(self, data: Data): """Execute the Data object. The activities carried out here include target directory preparation, executor copying, setting serialization and actual execution of the object. :param data: The :class:`~resolwe.flow.models.Data` object to execute. """ logger.debug(__("Manager preparing Data with id {} for processing.", data.id)) # Prepare the executor's environment. try: self._prepare_data_dir(data) executor_module = ".{}".format( getattr(settings, "FLOW_EXECUTOR", {}) .get("NAME", "resolwe.flow.executors.local") .rpartition(".executors.")[-1] ) self._lock_inputs_local_storage_locations(data) argv = [ "/bin/bash", "-c", getattr(settings, "FLOW_EXECUTOR", {}).get( "PYTHON", "/usr/bin/env python" ) + " -m executors " + executor_module + " {}".format(data.pk), ] self.executor.prepare_for_execution(data) except PermissionDenied as error: data.status = Data.STATUS_ERROR data.process_error.append("Permission denied for process: {}".format(error)) data.save() if hasattr(data, "worker"): data.worker.status = Worker.STATUS_ERROR_PREPARING data.worker.save() return except OSError as err: logger.exception( __( "OSError occurred while preparing data {} (will skip): {}", data.id, err, ) ) if hasattr(data, "worker"): data.worker.status = Worker.STATUS_ERROR_PREPARING data.worker.save() return # Hand off to the run() method for execution. logger.info(__("Running executor for data with id {}", data.pk)) self.run(data, argv)
def run(self, data: Data, runtime_dir: Path, argv): """Select a concrete connector and run the process through it. :param data: The :class:`~resolwe.flow.models.Data` object that is to be run. :param runtime_dir: The directory the executor is run from. :param argv: The argument vector used to spawn the executor. """ process_scheduling = self.scheduling_class_map[ data.process.scheduling_class] if "DISPATCHER_MAPPING" in getattr(settings, "FLOW_MANAGER", {}): class_name = settings.FLOW_MANAGER["DISPATCHER_MAPPING"][ process_scheduling] else: class_name = getattr(settings, "FLOW_MANAGER", {}).get("NAME", DEFAULT_CONNECTOR) data.scheduled = now() data.save(update_fields=["scheduled"]) return self.connectors[class_name].submit(data, runtime_dir, argv)
class BackendTest(TestCase): def setUp(self): u = get_user_model().objects.create_superuser('test', '*****@*****.**', 'test') self.p = Process(slug='test-processor', name='Test Process', contributor=u, type='data:test', version=1) self.p.save() self.d = Data(slug='test-data', name='Test Data', contributor=u, process=self.p) self.d.save() def tearDown(self): for data in Data.objects.all(): data_dir = os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(data.id)) shutil.rmtree(data_dir, ignore_errors=True) def test_manager(self): manager.communicate(verbosity=0) def test_dtlbash(self): self.p.slug = 'test-processor-dtlbash' self.p.run = {'script': """ gen-info \"Test processor info\" gen-warning \"Test processor warning\" echo '{"proc.info": "foo"}' """} self.p.save() self.d.slug = 'test-data-dtlbash' self.d.process = self.p self.d.save() self.d = Data(id=self.d.id)
class BackendTest(TestCase): def setUp(self): super(BackendTest, self).setUp() self.p = Process(slug='test-processor', name='Test Process', contributor=self.contributor, type='data:test', version=1) self.p.save() self.d = Data(slug='test-data', name='Test Data', contributor=self.contributor, process=self.p) self.d.save() def test_manager(self): manager.communicate(verbosity=0) def test_dtlbash(self): self.p.slug = 'test-processor-dtlbash' self.p.run = { 'script': """ gen-info \"Test processor info\" gen-warning \"Test processor warning\" echo '{"proc.info": "foo"}' """ } self.p.save() self.d.slug = 'test-data-dtlbash' self.d.process = self.p self.d.save() self.d = Data(id=self.d.id)
def migrate_data(self, data): contributor = self.get_contributor(data["author_id"]) # DESCRIPTOR SCHEMA ############################################ ds_fields = [] ds_fields.extend(data.get("static_schema", [])) ds_fields.extend(data.get("var_template", [])) ds_fields.sort(key=lambda d: d["name"]) ds_fields_dumped = json.dumps(ds_fields) if ds_fields_dumped in self.descriptor_schema_index: descriptor_schema = self.descriptor_schema_index[ds_fields_dumped] else: descriptor_schema = DescriptorSchema(schema=ds_fields) descriptor_schema.name = "data_{}_descriptor".format(data["_id"]) descriptor_schema.contributor = contributor descriptor_schema.save() self.descriptor_schema_index[ds_fields_dumped] = descriptor_schema descriptor = {} descriptor.update(data.get("static", {})) descriptor.update(data.get("var", {})) # PROCESS ###################################################### if "processor_version" not in data: data["processor_version"] = "0.0.0" process_slug = self.process_slug(data["processor_name"]) process_version = data["processor_version"] try: process = Process.objects.get(slug=process_slug, version=process_version) except Process.DoesNotExist: latest = Process.objects.filter(slug=process_slug).order_by("-version").first() if latest: process = Process() process.name = latest.name process.slug = latest.slug process.category = latest.category process.description = latest.description process.contributor = latest.contributor process.version = process_version process.type = data["type"] process.output_schema = data["output_schema"] process.input_schema = data.get("input_schema", {}) process.persistence = self.persistence_dict[data["persistence"]] process.run["script"] = 'gen-require common\ngen-error "Depricated process, use the latest version."' # XXX # process.created = # process.modified = process.save() # copy permissions from latest process for user, perms in get_users_with_perms(latest, attach_perms=True).iteritems(): for perm in perms: assign_perm(perm, user, process) for group, perms in get_groups_with_perms(latest, attach_perms=True).iteritems(): for perm in perms: assign_perm(perm, group, process) else: # Create dummy processor if there is no other version dummy_name = "Dummy processor of type {}".format(data["type"]) try: process = Process.objects.get(name=dummy_name) except Process.DoesNotExist: process = Process.objects.create( name=dummy_name, slug="non-existent", contributor=get_user_model().objects.filter(is_superuser=True).first(), type=data["type"], category="data:non-existent", run={"script": {'gen-require common\ngen-error "This processor is not intendent to be run."'}}, ) # DATA ######################################################### new = Data() new.name = data.get("static", {}).get("name", "") if len(new.name) > 100: self.long_names.append(new.name) new.name = new.name[:97] + "..." new.status = self.status_dict[data["status"]] new.process = process new.contributor = contributor new.input = data["input"] if "input" in data else {} new.output = data["output"] new.descriptor_schema = descriptor_schema new.descriptor = descriptor new.checksum = data.get("checksum", "") # XXX: Django will change this on create new.created = data["date_created"] # XXX: Django will change this on save new.modified = data["date_modified"] if "date_start" in data and "date_finish" in data: new.started = data["date_start"] new.finished = data["date_finish"] elif "date_finish" in data: new.started = data["date_finish"] new.finished = data["date_finish"] elif "date_start" in data: new.started = data["date_start"] new.finished = data["date_start"] else: new.started = datetime.fromtimestamp(0) new.finished = datetime.fromtimestamp(0) new.save() for case_id in data["case_ids"]: try: collection = Collection.objects.get(pk=self.id_mapping["collection"][str(case_id)]) except KeyError: self.missing_collections.add(str(case_id)) continue collection.data.add(new) for field_schema, fields, path in iterate_fields(data["output"], data["output_schema"], ""): if "type" in field_schema and field_schema["type"].startswith("basic:json:"): self.storage_index[fields[field_schema["name"]]] = {"id": new.pk, "path": path} self.migrate_permissions(new, data) self.id_mapping["data"][str(data["_id"])] = new.pk # DESCRIPTOR SCHEMA PERMISSIONS ################################ for user in get_users_with_perms(new): assign_perm("view_descriptorschema", user, obj=descriptor_schema) for group in get_groups_with_perms(new): assign_perm("view_descriptorschema", group, obj=descriptor_schema)
def process_data_object(data: Data): """Process a single data object.""" # Lock for update. Note that we want this transaction to be as short as possible in # order to reduce contention and avoid deadlocks. This is why we do not lock all # resolving objects for update, but instead only lock one object at a time. This # allows managers running in parallel to process different objects. data = Data.objects.select_for_update().get(pk=data.pk) if data.status != Data.STATUS_RESOLVING: # The object might have already been processed while waiting for the lock to be # obtained. In this case, skip the object. return dep_status = dependency_status(data) if dep_status == Data.STATUS_ERROR: data.status = Data.STATUS_ERROR data.process_error.append("One or more inputs have status ERROR") data.process_rc = 1 data.save() if hasattr(data, "worker"): data.worker.status = Worker.STATUS_ERROR_PREPARING data.worker.save(update_fields=["status"]) return elif dep_status != Data.STATUS_DONE: return run_in_executor = False if data.process.run: try: # Check if execution engine is sound and evaluate workflow. execution_engine_name = data.process.run.get("language", None) execution_engine = self.get_execution_engine(execution_engine_name) run_in_executor = execution_engine_name != "workflow" if not run_in_executor: execution_engine.evaluate(data) else: # Set allocated resources resource_limits = data.process.get_resource_limits() data.process_memory = resource_limits["memory"] data.process_cores = resource_limits["cores"] except (ExecutionError, InvalidEngineError) as error: data.status = Data.STATUS_ERROR data.process_error.append( "Error in process script: {}".format(error) ) data.save() if hasattr(data, "worker"): data.worker.status = Worker.STATUS_ERROR_PREPARING data.worker.save(update_fields=["status"]) return if data.status != Data.STATUS_DONE: # The data object may already be marked as done by the execution engine. In this # case we must not revert the status to STATUS_WAITING. data.status = Data.STATUS_WAITING data.save(render_name=True) # Actually run the object only if there was nothing with the # transaction and was not already evaluated. if run_in_executor: transaction.on_commit( # Make sure the closure gets the right values here, since they're # changed in the loop. lambda d=data: self._data_execute(d) )
def migrate_data(self, data): """Migrate data.""" contributor = self.get_contributor(data[u'author_id']) # DESCRIPTOR SCHEMA ############################################ ds_fields = [] ds_fields.extend(data.get(u'static_schema', [])) ds_fields.extend(data.get(u'var_template', [])) ds_fields.sort(key=lambda d: d[u'name']) ds_fields_dumped = json.dumps(ds_fields) if ds_fields_dumped in self.descriptor_schema_index: descriptor_schema = self.descriptor_schema_index[ds_fields_dumped] else: descriptor_schema = DescriptorSchema(schema=ds_fields) descriptor_schema.name = 'data_{}_descriptor'.format(data[u'_id']) descriptor_schema.contributor = contributor descriptor_schema.save() self.descriptor_schema_index[ds_fields_dumped] = descriptor_schema descriptor = {} descriptor.update(data.get(u'static', {})) descriptor.update(data.get(u'var', {})) # PROCESS ###################################################### if u'processor_version' not in data: data[u'processor_version'] = '0.0.0' process_slug = self.process_slug(data[u'processor_name']) process_version = data[u'processor_version'] try: process = Process.objects.get(slug=process_slug, version=process_version) except Process.DoesNotExist: latest = Process.objects.filter(slug=process_slug).order_by('-version').first() if latest: process = Process() process.name = latest.name process.slug = latest.slug process.category = latest.category process.description = latest.description process.contributor = latest.contributor process.version = process_version process.type = data[u'type'] process.output_schema = data[u'output_schema'] process.input_schema = data.get(u'input_schema', {}) process.persistence = self.persistence_dict[data[u'persistence']] process.run['script'] = 'gen-require common\ngen-error "Depricated process, use the latest version."' # noqa pylint: disable=unsubscriptable-object # XXX # process.created = # process.modified = process.save() # copy permissions from latest process for user, perms in six.iteritems(get_users_with_perms(latest, attach_perms=True)): for perm in perms: assign_perm(perm, user, process) for group, perms in six.iteritems(get_groups_with_perms(latest, attach_perms=True)): for perm in perms: assign_perm(perm, group, process) else: # Create dummy processor if there is no other version dummy_name = 'Dummy processor of type {}'.format(data[u'type']) try: process = Process.objects.get(name=dummy_name) except Process.DoesNotExist: process = Process.objects.create( name=dummy_name, slug='non-existent', contributor=get_user_model().objects.filter(is_superuser=True).first(), type=data[u'type'], category='data:non-existent', run={'script': {'gen-require common\ngen-error "This processor is not intendent to be run."'}}, ) # DATA ######################################################### new = Data() new.name = data.get(u'static', {}).get(u'name', '') if len(new.name) > 100: self.long_names.append(new.name) new.name = new.name[:97] + '...' new.status = self.status_dict[data[u'status']] new.process = process new.contributor = contributor new.input = data[u'input'] if u'input' in data else {} new.output = data[u'output'] new.descriptor_schema = descriptor_schema new.descriptor = descriptor new.checksum = data.get(u'checksum', '') # XXX: Django will change this on create new.created = data[u'date_created'] # XXX: Django will change this on save new.modified = data[u'date_modified'] if u'date_start' in data and u'date_finish' in data: new.started = data[u'date_start'] new.finished = data[u'date_finish'] elif u'date_finish' in data: new.started = data[u'date_finish'] new.finished = data[u'date_finish'] elif u'date_start' in data: new.started = data[u'date_start'] new.finished = data[u'date_start'] else: new.started = datetime.fromtimestamp(0) new.finished = datetime.fromtimestamp(0) new.save() for case_id in data[u'case_ids']: try: collection = Collection.objects.get(pk=self.id_mapping[u'collection'][str(case_id)]) except KeyError: self.missing_collections.add(str(case_id)) continue collection.data.add(new) for field_schema, fields, path in iterate_fields(data[u'output'], data[u'output_schema'], ''): if 'type' in field_schema and field_schema['type'].startswith('basic:json:'): self.storage_index[fields[field_schema['name']]] = { 'id': new.pk, 'path': path, } self.migrate_permissions(new, data) self.id_mapping['data'][str(data[u'_id'])] = new.pk # DESCRIPTOR SCHEMA PERMISSIONS ################################ for user in get_users_with_perms(new): assign_perm('view_descriptorschema', user, obj=descriptor_schema) for group in get_groups_with_perms(new): assign_perm('view_descriptorschema', group, obj=descriptor_schema)
def migrate_data(self, data): """Migrate data.""" contributor = self.get_contributor(data[u'author_id']) # DESCRIPTOR SCHEMA ############################################ ds_fields = [] ds_fields.extend(data.get(u'static_schema', [])) ds_fields.extend(data.get(u'var_template', [])) ds_fields.sort(key=lambda d: d[u'name']) ds_fields_dumped = json.dumps(ds_fields) if ds_fields_dumped in self.descriptor_schema_index: descriptor_schema = self.descriptor_schema_index[ds_fields_dumped] else: descriptor_schema = DescriptorSchema(schema=ds_fields) descriptor_schema.name = 'data_{}_descriptor'.format(data[u'_id']) descriptor_schema.contributor = contributor descriptor_schema.save() self.descriptor_schema_index[ds_fields_dumped] = descriptor_schema descriptor = {} descriptor.update(data.get(u'static', {})) descriptor.update(data.get(u'var', {})) # PROCESS ###################################################### if u'processor_version' not in data: data[u'processor_version'] = '0.0.0' process_slug = self.process_slug(data[u'processor_name']) process_version = data[u'processor_version'] try: process = Process.objects.get(slug=process_slug, version=process_version) except Process.DoesNotExist: latest = Process.objects.filter( slug=process_slug).order_by('-version').first() if latest: process = Process() process.name = latest.name process.slug = latest.slug process.category = latest.category process.description = latest.description process.contributor = latest.contributor process.version = process_version process.type = data[u'type'] process.output_schema = data[u'output_schema'] process.input_schema = data.get(u'input_schema', {}) process.persistence = self.persistence_dict[ data[u'persistence']] process.run[ 'script'] = 'gen-require common\ngen-error "Depricated process, use the latest version."' # noqa pylint: disable=unsubscriptable-object # XXX # process.created = # process.modified = process.save() # copy permissions from latest process for user, perms in six.iteritems( get_users_with_perms(latest, attach_perms=True)): for perm in perms: assign_perm(perm, user, process) for group, perms in six.iteritems( get_groups_with_perms(latest, attach_perms=True)): for perm in perms: assign_perm(perm, group, process) else: # Create dummy processor if there is no other version dummy_name = 'Dummy processor of type {}'.format(data[u'type']) try: process = Process.objects.get(name=dummy_name) except Process.DoesNotExist: process = Process.objects.create( name=dummy_name, slug='non-existent', contributor=get_user_model().objects.filter( is_superuser=True).first(), type=data[u'type'], category='data:non-existent', run={ 'script': { 'gen-require common\ngen-error "This processor is not intendent to be run."' } }, ) # DATA ######################################################### new = Data() new.name = data.get(u'static', {}).get(u'name', '') if len(new.name) > 100: self.long_names.append(new.name) new.name = new.name[:97] + '...' new.status = self.status_dict[data[u'status']] new.process = process new.contributor = contributor new.input = data[u'input'] if u'input' in data else {} new.output = data[u'output'] new.descriptor_schema = descriptor_schema new.descriptor = descriptor new.checksum = data.get(u'checksum', '') # XXX: Django will change this on create new.created = data[u'date_created'] # XXX: Django will change this on save new.modified = data[u'date_modified'] if u'date_start' in data and u'date_finish' in data: new.started = data[u'date_start'] new.finished = data[u'date_finish'] elif u'date_finish' in data: new.started = data[u'date_finish'] new.finished = data[u'date_finish'] elif u'date_start' in data: new.started = data[u'date_start'] new.finished = data[u'date_start'] else: new.started = datetime.fromtimestamp(0) new.finished = datetime.fromtimestamp(0) new.save() for case_id in data[u'case_ids']: try: collection = Collection.objects.get( pk=self.id_mapping[u'collection'][str(case_id)]) except KeyError: self.missing_collections.add(str(case_id)) continue collection.data.add(new) for field_schema, fields, path in iterate_fields( data[u'output'], data[u'output_schema'], ''): if 'type' in field_schema and field_schema['type'].startswith( 'basic:json:'): self.storage_index[fields[field_schema['name']]] = { 'id': new.pk, 'path': path, } self.migrate_permissions(new, data) self.id_mapping['data'][str(data[u'_id'])] = new.pk # DESCRIPTOR SCHEMA PERMISSIONS ################################ for user in get_users_with_perms(new): assign_perm('view_descriptorschema', user, obj=descriptor_schema) for group in get_groups_with_perms(new): assign_perm('view_descriptorschema', group, obj=descriptor_schema)
def process_data_object(data: Data): """Process a single data object.""" # Lock for update. Note that we want this transaction to be as short as possible in # order to reduce contention and avoid deadlocks. This is why we do not lock all # resolving objects for update, but instead only lock one object at a time. This # allows managers running in parallel to process different objects. data = Data.objects.select_for_update().get(pk=data.pk) if data.status != Data.STATUS_RESOLVING: # The object might have already been processed while waiting for the lock to be # obtained. In this case, skip the object. return dep_status = dependency_status(data) if dep_status == Data.STATUS_ERROR: data.status = Data.STATUS_ERROR data.process_error.append( "One or more inputs have status ERROR") data.process_rc = 1 data.save() if hasattr(data, "worker"): data.worker.status = Worker.STATUS_ERROR_PREPARING data.worker.save(update_fields=["status"]) return elif dep_status != Data.STATUS_DONE: return if data.process.run: try: execution_engine = data.process.run.get("language", None) # Evaluation by the execution engine may spawn additional data objects and # perform other queries on the database. Queries of all possible execution # engines need to be audited for possibilities of deadlocks in case any # additional locks are introduced. Currently, we only take an explicit lock on # the currently processing object. program = self.get_execution_engine( execution_engine).evaluate(data) except (ExecutionError, InvalidEngineError) as error: data.status = Data.STATUS_ERROR data.process_error.append( "Error in process script: {}".format(error)) data.save() if hasattr(data, "worker"): data.worker.status = Worker.STATUS_ERROR_PREPARING data.worker.save(update_fields=["status"]) return # Set allocated resources: resource_limits = data.process.get_resource_limits() data.process_memory = resource_limits["memory"] data.process_cores = resource_limits["cores"] else: # If there is no run section, then we should not try to run # anything. But the program must not be set to None as then # the process will be stuck in waiting state. program = "" if data.status != Data.STATUS_DONE: # The data object may already be marked as done by the execution engine. In this # case we must not revert the status to STATUS_WAITING. data.status = Data.STATUS_WAITING data.save(render_name=True) # Actually run the object only if there was nothing with the transaction. transaction.on_commit( # Make sure the closure gets the right values here, since they're # changed in the loop. lambda d=data, p=program: self._data_execute(d, p))
def _data_execute(self, data: Data, program: str): """Execute the Data object. The activities carried out here include target directory preparation, executor copying, setting serialization and actual execution of the object. :param data: The :class:`~resolwe.flow.models.Data` object to execute. :param program: The process text the manager got out of execution engine evaluation. :param executor: The executor to use for this object. """ # Notify dispatcher if there is nothing to do so it can check whether # conditions for raising runtime barrier are fulfilled. if not program: return logger.debug( __("Manager preparing Data with id {} for processing.", data.id)) # Prepare the executor's environment. try: executor_env_vars = self.get_executor().get_environment_variables() program = self._include_environment_variables( program, executor_env_vars) data_dir = self._prepare_data_dir(data) executor_module, runtime_dir = self._prepare_executor(data) self._prepare_storage_connectors(runtime_dir) self._lock_inputs_local_storage_locations(data) # Execute execution engine specific runtime preparation. execution_engine = data.process.run.get("language", None) volume_maps = self.get_execution_engine( execution_engine).prepare_runtime(runtime_dir, data) self._prepare_context(data, data_dir, runtime_dir, RUNTIME_VOLUME_MAPS=volume_maps) self._prepare_script(runtime_dir, program) argv = [ "/bin/bash", "-c", getattr(settings, "FLOW_EXECUTOR", {}).get( "PYTHON", "/usr/bin/env python") + " -m executors " + executor_module, ] except PermissionDenied as error: data.status = Data.STATUS_ERROR data.process_error.append( "Permission denied for process: {}".format(error)) data.save() if hasattr(data, "worker"): data.worker.status = Worker.STATUS_ERROR_PREPARING data.worker.save() return except OSError as err: logger.error( __( "OSError occurred while preparing data {} (will skip): {}", data.id, err, )) if hasattr(data, "worker"): data.worker.status = Worker.STATUS_ERROR_PREPARING data.worker.save() return # Hand off to the run() method for execution. logger.info(__("Running {}", runtime_dir)) self.run(data, runtime_dir, argv)