def purge_run(self, event): """Run purge for the object with ``data_id`` specified in ``event`` argument.""" data_id = event['data_id'] verbosity = event['verbosity'] try: logger.info(__("Running purge for Data id {}.", data_id)) data_purge(data_ids=[data_id], verbosity=verbosity, delete=True) except Exception: # pylint: disable=broad-except logger.exception("Error while purging data object.", extra={'data_id': data_id})
def create_and_run_processor(self, processor, **kwargs): processor_slug = get_random_string(6) Process.objects.create(slug=processor_slug, name='Test Purge Process', contributor=self.admin, type='data:test', version=1, **processor) data = self.run_process(processor_slug, **kwargs) # Purge is normally called in an async worker, so we have to emulate the call. purge.data_purge(data_ids=[data.id], delete=True) return data
def test_remove(self, manager_mock): user = get_user_model().objects.create(username="******") processor = Process.objects.create( name='Test process', contributor=user, output_schema=[ {'name': 'sample', 'type': 'basic:file:'} ] ) data = { 'name': 'Test data', 'contributor': user, 'process': processor, } completed_data = Data.objects.create(**data) completed_data.status = Data.STATUS_DONE completed_data.output = {'sample': {'file': 'test-file'}} self.create_test_file(completed_data, 'test-file') self.create_test_file(completed_data, 'removeme') completed_data.save() pending_data = Data.objects.create(**data) self.create_test_file(pending_data, 'test-file') self.create_test_file(pending_data, 'donotremoveme') # Check that nothing is removed if delete is False (the default). with patch('resolwe.flow.utils.purge.os', wraps=os) as os_mock: os_mock.path.isfile = MagicMock(return_value=True) os_mock.remove = MagicMock() purge.data_purge() os_mock.remove.assert_not_called() # Check that only the 'removeme' file from the completed Data objects is removed # and files from the second (not completed) Data objects are unchanged. with patch('resolwe.flow.utils.purge.os', wraps=os) as os_mock: os_mock.path.isfile = MagicMock(return_value=True) os_mock.remove = MagicMock() purge.data_purge(delete=True) os_mock.remove.assert_called_once_with( os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(completed_data.pk), 'removeme')) # Create another data object and check that if remove is called on one object, # only that object's data is removed. another_data = Data.objects.create(**data) another_data.status = Data.STATUS_DONE another_data.output = {'sample': {'file': 'test-file'}} self.create_test_file(another_data, 'test-file') self.create_test_file(another_data, 'removeme') another_data.save() with patch('resolwe.flow.utils.purge.os', wraps=os) as os_mock: os_mock.path.isfile = MagicMock(return_value=True) os_mock.remove = MagicMock() purge.data_purge(data_ids=[another_data.pk], delete=True) os_mock.remove.assert_called_once_with( os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(another_data.pk), 'removeme'))
def handle_finish(self, obj): """Handle an incoming ``Data`` finished processing request. :param obj: The Channels message object. Command object format: .. code-block:: none { 'command': 'finish', 'data_id': [id of the :class:`~resolwe.flow.models.Data` object this command changes], 'process_rc': [exit status of the processing] 'spawn_processes': [optional; list of spawn dictionaries], 'exported_files_mapper': [if spawn_processes present] } """ data_id = obj[ExecutorProtocol.DATA_ID] logger.debug(__("Finishing Data with id {} (handle_finish).", data_id), extra={ 'data_id': data_id, 'packet': obj }) with transaction.atomic(): # Spawn any new jobs in the request. spawned = False if ExecutorProtocol.FINISH_SPAWN_PROCESSES in obj: if is_testing(): # NOTE: This is a work-around for Django issue #10827 # (https://code.djangoproject.com/ticket/10827), same as in # TestCaseHelpers._pre_setup(). Because the listener is running # independently, it must clear the cache on its own. ContentType.objects.clear_cache() spawned = True exported_files_mapper = obj[ ExecutorProtocol.FINISH_EXPORTED_FILES] logger.debug(__( "Spawning new Data objects for Data with id {} (handle_finish).", data_id), extra={'data_id': data_id}) try: # This transaction is needed because we're running # asynchronously with respect to the main Django code # here; the manager can get nudged from elsewhere. with transaction.atomic(): parent_data = Data.objects.get(pk=data_id) # Spawn processes. for d in obj[ExecutorProtocol.FINISH_SPAWN_PROCESSES]: d['contributor'] = parent_data.contributor d['process'] = Process.objects.filter( slug=d['process']).latest() for field_schema, fields in iterate_fields( d.get('input', {}), d['process'].input_schema): type_ = field_schema['type'] name = field_schema['name'] value = fields[name] if type_ == 'basic:file:': fields[name] = self.hydrate_spawned_files( exported_files_mapper, value, data_id) elif type_ == 'list:basic:file:': fields[name] = [ self.hydrate_spawned_files( exported_files_mapper, fn, data_id) for fn in value ] with transaction.atomic(): d = Data.objects.create(**d) DataDependency.objects.create( parent=parent_data, child=d, kind=DataDependency.KIND_SUBPROCESS, ) # Copy permissions. copy_permissions(parent_data, d) # Entity is added to the collection only when it is # created - when it only contains 1 Data object. entities = Entity.objects.filter( data=d).annotate( num_data=Count('data')).filter( num_data=1) # Copy collections. for collection in parent_data.collection_set.all( ): collection.data.add(d) # Add entities to which data belongs to the collection. for entity in entities: entity.collections.add(collection) except Exception: # pylint: disable=broad-except logger.error(__( "Error while preparing spawned Data objects of process '{}' (handle_finish):\n\n{}", parent_data.process.slug, traceback.format_exc()), extra={'data_id': data_id}) # Data wrap up happens last, so that any triggered signals # already see the spawned children. What the children themselves # see is guaranteed by the transaction we're in. if ExecutorProtocol.FINISH_PROCESS_RC in obj: process_rc = obj[ExecutorProtocol.FINISH_PROCESS_RC] try: d = Data.objects.get(pk=data_id) except Data.DoesNotExist: logger.warning( "Data object does not exist (handle_finish).", extra={ 'data_id': data_id, }) async_to_sync(self._send_reply)(obj, { ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_ERROR }) return if process_rc == 0 and not d.status == Data.STATUS_ERROR: changeset = { 'status': Data.STATUS_DONE, 'process_progress': 100, 'finished': now() } else: changeset = { 'status': Data.STATUS_ERROR, 'process_progress': 100, 'process_rc': process_rc, 'finished': now() } obj[ExecutorProtocol.UPDATE_CHANGESET] = changeset self.handle_update(obj, internal_call=True) if not getattr(settings, 'FLOW_MANAGER_KEEP_DATA', False): try: # Clean up after process data_purge(data_ids=[data_id], delete=True, verbosity=self._verbosity) except Exception: # pylint: disable=broad-except logger.error(__("Purge error:\n\n{}", traceback.format_exc()), extra={'data_id': data_id}) # Notify the executor that we're done. async_to_sync(self._send_reply)( obj, { ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_OK }) # Now nudge the main manager to perform final cleanup. This is # needed even if there was no spawn baggage, since the manager # may need to know when executors have finished, to keep count # of them and manage synchronization. async_to_sync(consumer.send_event)({ WorkerProtocol.COMMAND: WorkerProtocol.FINISH, WorkerProtocol.DATA_ID: data_id, WorkerProtocol.FINISH_SPAWNED: spawned, WorkerProtocol.FINISH_COMMUNICATE_EXTRA: { 'executor': getattr(settings, 'FLOW_EXECUTOR', {}).get('NAME', 'resolwe.flow.executors.local'), }, })
def test_remove(self, manager_mock): user = get_user_model().objects.create(username="******") processor = Process.objects.create(name='Test process', contributor=user, output_schema=[{ 'name': 'sample', 'type': 'basic:file:' }]) data = { 'name': 'Test data', 'contributor': user, 'process': processor, } completed_data = Data.objects.create(**data) completed_data.status = Data.STATUS_DONE completed_data.output = {'sample': {'file': 'test-file'}} self.create_test_file(completed_data, 'test-file') self.create_test_file(completed_data, 'removeme') completed_data.save() pending_data = Data.objects.create(**data) self.create_test_file(pending_data, 'test-file') self.create_test_file(pending_data, 'donotremoveme') # Check that nothing is removed if delete is False (the default). with patch('resolwe.flow.utils.purge.os', wraps=os) as os_mock: os_mock.path.isfile = MagicMock(return_value=True) os_mock.remove = MagicMock() purge.data_purge() os_mock.remove.assert_not_called() # Check that only the 'removeme' file from the completed Data objects is removed # and files from the second (not completed) Data objects are unchanged. with patch('resolwe.flow.utils.purge.os', wraps=os) as os_mock: os_mock.path.isfile = MagicMock(return_value=True) os_mock.remove = MagicMock() purge.data_purge(delete=True) os_mock.remove.assert_called_once_with( os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(completed_data.pk), 'removeme')) # Create dummy data directories for non-existant data objects. self.create_test_file(990, 'dummy') self.create_test_file(991, 'dummy') # Check that only the 'removeme' file from the completed Data objects is removed # together with directories not belonging to any data objects. with patch('resolwe.flow.utils.purge.os', wraps=os) as os_mock: os_mock.path.isfile = MagicMock(return_value=True) os_mock.remove = MagicMock() purge.data_purge(delete=True) self.assertEqual(os_mock.remove.call_count, 3) os_mock.remove.assert_any_call( os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(completed_data.pk), 'removeme')) os_mock.remove.assert_any_call( os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], '990')) os_mock.remove.assert_any_call( os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], '991')) # Create another data object and check that if remove is called on one object, # only that object's data is removed. another_data = Data.objects.create(**data) another_data.status = Data.STATUS_DONE another_data.output = {'sample': {'file': 'test-file'}} self.create_test_file(another_data, 'test-file') self.create_test_file(another_data, 'removeme') another_data.save() with patch('resolwe.flow.utils.purge.os', wraps=os) as os_mock: os_mock.path.isfile = MagicMock(return_value=True) os_mock.remove = MagicMock() purge.data_purge(data_ids=[another_data.pk], delete=True) os_mock.remove.assert_called_once_with( os.path.join(settings.FLOW_EXECUTOR['DATA_DIR'], str(another_data.pk), 'removeme'))
def run(self, data_id, script, verbosity=1): """Execute the script and save results.""" if verbosity >= 1: print('RUN: {} {}'.format(data_id, script)) self.data_id = data_id # Fetch data instance to get any executor requirements. self.requirements = Data.objects.get( pk=data_id).process.requirements.get('executor', {}).get(self.name, {}) data_dir = settings.FLOW_EXECUTOR['DATA_DIR'] dir_mode = getattr(settings, 'FLOW_EXECUTOR', {}).get('DATA_DIR_MODE', 0o755) output_path = os.path.join(data_dir, str(data_id)) os.mkdir(output_path) # os.mkdir is not guaranteed to set the given mode os.chmod(output_path, dir_mode) os.chdir(output_path) log_file = open('stdout.txt', 'w+') json_file = open('jsonout.txt', 'w+') proc_pid = self.start() self.update_data_status(status=Data.STATUS_PROCESSING, started=now(), process_pid=proc_pid) # Run processor and handle intermediate results self.run_script(script) spawn_processors = [] output = {} process_error, process_warning, process_info = [], [], [] process_progress, process_rc = 0, 0 # read processor output try: stdout = self.get_stdout() while True: line = stdout.readline() if not line: break try: if line.strip().startswith('run'): # Save processor and spawn if no errors log_file.write(line) log_file.flush() for obj in iterjson(line[3:].strip()): spawn_processors.append(obj) elif line.strip().startswith('export'): file_name = line[6:].strip() export_folder = settings.FLOW_EXECUTOR['UPLOAD_DIR'] unique_name = 'export_{}'.format(uuid.uuid4().hex) export_path = os.path.join(export_folder, unique_name) self.exported_files_mapper[ self.data_id][file_name] = unique_name shutil.move(file_name, export_path) else: # If JSON, save to MongoDB updates = {} for obj in iterjson(line): for key, val in six.iteritems(obj): if key.startswith('proc.'): if key == 'proc.error': process_error.append(val) if not process_rc: process_rc = 1 updates['process_rc'] = process_rc updates[ 'process_error'] = process_error updates['status'] = Data.STATUS_ERROR elif key == 'proc.warning': process_warning.append(val) updates[ 'process_warning'] = process_warning elif key == 'proc.info': process_info.append(val) updates['process_info'] = process_info elif key == 'proc.rc': process_rc = int(val) updates['process_rc'] = process_rc if process_rc != 0: updates[ 'status'] = Data.STATUS_ERROR elif key == 'proc.progress': process_progress = int( float(val) * 100) updates[ 'process_progress'] = process_progress else: dict_dot(output, key, val) updates['output'] = output if updates: updates['modified'] = now() self.update_data_status(**updates) if process_rc > 0: log_file.close() json_file.close() os.chdir(CWD) return # Debug output # Not referenced in Data object json_file.write(line) json_file.flush() except ValueError as ex: # Ignore if not JSON log_file.write(line) log_file.flush() except MemoryError as ex: logger.error(__("Out of memory: {}", ex)) except IOError as ex: # TODO: if ex.errno == 28: no more free space raise ex finally: # Store results log_file.close() json_file.close() os.chdir(CWD) return_code = self.end() if process_rc < return_code: process_rc = return_code if spawn_processors and process_rc == 0: parent_data = Data.objects.get(pk=self.data_id) # Spawn processors for d in spawn_processors: d['contributor'] = parent_data.contributor d['process'] = Process.objects.filter( slug=d['process']).order_by('version').last() for field_schema, fields in iterate_fields( d.get('input', {}), d['process'].input_schema): type_ = field_schema['type'] name = field_schema['name'] value = fields[name] if type_ == 'basic:file:': fields[name] = self.hydrate_spawned_files( value, data_id) elif type_ == 'list:basic:file:': fields[name] = [ self.hydrate_spawned_files(fn, data_id) for fn in value ] with transaction.atomic(): d = Data.objects.create(**d) for collection in parent_data.collection_set.all(): collection.data.add(d) if process_rc == 0: self.update_data_status(status=Data.STATUS_DONE, process_progress=100, finished=now()) else: self.update_data_status(status=Data.STATUS_ERROR, process_progress=100, process_rc=process_rc, finished=now()) try: # Cleanup after processor data_purge(data_ids=[data_id], delete=True, verbosity=verbosity) except: # pylint: disable=bare-except logger.error(__("Purge error:\n\n{}", traceback.format_exc()))
def run_process(self, process_slug, input_={}, assert_status=Data.STATUS_DONE, descriptor=None, descriptor_schema=None, verbosity=0, tags=None): """Run the specified process with the given inputs. If input is a file, file path should be given relative to the ``tests/files`` directory of a Django application. If ``assert_status`` is given, check if :class:`~resolwe.flow.models.Data` object's status matches it after the process has finished. .. note:: If you need to delay calling the manager, you must put the desired code in a ``with transaction.atomic()`` block. :param str process_slug: slug of the :class:`~resolwe.flow.models.Process` to run :param dict ``input_``: :class:`~resolwe.flow.models.Process`'s input parameters .. note:: You don't have to specify parameters with defined default values. :param str ``assert_status``: desired status of the :class:`~resolwe.flow.models.Data` object :param dict descriptor: descriptor to set on the :class:`~resolwe.flow.models.Data` object :param dict descriptor_schema: descriptor schema to set on the :class:`~resolwe.flow.models.Data` object :param list tags: list of tags that will be added to the created :class:`~resolwe.flow.models.Data` object :return: object created by :class:`~resolwe.flow.models.Process` :rtype: ~resolwe.flow.models.Data """ # Copy input_, to avoid mutation that would occur in ``mock_upload`` input_ = input_.copy() # backward compatibility process_slug = slugify(process_slug.replace(':', '-')) # Enforce correct process tags. if getattr(settings, 'TEST_PROCESS_REQUIRE_TAGS', False) and not self._preparation_stage: test = getattr(self, self._testMethodName) if not has_process_tag(test, process_slug): self.fail( 'Tried to run process with slug "{0}" outside of preparation_stage\n' 'block while test is not tagged for this process. Either tag the\n' 'test using tag_process decorator or move this under the preparation\n' 'stage block if this process is only used to prepare upstream inputs.\n' '\n' 'To tag the test you can add the following decorator:\n' ' @tag_process(\'{0}\')\n' ''.format(process_slug)) self._executed_processes.add(process_slug) process = Process.objects.filter( slug=process_slug).order_by('-version').first() if process is None: self.fail('No process with slug "{}"'.format(process_slug)) def mock_upload(file_path): """Mock file upload.""" def is_url(path): """Check if path is a URL.""" validate = URLValidator() try: validate(path) except (ValueError, ValidationError): return False return True if is_url(file_path): return { 'file': file_path, 'file_temp': file_path, 'is_remote': True, } else: old_path = os.path.join(self.files_path, file_path) if not os.path.isfile(old_path): raise RuntimeError('Missing file: {}'.format(old_path)) file_temp = '{}_{}'.format(file_path, uuid.uuid4()) upload_file_path = os.path.join(self.upload_dir, file_temp) # create directories needed by new_path upload_file_dir = os.path.dirname(upload_file_path) if not os.path.exists(upload_file_dir): os.makedirs(upload_file_dir) shutil.copy2(old_path, upload_file_path) self._upload_files.append(upload_file_path) return { 'file': file_path, 'file_temp': file_temp, } for field_schema, fields in iterate_fields(input_, process.input_schema): # copy referenced files to upload dir if field_schema['type'] == "basic:file:": fields[field_schema['name']] = mock_upload( fields[field_schema['name']]) elif field_schema['type'] == "list:basic:file:": file_list = [ mock_upload(file_path) for file_path in fields[field_schema['name']] ] fields[field_schema['name']] = file_list # convert primary keys to strings if field_schema['type'].startswith('data:'): fields[field_schema['name']] = fields[field_schema['name']] if field_schema['type'].startswith('list:data:'): fields[field_schema['name']] = [ obj for obj in fields[field_schema['name']] ] data = Data.objects.create(input=input_, contributor=self.admin, process=process, slug=get_random_string(length=6), tags=tags or [], descriptor_schema=descriptor_schema, descriptor=descriptor or {}) self.collection.data.add(data) # Fetch latest Data object from database data = Data.objects.get(pk=data.pk) if assert_status: if not transaction.get_autocommit( ) and assert_status == Data.STATUS_DONE: # We are in an atomic transaction block, hence the data object will not be done # until after the block. Therefore the expected status is resolving. assert_status = Data.STATUS_RESOLVING self.assertStatus(data, assert_status) # Purge is normally called in an async worker, so we have to emulate the call. purge.data_purge(data_ids=[data.id], delete=True) return data
def run(self, data_id, script, verbosity=1): """Execute the script and save results.""" if verbosity >= 1: print('RUN: {} {}'.format(data_id, script)) self.data_id = data_id data_dir = settings.FLOW_EXECUTOR['DATA_DIR'] dir_mode = getattr(settings, 'FLOW_EXECUTOR', {}).get('DATA_DIR_MODE', 0o755) output_path = os.path.join(data_dir, str(data_id)) os.mkdir(output_path) # os.mkdir is not guaranteed to set the given mode os.chmod(output_path, dir_mode) os.chdir(output_path) log_file = open('stdout.txt', 'w+') json_file = open('jsonout.txt', 'w+') proc_pid = self.start() self.update_data_status( status=Data.STATUS_PROCESSING, started=now(), process_pid=proc_pid ) # Run processor and handle intermediate results self.run_script(script) spawn_processors = [] output = {} process_error, process_warning, process_info = [], [], [] process_progress, process_rc = 0, 0 # read processor output try: stdout = self.get_stdout() while True: line = stdout.readline() if not line: break try: if line.strip().startswith('run'): # Save processor and spawn if no errors log_file.write(line) log_file.flush() for obj in iterjson(line[3:].strip()): spawn_processors.append(obj) elif line.strip().startswith('export'): file_name = line[6:].strip() export_folder = settings.FLOW_EXECUTOR['UPLOAD_DIR'] unique_name = 'export_{}'.format(uuid.uuid4().hex) export_path = os.path.join(export_folder, unique_name) EXPORTED_FILES_MAPPER[file_name] = unique_name shutil.move(file_name, export_path) else: # If JSON, save to MongoDB updates = {} for obj in iterjson(line): for key, val in six.iteritems(obj): if key.startswith('proc.'): if key == 'proc.error': process_error.append(val) if not process_rc: process_rc = 1 updates['process_rc'] = process_rc updates['process_error'] = process_error updates['status'] = Data.STATUS_ERROR elif key == 'proc.warning': process_warning.append(val) updates['process_warning'] = process_warning elif key == 'proc.info': process_info.append(val) updates['process_info'] = process_info elif key == 'proc.rc': process_rc = int(val) updates['process_rc'] = process_rc if process_rc != 0: updates['status'] = Data.STATUS_ERROR elif key == 'proc.progress': process_progress = int(float(val) * 100) updates['process_progress'] = process_progress else: dict_dot(output, key, val) updates['output'] = output if updates: updates['modified'] = now() self.update_data_status(**updates) if process_rc > 0: log_file.close() json_file.close() os.chdir(CWD) return # Debug output # Not referenced in Data object json_file.write(line) json_file.flush() except ValueError as ex: # Ignore if not JSON log_file.write(line) log_file.flush() except MemoryError as ex: logger.error(__("Out of memory: {}", ex)) except IOError as ex: # TODO: if ex.errno == 28: no more free space raise ex finally: # Store results log_file.close() json_file.close() os.chdir(CWD) return_code = self.end() if process_rc < return_code: process_rc = return_code if spawn_processors and process_rc == 0: parent_data = Data.objects.get(pk=self.data_id) # Spawn processors for d in spawn_processors: d['contributor'] = parent_data.contributor d['process'] = Process.objects.filter(slug=d['process']).order_by('version').last() for field_schema, fields in iterate_fields(d.get('input', {}), d['process'].input_schema): type_ = field_schema['type'] name = field_schema['name'] value = fields[name] if type_ == 'basic:file:': fields[name] = hydrate_spawned_files(value, data_id) elif type_ == 'list:basic:file:': fields[name] = [hydrate_spawned_files(fn, data_id) for fn in value] with transaction.atomic(): d = Data.objects.create(**d) for collection in parent_data.collection_set.all(): collection.data.add(d) if process_rc == 0: self.update_data_status( status=Data.STATUS_DONE, process_progress=100, finished=now() ) else: self.update_data_status( status=Data.STATUS_ERROR, process_progress=100, process_rc=process_rc, finished=now() ) try: # Cleanup after processor if data_id != 'no_data_id': data_purge(data_ids=[data_id], delete=True, verbosity=verbosity) except: # pylint: disable=bare-except logger.error(__("Purge error:\n\n{}", traceback.format_exc()))
def handle(self, *args, **options): """Call :func:`~resolwe.flow.utils.purge.data_purge`.""" data_purge(options['data'], options['force'])
def handle(self, *args, **options): """Call :func:`~resolwe.flow.utils.purge.data_purge`.""" data_purge(options['data'], options['force'], options['verbosity'])