def update_mongo(archive_blob_name: str, job_id: str): """Updates MongoDB by removing tasks and logs, and setting the job status.""" job_oid = bson.ObjectId(job_id) tasks_coll = current_flamenco.db('tasks') logs_coll = current_flamenco.db('task_logs') jobs_coll = current_flamenco.db('jobs') log.info('Purging Flamenco tasks and task logs for job %s', job_id) # Task log entries don't have a job ID, so we have to fetch the task IDs first. task_ids = [ task['_id'] for task in tasks_coll.find({'job': job_oid}) ] logs_coll.delete_many({'task_id': {'$in': task_ids}}) tasks_coll.delete_many({'job': job_oid}) # Update the job's archive blob name res = jobs_coll.update_one({'_id': job_oid}, {'$set': {'archive_blob_name': archive_blob_name}}) if res.matched_count != 1: raise ArchivalError( f"Unable to update job {job_oid} to set archive_blob_name={archive_blob_name!r}, " f"matched count={res.matched_count}") # Update the job status to 'archived' res = current_flamenco.job_manager.api_set_job_status(job_oid, 'archived') if res.matched_count != 1: raise ArchivalError( f"Unable to update job {job_oid} to status 'archived', " f"matched count={res.matched_count}")
def update_mongo(archive_blob_name: str, job_id: str): """Updates MongoDB by removing tasks and logs, and setting the job status.""" job_oid = bson.ObjectId(job_id) tasks_coll = current_flamenco.db('tasks') logs_coll = current_flamenco.db('task_logs') jobs_coll = current_flamenco.db('jobs') log.info('Purging Flamenco tasks and task logs for job %s', job_id) # Task log entries don't have a job ID, so we have to fetch the task IDs first. task_ids = [task['_id'] for task in tasks_coll.find({'job': job_oid})] logs_coll.delete_many({'task_id': {'$in': task_ids}}) tasks_coll.delete_many({'job': job_oid}) # Update the job's archive blob name res = jobs_coll.update_one( {'_id': job_oid}, {'$set': { 'archive_blob_name': archive_blob_name }}) if res.matched_count != 1: raise ArchivalError( f"Unable to update job {job_oid} to set archive_blob_name={archive_blob_name!r}, " f"matched count={res.matched_count}") # Update the job status to 'archived' res = current_flamenco.job_manager.api_set_job_status(job_oid, 'archived') if res.matched_count != 1: raise ArchivalError( f"Unable to update job {job_oid} to status 'archived', " f"matched count={res.matched_count}")
def api_set_job_priority(self, job_id: ObjectId, new_priority: int): """API-level call to updates the job priority.""" assert isinstance(new_priority, int) self._log.debug('Setting job %s priority to %r', job_id, new_priority) jobs_coll = current_flamenco.db('jobs') curr_job = jobs_coll.find_one({'_id': job_id}, projection={'priority': 1}) old_priority = curr_job['priority'] if old_priority == new_priority: self._log.debug('Job %s is already at priority %r', job_id, old_priority) return new_etag = random_etag() now = utcnow() jobs_coll = current_flamenco.db('jobs') result = jobs_coll.update_one({'_id': job_id}, {'$set': {'priority': new_priority, '_updated': now, '_etag': new_etag, }}) if result.matched_count != 1: self._log.warning('Matched %d jobs while setting job %s to priority %r', result.matched_count, job_id, new_priority) tasks_coll = current_flamenco.db('tasks') result = tasks_coll.update_many({'job': job_id}, {'$set': {'job_priority': new_priority, '_updated': now, '_etag': new_etag, }}) self._log.debug('Matched %d tasks while setting job %s to priority %r', result.matched_count, job_id, new_priority)
def download_task_and_log(storage_path: str, task_id: str): """Downloads task + task log and stores them.""" import gzip import pymongo task_oid = bson.ObjectId(task_id) log.info('Archiving task %s to %s', task_oid, storage_path) tasks_coll = current_flamenco.db('tasks') logs_coll = current_flamenco.db('task_logs') task = tasks_coll.find_one({'_id': task_oid}) logs = logs_coll.find({ 'task': task_oid }).sort([ ('received_on_manager', pymongo.ASCENDING), ('_id', pymongo.ASCENDING), ]) # Save the task as JSON spath = pathlib.Path(storage_path) task_path = spath / f'task-{task_id}.json' with open(task_path, mode='w', encoding='utf8') as outfile: outfile.write(dumps(task, indent=4, sort_keys=True)) # Get the task log bits and write to compressed file. log_path = spath / f'task-{task_id}.log.gz' with gzip.open(log_path, mode='wb') as outfile: for log_entry in logs: outfile.write(log_entry['log'].encode())
def api_set_job_status( self, job_id: bson.ObjectId, new_status: str, *, now: datetime.datetime = None) -> pymongo.results.UpdateResult: """API-level call to updates the job status.""" assert new_status self._log.debug('Setting job %s status to "%s"', job_id, new_status) jobs_coll = current_flamenco.db('jobs') curr_job = jobs_coll.find_one({'_id': job_id}, projection={'status': 1}) old_status = curr_job['status'] # Go through all necessary status transitions. result = None # make sure that 'result' always has a value. while new_status: result = current_flamenco.update_status('jobs', job_id, new_status, now=now) next_status = self.handle_job_status_change( job_id, old_status, new_status) old_status, new_status = new_status, next_status return result
def api_recreate_job(self, job_id: bson.ObjectId): """Deletes all tasks of a job, then recompiles the job to construct new tasks. The job MUST be in state 'canceled', to ensure that the manager has stopped task execution. As this functionality requires access to both the task manager and the job manager, this is implemented on FlamencoExtension itself. """ from flamenco import job_compilers from flamenco.jobs import RECREATABLE_JOB_STATES jobs_coll = current_flamenco.db('jobs') job_doc = jobs_coll.find_one({'_id': job_id}) if not job_doc: raise ValueError('Job ID %s not found', job_id) if job_doc['status'] not in RECREATABLE_JOB_STATES: raise ValueError( 'Job recreation is only possible on jobs in state %s.', ', '.join(RECREATABLE_JOB_STATES)) # Delete the tasks and revert the job to 'under-construction' status before recompiling it. self._log.info('Recreating job %s', job_id) self.job_manager.api_set_job_status(job_id, 'under-construction') self.task_manager.api_delete_tasks_for_job(job_id) job_compilers.compile_job(job_doc) self._log.info('Recreated job %s', job_id)
def schedule_checks(): """Schedules a runnability check for all active jobs.""" jobs_coll = current_flamenco.db('jobs') for job in jobs_coll.find({'status': 'active'}, projection={'_id': True}): log.info('Scheduling runnability check of job %s', job['_id']) runnability_check.delay(str(job['_id']))
def _insert_rna_overrides_task(self, job: dict, parent_task_selector: dict) -> bson.ObjectId: # Find the task that is supposed to be the parent of the new task. tasks_coll = current_flamenco.db('tasks') if parent_task_selector: parent_task = tasks_coll.find_one({'job': job['_id'], **parent_task_selector}, projection={'_id': True}) if not parent_task: raise ValueError('unable to find move-out-of-way task, cannot update this job') parents_kwargs = {'parents': [parent_task['_id']]} else: parents_kwargs = {} # Construct the new task. cmd = rna_overrides_command(job) task_id = self._create_task(job, [cmd], RNA_OVERRIDES_TASK_NAME, 'file-management', priority=80, status='queued', **parents_kwargs) self._log.info('Inserted RNA Overrides task %s into job %s', task_id, job['_id']) # Update existing render tasks to have the new task as parent. new_etag = random_etag() now = utcnow() result = tasks_coll.update_many({ 'job': job['_id'], 'task_type': 'blender-render', **parents_kwargs, }, {'$set': { '_etag': new_etag, '_updated': now, 'parents': [task_id], }}) self._log.debug('Updated %d task parent pointers to %s', result.modified_count, task_id) return task_id
def runnability_check(job_id: str): log.info('checking job %s', job_id) job_oid = ObjectId(job_id) jobs_coll = current_flamenco.db('jobs') job = jobs_coll.find_one({'_id': job_oid}) if not job: log.info('job %s does not exist (any more)', job_id) return if job['status'] != 'active': log.info('job %s is not active any more (status=%r now)', job_id, job['status']) return unrunnable_task_ids = _nonrunnable_tasks(job_oid) if not unrunnable_task_ids: log.info('job %s has no non-runnable tasks', job_id) return log.info('Non-runnable tasks in job %s, failing job: %s', job_id, ', '.join([str(tid) for tid in unrunnable_task_ids])) reason = f'{len(unrunnable_task_ids)} tasks have a failed/cancelled parent ' \ f'and will not be able to run.' current_flamenco.job_manager.api_set_job_status( job_oid, new_status='fail-requested', reason=reason)
def archive_job(self, job: dict): """Initiates job archival by creating a Celery task for it.""" from flamenco.celery import job_archival job_id = job['_id'] job_status = job['status'] if job_status in ARCHIVE_JOB_STATES: msg = f'Job {job_id} cannot be archived, it has status {job_status}' self._log.info(msg) raise wz_exceptions.UnprocessableEntity(msg) # Store current job status in a special key so that it can be restored before # writing to the archive ZIP file as JSON. jobs_coll = current_flamenco.db('jobs') jobs_coll.update_one({'_id': job_id}, {'$set': { 'pre_archive_status': job_status }}) # Immediately set job status to 'archiving', as this should be reflected ASAP in the # database + web interface, rather than waiting for a Celery Worker to pick it up. self.api_set_job_status(job_id, 'archiving') self._log.info( 'Creating Celery background task for archival of job %s', job_id) job_archival.archive_job.delay(str(job_id))
def patch_set_task_status(self, task_id: bson.ObjectId, patch: dict): """Updates a task's status in the database.""" from flamenco import current_flamenco from pillar.api.utils.authentication import current_user_id tasks_coll = current_flamenco.db('tasks') task = tasks_coll.find_one({'_id': task_id}, projection={ 'job': 1, 'manager': 1, 'status': 1 }) if not current_flamenco.manager_manager.user_may_use( mngr_doc_id=task['manager']): log.warning( 'patch_set_task_status(%s, %r): User %s is not allowed to use manager %s!', task_id, patch, current_user_id(), task['manager']) raise wz_exceptions.Forbidden() new_status = patch['status'] try: current_flamenco.task_manager.api_set_task_status(task, new_status) except ValueError: raise wz_exceptions.UnprocessableEntity('Invalid status')
def update_rna_overrides_task(self, job: dict): """Update or create an RNA Overrides task of an existing job.""" tasks_coll = current_flamenco.db('tasks') task = tasks_coll.find_one( { 'job': job['_id'], 'name': RNA_OVERRIDES_TASK_NAME }, projection={'_id': True}) if not task: self.insert_rna_overrides_task(job) return cmd = rna_overrides_command(job) new_etag = random_etag() now = utcnow() result = tasks_coll.update_one( task, { '$set': { '_etag': new_etag, '_updated': now, 'status': 'queued', 'commands': [cmd.to_dict()], } }) self._log.info('Modified %d RNA override task (%s) of job %s', result.modified_count, task['_id'], job['_id'])
def api_construct_job(self, job_id: ObjectId, new_job_settings: typing.Optional[typing.Dict[str, typing.Any]]=None, *, reason: str): """Construct the tasks for a job.""" jobs_coll = current_flamenco.db('jobs') job = jobs_coll.find_one({'_id': job_id}) if not job: raise ValueError(f'Job {job_id} does not exist') if new_job_settings: self._log.info('Updating settings for job %s: %s', job_id, new_job_settings) job_settings = job.setdefault('settings', {}) job_settings.update(new_job_settings) result = jobs_coll.update_one({'_id': job_id}, {'$set': {'settings': job_settings}}) if result.matched_count != 1: raise ValueError(f'Could not find job {job_id} for updating new settings') self.api_set_job_status(job_id, 'under-construction', reason=reason) self._log.info('Generating tasks for job %s', job_id) try: job_compilers.compile_job(job) except Exception as ex: self._log.exception('Compiling job %s failed', job_id) current_flamenco.job_manager.api_set_job_status( job_id, 'construction-failed', reason=f'{reason}; compilation failed: {ex}')
def remove_waiting_for_files(): """Deletes jobs that are stuck in 'waiting-for-files' status. These jobs are waiting for an external PATCH call to initiate job compilation, queueing, and execution. If this PATCH call doesn't come, the job is stuck in this status. After a certain time of waiting, this function will automatically delete those jobs. Be sure to add a schedule to the Celery Beat like this: 'remove_waiting_for_files': { 'task': 'flamenco.celery.job_cleanup.remove_waiting_for_files', 'schedule': 3600, # every N seconds } """ age = current_app.config['FLAMENCO_WAITING_FOR_FILES_MAX_AGE'] # type: datetime.timedelta assert isinstance(age, datetime.timedelta), \ f'FLAMENCO_WAITING_FOR_FILES_MAX_AGE should be a timedelta, not {age!r}' threshold = utcnow() - age log.info('Deleting jobs stuck in "waiting-for-files" status that have not been ' 'updated since %s', threshold) jobs_coll = current_flamenco.db('jobs') result = jobs_coll.delete_many({ 'status': 'waiting-for-files', '_updated': {'$lt': threshold}, }) # No need to delete the tasks, because those jobs don't have any. log.info('Deleted %d jobs stuck in "waiting-for-files" status', result.deleted_count)
def startup(manager_id, notification): from flamenco import current_flamenco import uuid import datetime log.info('Received startup notification from manager %s %s', manager_id, notification) mngr_coll = current_flamenco.db('managers') update_res = mngr_coll.update_one({'_id': manager_id}, { '$set': { '_updated': datetime.datetime.utcnow(), '_etag': uuid.uuid4().hex, 'url': notification['manager_url'], 'variables': notification['variables'], 'path_replacement': notification['path_replacement'], 'stats.nr_of_workers': notification['nr_of_workers'], } }) if update_res.matched_count != 1: log.warning('Updating manager %s matched %i documents.', manager_id, update_res.matched_count) raise wz_exceptions.InternalServerError( 'Unable to update manager in database.') return '', 204
def api_set_job_status(self, job_id: ObjectId, new_status: str, *, reason='', now: datetime.datetime = None) -> pymongo.results.UpdateResult: """API-level call to updates the job status.""" assert new_status self._log.debug('Setting job %s status to "%s", reason: %r', job_id, new_status, reason) jobs_coll = current_flamenco.db('jobs') curr_job = jobs_coll.find_one({'_id': job_id}, projection={'status': 1}) old_status = curr_job['status'] if reason: extra_updates = {'status_reason': reason} # type: typing.Optional[dict] else: extra_updates = None # Go through all necessary status transitions. result = None # make sure that 'result' always has a value. while new_status: result = current_flamenco.update_status('jobs', job_id, new_status, extra_updates=extra_updates, now=now) extra_updates = None # Only pass it to the first status update change. next_status = self.handle_job_status_change(job_id, old_status, new_status) old_status, new_status = new_status, next_status return result
def archive_job(self, job: dict): """Initiates job archival by creating a Celery task for it.""" from flamenco.celery import job_archival job_id = job['_id'] job_status = job['status'] if job_status in ARCHIVE_JOB_STATES: msg = f'Job {job_id} cannot be archived, it has status {job_status}' self._log.info(msg) raise wz_exceptions.UnprocessableEntity(msg) # Store current job status in a special key so that it can be restored before # writing to the archive ZIP file as JSON. jobs_coll = current_flamenco.db('jobs') jobs_coll.update_one({'_id': job_id}, {'$set': {'pre_archive_status': job_status}}) # Immediately set job status to 'archiving', as this should be reflected ASAP in the # database + web interface, rather than waiting for a Celery Worker to pick it up. self.api_set_job_status(job_id, 'archiving') self._log.info('Creating Celery background task for archival of job %s', job_id) job_archival.archive_job.delay(str(job_id))
def exchange(): """Receives a secret key from a Manager that wants to link. Stores the secret key, and returns the ObjectID of that document. """ from flamenco import current_flamenco import datetime # See if we got a key at all. data = request.get_json() secret_key_hex = data.get('key') if not secret_key_hex: raise wz_exceptions.BadRequest('No key given') # Transform the key from hex to binary data. try: secret_key = binascii.a2b_hex(secret_key_hex) except binascii.Error: raise wz_exceptions.BadRequest('Malformed key') # Store the key in the database. log.info('Received secret key from manager at %s', request.remote_addr) mngr_key_coll = current_flamenco.db('manager_linking_keys') insert_res: pymongo.results.InsertOneResult = mngr_key_coll.insert_one({ 'secret_key': secret_key, 'remove_after': datetime.datetime.now(tz=tz_util.utc) + EXPIRE_AFTER, }) identifier = insert_res.inserted_id if not identifier: log.error('No inserted_id after inserting secret key!') raise wz_exceptions.InternalServerError('Unable to store key') return jsonify({'identifier': str(identifier)})
def api_update_rna_overrides(self, job_id: ObjectId, rna_overrides: typing.List[str]): """API-level call to create or update an RNA override task of a Blender Render job.""" new_etag = random_etag() now = utcnow() jobs_coll = current_flamenco.db('jobs') # Check that the job exists and is a Blender-related job. job = jobs_coll.find_one({'_id': job_id}) if not job: self._log.warning('Unable to update RNA overrides of non-existing job %s', job_id) return None compiler = job_compilers.construct_job_compiler(job) if not isinstance(compiler, blender_render.AbstractBlenderJobCompiler): self._log.warning('Job compiler %r is not an AbstractBlenderJobCompiler, unable ' 'to update RNA overrides for job %s of type %r', type(compiler), job_id, job['job_type']) return None # Update the job itself before updating its tasks. Ideally this would happen in the # same transaction. # TODO(Sybren): put into one transaction when we upgrade to MongoDB 4+. job['settings']['rna_overrides'] = rna_overrides result = jobs_coll.update_one({'_id': job_id}, {'$set': { 'settings.rna_overrides': rna_overrides, '_updated': now, '_etag': new_etag, }}) if result.matched_count != 1: self._log.warning('Matched %d jobs while setting job %s RNA overrides', result.matched_count, job_id) compiler.update_rna_overrides_task(job)
def patch_request_task_log_file(self, task_id: bson.ObjectId, patch: dict): """Queue a request to the Manager to upload this task's log file.""" from flamenco import current_flamenco from pillar.api.utils.authentication import current_user_id tasks_coll = current_flamenco.db('tasks') task = tasks_coll.find_one( {'_id': task_id}, projection={'job': 1, 'manager': 1, 'log_file': 1, 'project': 1, 'status': 1}) if not current_flamenco.manager_manager.user_may_use(mngr_doc_id=task['manager']): log.warning('request_task_log_file(%s, %r): User %s is not allowed to use manager %s!', task_id, patch, current_user_id(), task['manager']) raise wz_exceptions.Forbidden() status = task['status'] if status not in LOG_UPLOAD_REQUESTABLE_TASK_STATES: ok = ', '.join(LOG_UPLOAD_REQUESTABLE_TASK_STATES) raise wz_exceptions.BadRequest( f'Log file not requestable while task is in status {status}, must be in {ok}') # Check that the log file hasn't arrived yet (this may not be the # first request for this task). force_rerequest = patch.get('force_rerequest', False) if task.get('log_file') and not force_rerequest: url = url_for('flamenco.tasks.perproject.download_task_log_file', project_url=get_project_url(task['project']), task_id=task_id) # Using 409 Conflict because a 303 See Other (which would be more # appropriate) cannot be intercepted by some AJAX calls. return redirect(url, code=409) current_flamenco.manager_manager.queue_task_log_request( task['manager'], task['job'], task_id)
def patch_requeue(self, task_id: bson.ObjectId, patch: dict): """Re-queue a task and its successors.""" from flamenco import current_flamenco from pillar.api.utils.authentication import current_user_id tasks_coll = current_flamenco.db('tasks') task = tasks_coll.find_one({'_id': task_id}, projection={ 'job': 1, 'manager': 1 }) if not current_flamenco.manager_manager.user_may_use( mngr_doc_id=task['manager']): log.warning( 'patch_set_task_status(%s, %r): User %s is not allowed to use manager %s!', task_id, patch, current_user_id(), task['manager']) raise wz_exceptions.Forbidden() current_flamenco.task_manager.api_requeue_task_and_successors(task_id) # Also inspect other tasks of the same job, and possibly update the job status as well. current_flamenco.job_manager.update_job_after_task_status_change( task['job'], task_id, 'queued')
def managers_for_project( self, project_id: bson.ObjectId) -> typing.List[bson.ObjectId]: """Returns a list of Manager object IDs assigned to the given project.""" assert isinstance(project_id, bson.ObjectId) managers_coll = current_flamenco.db('managers') managers = managers_coll.find({'projects': project_id}, {'_id': 1}) return [m['_id'] for m in managers]
def _task_log_request(self, manager_id: bson.ObjectId, operation: dict): managers_coll = current_flamenco.db('managers') managers_coll.update_one({'_id': manager_id}, { **operation, '$set': { '_updated': utcnow(), '_etag': random_etag(), }, })
def update_status_q(self, collection_name, query, new_status, *, extra_updates: typing.Optional[dict] = None, extra_unset: typing.Optional[typing.Set[str]] = None, now: datetime.datetime = None): """Updates the status for the queried objects. :param extra_updates: dictionary of extra updates to set on the document(s). :param extra_unset: set of fields to unset. :param now: the _updated field is set to this timestamp; use this to set multiple objects to the same _updated field. :returns: the result of the collection.update_many() call :rtype: pymongo.results.UpdateResult """ from flamenco import eve_settings, current_flamenco import uuid singular_name = collection_name.rstrip('s') # jobs -> job schema = eve_settings.DOMAIN['flamenco_%s' % collection_name]['schema'] valid_statuses = schema['status']['allowed'] if new_status not in valid_statuses: raise ValueError('Invalid %s status %s' % (singular_name, new_status)) # Generate random ETag since we can't compute it from the entire document. # This means that a subsequent PUT will change the etag even when the document doesn't # change; this is unavoidable without fetching the entire document. etag = uuid.uuid4().hex if now is None: from bson import tz_util now = datetime.datetime.now(tz=tz_util.utc) collection = current_flamenco.db(collection_name) update = { '$set': { **(extra_updates or {}), 'status': new_status, '_updated': now, '_etag': etag, } } if extra_unset: update['$unset'] = {field_name: True for field_name in extra_unset} result = collection.update_many(query, update) self._log.debug('Updated status of %i %s %s to %s', result.modified_count, singular_name, query, new_status) return result
def api_set_job_priority(self, job_id: ObjectId, new_priority: int): """API-level call to updates the job priority.""" assert isinstance(new_priority, int) self._log.debug('Setting job %s priority to %r', job_id, new_priority) jobs_coll = current_flamenco.db('jobs') curr_job = jobs_coll.find_one({'_id': job_id}, projection={'priority': 1}) old_priority = curr_job['priority'] if old_priority == new_priority: self._log.debug('Job %s is already at priority %r', job_id, old_priority) return new_etag = random_etag() now = utcnow() jobs_coll = current_flamenco.db('jobs') result = jobs_coll.update_one({'_id': job_id}, { '$set': { 'priority': new_priority, '_updated': now, '_etag': new_etag, } }) if result.matched_count != 1: self._log.warning( 'Matched %d jobs while setting job %s to priority %r', result.matched_count, job_id, new_priority) tasks_coll = current_flamenco.db('tasks') result = tasks_coll.update_many({'job': job_id}, { '$set': { 'job_priority': new_priority, '_updated': now, '_etag': new_etag, } }) self._log.debug('Matched %d tasks while setting job %s to priority %r', result.matched_count, job_id, new_priority)
def download_task_and_log(storage_path: str, task_id: str): """Downloads task + task log and stores them.""" import gzip import pymongo task_oid = bson.ObjectId(task_id) log.info('Archiving task %s to %s', task_oid, storage_path) tasks_coll = current_flamenco.db('tasks') logs_coll = current_flamenco.db('task_logs') task = tasks_coll.find_one({'_id': task_oid}) # Use the exact same sort as we've created an index for. logs = logs_coll.find({ 'task': task_oid }).sort([ ('task', pymongo.ASCENDING), ('received_on_manager', pymongo.ASCENDING), ]) # Save the task as JSON spath = pathlib.Path(storage_path) task_path = spath / f'task-{task_id}.json' with open(task_path, mode='w', encoding='utf8') as outfile: outfile.write(dumps(task, indent=4, sort_keys=True)) # Get the task log bits and write to compressed file. log_path = spath / f'task-{task_id}.log.gz' with gzip.open(log_path, mode='wb') as outfile: for log_entry in logs: try: log_contents = log_entry['log'] except KeyError: # No 'log' in this log entry. Bit weird, but we shouldn't crash on it. continue outfile.write(log_contents.encode())
def assert_job_access(self, job_id: bson.ObjectId) -> dict: # TODO: possibly store job and project into flask.g to reduce the nr of Mongo queries. job = current_flamenco.db('jobs').find_one({'_id': job_id}, {'project': 1, 'status': 1}) auth = current_flamenco.auth if not auth.current_user_may(auth.Actions.USE, job['project']): log.info( 'User %s wants to PATCH job %s, but has no right to use Flamenco on project %s', current_user_id(), job_id, job['project']) raise wz_exceptions.Forbidden('Denied Flamenco use on this project') return job
def api_set_job_status(self, job_id, new_status, *, now: datetime.datetime = None) -> pymongo.results.UpdateResult: """API-level call to updates the job status.""" self._log.info('Setting job %s status to "%s"', job_id, new_status) jobs_coll = current_flamenco.db('jobs') curr_job = jobs_coll.find_one({'_id': job_id}, projection={'status': 1}) old_status = curr_job['status'] result = current_flamenco.update_status('jobs', job_id, new_status, now=now) self.handle_job_status_change(job_id, old_status, new_status) return result
def handle_notification(manager_id: str, notification: dict): """Handle startup and update notifications. These notifications contain info about the Manager, including the task types supported by its Workers. """ from flamenco import current_flamenco import uuid import datetime if not notification: raise wz_exceptions.BadRequest('no JSON payload received') settings_version = notification.get('_meta', {}).get('version', 1) updates_unset = {} try: updates_set = { '_updated': datetime.datetime.utcnow(), '_etag': uuid.uuid4().hex, 'url': notification['manager_url'], 'settings_version': settings_version, 'variables': notification['variables'], 'stats.nr_of_workers': notification['nr_of_workers'], } if settings_version <= 1: updates_set['path_replacement'] = notification['path_replacement'] else: updates_unset['path_replacement'] = True except KeyError as ex: raise wz_exceptions.BadRequest(f'Missing key {ex}') try: updates_set['worker_task_types'] = notification['worker_task_types'] except KeyError: pass mngr_coll = current_flamenco.db('managers') updates = {'$set': updates_set} if updates_unset: updates['$unset'] = updates_unset update_res = mngr_coll.update_one({'_id': manager_id}, updates) if update_res.matched_count != 1: log.warning('Updating manager %s matched %i documents.', manager_id, update_res.matched_count) raise wz_exceptions.InternalServerError( 'Unable to update manager in database.') return '', 204
def _insert_rna_overrides_task( self, job: dict, parent_task_selector: dict) -> bson.ObjectId: # Find the task that is supposed to be the parent of the new task. tasks_coll = current_flamenco.db('tasks') if parent_task_selector: parent_task = tasks_coll.find_one( { 'job': job['_id'], **parent_task_selector }, projection={'_id': True}) if not parent_task: raise ValueError( 'unable to find move-out-of-way task, cannot update this job' ) parents_kwargs = {'parents': [parent_task['_id']]} else: parents_kwargs = {} # Construct the new task. cmd = rna_overrides_command(job) task_id = self._create_task(job, [cmd], RNA_OVERRIDES_TASK_NAME, 'file-management', priority=80, status='queued', **parents_kwargs) self._log.info('Inserted RNA Overrides task %s into job %s', task_id, job['_id']) # Update existing render tasks to have the new task as parent. new_etag = random_etag() now = utcnow() result = tasks_coll.update_many( { 'job': job['_id'], 'task_type': 'blender-render', **parents_kwargs, }, { '$set': { '_etag': new_etag, '_updated': now, 'parents': [task_id], } }) self._log.debug('Updated %d task parent pointers to %s', result.modified_count, task_id) return task_id
def attach_task_log(manager_id: ObjectId, _, task_id: str): """Store the POSTed task log as a file in the storage backend. Also updates the task itself to have a reference to the file. """ # We only want to deal with GZipped files. if 'logfile' not in request.files: raise wz_exceptions.BadRequest("Missing uploaded file named 'logfile'") uploaded_file: werkzeug.datastructures.FileStorage = request.files[ 'logfile'] if not uploaded_file.filename.endswith('.gz'): # The test HTTP client doesn't support setting per-part headers. raise wz_exceptions.BadRequest(f'GZIP your file!') # De-queue now; if the task or project doesn't exist, the Manager shouldn't be asked again. task_oid = str2id(task_id) current_flamenco.manager_manager.dequeue_task_log_request( manager_id, task_oid) # Check whether this Manager may attach to this Task. tasks_coll = current_flamenco.db('tasks') task = tasks_coll.find_one({'_id': task_oid, 'manager': manager_id}) if not task: raise wz_exceptions.NotFound(f'No such task exists') proj_coll = current_app.db('projects') project = proj_coll.find_one( { '_id': task['project'], '_deleted': { '$ne': True } }, projection={'url': True}) if not project: log.warning('attach_task_log(%s, %s): project %s does not exist!', manager_id, task_id, task['project']) raise wz_exceptions.NotFound( f'Project for task {task_oid} does not exist') preexisting = current_flamenco.task_manager.api_attach_log( task, uploaded_file) resp = jsonify({'_message': 'ok'}, status=200 if preexisting else 201) resp.headers['Location'] = url_for( 'flamenco.tasks.perproject.download_task_log_file', project_url=project['url'], task_id=task_id) return resp
def tasks_cancel_requested(manager_id): """Returns a set of tasks of status cancel-requested.""" from flamenco import current_flamenco, eve_settings tasks_coll = current_flamenco.db('tasks') task_ids = { task['_id'] for task in tasks_coll.find({'manager': manager_id, 'status': 'cancel-requested'}, projection={'_id': 1}) } log.debug('Returning %i tasks to be canceled by manager %s', len(task_ids), manager_id) return task_ids
def download_task_and_log(storage_path: str, task_id: str): """Downloads task + task log and stores them.""" import gzip import pymongo task_oid = bson.ObjectId(task_id) log.info('Archiving task %s to %s', task_oid, storage_path) tasks_coll = current_flamenco.db('tasks') logs_coll = current_flamenco.db('task_logs') task = tasks_coll.find_one({'_id': task_oid}) # Use the exact same sort as we've created an index for. logs = logs_coll.find({'task': task_oid}).sort([ ('task', pymongo.ASCENDING), ('received_on_manager', pymongo.ASCENDING), ]) # Save the task as JSON spath = pathlib.Path(storage_path) task_path = spath / f'task-{task_id}.json' with open(task_path, mode='w', encoding='utf8') as outfile: outfile.write(dumps(task, indent=4, sort_keys=True)) # Get the task log bits and write to compressed file. log_path = spath / f'task-{task_id}.log.gz' with gzip.open(log_path, mode='wb') as outfile: for log_entry in logs: try: log_contents = log_entry['log'] except KeyError: # No 'log' in this log entry. Bit weird, but we shouldn't crash on it. continue outfile.write(log_contents.encode())
def handle_notification(manager_id: str, notification: dict): """Handle startup and update notifications. These notifications contain info about the Manager, including the task types supported by its Workers. """ from flamenco import current_flamenco import uuid import datetime if not notification: raise wz_exceptions.BadRequest('no JSON payload received') settings_version = notification.get('_meta', {}).get('version', 1) updates_unset = {} try: updates_set = { '_updated': datetime.datetime.utcnow(), '_etag': uuid.uuid4().hex, 'url': notification['manager_url'], 'settings_version': settings_version, 'variables': notification['variables'], 'stats.nr_of_workers': notification['nr_of_workers'], } if settings_version <= 1: updates_set['path_replacement'] = notification['path_replacement'] else: updates_unset['path_replacement'] = True except KeyError as ex: raise wz_exceptions.BadRequest(f'Missing key {ex}') try: updates_set['worker_task_types'] = notification['worker_task_types'] except KeyError: pass mngr_coll = current_flamenco.db('managers') updates = {'$set': updates_set} if updates_unset: updates['$unset'] = updates_unset update_res = mngr_coll.update_one({'_id': manager_id}, updates) if update_res.matched_count != 1: log.warning('Updating manager %s matched %i documents.', manager_id, update_res.matched_count) raise wz_exceptions.InternalServerError('Unable to update manager in database.') return '', 204
def update_status_q(self, collection_name, query, new_status, *, extra_updates: typing.Optional[dict] = None, extra_unset: typing.Optional[typing.Set[str]] = None, now: datetime.datetime = None): """Updates the status for the queried objects. :param extra_updates: dictionary of extra updates to set on the document(s). :param extra_unset: set of fields to unset. :param now: the _updated field is set to this timestamp; use this to set multiple objects to the same _updated field. :returns: the result of the collection.update_many() call :rtype: pymongo.results.UpdateResult """ from flamenco import eve_settings, current_flamenco import uuid singular_name = collection_name.rstrip('s') # jobs -> job schema = eve_settings.DOMAIN['flamenco_%s' % collection_name]['schema'] valid_statuses = schema['status']['allowed'] if new_status not in valid_statuses: raise ValueError('Invalid %s status %s' % (singular_name, new_status)) # Generate random ETag since we can't compute it from the entire document. # This means that a subsequent PUT will change the etag even when the document doesn't # change; this is unavoidable without fetching the entire document. etag = uuid.uuid4().hex if now is None: from bson import tz_util now = datetime.datetime.now(tz=tz_util.utc) collection = current_flamenco.db(collection_name) update = {'$set': { **(extra_updates or {}), 'status': new_status, '_updated': now, '_etag': etag, }} if extra_unset: update['$unset'] = {field_name: True for field_name in extra_unset} result = collection.update_many(query, update) self._log.debug('Updated status of %i %s %s to %s', result.modified_count, singular_name, query, new_status) return result
def resume_job_archiving(): """Resumes archiving of jobs that are stuck in status "archiving". Finds all jobs in status "archiving" that is older than one day and calls archive_job with each job. """ age = current_app.config['FLAMENCO_RESUME_ARCHIVING_AGE'] jobs_coll = current_flamenco.db('jobs') archiving = jobs_coll.find({ 'status': 'archiving', '_updated': {'$lte': utcnow() - age}, }) log.info('Resume archiving %d jobs', archiving.count()) for job in archiving: log.debug('Resume archiving job %s', job['_id']) archive_job.delay(str(job['_id']))
def owned_managers( self, user_group_ids: typing.List[bson.ObjectId], projection: typing.Optional[dict] = None) -> pymongo.cursor.Cursor: """Returns a Mongo cursor of Manager object IDs owned by the given user. :param user_group_ids: list of the group IDs of the user. :param projection: When not None, it is used instead of the default {'_id': 1}. """ if projection is None: projection = {'_id': 1} managers_coll = current_flamenco.db('managers') managers = managers_coll.find({'owner': { '$in': user_group_ids }}, projection) return managers
def current_user_may(self, action: Actions, project_id: bson.ObjectId) -> bool: """Returns True iff the user is authorised to use/view Flamenco on the given project. This is linked to the Managers assigned to this project. As a result, you cannot use Flamenco until one or more Managers is assigned. """ from pillar.api.projects.utils import user_rights_in_project import pillar.auth from flamenco import current_flamenco # Get the actual user object to prevent multiple passes through the LocalProxy. user: pillar.auth.UserClass = current_user._get_current_object() if user.is_anonymous: self._log.debug('Anonymous user never has access to Flamenco.') return False cap = req_cap[action] if not user.has_cap(cap): self._log.info( 'User %s does not have capability %r required for action %s; ' 'denying access to Flamenco', user.user_id, cap, action) return False # TODO Sybren: possibly split this up into a manager-fetching func + authorisation func. # TODO: possibly store the user rights on the current project in the current_user object? allowed_on_proj = user_rights_in_project(project_id) if not allowed_on_proj.intersection(PROJECT_METHODS_TO_USE_FLAMENCO): self._log.info('User %s has no %s access to project %s.', user.user_id, PROJECT_METHODS_TO_USE_FLAMENCO, project_id) return False if user.has_cap('flamenco-admin'): self._log.debug( 'User is flamenco-admin, so has access to all Managers') return True managers_coll = current_flamenco.db('managers') managers_count = managers_coll.count_documents( {'projects': project_id}) return managers_count > 0
def patch_requeue(self, task_id: bson.ObjectId, patch: dict): """Re-queue a task and its successors.""" from flamenco import current_flamenco from pillar.api.utils.authentication import current_user_id tasks_coll = current_flamenco.db('tasks') task = tasks_coll.find_one({'_id': task_id}, projection={'job': 1, 'manager': 1}) if not current_flamenco.manager_manager.user_may_use(mngr_doc_id=task['manager']): log.warning('patch_set_task_status(%s, %r): User %s is not allowed to use manager %s!', task_id, patch, current_user_id(), task['manager']) raise wz_exceptions.Forbidden() current_flamenco.task_manager.api_requeue_task_and_successors(task_id) # Also inspect other tasks of the same job, and possibly update the job status as well. current_flamenco.job_manager.update_job_after_task_status_change( task['job'], task_id, 'queued')
def resume_job_archiving(): """Resumes archiving of jobs that are stuck in status "archiving". Finds all jobs in status "archiving" that is older than one day and calls archive_job with each job. """ age = current_app.config['FLAMENCO_RESUME_ARCHIVING_AGE'] jobs_coll = current_flamenco.db('jobs') archiving = jobs_coll.find({ 'status': 'archiving', '_updated': { '$lte': utcnow() - age }, }) log.info('Resume archiving %d jobs', archiving.count()) for job in archiving: log.debug('Resume archiving job %s', job['_id']) archive_job.delay(str(job['_id']))
def patch_request_task_log_file(self, task_id: bson.ObjectId, patch: dict): """Queue a request to the Manager to upload this task's log file.""" from flamenco import current_flamenco from pillar.api.utils.authentication import current_user_id tasks_coll = current_flamenco.db('tasks') task = tasks_coll.find_one({'_id': task_id}, projection={ 'job': 1, 'manager': 1, 'log_file': 1, 'project': 1, 'status': 1 }) if not current_flamenco.manager_manager.user_may_use( mngr_doc_id=task['manager']): log.warning( 'request_task_log_file(%s, %r): User %s is not allowed to use manager %s!', task_id, patch, current_user_id(), task['manager']) raise wz_exceptions.Forbidden() status = task['status'] if status not in LOG_UPLOAD_REQUESTABLE_TASK_STATES: ok = ', '.join(LOG_UPLOAD_REQUESTABLE_TASK_STATES) raise wz_exceptions.BadRequest( f'Log file not requestable while task is in status {status}, must be in {ok}' ) # Check that the log file hasn't arrived yet (this may not be the # first request for this task). force_rerequest = patch.get('force_rerequest', False) if task.get('log_file') and not force_rerequest: url = url_for('flamenco.tasks.perproject.download_task_log_file', project_url=get_project_url(task['project']), task_id=task_id) # Using 409 Conflict because a 303 See Other (which would be more # appropriate) cannot be intercepted by some AJAX calls. return redirect(url, code=409) current_flamenco.manager_manager.queue_task_log_request( task['manager'], task['job'], task_id)
def current_user_may(self, action: Actions, project_id: bson.ObjectId) -> bool: """Returns True iff the user is authorised to use/view Flamenco on the given project. This is linked to the Managers assigned to this project. As a result, you cannot use Flamenco until one or more Managers is assigned. """ from pillar.api.projects.utils import user_rights_in_project import pillar.auth from flamenco import current_flamenco # Get the actual user object to prevent multiple passes through the LocalProxy. user: pillar.auth.UserClass = current_user._get_current_object() if user.is_anonymous: self._log.debug('Anonymous user never has access to Flamenco.') return False cap = req_cap[action] if not user.has_cap(cap): self._log.info('User %s does not have capability %r required for action %s; ' 'denying access to Flamenco', user.user_id, cap, action) return False # TODO Sybren: possibly split this up into a manager-fetching func + authorisation func. # TODO: possibly store the user rights on the current project in the current_user object? allowed_on_proj = user_rights_in_project(project_id) if not allowed_on_proj.intersection(PROJECT_METHODS_TO_USE_FLAMENCO): self._log.info('User %s has no %s access to project %s.', user.user_id, PROJECT_METHODS_TO_USE_FLAMENCO, project_id) return False if user.has_cap('flamenco-admin'): self._log.debug('User is flamenco-admin, so has access to all Managers') return True managers_coll = current_flamenco.db('managers') managers = managers_coll.find({'projects': project_id}) if self._log.isEnabledFor(logging.DEBUG): self._log.debug('User has access to the following managers for this project: %s', [m['_id'] for m in managers]) return managers.count() > 0
def owned_managers(self, user_group_ids: typing.List[bson.ObjectId], projection: typing.Optional[dict] = None) \ -> typing.Tuple[pymongo.cursor.Cursor, int]: """Returns a Mongo cursor of Manager object IDs owned by the given user. :param user_group_ids: list of the group IDs of the user. :param projection: When not None, it is used instead of the default {'_id': 1}. :return: tuple (cursor, manager count) """ if projection is None: projection = {'_id': 1} managers_coll = current_flamenco.db('managers') query = {'owner': {'$in': user_group_ids}} manager_cursor = managers_coll.find(query, projection) manager_count = managers_coll.count_documents(query) return manager_cursor, manager_count
def _do_check_completion(self, job_id, new_status) -> str: """Completes the job if all tasks are completed. :returns: the new job status, if this status transition should be followed by another one. """ tasks_coll = current_flamenco.db('tasks') total_tasks = tasks_coll.find({'job': job_id}).count() completed_tasks = tasks_coll.find({'job': job_id, 'status': 'completed'}).count() if completed_tasks < total_tasks: # Not yet completed, so just stay at current status. self._log.debug('Job %s has %d of %d tasks completed, staying at status %r', job_id, completed_tasks, total_tasks, new_status) return '' self._log.info("Job %s has all %d tasks completed, transition from %r to 'completed'", job_id, total_tasks, new_status) return 'completed'
def unused_manager_owners(): """Lists all email addresses of unused Manager owners""" from flamenco import current_flamenco mngr_coll = current_flamenco.db('managers') found = mngr_coll.aggregate([ { '$match': { 'url': { '$exists': False } } }, { '$lookup': { 'from': 'users', 'localField': 'owner', 'foreignField': 'groups', 'as': 'owners' } }, { '$unwind': { 'path': '$owners' } }, { '$match': { 'owners.settings.email_communications': { '$ne': 0 } } }, { '$group': { '_id': '$owners.email' } }, ]) emails = ', '.join(sorted(result['_id'] for result in found)) print(emails)
def patch_edit_from_web(self, manager_id: bson.ObjectId, patch: dict): """Updates Manager fields from the web.""" from pymongo.results import UpdateResult if not current_flamenco.manager_manager.user_is_owner( mngr_doc_id=manager_id): log.warning( 'User %s uses PATCH to edit manager %s, ' 'but user is not owner of that Manager. Request denied.', current_user_id(), manager_id) raise wz_exceptions.Forbidden() # Only take known fields from the patch, don't just copy everything. update = {'name': patch['name'], 'description': patch['description']} self.log.info('User %s edits Manager %s: %s', current_user_id(), manager_id, update) validator = current_app.validator_for_resource('flamenco_managers') if not validator.validate_update( update, manager_id, persisted_document={}): resp = jsonify({ '_errors': validator.errors, '_message': ', '.join(f'{field}: {error}' for field, error in validator.errors.items()), }) resp.status_code = 422 return resp managers_coll = current_flamenco.db('managers') result: UpdateResult = managers_coll.update_one({'_id': manager_id}, {'$set': update}) if result.matched_count != 1: self.log.warning( 'User %s edits Manager %s but update matched %i items', current_user_id(), manager_id, result.matched_count) raise wz_exceptions.BadRequest() return '', 204
def patch_set_task_status(self, task_id: bson.ObjectId, patch: dict): """Updates a task's status in the database.""" from flamenco import current_flamenco from pillar.api.utils.authentication import current_user_id tasks_coll = current_flamenco.db('tasks') task = tasks_coll.find_one({'_id': task_id}, projection={'job': 1, 'manager': 1, 'status': 1}) if not current_flamenco.manager_manager.user_may_use(mngr_doc_id=task['manager']): log.warning('patch_set_task_status(%s, %r): User %s is not allowed to use manager %s!', task_id, patch, current_user_id(), task['manager']) raise wz_exceptions.Forbidden() new_status = patch['status'] try: current_flamenco.task_manager.api_set_task_status(task, new_status) except ValueError: raise wz_exceptions.UnprocessableEntity('Invalid status')
def unused_manager_owners(): """Lists all email addresses of unused Manager owners""" from flamenco import current_flamenco mngr_coll = current_flamenco.db('managers') found = mngr_coll.aggregate([ {'$match': {'url': {'$exists': False}}}, {'$lookup': { 'from': 'users', 'localField': 'owner', 'foreignField': 'groups', 'as': 'owners' }}, {'$unwind': {'path': '$owners'}}, {'$match': {'owners.settings.email_communications': {'$ne': 0}}}, {'$group': {'_id': '$owners.email'}}, ]) emails = ', '.join(sorted(result['_id'] for result in found)) print(emails)
def update_rna_overrides_task(self, job: dict): """Update or create an RNA Overrides task of an existing job.""" tasks_coll = current_flamenco.db('tasks') task = tasks_coll.find_one({'job': job['_id'], 'name': RNA_OVERRIDES_TASK_NAME}, projection={'_id': True}) if not task: self.insert_rna_overrides_task(job) return cmd = rna_overrides_command(job) new_etag = random_etag() now = utcnow() result = tasks_coll.update_one(task, {'$set': { '_etag': new_etag, '_updated': now, 'status': 'queued', 'commands': [cmd.to_dict()], }}) self._log.info('Modified %d RNA override task (%s) of job %s', result.modified_count, task['_id'], job['_id'])
def patch_edit_from_web(self, manager_id: bson.ObjectId, patch: dict): """Updates Manager fields from the web.""" from pymongo.results import UpdateResult if not current_flamenco.manager_manager.user_is_owner(mngr_doc_id=manager_id): log.warning('User %s uses PATCH to edit manager %s, ' 'but user is not owner of that Manager. Request denied.', current_user_id(), manager_id) raise wz_exceptions.Forbidden() # Only take known fields from the patch, don't just copy everything. update = {'name': patch['name'], 'description': patch['description']} self.log.info('User %s edits Manager %s: %s', current_user_id(), manager_id, update) validator = current_app.validator_for_resource('flamenco_managers') if not validator.validate_update(update, manager_id): resp = jsonify({ '_errors': validator.errors, '_message': ', '.join(f'{field}: {error}' for field, error in validator.errors.items()), }) resp.status_code = 422 return resp managers_coll = current_flamenco.db('managers') result: UpdateResult = managers_coll.update_one( {'_id': manager_id}, {'$set': update} ) if result.matched_count != 1: self.log.warning('User %s edits Manager %s but update matched %i items', current_user_id(), manager_id, result.matched_count) raise wz_exceptions.BadRequest() return '', 204
def attach_task_log(manager_id: ObjectId, _, task_id: str): """Store the POSTed task log as a file in the storage backend. Also updates the task itself to have a reference to the file. """ # We only want to deal with GZipped files. if 'logfile' not in request.files: raise wz_exceptions.BadRequest("Missing uploaded file named 'logfile'") uploaded_file: werkzeug.datastructures.FileStorage = request.files['logfile'] if not uploaded_file.filename.endswith('.gz'): # The test HTTP client doesn't support setting per-part headers. raise wz_exceptions.BadRequest(f'GZIP your file!') # De-queue now; if the task or project doesn't exist, the Manager shouldn't be asked again. task_oid = str2id(task_id) current_flamenco.manager_manager.dequeue_task_log_request(manager_id, task_oid) # Check whether this Manager may attach to this Task. tasks_coll = current_flamenco.db('tasks') task = tasks_coll.find_one({'_id': task_oid, 'manager': manager_id}) if not task: raise wz_exceptions.NotFound(f'No such task exists') proj_coll = current_app.db('projects') project = proj_coll.find_one({'_id': task['project'], '_deleted': {'$ne': True}}, projection={'url': True}) if not project: log.warning('attach_task_log(%s, %s): project %s does not exist!', manager_id, task_id, task['project']) raise wz_exceptions.NotFound(f'Project for task {task_oid} does not exist') preexisting = current_flamenco.task_manager.api_attach_log(task, uploaded_file) resp = jsonify({'_message': 'ok'}, status=200 if preexisting else 201) resp.headers['Location'] = url_for( 'flamenco.tasks.perproject.download_task_log_file', project_url=project['url'], task_id=task_id) return resp
def api_recreate_job(self, job_id: bson.ObjectId): """Delete all tasks of a job, then recompile the job. The job state MUST be in RECREATABLE_JOB_STATES, to ensure that the manager has stopped task execution. As this functionality requires access to both the task manager and the job manager, this is implemented on FlamencoExtension itself. """ from flamenco import job_compilers from flamenco.jobs import RECREATABLE_JOB_STATES jobs_coll = current_flamenco.db('jobs') job_doc = jobs_coll.find_one({'_id': job_id}) if not job_doc: raise ValueError('Job ID %s not found', job_id) if job_doc['status'] not in RECREATABLE_JOB_STATES: raise ValueError('Job recreation is only possible on jobs in state %s.' % ', '.join(RECREATABLE_JOB_STATES)) # Delete the tasks and revert the job to 'under-construction' status before recompiling it. self._log.info('Recreating job %s', job_id) self.job_manager.api_set_job_status( job_id, 'under-construction', reason=f'Recreated by {current_user.full_name} (@{current_user.username})') self.task_manager.api_delete_tasks_for_job(job_id) try: job_compilers.compile_job(job_doc) except Exception as ex: self._log.exception('Recreating job %s failed', job_id) self.job_manager.api_set_job_status(job_id, 'construction-failed') raise ValueError('Job recreation failed: %s' % ex) else: self._log.info('Recreated job %s', job_id)
def reset_token(): """Generates a new authentication token for the Manager. The Manager must have exchanged a secret key first, which must be linked to a Manager ID before this function can be called. """ from flamenco import current_flamenco from .linking_routes import check_hmac data = request.get_json() identifier = str2id(data.get('identifier')) manager_id = str2id(data.get('manager_id')) padding = data.get('padding', '') mac = data.get('hmac') log.info('Received request to reset auth token for Manager %s', manager_id) mngr_key_coll = current_flamenco.db('manager_linking_keys') key_info = mngr_key_coll.find_one({'_id': identifier, 'manager_id': manager_id}) if not key_info or not key_info.get('secret_key'): log.warning('No secret key found for identifier %s, manager %s', identifier, manager_id) raise wz_exceptions.BadRequest('No secret key exchanged') check_hmac(key_info['secret_key'], f'{padding}-{identifier}-{manager_id}'.encode('ascii'), mac) auth_token_info = current_flamenco.manager_manager.gen_new_auth_token(manager_id) if not auth_token_info: raise wz_exceptions.NotFound() del_res = mngr_key_coll.delete_many({'manager_id': manager_id}) log.info('Authentication token reset for Manager %s, all %d secret key(s) for this' ' manager have been removed.', manager_id, del_res.deleted_count) return jsonify(attr.asdict(auth_token_info))
def archive_job(job_id: str): """Archives a given job. - Sets job status "archiving" (if not already that status). - For each task, de-chunks the task logs and gz-compresses them. - Creates a ZIP file with the job+task definitions in JSON and compressed logs. - Uploads the ZIP to the project's file storage. - Records the link of the ZIP in the job document. - Deletes the tasks and task logs in MongoDB. - Sets the job status to "archived". """ import tempfile import celery try: job_oid = bson.ObjectId(job_id) except bson.errors.InvalidId as ex: log.error('%s', ex) return jobs_coll = current_flamenco.db('jobs') job = jobs_coll.find_one({'_id': job_oid}) if job is None: log.info('Job %s does not exist, not archiving', job_oid) return if job['status'] == 'archived': log.info('Job %s already archived, not archiving again', job_oid) return log.info('Archiving job %s', job_oid) # Create a temporary directory for the file operations. storage_path = tempfile.mkdtemp(prefix=f'job-archival-{job_id}-') zip_path = pathlib.Path(storage_path) / f'flamenco-job-{job_id}.zip' log.info('Job archival path: %s', storage_path) # TODO: store the ZIP link in the job JSON in MongoDB. # Write the job to JSON. pre_archive_status = job.get('pre_archive_status') if pre_archive_status: job['status'] = pre_archive_status del job['pre_archive_status'] job_json_path = pathlib.Path(storage_path) / f'job-{job_id}.json' with job_json_path.open(mode='w', encoding='utf8') as outfile: outfile.write(dumps(job, indent=4, sort_keys=True)) # Set job status to 'archiving'. res = current_flamenco.job_manager.api_set_job_status(job_oid, 'archiving') if res.matched_count != 1: raise ArchivalError(f'Unable to update job {job_oid}, matched count={res.matched_count}') # Run each task log compression in a separate Celery task. tasks_coll = current_flamenco.db('tasks') tasks = tasks_coll.find({'job': job_oid}, {'_id': 1}) # The chain of everything except downloading tasks & logs. Celery can't handle empty # groups, so we have to be careful in constructing the download_tasks group. chain = ( create_upload_zip.si(str(job['project']), storage_path, str(zip_path)) | update_mongo.s(job_id) | cleanup.si(storage_path) ) if tasks.count(): download_tasks = celery.group(*( download_task_and_log.si(storage_path, str(task['_id'])) for task in tasks)) chain = download_tasks | chain chain()
def update_job_after_task_status_change(self, job_id, task_id, new_task_status): """Updates the job status based on the status of this task and other tasks in the job. """ jobs_coll = current_flamenco.db('jobs') tasks_coll = current_flamenco.db('tasks') def __job_status_if_a_then_b(if_status: str, then_new_status: str): """Set job to active if it was queued.""" job = jobs_coll.find_one(job_id, projection={'status': 1}) if job['status'] == if_status: self._log.info('Job %s became %s because one of its tasks %s changed ' 'status to %s', job_id, then_new_status, task_id, new_task_status) self.api_set_job_status(job_id, then_new_status) if new_task_status == 'queued': # Re-queueing a task on a completed job should re-queue the job too. __job_status_if_a_then_b('completed', 'queued') return if new_task_status == 'claimed-by-manager': # See if there are any active tasks left. If the job was active, but a task # goes to 'claimed-by-manager', this means the task likely active and now re-queued. statuses = tasks_coll.distinct('status', {'job': job_id}) if 'active' not in statuses: __job_status_if_a_then_b('active', 'queued') return if new_task_status in {'cancel-requested', 'claimed-by-manager'}: # A task being claimed by the manager also doesn't change job status. # Also, canceling a single task has no influence on the job itself. return if new_task_status == 'canceled': # Only trigger cancellation/failure of the job if that was actually requested. # A user can also cancel a single task from the Server web UI or API. job = jobs_coll.find_one(job_id, projection={'status': 1}) job_status = job['status'] if job_status in {'cancel-requested', 'fail-requested'}: # This could be the last cancel-requested task to go to 'canceled. statuses = tasks_coll.distinct('status', {'job': job_id}) if 'cancel-requested' not in statuses: self._log.info('Last task %s of job %s went from cancel-requested to canceled', task_id, job_id) next_status = job_status.replace('-requested', 'ed') self.api_set_job_status(job_id, next_status) return if new_task_status == 'failed': # Count the number of failed tasks. If it is more than 10%, fail the job. total_count = tasks_coll.find({'job': job_id}).count() fail_count = tasks_coll.find({'job': job_id, 'status': 'failed'}).count() fail_perc = fail_count / float(total_count) * 100 if fail_perc >= TASK_FAIL_JOB_PERCENTAGE: msg = f'Failing job {job_id} because {fail_count} of its {total_count} tasks ' \ f'({int(fail_perc)}%) failed' self._log.info(msg) self.api_set_job_status(job_id, 'failed', reason=msg) else: self._log.info('Task %s of job %s failed; ' 'only %i of its %i tasks failed (%i%%), so ignoring for now', task_id, job_id, fail_count, total_count, fail_perc) __job_status_if_a_then_b('queued', 'active') return if new_task_status in {'active', 'processing', 'soft-failed'}: job = jobs_coll.find_one(job_id, projection={'status': 1}) if job['status'] not in {'active', 'fail-requested', 'cancel-requested'}: self._log.info('Job %s became active because one of its tasks %s changed ' 'status to %s', job_id, task_id, new_task_status) self.api_set_job_status(job_id, 'active') return if new_task_status == 'completed': # Maybe all tasks are completed, which should complete the job. statuses = tasks_coll.distinct('status', {'job': job_id}) if statuses == ['completed']: self._log.info('All tasks (last one was %s) of job %s are completed, ' 'setting job to completed.', task_id, job_id) self.api_set_job_status(job_id, 'completed') else: __job_status_if_a_then_b('queued', 'active') return self._log.warning('Task %s of job %s obtained status %s, ' 'which we do not know how to handle.', task_id, job_id, new_task_status)