def execute_jobs( verb, jobs, locks, event_queue, log_path, max_toplevel_jobs=None, continue_on_failure=False, continue_without_deps=False): """Process a number of jobs asynchronously. :param jobs: A list of topologically-sorted Jobs with no circular dependencies. :param event_queue: A python queue for reporting events. :param log_path: The path in which logfiles can be written :param max_toplevel_jobs: Max number of top-level jobs :param continue_on_failure: Keep running jobs even if one fails. :param continue_without_deps: Run jobs even if their dependencies fail. """ # Map of jid -> job job_map = dict([(j.jid, j) for j in jobs]) # Jobs which are not ready to be executed pending_jobs = [] # Jobs which are ready to be executed once workers are available queued_jobs = [] # List of active jobs active_jobs = [] # Set of active job futures active_job_fs = set() # Dict of completd jobs job_id -> succeeded completed_jobs = {} # List of jobs whose deps failed abandoned_jobs = [] # Make sure job server has been initialized if not job_server.initialized(): raise RuntimeError('JobServer has not been initialized.') # Create a thread pool executor for blocking python stages in the asynchronous jobs threadpool = ThreadPoolExecutor(max_workers=job_server.max_jobs()) # Immediately abandon jobs with bad dependencies pending_jobs, new_abandoned_jobs = split(jobs, lambda j: all([d in job_map for d in j.deps])) for abandoned_job in new_abandoned_jobs: abandoned_jobs.append(abandoned_job) event_queue.put(ExecutionEvent( 'ABANDONED_JOB', job_id=abandoned_job.jid, reason='MISSING_DEPS', dep_ids=[d for d in abandoned_job.deps if d not in job_map])) # Initialize list of ready and pending jobs (jobs not ready to be executed) queued_jobs, pending_jobs = split(pending_jobs, lambda j: len(j.deps) == 0) # Process all jobs asynchronously until there are none left while len(active_job_fs) + len(queued_jobs) + len(pending_jobs) > 0: # Activate jobs while the jobserver dispenses tokens while ((len(queued_jobs) > 0) and ((max_toplevel_jobs is None) or (len(active_jobs) < max_toplevel_jobs)) and (job_server.try_acquire() is not None)): # Pop a job off of the job queue job = queued_jobs.pop(0) # Label it (for debugging) job_server.add_label(job.jid) # Notify that the job is being started event_queue.put(ExecutionEvent( 'STARTED_JOB', job_id=job.jid)) # Start the job coroutine active_jobs.append(job) active_job_fs.add(async_job(verb, job, threadpool, locks, event_queue, log_path)) # Report running jobs event_queue.put(ExecutionEvent( 'JOB_STATUS', pending=[j.jid for j in pending_jobs], queued=[j.jid for j in queued_jobs], active=[j.jid for j in active_jobs], abandoned=[j.jid for j in abandoned_jobs], completed=completed_jobs )) # Process jobs as they complete asynchronously done_job_fs, active_job_fs = yield asyncio.From(asyncio.wait( active_job_fs, timeout=0.10, return_when=FIRST_COMPLETED)) for done_job_f in done_job_fs: # Capture a result once the job has finished job_id, succeeded = yield asyncio.From(done_job_f) # Release a jobserver token now that this job has succeeded job_server.release(job_id) active_jobs = [j for j in active_jobs if j.jid != job_id] # Generate event with the results of this job event_queue.put(ExecutionEvent( 'FINISHED_JOB', job_id=job_id, succeeded=succeeded)) # Add the job to the completed list completed_jobs[job_id] = succeeded # Handle failure modes if not succeeded: # Handle different abandoning policies if not continue_on_failure: # Abort all pending jobs if any job fails new_abandoned_jobs = queued_jobs + pending_jobs queued_jobs = [] pending_jobs = [] # Notify that jobs have been abandoned for abandoned_job in new_abandoned_jobs: abandoned_jobs.append(abandoned_job) event_queue.put(ExecutionEvent( 'ABANDONED_JOB', job_id=abandoned_job.jid, reason='PEER_FAILED', peer_job_id=job_id)) elif not continue_without_deps: unhandled_abandoned_job_ids = [job_id] # Abandon jobs which depend on abandoned jobs while len(unhandled_abandoned_job_ids) > 0: # Get the abandoned job abandoned_job_id = unhandled_abandoned_job_ids.pop(0) # Abandon all pending jobs which depend on this job_id unhandled_abandoned_jobs, pending_jobs = split( pending_jobs, lambda j: abandoned_job_id in j.deps) # Handle each new abandoned job for abandoned_job in unhandled_abandoned_jobs: abandoned_jobs.append(abandoned_job) # Notify if any jobs have been abandoned event_queue.put(ExecutionEvent( 'ABANDONED_JOB', job_id=abandoned_job.jid, reason='DEP_FAILED', direct_dep_job_id=abandoned_job_id, dep_job_id=job_id)) # Add additional job ids to check unhandled_abandoned_job_ids.extend( [j.jid for j in unhandled_abandoned_jobs]) # Update the list of ready jobs (based on completed job dependencies) new_queued_jobs, pending_jobs = split( pending_jobs, lambda j: j.all_deps_completed(completed_jobs)) queued_jobs.extend(new_queued_jobs) # Notify of newly queued jobs for queued_job in new_queued_jobs: event_queue.put(ExecutionEvent( 'QUEUED_JOB', job_id=queued_job.jid)) # Report running jobs event_queue.put(ExecutionEvent( 'JOB_STATUS', pending=[j.jid for j in pending_jobs], queued=[j.jid for j in queued_jobs], active=[j.jid for j in active_jobs], abandoned=[j.jid for j in abandoned_jobs], completed=completed_jobs )) raise asyncio.Return(all(completed_jobs.values()))
def execute_jobs(verb, jobs, locks, event_queue, log_path, max_toplevel_jobs=None, continue_on_failure=False, continue_without_deps=False): """Process a number of jobs asynchronously. :param jobs: A list of topologically-sorted Jobs with no circular dependencies. :param event_queue: A python queue for reporting events. :param log_path: The path in which logfiles can be written :param max_toplevel_jobs: Max number of top-level jobs :param continue_on_failure: Keep running jobs even if one fails. :param continue_without_deps: Run jobs even if their dependencies fail. """ # Map of jid -> job job_map = dict([(j.jid, j) for j in jobs]) # Jobs which are not ready to be executed pending_jobs = [] # Jobs which are ready to be executed once workers are available queued_jobs = [] # List of active jobs active_jobs = [] # Set of active job futures active_job_fs = set() # Dict of completd jobs job_id -> succeeded completed_jobs = {} # List of jobs whose deps failed abandoned_jobs = [] # Make sure job server has been initialized if not job_server.initialized(): raise RuntimeError('JobServer has not been initialized.') # Create a thread pool executor for blocking python stages in the asynchronous jobs threadpool = ThreadPoolExecutor(max_workers=job_server.max_jobs()) # Immediately abandon jobs with bad dependencies pending_jobs, new_abandoned_jobs = split( jobs, lambda j: all([d in job_map for d in j.deps])) for abandoned_job in new_abandoned_jobs: abandoned_jobs.append(abandoned_job) event_queue.put( ExecutionEvent( 'ABANDONED_JOB', job_id=abandoned_job.jid, reason='MISSING_DEPS', dep_ids=[d for d in abandoned_job.deps if d not in job_map])) # Initialize list of ready and pending jobs (jobs not ready to be executed) queued_jobs, pending_jobs = split(pending_jobs, lambda j: len(j.deps) == 0) # Process all jobs asynchronously until there are none left while len(active_job_fs) + len(queued_jobs) + len(pending_jobs) > 0: # Activate jobs while the jobserver dispenses tokens while ((len(queued_jobs) > 0) and ((max_toplevel_jobs is None) or (len(active_jobs) < max_toplevel_jobs)) and (job_server.try_acquire() is not None)): # Pop a job off of the job queue job = queued_jobs.pop(0) # Label it (for debugging) job_server.add_label(job.jid) # Notify that the job is being started event_queue.put(ExecutionEvent('STARTED_JOB', job_id=job.jid)) # Start the job coroutine active_jobs.append(job) active_job_fs.add( async_job(verb, job, threadpool, locks, event_queue, log_path)) # Report running jobs event_queue.put( ExecutionEvent('JOB_STATUS', pending=[j.jid for j in pending_jobs], queued=[j.jid for j in queued_jobs], active=[j.jid for j in active_jobs], abandoned=[j.jid for j in abandoned_jobs], completed=completed_jobs)) # Process jobs as they complete asynchronously done_job_fs, active_job_fs = yield from asyncio.wait( active_job_fs, timeout=0.10, return_when=FIRST_COMPLETED) for done_job_f in done_job_fs: # Capture a result once the job has finished job_id, succeeded = yield from done_job_f # Release a jobserver token now that this job has succeeded job_server.release(job_id) active_jobs = [j for j in active_jobs if j.jid != job_id] # Generate event with the results of this job event_queue.put( ExecutionEvent('FINISHED_JOB', job_id=job_id, succeeded=succeeded)) # Add the job to the completed list completed_jobs[job_id] = succeeded # Handle failure modes if not succeeded: # Handle different abandoning policies if not continue_on_failure: # Abort all pending jobs if any job fails new_abandoned_jobs = queued_jobs + pending_jobs queued_jobs = [] pending_jobs = [] # Notify that jobs have been abandoned for abandoned_job in new_abandoned_jobs: abandoned_jobs.append(abandoned_job) event_queue.put( ExecutionEvent('ABANDONED_JOB', job_id=abandoned_job.jid, reason='PEER_FAILED', peer_job_id=job_id)) elif not continue_without_deps: unhandled_abandoned_job_ids = [job_id] # Abandon jobs which depend on abandoned jobs while len(unhandled_abandoned_job_ids) > 0: # Get the abandoned job abandoned_job_id = unhandled_abandoned_job_ids.pop(0) # Abandon all pending jobs which depend on this job_id unhandled_abandoned_jobs, pending_jobs = split( pending_jobs, lambda j: abandoned_job_id in j.deps) # Handle each new abandoned job for abandoned_job in unhandled_abandoned_jobs: abandoned_jobs.append(abandoned_job) # Notify if any jobs have been abandoned event_queue.put( ExecutionEvent( 'ABANDONED_JOB', job_id=abandoned_job.jid, reason='DEP_FAILED', direct_dep_job_id=abandoned_job_id, dep_job_id=job_id)) # Add additional job ids to check unhandled_abandoned_job_ids.extend( [j.jid for j in unhandled_abandoned_jobs]) # Update the list of ready jobs (based on completed job dependencies) new_queued_jobs, pending_jobs = split( pending_jobs, lambda j: j.all_deps_completed(completed_jobs)) queued_jobs.extend(new_queued_jobs) # Notify of newly queued jobs for queued_job in new_queued_jobs: event_queue.put( ExecutionEvent('QUEUED_JOB', job_id=queued_job.jid)) # Report running jobs event_queue.put( ExecutionEvent('JOB_STATUS', pending=[j.jid for j in pending_jobs], queued=[j.jid for j in queued_jobs], active=[j.jid for j in active_jobs], abandoned=[j.jid for j in abandoned_jobs], completed=completed_jobs)) return all(completed_jobs.values())