def run(argv, jobinfo, job, runid): # List of pending tasks memstat.stats() tasklist = {} # Keep an extra list of completed tasks completed = {0: 0} # Start the job manager logging.info('Starting job manager jor job %d...', runid) # Create the job manager from the job module jm = job.spits_job_manager_new(argv, jobinfo) jmthread = threading.Thread(target=jobmanager, args=(argv, job, runid, jm, tasklist, completed)) jmthread.start() # Start the committer logging.info('Starting committer for job %d...', runid) # Create the job manager from the job module co = job.spits_committer_new(argv, jobinfo) cothread = threading.Thread(target=committer, args=(argv, job, runid, co, tasklist, completed)) cothread.start() # Wait for both threads jmthread.join() cothread.join() # Commit the job logging.info('Committing Job...') r, res, ctx = job.spits_committer_commit_job(co, 0x12345678) logging.debug('Job committed.') # Finalize the job manager logging.debug('Finalizing Job Manager...') job.spits_job_manager_finalize(jm) # Finalize the committer logging.debug('Finalizing Committer...') job.spits_committer_finalize(co) memstat.stats() if res == None: logging.error('Job did not push any result!') return messaging.res_module_noans, None if ctx != 0x12345678: logging.error('Context verification failed for job!') return messaging.res_module_ctxer, None logging.debug('Job %d finished successfully.', runid) return r, res[0]
def committer(argv, job, runid, co, tasklist, completed): logging.info('Committer running...') memstat.stats() # Load the list of nodes to connect to tmlist = load_tm_list() # Result pulling loop while True: # Reload the list of task managers at each # run so new tms can be added on the fly try: newtmlist = load_tm_list() if len(newtmlist) > 0: tmlist = newtmlist elif len(tmlist) > 0: logging.warning('New list of task managers is ' + 'empty and will not be updated!') except: logging.error('Failed parsing task manager list!') for name, tm in tmlist.items(): logging.debug('Connecting to %s:%d...', tm.address, tm.port) # Open the connection to the task manager and query if it is # possible to send data if not setup_endpoint_for_pulling(tm): continue logging.debug('Pulling tasks from %s:%d...', tm.address, tm.port) # Task pulling loop commit_tasks(job, runid, co, tm, tasklist, completed) memstat.stats() # Close the connection with the task manager tm.Close() logging.debug('Finished pulling tasks from %s:%d.', tm.address, tm.port) if len(tasklist) == 0 and completed[0] == 1: logging.info('All tasks committed.') logging.debug('Committer exiting...') return # Refresh the tasklist for taskid in completed: tasklist.pop(taskid, 0) time.sleep(jm_recv_backoff)
def main(argv): # Print usage if len(argv) <= 1: abort('USAGE: jm module [module args]') # Parse the arguments args = Args.Args(argv[1:]) parse_global_config(args.args) # Setup logging setup_log() logging.debug('Hello!') # Enable memory debugging if jm_memstat == 1: memstat.enable() memstat.stats() # Load the module module = args.margs[0] job = JobBinary(module) # Remove JM arguments when passing to the module margv = args.margs # Keep a run identifier runid = [0] # Wrapper to include job module def run_wrapper(argv, jobinfo): runid[0] = runid[0] + 1 return run(argv, jobinfo, job, runid[0]) # Wrapper for the heartbeat finished = [False] def heartbeat_wrapper(): heartbeat(finished) # Start the heartbeat threading.Thread(target=heartbeat_wrapper).start() # Run the module logging.info('Running module') memstat.stats() r = job.spits_main(margv, run_wrapper) memstat.stats() # Stop the heartbeat thread finished[0] = True # Kill the workers if jm_killtms: killtms() # Print final memory report memstat.stats() # Finalize logging.debug('Bye!')
def jobmanager(argv, job, runid, jm, tasklist, completed): logging.info('Job manager running...') memstat.stats() # Load the list of nodes to connect to tmlist = load_tm_list() # Store some metadata submissions = [] # (taskid, submission time, [sent to]) # Task generation loop taskid = 0 task = None finished = False while True: # Reload the list of task managers at each # run so new tms can be added on the fly try: newtmlist = load_tm_list() if len(newtmlist) > 0: tmlist = newtmlist elif len(tmlist) > 0: logging.warning('New list of task managers is ' + 'empty and will not be updated!') except: logging.error('Failed parsing task manager list!') for name, tm in tmlist.items(): logging.debug('Connecting to %s:%d...', tm.address, tm.port) # Open the connection to the task manager and query if it is # possible to send data if not setup_endpoint_for_pushing(tm): finished = False else: logging.debug('Pushing tasks to %s:%d...', tm.address, tm.port) # Task pushing loop memstat.stats() finished, taskid, task, sent = push_tasks( job, runid, jm, tm, taskid, task, tasklist, completed[0] == 1) # Add the sent tasks to the sumission list submissions = submissions + sent # Close the connection with the task manager tm.Close() logging.debug('Finished pushing tasks to %s:%d.', tm.address, tm.port) if finished and completed[0] == 0: # Tell everyone the task generation was completed logging.info('All tasks generated.') completed[0] = 1 # Exit the job manager when done if len(tasklist) == 0 and completed[0] == 1: logging.debug('Job manager exiting...') return # Keep sending the uncommitted tasks # TODO: WARNING this will flood the system # with repeated tasks if finished and len(tasklist) > 0: if len(submissions) == 0: logging.critical( 'The submission list is empty but ' 'the task list is not! Some tasks were lost!') # Select the oldest task that is not already completed while True: taskid, task = submissions.pop(0) if taskid in tasklist: break # Remove the committed tasks from the submission list submissions = [x for x in submissions if x[0] in tasklist] time.sleep(jm_send_backoff)