class JobComm(StatefulService.StatefulService): ''' JobComm: This thread handles all the AthenaMP related payloadcommunication Pilot/Droid launches AthenaMP and starts listening to its messages. AthenaMP finishes initialization and sends "Ready for events" to Pilot/Droid. Pilot/Droid sends back an event range. AthenaMP sends NWorkers-1 more times "Ready for events" to Pilot/Droid. On each of these messages Pilot/Droid replies with a new event range. After that all AthenaMP workers are busy processing events. Once some AthenaMP worker is available to take the next event range, AthenaMP sends "Ready for events" to Pilot/Droid. Pilot/Droid sends back an event range. Once some output file becomes available, AthenaMP sends full path, RangeID, CPU time and Wall time to Pilot/Droid and does not expect any answer on this message. Here is an example of such message: "/build1/tsulaia/20.3.7.5/run-es/athenaMP-workers-AtlasG4Tf-sim/worker_1/myHITS.pool.root_000.Range-6,ID:Range-6,CPU:1,WALL:1" If Pilot/Droid receives "Ready for events"and there are no more ranges to process, it answers with "No more events". This is a signal for AthenaMP that no more events are expected. In this case AthenaMP waits for all its workers to finish processing current event ranges, reports all outputs back to Pilot/Droid and exits. The event range format is json and is this: [{"eventRangeID": "8848710-3005316503-6391858827-3-10", "LFN":"EVNT.06402143._012906.pool.root.1", "lastEvent": 3, "startEvent": 3, "scope": "mc15_13TeV", "GUID": "63A015D3-789D-E74D-BAA9-9F95DB068EE9"}] ''' WAITING_FOR_JOB = 'WAITING_FOR_JOB' REQUEST_EVENT_RANGES = 'REQUEST_EVENT_RANGES' WAITING_FOR_EVENT_RANGES = 'WAITING_FOR_EVENT_RANGES' MONITORING = 'MONITORING' WAIT_FOR_PAYLOAD_MESSAGE = 'WAIT_FOR_PAYLOAD_MESSAGE' MESSAGE_RECEIVED = 'MESSAGE_RECEIVED' SEND_EVENT_RANGE = 'SEND_EVENT_RANGE' SEND_OUTPUT_FILE = 'SEND_OUTPUT_FILE' EXITED = 'EXITED' STATES = [WAITING_FOR_JOB,WAITING_FOR_EVENT_RANGES, REQUEST_EVENT_RANGES,WAIT_FOR_PAYLOAD_MESSAGE, MESSAGE_RECEIVED,SEND_EVENT_RANGE,SEND_OUTPUT_FILE,EXITED] def __init__(self,config,queues,droid_working_path,droid_output_path,yampl_socket_name): ''' queues: A dictionary of SerialQueue.SerialQueue objects where the JobManager can send messages to other Droid components about errors, etc. config: the ConfigParser handle for yoda droid_working_path: The location of the Droid working area droid_output_path: The location of output files from the Payload ''' # call base class init function super(JobComm,self).__init__() # dictionary of queues for sending messages to Droid components self.queues = queues # configuration of Yoda self.config = config # working path for droid self.working_path = droid_working_path # where output files will be placed if stage_outputs is set to True self.staging_path = droid_output_path # socket name to pass to transform for use when communicating via yampl self.yampl_socket_name = yampl_socket_name # flag to set when all work is done and thread is exiting self.all_work_done = Event() # set some defaults self.debug_message_char_length = 100 self.stage_outputs = False # set initial state self.set_state(self.WAITING_FOR_JOB) def no_more_work(self): return self.all_work_done.is_set() def run(self): ''' this is the function run as a subthread when the user runs jobComm_instance.start() ''' self.read_config() logger.debug('start yampl payloadcommunicator') athpayloadcomm = athena_payloadcommunicator(self.yampl_socket_name) payload_msg = '' # current list of output files to send via MPI output_files = [] last_output_file_mpi_send = time.time() # list of event ranges eventranges = EventRangeList.EventRangeList() no_more_events = False waiting_for_eventranges = False event_range_request_counter = 0 # current panda job that AthenaMP is configured to run current_job = None while not self.exit.is_set(): logger.debug('start loop: state: %s',self.get_state()) # in debug mode, report evenranges status if logger.getEffectiveLevel() == logging.DEBUG: ready_events = eventranges.number_ready() number_completed = eventranges.number_completed() total = len(eventranges) logger.debug('number of ready events %s; number of completed events %s; total events %s',ready_events,number_completed,total) # don't want to hammer Yoda with lots of little messages for output files # so aggregate output files for some time period then send as a group if len(output_files) == 0: last_output_file_mpi_send = time.time() elif (time.time() - last_output_file_mpi_send) > self.aggregate_output_files_time: # send output file data to Yoda/FileManager logger.info('sending %s output files to Yoda/FileManager',len(output_files)) mpi_message = {'type':MessageTypes.OUTPUT_FILE, 'filelist':output_files, 'destination_rank': 0 } self.queues['MPIService'].put(mpi_message) # set time for next send last_output_file_mpi_send = time.time() # reset output file list output_files = [] ################## # WAITING_FOR_JOB: waiting for the job definition to arrive, before # it does, it is assumed that there is no payload running ###################################################################### if self.get_state() == self.WAITING_FOR_JOB: logger.info(' waiting for job definition, blocking on message queue for %s ',self.loop_timeout) try: qmsg = self.queues['JobComm'].get(block=True,timeout=self.loop_timeout) except Queue.Empty: logger.debug('no message on queue') else: # shorten our message for printing if logger.getEffectiveLevel() == logging.DEBUG: tmpmsg = str(qmsg) if len(tmpmsg) > self.debug_message_char_length: tmpslice = slice(0,self.debug_message_char_length) tmpmsg = tmpmsg[tmpslice] + '...' logger.debug('received queue message: %s',tmpmsg) # verify message type is as expected if 'type' not in qmsg or qmsg['type'] != MessageTypes.NEW_JOB or 'job' not in qmsg: logger.error('received unexpected message format: %s',qmsg) else: logger.info('received job definition') current_job = qmsg['job'] # change state self.set_state(self.REQUEST_EVENT_RANGES) qmsg = None ################## # REQUEST_EVENT_RANGES: Request event ranges from Yoda ###################################################################### elif self.get_state() == self.REQUEST_EVENT_RANGES: if not waiting_for_eventranges: logger.info('sending request for event ranges') # send MPI message to Yoda for more event ranges self.request_events(current_job) waiting_for_eventranges = True # change state self.set_state(self.WAITING_FOR_EVENT_RANGES) ################## # WAITING_FOR_EVENT_RANGES: Waiting for event ranges from Yoda ###################################################################### elif self.get_state() == self.WAITING_FOR_EVENT_RANGES: logger.info('waiting for event ranges, blocking on message queue for %s',self.loop_timeout) try: qmsg = self.queues['JobComm'].get(block=True,timeout=self.loop_timeout) except Queue.Empty: logger.debug('no message on queue') else: # shorten our message for printing if logger.getEffectiveLevel() == logging.DEBUG: tmpmsg = str(qmsg) if len(tmpmsg) > self.debug_message_char_length: tmpslice = slice(0,self.debug_message_char_length) tmpmsg = tmpmsg[tmpslice] + '...' logger.debug('received queue message: %s',tmpmsg) if 'type' not in qmsg: logger.error('received unexpected message format: %s',qmsg) elif qmsg['type'] == MessageTypes.NEW_EVENT_RANGES: logger.info('received event ranges, adding to list') eventranges += EventRangeList.EventRangeList(qmsg['eventranges']) # add event ranges to payload messenger list # payloadcomm.add_eventranges(eventranges) # change state self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES: logger.info('no more event ranges for PandaID %s',qmsg['PandaID']) no_more_events = True # check for running events if len(eventranges) == eventranges.number_completed(): logger.info('no eventranges left to send so triggering exit') self.stop() else: logger.info('still have events to process so continuing') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.error('unknown message type: %s',qmsg['type']) waiting_for_eventranges = False qmsg = None ################## # WAIT_FOR_PAYLOAD_MESSAGE: initiates # a request for a message from the payload ###################################################################### if self.get_state() == self.WAIT_FOR_PAYLOAD_MESSAGE: # first check if there is an incoming message try: logger.debug('checking for queue message') qmsg = self.queues['JobComm'].get(block=False) if MessageTypes.NEW_EVENT_RANGES in qmsg['type']: logger.info('received new event range') eventranges += EventRangeList.EventRangeList(qmsg['eventranges']) waiting_for_eventranges = False elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES: logger.info('no more event ranges for PandaID %s',qmsg['PandaID']) no_more_events = True # check for running events if len(eventranges) == eventranges.number_completed(): logger.info('no eventranges left to send so triggering exit') self.stop() else: logger.info('still have events to process so continuing') else: logger.error('received message of unknown type: %s',qmsg) except Queue.Empty: logger.debug('no messages on queue') logger.info('checking for message from payload, block for %s, pending event range requests: %s',self.loop_timeout,event_range_request_counter) payload_msg = athpayloadcomm.recv(self.loop_timeout) if len(payload_msg) > 0: logger.debug('received message: %s',payload_msg) self.set_state(self.MESSAGE_RECEIVED) else: logger.debug('did not receive message from payload') if event_range_request_counter > 0: logger.debug('have %s pending event range requests so will try sending one.',event_range_request_counter) self.set_state(self.SEND_EVENT_RANGE) # time.sleep(self.loop_timeout) ################## # MESSAGE_RECEIVED: this state indicates that a message has been # received from the payload and its meaning will be parsed ###################################################################### elif self.get_state() == self.MESSAGE_RECEIVED: # if ready for events, send them or wait for some if athena_payloadcommunicator.READY_FOR_EVENTS in payload_msg: logger.info('payload is ready for event range') self.set_state(self.SEND_EVENT_RANGE) # increment counter to keep track of how many requests are queued event_range_request_counter += 1 #### OUTPUT File received elif len(payload_msg.split(',')) == 4: # Athena sent details of an output file logger.info('received output file from AthenaMP') self.set_state(self.SEND_OUTPUT_FILE) else: logger.error('failed to parse message from Athena: %s',payload_msg) self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) ################## # SEND_EVENT_RANGE: wait until more event ranges are sent by JobComm ###################################################################### elif self.get_state() == self.SEND_EVENT_RANGE: logger.debug('sending event to payload') # if event ranges available, send one try: logger.debug('have %d ready event ranges to send to AthenaMP',eventranges.number_ready()) local_eventranges = eventranges.get_next() # no more event ranges available except EventRangeList.NoMoreEventRanges: logger.debug('there are no more event ranges to process') # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events if no_more_events: logger.info('sending AthenaMP NO_MORE_EVENTS') athpayloadcomm.send(athena_payloadcommunicator.NO_MORE_EVENTS) # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) # otherwise wait for more events else: logger.info('waiting for more events ranges') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) # something wrong with the index in the EventRangeList index except EventRangeList.RequestedMoreRangesThanAvailable: logger.error('requested more event ranges than available, waiting for more event ranges') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.info('sending %s eventranges to AthenaMP',len(local_eventranges)) # append full path to file name for AthenaMP # and adjust event counter by the number of files # input_files = self.job_def.get()['inFiles'].split(',') # logger.debug('%s: found %s input files',self.prelog,len(input_files)) for evtrg in local_eventranges: evtrg['PFN'] = os.path.join(os.getcwd(),evtrg['LFN']) # send AthenaMP the new event ranges athpayloadcomm.send(serializer.serialize(local_eventranges)) # decrement counter since we sent some events event_range_request_counter -= 1 # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) payload_msg = None ################## # SEND_OUTPUT_FILE: send output file data to MPIService ###################################################################### elif self.get_state() == self.SEND_OUTPUT_FILE: logger.debug('send output file information') # parse message parts = payload_msg.split(',') # there should be four parts: # "myHITS.pool.root_000.Range-6,ID:Range-6,CPU:1,WALL:1" if len(parts) == 4: # parse the parts outputfilename = parts[0] eventrangeid = parts[1].replace('ID:','') cpu = parts[2].replace('CPU:','') wallclock = parts[3].replace('WALL:','') # if staging, stage and change output filename if self.stage_outputs: # move file to staging_path logger.debug('shutil.move(%s,%s)',outputfilename,self.staging_path) shutil.move(outputfilename,self.staging_path) # change output filename outputfilename = os.path.join(self.staging_path,os.path.basename(outputfilename)) logger.info('outputfilename - %s',outputfilename) # build the data for Harvester output file output_file_data = {'type':MessageTypes.OUTPUT_FILE, 'filename':outputfilename, 'eventrangeid':eventrangeid, 'cpu':cpu, 'wallclock':wallclock, 'scope':current_job['scopeOut'], 'pandaid':current_job['PandaID'], 'eventstatus':'finished', 'destination_rank': 0, } # self.output_file_data.set(output_file_data) # append output file data to list of files for transfer via MPI output_files.append(output_file_data) logger.info('received output file from AthenaMP; %s output files now on waiting list',len(output_files)) # set event range to completed: logger.debug('mark event range id %s as completed',output_file_data['eventrangeid']) try: eventranges.mark_completed(output_file_data['eventrangeid']) except Exception: logger.error('failed to mark eventrangeid %s as completed',output_file_data['eventrangeid']) self.stop() # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.error('failed to parse output file') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) payload_msg = None # if ready_events is below the threshold and the no more events flag has not been set # request more event ranges if eventranges.number_ready() < self.get_more_events_threshold and not no_more_events and not waiting_for_eventranges and current_job is not None: logger.info('number of ready events %s below request threshold %s, asking for more.',eventranges.number_ready(),self.get_more_events_threshold) # send MPI message to Yoda for more event ranges self.request_events(current_job) waiting_for_eventranges = True # if the number of completed events equals the number of event ranges # available, and no more events flag is set, then kill subprocess and exit. elif eventranges.number_ready() == 0 and eventranges.number_completed() == len(eventranges) and no_more_events: logger.info('no more events to process, exiting') self.stop() self.all_work_done.set() # else: # logger.info('sleeping for %s',self.loop_timeout) # self.exit.wait(timeout=self.loop_timeout) # send any remaining output files to Yoda before exitingn. # don't want to hammer Yoda with lots of little messages for output files # so aggregate output files for some time period then send as a group if len(output_files) > 0: # send output file data to Yoda/FileManager logger.info('sending %s output files to Yoda/FileManager',len(output_files)) mpi_message = {'type':MessageTypes.OUTPUT_FILE, 'filelist':output_files, 'destination_rank': 0 } self.queues['MPIService'].put(mpi_message) # reset output file list output_files = [] self.set_state(self.EXITED) logger.info('JobComm exiting') def read_config(self): if config_section in self.config: # read loglevel: if 'loglevel' in self.config[config_section]: self.loglevel = self.config[config_section]['loglevel'] logger.info('%s loglevel: %s',config_section,self.loglevel) logger.setLevel(logging.getLevelName(self.loglevel)) else: logger.warning('no "loglevel" in "%s" section of config file, keeping default',config_section) # read loop_timeout: if 'loop_timeout' in self.config[config_section]: self.loop_timeout = int(self.config[config_section]['loop_timeout']) logger.info('%s loop_timeout: %s',config_section,self.loop_timeout) else: logger.warning('no "loop_timeout" in "%s" section of config file, keeping default %s',config_section,self.loop_timeout) # read get_more_events_threshold: if 'get_more_events_threshold' in self.config[config_section]: self.get_more_events_threshold = int(self.config[config_section]['get_more_events_threshold']) logger.info('%s get_more_events_threshold: %s',config_section,self.get_more_events_threshold) else: raise Exception('must specify "get_more_events_threshold" in "%s" section of config file' % config_section) # read aggregate_output_files_time: if 'aggregate_output_files_time' in self.config[config_section]: self.aggregate_output_files_time = int(self.config[config_section]['aggregate_output_files_time']) logger.info('%s aggregate_output_files_time: %s',config_section,self.aggregate_output_files_time) else: raise Exception('must specify "aggregate_output_files_time" in "%s" section of config file' % config_section) # read debug_message_char_length: if 'debug_message_char_length' in self.config[config_section]: self.debug_message_char_length = int(self.config[config_section]['debug_message_char_length']) logger.info('%s debug_message_char_length: %s',config_section,self.debug_message_char_length) else: logger.warning('no "debug_message_char_length" in "%s" section of config file, using default %s',config_section,self.debug_message_char_length) # read stage_outputs: if 'stage_outputs' in self.config[config_section]: self.stage_outputs = self.get_boolean(self.config[config_section]['stage_outputs']) logger.info('%s stage_outputs: %s',config_section,self.stage_outputs) else: logger.warning('no "stage_outputs" in "%s" section of config file, using default %s',config_section,self.stage_outputs) else: raise Exception('no %s section in the configuration' % config_section) def get_boolean(self,string): if 'true' in string.lower(): return True return False def request_events(self,current_job): msg = { 'type':MessageTypes.REQUEST_EVENT_RANGES, 'PandaID':current_job['PandaID'], 'taskID':current_job['taskID'], 'jobsetID':current_job['jobsetID'], 'destination_rank': 0, # YODA rank } self.queues['MPIService'].put(msg) def send_output_file(self,payload_msg,current_job,eventranges,output_files): logger.debug('sending output file information') # parse message parts = payload_msg.split(',') # there should be four parts: # "myHITS.pool.root_000.Range-6,ID:Range-6,CPU:1,WALL:1" if len(parts) == 4: # parse the parts outputfilename = parts[0] eventrangeid = parts[1].replace('ID:','') cpu = parts[2].replace('CPU:','') wallclock = parts[3].replace('WALL:','') # if staging, stage and change output filename if self.stage_outputs: # move file to staging_path logger.debug('shutil.move(%s,%s)',outputfilename,self.staging_path) shutil.move(outputfilename,self.staging_path) # change output filename outputfilename = os.path.join(self.staging_path,os.path.basename(outputfilename)) logger.info('outputfilename - %s',outputfilename) # build the data for Harvester output file output_file_data = {'type':MessageTypes.OUTPUT_FILE, 'filename':outputfilename, 'eventrangeid':eventrangeid, 'cpu':cpu, 'wallclock':wallclock, 'scope':current_job['scopeOut'], 'pandaid':current_job['PandaID'], 'eventstatus':'finished', 'destination_rank': 0, } # self.output_file_data.set(output_file_data) # append output file data to list of files for transfer via MPI output_files.append(output_file_data) logger.info('received output file from AthenaMP; %s output files now on waiting list',len(output_files)) # return to state requesting a message # self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) # set event range to completed: logger.debug('mark event range id %s as completed',output_file_data['eventrangeid']) try: eventranges.mark_completed(output_file_data['eventrangeid']) except Exception: logger.error('failed to mark eventrangeid %s as completed',output_file_data['eventrangeid']) self.stop() else: logger.error('failed to parse output file') def send_eventrange(self,eventranges,athpayloadcomm,no_more_events): logger.debug('sending event to payload') # if event ranges available, send one try: logger.debug('have %d ready event ranges to send to AthenaMP',eventranges.number_ready()) local_eventranges = eventranges.get_next() # no more event ranges available except EventRangeList.NoMoreEventRanges: logger.debug('there are no more event ranges to process') # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events if no_more_events: logger.info('sending AthenaMP NO_MORE_EVENTS') athpayloadcomm.send(athena_payloadcommunicator.NO_MORE_EVENTS) else: # otherwise, raise the Exception to trigger an event request raise # something wrong with the index in the EventRangeList index except EventRangeList.RequestedMoreRangesThanAvailable: logger.error('requested more event ranges than available, going to try waiting for more events') raise EventRangeList.NoMoreEventRanges() else: logger.info('sending eventranges to AthenaMP: %s',local_eventranges) # append full path to file name for AthenaMP # and adjust event counter by the number of files # input_files = self.job_def.get()['inFiles'].split(',') # logger.debug('found %s input files',len(input_files)) for evtrg in local_eventranges: evtrg['PFN'] = os.path.join(os.getcwd(),evtrg['LFN']) # send AthenaMP the new event ranges athpayloadcomm.send(serializer.serialize(local_eventranges))
class FileManager(Process): IDLE = 'IDLE' STAGE_OUT = 'STAGE_OUT' STAGING_OUT = 'STAGING_OUT' EXITED = 'EXITED' STATES = [IDLE, STAGE_OUT, STAGING_OUT, EXITED] def __init__(self, config, queues, yoda_working_path, harvester_messenger): super(FileManager, self).__init__() # dictionary of queues for sending messages to Droid components self.queues = queues # configuration of Yoda self.config = config # this is used to trigger the thread exit self.exit = Event() # this is the working directory of yoda self.yoda_working_path = yoda_working_path # harvester communication module self.harvester_messenger = harvester_messenger # default harvester_output_timeout self.harvester_output_timeout = 10 def stop(self): """ this function can be called by outside threads to cause the JobManager thread to exit. """ self.exit.set() def run(self): logger.debug('starting thread') # read configuration info from file self.read_config() local_filelist = [] last_check = time.time() while not self.exit.is_set(): logger.debug('starting loop, local_filelist size: %s', len(local_filelist)) # process incoming messages try: qmsg = self.queues['FileManager'].get(timeout=self.loop_timeout) except Queue.Empty: logger.debug('queue is empty') else: logger.debug('message received: %s', qmsg) if qmsg['type'] == MessageTypes.OUTPUT_FILE: local_filelist += qmsg['filelist'] logger.info('received output file, waiting list contains %s files', len(local_filelist)) # I don't want to constantly check to see if the output file exists # so I'll only check every few seconds if time.time() - last_check > self.harvester_output_timeout: last_check = time.time() # if an output file already exists, # wait for harvester to read in file, so add file list to # local running list if not self.harvester_messenger.stage_out_file_exists(): # add file to Harvester stage out logger.info('staging %s files to Harvester', len(local_filelist)) self.harvester_messenger.stage_out_files( local_filelist, self.output_file_type ) local_filelist = [] else: logger.warning( 'Harvester has not yet consumed output files, currently waiting to dump %s output files', len(local_filelist)) else: logger.error('message type not recognized') if local_filelist: logger.info('staging %s files to Harvester', len(local_filelist)) self.harvester_messenger.stage_out_files(local_filelist, self.output_file_type) # exit logger.info('FileManager exiting') def read_config(self): if config_section in self.config: # read log level: if 'loglevel' in self.config[config_section]: self.loglevel = self.config[config_section]['loglevel'] logger.info('%s loglevel: %s', config_section, self.loglevel) logger.setLevel(logging.getLevelName(self.loglevel)) else: logger.warning('no "loglevel" in "%s" section of config file, keeping default', config_section) # read droid loop timeout: if 'loop_timeout' in self.config[config_section]: self.loop_timeout = int(self.config[config_section]['loop_timeout']) logger.info('%s loop_timeout: %s', config_section, self.loop_timeout) else: logger.warning('no "loop_timeout" in "%s" section of config file, keeping default %s', config_section, self.loop_timeout) # read harvester_output_timeout: if 'harvester_output_timeout' in self.config[config_section]: self.harvester_output_timeout = int(self.config[config_section]['harvester_output_timeout']) logger.info('%s harvester_output_timeout: %s', config_section, self.harvester_output_timeout) else: logger.warning('no "harvester_output_timeout" in "%s" section of config file, keeping default %s', config_section, self.harvester_output_timeout) # read output_file_type: if 'output_file_type' in self.config[config_section]: self.output_file_type = self.config[config_section]['output_file_type'] logger.info('%s output_file_type: %s', config_section, self.output_file_type) else: logger.error('no "output_file_type" in "%s" section of config file, keeping default %s', config_section, self.output_file_type) raise Exception( 'must specify "output_file_type" in %s section of config file. Typically set to "es_output"' % config_section) else: raise Exception('no %s section in the configuration' % config_section)
class JobComm(StatefulService.StatefulService): # noqa: C901 """ JobComm: This thread handles all the AthenaMP related payloadcommunication Pilot/Droid launches AthenaMP and starts listening to its messages. AthenaMP finishes initialization and sends "Ready for events" to Pilot/Droid. Pilot/Droid sends back an event range. AthenaMP sends NWorkers-1 more times "Ready for events" to Pilot/Droid. On each of these messages Pilot/Droid replies with a new event range. After that all AthenaMP workers are busy processing events. Once some AthenaMP worker is available to take the next event range, AthenaMP sends "Ready for events" to Pilot/Droid. Pilot/Droid sends back an event range. Once some output file becomes available, AthenaMP sends full path, RangeID, CPU time and Wall time to Pilot/Droid and does not expect any answer on this message. Here is an example of such message: "/build1/tsulaia/20.3.7.5/run-es/athenaMP-workers-AtlasG4Tf-sim/worker_1/myHITS.pool.root_000.Range-6,ID:Range-6, CPU:1,WALL:1" If Pilot/Droid receives "Ready for events"and there are no more ranges to process, it answers with "No more events". This is a signal for AthenaMP that no more events are expected. In this case AthenaMP waits for all its workers to finish processing current event ranges, reports all outputs back to Pilot/Droid and exits. The event range format is json and is this: [{"eventRangeID": "8848710-3005316503-6391858827-3-10", "LFN":"EVNT.06402143._012906.pool.root.1", "lastEvent": 3, "startEvent": 3, "scope": "mc15_13TeV", "GUID": "63A015D3-789D-E74D-BAA9-9F95DB068EE9"}] """ WAITING_FOR_JOB = 'WAITING_FOR_JOB' REQUEST_EVENT_RANGES = 'REQUEST_EVENT_RANGES' WAITING_FOR_EVENT_RANGES = 'WAITING_FOR_EVENT_RANGES' MONITORING = 'MONITORING' WAIT_FOR_PAYLOAD_MESSAGE = 'WAIT_FOR_PAYLOAD_MESSAGE' MESSAGE_RECEIVED = 'MESSAGE_RECEIVED' SEND_EVENT_RANGE = 'SEND_EVENT_RANGE' SEND_OUTPUT_FILE = 'SEND_OUTPUT_FILE' EXITED = 'EXITED' STATES = [ WAITING_FOR_JOB, WAITING_FOR_EVENT_RANGES, REQUEST_EVENT_RANGES, WAIT_FOR_PAYLOAD_MESSAGE, MESSAGE_RECEIVED, SEND_EVENT_RANGE, SEND_OUTPUT_FILE, EXITED ] def __init__(self, config, queues, droid_working_path, droid_output_path, yampl_socket_name): """ queues: A dictionary of SerialQueue.SerialQueue objects where the JobManager can send messages to other Droid components about errors, etc. config: the ConfigParser handle for yoda droid_working_path: The location of the Droid working area droid_output_path: The location of output files from the Payload """ # call base class init function super(JobComm, self).__init__() # dictionary of queues for sending messages to Droid components self.queues = queues # configuration of Yoda self.config = config # working path for droid self.working_path = droid_working_path # where output files will be placed if stage_outputs is set to True self.staging_path = droid_output_path # socket name to pass to transform for use when communicating via yampl self.yampl_socket_name = yampl_socket_name # flag to set when all work is done and thread is exiting self.all_work_done = Event() # set some defaults self.debug_message_char_length = 100 self.stage_outputs = False # set initial state self.set_state(self.WAITING_FOR_JOB) # to be set self.loglevel = None def no_more_work(self): return self.all_work_done.is_set() def run(self): # noqa: C901 """ this is the function run as a subthread when the user runs jobComm_instance.start() """ self.read_config() logger.debug('start yampl payloadcommunicator') athpayloadcomm = AthenaPayloadCommunicator(self.yampl_socket_name) payload_msg = '' # current list of output files to send via MPI output_files = [] last_output_file_mpi_send = time.time() # list of event ranges eventranges = EventRangeList.EventRangeList() no_more_events = False waiting_for_eventranges = False event_range_request_counter = 0 # current panda job that AthenaMP is configured to run current_job = None while not self.exit.is_set(): logger.debug('start loop: state: %s', self.get_state()) # in debug mode, report evenranges status if logger.getEffectiveLevel() == logging.DEBUG: ready_events = eventranges.number_ready() number_completed = eventranges.number_completed() total = len(eventranges) logger.debug( 'number of ready events %s; number of completed events %s; total events %s', ready_events, number_completed, total) # don't want to hammer Yoda with lots of little messages for output files # so aggregate output files for some time period then send as a group if len(output_files) == 0: last_output_file_mpi_send = time.time() elif (time.time() - last_output_file_mpi_send ) > self.aggregate_output_files_time: # send output file data to Yoda/FileManager logger.info('sending %s output files to Yoda/FileManager', len(output_files)) mpi_message = { 'type': MessageTypes.OUTPUT_FILE, 'filelist': output_files, 'destination_rank': 0 } self.queues['MPIService'].put(mpi_message) # set time for next send last_output_file_mpi_send = time.time() # reset output file list output_files = [] ################## # WAITING_FOR_JOB: waiting for the job definition to arrive, before # it does, it is assumed that there is no payload running ###################################################################### if self.get_state() == self.WAITING_FOR_JOB: logger.info( ' waiting for job definition, blocking on message queue for %s ', self.loop_timeout) try: qmsg = self.queues['JobComm'].get( block=True, timeout=self.loop_timeout) except Queue.Empty: logger.debug('no message on queue') else: # shorten our message for printing if logger.getEffectiveLevel() == logging.DEBUG: tmpmsg = str(qmsg) if len(tmpmsg) > self.debug_message_char_length: tmpslice = slice(0, self.debug_message_char_length) tmpmsg = tmpmsg[tmpslice] + '...' logger.debug('received queue message: %s', tmpmsg) # verify message type is as expected if 'type' not in qmsg or qmsg[ 'type'] != MessageTypes.NEW_JOB or 'job' not in qmsg: logger.error('received unexpected message format: %s', qmsg) else: logger.info('received job definition') current_job = qmsg['job'] # change state self.set_state(self.REQUEST_EVENT_RANGES) qmsg = None ################## # REQUEST_EVENT_RANGES: Request event ranges from Yoda ###################################################################### elif self.get_state() == self.REQUEST_EVENT_RANGES: if not waiting_for_eventranges: logger.info('sending request for event ranges') # send MPI message to Yoda for more event ranges self.request_events(current_job) waiting_for_eventranges = True # change state self.set_state(self.WAITING_FOR_EVENT_RANGES) ################## # WAITING_FOR_EVENT_RANGES: Waiting for event ranges from Yoda ###################################################################### elif self.get_state() == self.WAITING_FOR_EVENT_RANGES: logger.info( 'waiting for event ranges, blocking on message queue for %s', self.loop_timeout) try: qmsg = self.queues['JobComm'].get( block=True, timeout=self.loop_timeout) except Queue.Empty: logger.debug('no message on queue') else: # shorten our message for printing if logger.getEffectiveLevel() == logging.DEBUG: tmpmsg = str(qmsg) if len(tmpmsg) > self.debug_message_char_length: tmpslice = slice(0, self.debug_message_char_length) tmpmsg = tmpmsg[tmpslice] + '...' logger.debug('received queue message: %s', tmpmsg) if 'type' not in qmsg: logger.error('received unexpected message format: %s', qmsg) elif qmsg['type'] == MessageTypes.NEW_EVENT_RANGES: logger.info('received event ranges, adding to list') eventranges += EventRangeList.EventRangeList( qmsg['eventranges']) # add event ranges to payload messenger list # payloadcomm.add_eventranges(eventranges) # change state self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES: logger.info('no more event ranges for PandaID %s', qmsg['PandaID']) no_more_events = True # check for running events if len(eventranges) == eventranges.number_completed(): logger.info( 'no eventranges left to send so triggering exit' ) self.stop() else: logger.info( 'still have events to process so continuing') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.error('unknown message type: %s', qmsg['type']) waiting_for_eventranges = False qmsg = None ################## # WAIT_FOR_PAYLOAD_MESSAGE: initiates # a request for a message from the payload ###################################################################### if self.get_state() == self.WAIT_FOR_PAYLOAD_MESSAGE: # first check if there is an incoming message try: logger.debug('checking for queue message') qmsg = self.queues['JobComm'].get(block=False) if MessageTypes.NEW_EVENT_RANGES in qmsg['type']: logger.info('received new event range') eventranges += EventRangeList.EventRangeList( qmsg['eventranges']) waiting_for_eventranges = False elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES: logger.info('no more event ranges for PandaID %s', qmsg['PandaID']) no_more_events = True # check for running events if len(eventranges) == eventranges.number_completed(): logger.info( 'no eventranges left to send so triggering exit' ) self.stop() else: logger.info( 'still have events to process so continuing') else: logger.error('received message of unknown type: %s', qmsg) except Queue.Empty: logger.debug('no messages on queue') logger.info( 'checking for message from payload, block for %s, pending event range requests: %s', self.loop_timeout, event_range_request_counter) payload_msg = athpayloadcomm.recv(self.loop_timeout) if len(payload_msg) > 0: logger.debug('received message: %s', payload_msg) self.set_state(self.MESSAGE_RECEIVED) else: logger.debug('did not receive message from payload') if event_range_request_counter > 0: logger.debug( 'have %s pending event range requests so will try sending one.', event_range_request_counter) self.set_state(self.SEND_EVENT_RANGE) # time.sleep(self.loop_timeout) ################## # MESSAGE_RECEIVED: this state indicates that a message has been # received from the payload and its meaning will be parsed ###################################################################### elif self.get_state() == self.MESSAGE_RECEIVED: # if ready for events, send them or wait for some if AthenaPayloadCommunicator.READY_FOR_EVENTS in payload_msg: logger.info('payload is ready for event range') self.set_state(self.SEND_EVENT_RANGE) # increment counter to keep track of how many requests are queued event_range_request_counter += 1 #### OUTPUT File received elif len(payload_msg.split(',')) == 4: # Athena sent details of an output file logger.info('received output file from AthenaMP') self.set_state(self.SEND_OUTPUT_FILE) else: logger.error('failed to parse message from Athena: %s', payload_msg) self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) ################## # SEND_EVENT_RANGE: wait until more event ranges are sent by JobComm ###################################################################### elif self.get_state() == self.SEND_EVENT_RANGE: logger.debug('sending event to payload') # if event ranges available, send one try: logger.debug( 'have %d ready event ranges to send to AthenaMP', eventranges.number_ready()) local_eventranges = eventranges.get_next() # no more event ranges available except EventRangeList.NoMoreEventRanges: logger.debug('there are no more event ranges to process') # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events if no_more_events: logger.info('sending AthenaMP NO_MORE_EVENTS') athpayloadcomm.send( AthenaPayloadCommunicator.NO_MORE_EVENTS) # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) # otherwise wait for more events else: logger.info('waiting for more events ranges') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) # something wrong with the index in the EventRangeList index except EventRangeList.RequestedMoreRangesThanAvailable: logger.error( 'requested more event ranges than available, waiting for more event ranges' ) self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.info('sending %s eventranges to AthenaMP', len(local_eventranges)) # append full path to file name for AthenaMP # and adjust event counter by the number of files # input_files = self.job_def.get()['inFiles'].split(',') # logger.debug('%s: found %s input files',self.prelog,len(input_files)) for evtrg in local_eventranges: evtrg['PFN'] = os.path.join(os.getcwd(), evtrg['LFN']) # send AthenaMP the new event ranges athpayloadcomm.send( serializer.serialize(local_eventranges)) # decrement counter since we sent some events event_range_request_counter -= 1 # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) payload_msg = None ################## # SEND_OUTPUT_FILE: send output file data to MPIService ###################################################################### elif self.get_state() == self.SEND_OUTPUT_FILE: logger.debug('send output file information') # parse message parts = payload_msg.split(',') # there should be four parts: # "myHITS.pool.root_000.Range-6,ID:Range-6,CPU:1,WALL:1" if len(parts) == 4: # parse the parts outputfilename = parts[0] eventrangeid = parts[1].replace('ID:', '') cpu = parts[2].replace('CPU:', '') wallclock = parts[3].replace('WALL:', '') # if staging, stage and change output filename if self.stage_outputs: # move file to staging_path logger.debug('shutil.move(%s,%s)', outputfilename, self.staging_path) shutil.move(outputfilename, self.staging_path) # change output filename outputfilename = os.path.join( self.staging_path, os.path.basename(outputfilename)) logger.info('outputfilename - %s', outputfilename) # build the data for Harvester output file output_file_data = { 'type': MessageTypes.OUTPUT_FILE, 'filename': outputfilename, 'eventrangeid': eventrangeid, 'cpu': cpu, 'wallclock': wallclock, 'scope': current_job['scopeOut'], 'pandaid': current_job['PandaID'], 'eventstatus': 'finished', 'destination_rank': 0, } # self.output_file_data.set(output_file_data) # append output file data to list of files for transfer via MPI output_files.append(output_file_data) logger.info( 'received output file from AthenaMP; %s output files now on waiting list', len(output_files)) # set event range to completed: logger.debug('mark event range id %s as completed', output_file_data['eventrangeid']) try: eventranges.mark_completed( output_file_data['eventrangeid']) except Exception: logger.error( 'failed to mark eventrangeid %s as completed', output_file_data['eventrangeid']) self.stop() # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.error('failed to parse output file') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) payload_msg = None # if ready_events is below the threshold and the no more events flag has not been set # request more event ranges if eventranges.number_ready( ) < self.get_more_events_threshold and not no_more_events and not waiting_for_eventranges and current_job is not None: logger.info( 'number of ready events %s below request threshold %s, asking for more.', eventranges.number_ready(), self.get_more_events_threshold) # send MPI message to Yoda for more event ranges self.request_events(current_job) waiting_for_eventranges = True # if the number of completed events equals the number of event ranges # available, and no more events flag is set, then kill subprocess and exit. elif eventranges.number_ready( ) == 0 and eventranges.number_completed() == len( eventranges) and no_more_events: logger.info('no more events to process, exiting') self.stop() self.all_work_done.set() # else: # logger.info('sleeping for %s',self.loop_timeout) # self.exit.wait(timeout=self.loop_timeout) # send any remaining output files to Yoda before exitingn. # don't want to hammer Yoda with lots of little messages for output files # so aggregate output files for some time period then send as a group if len(output_files) > 0: # send output file data to Yoda/FileManager logger.info('sending %s output files to Yoda/FileManager', len(output_files)) mpi_message = { 'type': MessageTypes.OUTPUT_FILE, 'filelist': output_files, 'destination_rank': 0 } self.queues['MPIService'].put(mpi_message) # reset output file list output_files = [] self.set_state(self.EXITED) logger.info('JobComm exiting') def read_config(self): if config_section in self.config: # read loglevel: if 'loglevel' in self.config[config_section]: self.loglevel = self.config[config_section]['loglevel'] logger.info('%s loglevel: %s', config_section, self.loglevel) logger.setLevel(logging.getLevelName(self.loglevel)) else: logger.warning( 'no "loglevel" in "%s" section of config file, keeping default', config_section) # read loop_timeout: if 'loop_timeout' in self.config[config_section]: self.loop_timeout = int( self.config[config_section]['loop_timeout']) logger.info('%s loop_timeout: %s', config_section, self.loop_timeout) else: logger.warning( 'no "loop_timeout" in "%s" section of config file, keeping default %s', config_section, self.loop_timeout) # read get_more_events_threshold: if 'get_more_events_threshold' in self.config[config_section]: self.get_more_events_threshold = int( self.config[config_section]['get_more_events_threshold']) logger.info('%s get_more_events_threshold: %s', config_section, self.get_more_events_threshold) else: raise Exception( 'must specify "get_more_events_threshold" in "%s" section of config file' % config_section) # read aggregate_output_files_time: if 'aggregate_output_files_time' in self.config[config_section]: self.aggregate_output_files_time = int( self.config[config_section]['aggregate_output_files_time']) logger.info('%s aggregate_output_files_time: %s', config_section, self.aggregate_output_files_time) else: raise Exception( 'must specify "aggregate_output_files_time" in "%s" section of config file' % config_section) # read debug_message_char_length: if 'debug_message_char_length' in self.config[config_section]: self.debug_message_char_length = int( self.config[config_section]['debug_message_char_length']) logger.info('%s debug_message_char_length: %s', config_section, self.debug_message_char_length) else: logger.warning( 'no "debug_message_char_length" in "%s" section of config file, using default %s', config_section, self.debug_message_char_length) # read stage_outputs: if 'stage_outputs' in self.config[config_section]: self.stage_outputs = self.get_boolean( self.config[config_section]['stage_outputs']) logger.info('%s stage_outputs: %s', config_section, self.stage_outputs) else: logger.warning( 'no "stage_outputs" in "%s" section of config file, using default %s', config_section, self.stage_outputs) else: raise Exception('no %s section in the configuration' % config_section) def get_boolean(self, string): if 'true' in string.lower(): return True return False def request_events(self, current_job): msg = { 'type': MessageTypes.REQUEST_EVENT_RANGES, 'PandaID': current_job['PandaID'], 'taskID': current_job['taskID'], 'jobsetID': current_job['jobsetID'], 'destination_rank': 0, # YODA rank } self.queues['MPIService'].put(msg) def send_output_file(self, payload_msg, current_job, eventranges, output_files): logger.debug('sending output file information') # parse message parts = payload_msg.split(',') # there should be four parts: # "myHITS.pool.root_000.Range-6,ID:Range-6,CPU:1,WALL:1" if len(parts) == 4: # parse the parts outputfilename = parts[0] eventrangeid = parts[1].replace('ID:', '') cpu = parts[2].replace('CPU:', '') wallclock = parts[3].replace('WALL:', '') # if staging, stage and change output filename if self.stage_outputs: # move file to staging_path logger.debug('shutil.move(%s,%s)', outputfilename, self.staging_path) shutil.move(outputfilename, self.staging_path) # change output filename outputfilename = os.path.join(self.staging_path, os.path.basename(outputfilename)) logger.info('outputfilename - %s', outputfilename) # build the data for Harvester output file output_file_data = { 'type': MessageTypes.OUTPUT_FILE, 'filename': outputfilename, 'eventrangeid': eventrangeid, 'cpu': cpu, 'wallclock': wallclock, 'scope': current_job['scopeOut'], 'pandaid': current_job['PandaID'], 'eventstatus': 'finished', 'destination_rank': 0, } # self.output_file_data.set(output_file_data) # append output file data to list of files for transfer via MPI output_files.append(output_file_data) logger.info( 'received output file from AthenaMP; %s output files now on waiting list', len(output_files)) # return to state requesting a message # self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) # set event range to completed: logger.debug('mark event range id %s as completed', output_file_data['eventrangeid']) try: eventranges.mark_completed(output_file_data['eventrangeid']) except Exception: logger.error('failed to mark eventrangeid %s as completed', output_file_data['eventrangeid']) self.stop() else: logger.error('failed to parse output file') def send_eventrange(self, eventranges, athpayloadcomm, no_more_events): logger.debug('sending event to payload') # if event ranges available, send one try: logger.debug('have %d ready event ranges to send to AthenaMP', eventranges.number_ready()) local_eventranges = eventranges.get_next() # no more event ranges available except EventRangeList.NoMoreEventRanges: logger.debug('there are no more event ranges to process') # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events if no_more_events: logger.info('sending AthenaMP NO_MORE_EVENTS') athpayloadcomm.send(AthenaPayloadCommunicator.NO_MORE_EVENTS) else: # otherwise, raise the Exception to trigger an event request raise # something wrong with the index in the EventRangeList index except EventRangeList.RequestedMoreRangesThanAvailable: logger.error( 'requested more event ranges than available, going to try waiting for more events' ) raise EventRangeList.NoMoreEventRanges() else: logger.info('sending eventranges to AthenaMP: %s', local_eventranges) # append full path to file name for AthenaMP # and adjust event counter by the number of files # input_files = self.job_def.get()['inFiles'].split(',') # logger.debug('found %s input files',len(input_files)) for evtrg in local_eventranges: evtrg['PFN'] = os.path.join(os.getcwd(), evtrg['LFN']) # send AthenaMP the new event ranges athpayloadcomm.send(serializer.serialize(local_eventranges))
class FileManager(Process): IDLE = 'IDLE' STAGE_OUT = 'STAGE_OUT' STAGING_OUT = 'STAGING_OUT' EXITED = 'EXITED' STATES = [IDLE,STAGE_OUT,STAGING_OUT,EXITED] def __init__(self,config,queues,yoda_working_path,harvester_messenger): super(FileManager,self).__init__() # dictionary of queues for sending messages to Droid components self.queues = queues # configuration of Yoda self.config = config # this is used to trigger the thread exit self.exit = Event() # this is the working directory of yoda self.yoda_working_path = yoda_working_path # harvester communication module self.harvester_messenger = harvester_messenger # default harvester_output_timeout self.harvester_output_timeout = 10 def stop(self): ''' this function can be called by outside threads to cause the JobManager thread to exit''' self.exit.set() def run(self): logger.debug('starting thread') # read configuration info from file self.read_config() local_filelist = [] last_check = time.time() while not self.exit.is_set(): logger.debug('starting loop, local_filelist size: %s',len(local_filelist)) # process incoming messages try: qmsg = self.queues['FileManager'].get(timeout=self.loop_timeout) except Queue.Empty: logger.debug('queue is empty') else: logger.debug('message received: %s',qmsg) if qmsg['type'] == MessageTypes.OUTPUT_FILE: local_filelist += qmsg['filelist'] logger.info('received output file, waiting list contains %s files',len(local_filelist)) # I don't want to constantly check to see if the output file exists # so I'll only check every few seconds if time.time() - last_check > self.harvester_output_timeout: last_check = time.time() # if an output file already exists, # wait for harvester to read in file, so add file list to # local running list if not self.harvester_messenger.stage_out_file_exists(): # add file to Harvester stage out logger.info('staging %s files to Harvester',len(local_filelist)) self.harvester_messenger.stage_out_files( local_filelist, self.output_file_type ) local_filelist = [] else: logger.warning('Harvester has not yet consumed output files, currently waiting to dump %s output files',len(local_filelist)) else: logger.error('message type not recognized') if local_filelist: logger.info('staging %s files to Harvester',len(local_filelist)) self.harvester_messenger.stage_out_files(local_filelist, self.output_file_type) # exit logger.info('FileManager exiting') def read_config(self): if config_section in self.config: # read log level: if 'loglevel' in self.config[config_section]: self.loglevel = self.config[config_section]['loglevel'] logger.info('%s loglevel: %s',config_section,self.loglevel) logger.setLevel(logging.getLevelName(self.loglevel)) else: logger.warning('no "loglevel" in "%s" section of config file, keeping default',config_section) # read droid loop timeout: if 'loop_timeout' in self.config[config_section]: self.loop_timeout = int(self.config[config_section]['loop_timeout']) logger.info('%s loop_timeout: %s',config_section,self.loop_timeout) else: logger.warning('no "loop_timeout" in "%s" section of config file, keeping default %s',config_section,self.loop_timeout) # read harvester_output_timeout: if 'harvester_output_timeout' in self.config[config_section]: self.harvester_output_timeout = int(self.config[config_section]['harvester_output_timeout']) logger.info('%s harvester_output_timeout: %s',config_section,self.harvester_output_timeout) else: logger.warning('no "harvester_output_timeout" in "%s" section of config file, keeping default %s',config_section,self.harvester_output_timeout) # read output_file_type: if 'output_file_type' in self.config[config_section]: self.output_file_type = self.config[config_section]['output_file_type'] logger.info('%s output_file_type: %s',config_section,self.output_file_type) else: logger.error('no "output_file_type" in "%s" section of config file, keeping default %s',config_section,self.output_file_type) raise Exception('must specify "output_file_type" in %s section of config file. Typically set to "es_output"' % config_section) else: raise Exception('no %s section in the configuration' % config_section)
class Yoda(Process): def __init__(self,queues,config,rank,worldsize): ''' config: configuration of Yoda ''' # call Thread constructor super(Yoda,self).__init__() # message queues self.queues = queues # rank number self.rank = rank # world size self.worldsize = worldsize # config settings self.config = config # keep track of if the wallclock has expired self.wallclock_expired = Event() # this is used to trigger the thread exit self.exit = Event() def stop(self): ''' this function can be called by outside threads to cause the Yoda thread to exit''' self.exit.set() # this runs when 'yoda_instance.start()' is called def run(self): ''' this is the function called when the user runs yoda_instance.start() ''' try: self.subrun() except Exception: logger.exception('Yoda failed with uncaught exception') raise def subrun(self): ''' this function is the business logic, but wrapped in exception ''' self.read_config() # set logging level logger.info('Yoda Thread starting') logger.info('loglevel: %s',self.loglevel) logger.info('loop_timeout: %s',self.loop_timeout) top_working_path = os.getcwd() logger.debug('cwd: %s',top_working_path) # setup harvester messenger to share with FileManager and WorkManager logger.debug('setup harvester messenger') harvester_messenger = self.get_harvester_messenger() harvester_messenger.setup(self.config) # wait for setup to complete harvester_messenger.sfm_har_config_done.wait() # a list of ranks that have exited self.exited_droids = [] # a dictionary of subthreads subthreads = {} # create WorkManager thread subthreads['WorkManager'] = WorkManager.WorkManager(self.config,self.queues,harvester_messenger) subthreads['WorkManager'].start() # create FileManager thread subthreads['FileManager'] = FileManager.FileManager(self.config,self.queues,top_working_path,harvester_messenger) subthreads['FileManager'].start() # start message loop while not self.exit.is_set(): logger.debug('start loop') # process incoming messages from other threads or ranks self.process_incoming_messages() # check if all droids have exited if len(self.exited_droids) >= (self.worldsize - 1): logger.info('all droids have exited, exiting yoda') self.stop() break # check the status of each subthread logger.debug('checking all threads still alive') keys = subthreads.keys() for name in keys: thread = subthreads[name] # if the thread is not alive, throw an error if not thread.is_alive(): logger.warning('%s is no longer running.',name) del subthreads[name] if name == 'WorkManager': self.stop() continue # else: # logger.debug('%s %s is running.',self.prelog,name) if len(subthreads) == 0: logger.info('no subthreads remaining, exiting') self.stop() break if self.queues['Yoda'].empty(): logger.debug('sleeping %s',self.loop_timeout) self.exit.wait(timeout=self.loop_timeout) # send the exit signal to all droid ranks logger.info('sending exit signal to droid ranks') for ranknum in range(1,self.worldsize): if ranknum not in self.exited_droids: if self.wallclock_expired.is_set(): self.queues['MPIService'].put({'type':MessageTypes.WALLCLOCK_EXPIRING,'destination_rank':ranknum}) else: self.queues['MPIService'].put({'type':MessageTypes.DROID_EXIT,'destination_rank':ranknum}) # send the exit signal to all subthreads for name,thread in subthreads.iteritems(): logger.info('sending exit signal to %s',name) thread.stop() # wait for sub threads to exit for name,thread in subthreads.iteritems(): logger.info('waiting for %s to join',name) thread.join() logger.info('%s has joined',name) while not self.queues['MPIService'].empty(): logger.info('waiting for MPIService to send exit messages to Droid, sleep for %s',self.loop_timeout) time.sleep(self.loop_timeout) logger.info('Yoda is exiting') def read_config(self): if config_section in self.config: # read log level: if 'loglevel' in self.config[config_section]: self.loglevel = self.config[config_section]['loglevel'] logger.info('%s loglevel: %s',config_section,self.loglevel) logger.setLevel(logging.getLevelName(self.loglevel)) else: logger.warning('no "loglevel" in "%s" section of config file, keeping default',config_section) # read loop timeout: if 'loop_timeout' in self.config[config_section]: self.loop_timeout = int(self.config[config_section]['loop_timeout']) logger.info('%s loop_timeout: %s',config_section,self.loop_timeout) else: logger.warning('no "loop_timeout" in "%s" section of config file, keeping default %s',config_section,self.loop_timeout) # messenger_plugin_module if 'messenger_plugin_module' in self.config[config_section]: self.messenger_plugin_module = self.config[config_section]['messenger_plugin_module'] else: raise Exception('Failed to retrieve "messenger_plugin_module" from config file section %s' % config_section) else: raise Exception('no %s section in the configuration' % config_section) def process_incoming_messages(self): while not self.queues['Yoda'].empty(): qmsg = self.queues['Yoda'].get(block=False) # process message logger.debug('received message: %s',qmsg) if qmsg['type'] == MessageTypes.DROID_HAS_EXITED: logger.debug(' droid rank %d has exited',qmsg['source_rank']) self.exited_droids.append(qmsg['source_rank']) logger.debug('%s droid ranks have exited',len(self.exited_droids)) else: logger.error(' could not interpret message: %s',qmsg) def get_harvester_messenger(self): # try to import the module specified in the config # if it is not in the PYTHONPATH this will fail try: return importlib.import_module(self.messenger_plugin_module) except ImportError: logger.exception('Failed to import messenger_plugin: %s',self.messenger_plugin_module) raise
class Yoda(Process): def __init__(self, queues, config, rank, worldsize): ''' config: configuration of Yoda ''' # call Thread constructor super(Yoda, self).__init__() # message queues self.queues = queues # rank number self.rank = rank # world size self.worldsize = worldsize # config settings self.config = config # keep track of if the wallclock has expired self.wallclock_expired = Event() # this is used to trigger the thread exit self.exit = Event() def stop(self): ''' this function can be called by outside threads to cause the Yoda thread to exit''' self.exit.set() # this runs when 'yoda_instance.start()' is called def run(self): ''' this is the function called when the user runs yoda_instance.start() ''' try: self.subrun() except Exception: logger.exception('Yoda failed with uncaught exception') raise def subrun(self): ''' this function is the business logic, but wrapped in exception ''' self.read_config() # set logging level logger.info('Yoda Thread starting') logger.info('loglevel: %s', self.loglevel) logger.info('loop_timeout: %s', self.loop_timeout) top_working_path = os.getcwd() logger.debug('cwd: %s', top_working_path) # setup harvester messenger to share with FileManager and WorkManager logger.debug('setup harvester messenger') harvester_messenger = self.get_harvester_messenger() harvester_messenger.setup(self.config) # wait for setup to complete harvester_messenger.sfm_har_config_done.wait() # a list of ranks that have exited self.exited_droids = [] # a dictionary of subthreads subthreads = {} # create WorkManager thread subthreads['WorkManager'] = WorkManager.WorkManager( self.config, self.queues, harvester_messenger) subthreads['WorkManager'].start() # create FileManager thread subthreads['FileManager'] = FileManager.FileManager( self.config, self.queues, top_working_path, harvester_messenger) subthreads['FileManager'].start() # start message loop while not self.exit.is_set(): logger.debug('start loop') # process incoming messages from other threads or ranks self.process_incoming_messages() # check if all droids have exited if len(self.exited_droids) >= (self.worldsize - 1): logger.info('all droids have exited, exiting yoda') self.stop() break # check the status of each subthread logger.debug('checking all threads still alive') keys = subthreads.keys() for name in keys: thread = subthreads[name] # if the thread is not alive, throw an error if not thread.is_alive(): logger.warning('%s is no longer running.', name) del subthreads[name] if name == 'WorkManager': self.stop() continue # else: # logger.debug('%s %s is running.',self.prelog,name) if len(subthreads) == 0: logger.info('no subthreads remaining, exiting') self.stop() break if self.queues['Yoda'].empty(): logger.debug('sleeping %s', self.loop_timeout) self.exit.wait(timeout=self.loop_timeout) # send the exit signal to all droid ranks logger.info('sending exit signal to droid ranks') for ranknum in range(1, self.worldsize): if ranknum not in self.exited_droids: if self.wallclock_expired.is_set(): self.queues['MPIService'].put({ 'type': MessageTypes.WALLCLOCK_EXPIRING, 'destination_rank': ranknum }) else: self.queues['MPIService'].put({ 'type': MessageTypes.DROID_EXIT, 'destination_rank': ranknum }) # send the exit signal to all subthreads for name, thread in subthreads.iteritems(): logger.info('sending exit signal to %s', name) thread.stop() # wait for sub threads to exit for name, thread in subthreads.iteritems(): logger.info('waiting for %s to join', name) thread.join() logger.info('%s has joined', name) while not self.queues['MPIService'].empty(): logger.info( 'waiting for MPIService to send exit messages to Droid, sleep for %s', self.loop_timeout) time.sleep(self.loop_timeout) logger.info('Yoda is exiting') def read_config(self): if config_section in self.config: # read log level: if 'loglevel' in self.config[config_section]: self.loglevel = self.config[config_section]['loglevel'] logger.info('%s loglevel: %s', config_section, self.loglevel) logger.setLevel(logging.getLevelName(self.loglevel)) else: logger.warning( 'no "loglevel" in "%s" section of config file, keeping default', config_section) # read loop timeout: if 'loop_timeout' in self.config[config_section]: self.loop_timeout = int( self.config[config_section]['loop_timeout']) logger.info('%s loop_timeout: %s', config_section, self.loop_timeout) else: logger.warning( 'no "loop_timeout" in "%s" section of config file, keeping default %s', config_section, self.loop_timeout) # messenger_plugin_module if 'messenger_plugin_module' in self.config[config_section]: self.messenger_plugin_module = self.config[config_section][ 'messenger_plugin_module'] else: raise Exception( 'Failed to retrieve "messenger_plugin_module" from config file section %s' % config_section) else: raise Exception('no %s section in the configuration' % config_section) def process_incoming_messages(self): while not self.queues['Yoda'].empty(): qmsg = self.queues['Yoda'].get(block=False) # process message logger.debug('received message: %s', qmsg) if qmsg['type'] == MessageTypes.DROID_HAS_EXITED: logger.debug(' droid rank %d has exited', qmsg['source_rank']) self.exited_droids.append(qmsg['source_rank']) logger.debug('%s droid ranks have exited', len(self.exited_droids)) else: logger.error(' could not interpret message: %s', qmsg) def get_harvester_messenger(self): # try to import the module specified in the config # if it is not in the PYTHONPATH this will fail try: return importlib.import_module(self.messenger_plugin_module) except ImportError: logger.exception('Failed to import messenger_plugin: %s', self.messenger_plugin_module) raise
class WorkManager(Process): ''' Work Manager: this thread manages work going to the running Droids ''' def __init__(self,config,queues,harvester_messenger): ''' queues: A dictionary of SerialQueue.SerialQueue objects where the JobManager can send messages to other Droid components about errors, etc. config: the ConfigParser handle for yoda ''' # call base class init function super(WorkManager,self).__init__() # dictionary of queues for sending messages to Droid components self.queues = queues # configuration of Yoda self.config = config # harvester communication module self.harvester_messenger = harvester_messenger # this is used to trigger the thread exit self.exit = Event() def stop(self): ''' this function can be called by outside subthreads to cause the JobManager thread to exit''' self.exit.set() def run(self): ''' this function is executed as the subthread. ''' # read inputs from config file self.read_config() # list of all jobs received from Harvester key-ed by panda id pandajobs = PandaJobDict.PandaJobDict() # pending requests from droid ranks self.pending_requests = [] # helps track which request I am processing self.pending_index = 0 # place holder for Request Harevester Event Ranges instance (wait for job definition before launching) requestHarvesterEventRanges = None # create a local multiprocessing manager for shared values mpmgr = Manager() # start a Request Havester Job thread to begin getting a job requestHarvesterJob = RequestHarvesterJob.RequestHarvesterJob(self.config, self.queues,mpmgr,self.harvester_messenger) requestHarvesterJob.start() while not self.exit.is_set(): logger.debug('start loop') ################ # check for queue messages ################################ qmsg = None if not self.queues['WorkManager'].empty(): logger.info('queue has messages') qmsg = self.queues['WorkManager'].get(block=False) # any time I get a message from this queue, I reset the index of the pending request list # this way, I cycle through the pending requests once per new message self.pending_index = 0 # if main queue is empty, process pending message queues elif len(self.pending_requests) > 0: if self.pending_index < len(self.pending_requests): logger.debug('pending queue has %s messages processing %s',len(self.pending_requests),self.pending_index) qmsg = self.pending_requests[self.pending_index] else: logger.info('have cycled through all pending requests without a change, will block on queue for %s',self.loop_timeout) try: self.pending_index = 0 qmsg = self.queues['WorkManager'].get(block=True,timeout=self.loop_timeout) except Queue.Empty: logger.debug('no messages on queue after blocking') else: # if not ( # (requestHarvesterJob is not None and requestHarvesterJob.jobs_ready()) and # (requestHarvesterEventRanges is not None and requestHarvesterEventRanges.eventranges_ready()) # ): try: self.pending_index = 0 logger.info('blocking on queue for %s',self.loop_timeout) qmsg = self.queues['WorkManager'].get(block=True,timeout=self.loop_timeout) except Queue.Empty: logger.debug('no messages on queue after blocking') if qmsg: logger.debug('received message %s',qmsg) ############# ## Just a message to cause WorkManager to wake from sleep and process ############################### if qmsg['type'] == MessageTypes.WAKE_UP: continue ############# ## DROID requesting new job ############################### elif qmsg['type'] == MessageTypes.REQUEST_JOB: logger.debug('droid requesting job description') # Do I have a panda job to give out? # if not, create a new request if no request is active if len(pandajobs) == 0: # if there are no panda jobs and requestHarvesterJob is None, then start a request # this really should never happen. logger.debug('There are no panda jobs') if requestHarvesterJob is None: logger.info('launching new job request') requestHarvesterJob = RequestHarvesterJob.RequestHarvesterJob(self.config,self.queues,mpmgr,self.harvester_messenger) requestHarvesterJob.start() else: if requestHarvesterJob.running(): logger.debug('request is running, adding message to pending and will process again later') elif requestHarvesterJob.exited(): logger.debug('request has exited') jobs = requestHarvesterJob.get_jobs() if jobs is None: logger.error('request has exited and returned no events, reseting request object') if requestHarvesterJob.is_alive(): requestHarvesterJob.stop() logger.info('waiting for requestHarvesterJob to join') requestHarvesterJob.join() requestHarvesterJob = None else: logger.info('new jobs ready, adding to PandaJobDict, then add to pending requests') pandajobs.append_from_dict(jobs) # add to pending requests because the job to send will be chose by another # section of code below which sends the job based on the event ranges on hand # reset job request requestHarvesterJob = None # backup pending counter self.pending_index += -1 else: if requestHarvesterJob.state_lifetime() > 60: logger.error('request is stuck in state %s recreating it.',requestHarvesterJob.get_state()) if requestHarvesterJob.is_alive(): requestHarvesterJob.stop() logger.info('waiting for requestHarvesterJob to join') requestHarvesterJob.join() requestHarvesterJob = None else: logger.debug('request is in %s state, waiting',requestHarvesterJob.get_state()) logger.debug('pending message') # place request on pending_requests queue and reprocess again when job is ready self.pend_request(qmsg) # There are jobs in the list so choose one to send # The choice depends on numbers of events available for each job elif len(pandajobs) == 1: logger.debug('There is one job so send it.') # get the job # FUTUREDEV: It's unclear if we will ever run more than one PandaID per Yoda job # so in the future this may need to actually search the pandajobs list for the # one with the most jobs to send or something like that. pandaid = pandajobs.keys()[0] job = pandajobs[pandaid] # send it to droid rank logger.info('sending droid rank %s panda id %s which has the most ready events %s', qmsg['source_rank'],pandaid,job.number_ready()) outmsg = { 'type':MessageTypes.NEW_JOB, 'job':job.job_def, 'destination_rank':qmsg['source_rank'] } self.queues['MPIService'].put(outmsg) if qmsg in self.pending_requests: self.pending_requests.remove(qmsg) qmsg = None # There are jobs in the list so choose one to send # The choice depends on numbers of events available for each job elif len(pandajobs) > 1: logger.error('there are multiple jobs to choose from, this is not yet implimented') raise Exception('there are multiple jobs to choose from, this is not yet implimented') ############# ## DROID requesting new event ranges ############################### elif qmsg['type'] == MessageTypes.REQUEST_EVENT_RANGES: logger.debug('droid requesting event ranges') # the droid sent the current running panda id, determine if there are events left for this panda job droid_pandaid = str(qmsg['PandaID']) # if there are no event ranges left reply with such if pandajobs[droid_pandaid].eventranges.no_more_event_ranges: logger.debug('no event ranges left for panda ID %s',droid_pandaid) logger.info('sending NO_MORE_EVENT_RANGES to rank %s',qmsg['source_rank']) self.queues['MPIService'].put( {'type':MessageTypes.NO_MORE_EVENT_RANGES, 'destination_rank':qmsg['source_rank'], 'PandaID':droid_pandaid, }) # remove message if from pending if qmsg in self.pending_requests: self.pending_requests.remove(qmsg) qmsg = None # may still be event ranges for this ID else: logger.debug('retrieving event ranges for panda ID %s',droid_pandaid) # event ranges found for pandaID if str(droid_pandaid) in pandajobs.keys(): logger.debug('EventRangeList object for pandaID %s exists, events ready %s',droid_pandaid,pandajobs[droid_pandaid].number_ready()) # have event ranges ready if pandajobs[droid_pandaid].number_ready() > 0: self.send_eventranges(pandajobs[droid_pandaid].eventranges,qmsg) # remove message if from pending if qmsg in self.pending_requests: self.pending_requests.remove(qmsg) qmsg = None # no event ranges remaining, will request more else: logger.debug('no eventranges remain for pandaID %s, can we request more?',droid_pandaid) # check if the no more event ranges flag has been set for this panda id if pandajobs[droid_pandaid].eventranges.no_more_event_ranges: # set the flag so we know there are no more events for this PandaID logger.debug('no more event ranges for PandaID: %s',droid_pandaid) logger.debug('sending NO_MORE_EVENT_RANGES to rank %s',qmsg['source_rank']) self.queues['MPIService'].put( {'type':MessageTypes.NO_MORE_EVENT_RANGES, 'destination_rank':qmsg['source_rank'], 'PandaID':droid_pandaid, }) # remove message if from pending if qmsg in self.pending_requests: self.pending_requests.remove(qmsg) qmsg = None # flag is not set, so if no request is running create one elif requestHarvesterEventRanges is None: logger.debug('requestHarvesterEventRanges does not exist, creating new request') requestHarvesterEventRanges = RequestHarvesterEventRanges.RequestHarvesterEventRanges( self.config, { 'pandaID':pandajobs[droid_pandaid]['PandaID'], 'jobsetID':pandajobs[droid_pandaid]['jobsetID'], 'taskID':pandajobs[droid_pandaid]['taskID'], 'nRanges':self.request_n_eventranges, }, mpmgr, self.harvester_messenger, ) requestHarvesterEventRanges.start() # pend the request for later processing self.pend_request(qmsg) # there is a request, if it is running, pend the message for later processing elif requestHarvesterEventRanges.running(): logger.debug('requestHarvesterEventRanges is running, will pend this request and check again') self.pend_request(qmsg) # there is a request, if it is has exited, check if there are events available elif requestHarvesterEventRanges.exited(): logger.debug('requestHarvesterEventRanges exited, will check for new event ranges') requestHarvesterEventRanges.join() # if no more events flag is set, there are no more events for this PandaID if requestHarvesterEventRanges.no_more_eventranges(): # set the flag so we know there are no more events for this PandaID logger.debug('no more event ranges for PandaID: %s',droid_pandaid) pandajobs[droid_pandaid].eventranges.no_more_event_ranges = True logger.debug('sending NO_MORE_EVENT_RANGES to rank %s',qmsg['source_rank']) self.queues['MPIService'].put( {'type':MessageTypes.NO_MORE_EVENT_RANGES, 'destination_rank':qmsg['source_rank'], 'PandaID':droid_pandaid, }) # remove message if from pending if qmsg in self.pending_requests: self.pending_requests.remove(qmsg) qmsg = None # event ranges received so add them to the list else: tmpeventranges = requestHarvesterEventRanges.get_eventranges() if tmpeventranges is not None: logger.debug('received eventranges: %s', ' '.join(('%s:%i' % (tmpid,len(tmpeventranges[tmpid]))) for tmpid in tmpeventranges.keys())) # add event ranges to pandajobs dict for jobid in tmpeventranges.keys(): ers = tmpeventranges[jobid] pandajobs[jobid].eventranges += EventRangeList.EventRangeList(ers) # events will be sent in the next loop execution self.send_eventranges(pandajobs[jobid].eventranges,qmsg) # remove message if from pending if qmsg in self.pending_requests: self.pending_requests.remove(qmsg) qmsg = None else: logger.error('no eventranges after requestHarvesterEventRanges exited, starting new request') self.pend_request(qmsg) # reset request requestHarvesterEventRanges = None else: logger.error('requestHarvesterEventRanges is in strange state %s, restarting',requestHarvesterEventRanges.get_state()) requestHarvesterEventRanges = None self.pend_request(qmsg) # something went wrong else: logger.error('there is no eventrange for pandaID %s, this should be impossible since every pandaID in the pandajobs dictionary gets an empty EventRangeList object. Something is amiss. panda job ids: %s',droid_pandaid,pandajobs.keys()) else: logger.error('message type was not recognized: %s',qmsg['type']) # if there is nothing to be done, sleep # if (requestHarvesterJob is not None and requestHarvesterJob.running()) and \ # (requestHarvesterEventRanges is not None and requestHarvesterEventRanges.running()) and \ # self.queues['WorkManager'].empty() and self.pending_requests.empty(): # time.sleep(self.loop_timeout) # else: logger.debug('continuing loop') if requestHarvesterJob is not None: logger.debug('RequestHarvesterJob: %s',requestHarvesterJob.get_state()) if requestHarvesterEventRanges is not None: logger.debug('requestHarvesterEventRanges: %s',requestHarvesterEventRanges.get_state()) logger.info('signaling exit to threads') if requestHarvesterJob is not None and requestHarvesterJob.is_alive(): logger.debug('signaling requestHarvesterJob to stop') requestHarvesterJob.stop() if requestHarvesterEventRanges is not None and requestHarvesterEventRanges.is_alive(): logger.debug('signaling requestHarvesterEventRanges to stop') requestHarvesterEventRanges.stop() if requestHarvesterJob is not None and requestHarvesterJob.is_alive(): logger.debug('waiting for requestHarvesterJob to join') requestHarvesterJob.join() if requestHarvesterEventRanges is not None and requestHarvesterEventRanges.is_alive(): logger.debug('waiting for requestHarvesterEventRanges to join') requestHarvesterEventRanges.join() logger.info('WorkManager is exiting') def number_eventranges_ready(self,eventranges): total = 0 for id,range in eventranges.iteritems(): total += range.number_ready() return total def get_jobid_with_minimum_ready(self,eventranges): # loop over event ranges, count the number of ready events job_id = 0 job_nready = 999999 for pandaid,erl in eventranges.iteritems(): nready = erl.number_ready() if nready > 0 and nready < job_nready: job_id = pandaid job_nready = nready return job_id def send_eventranges(self,eventranges,qmsg): ''' send the requesting MPI rank some event ranges ''' # get a subset of ready eventranges up to the send_n_eventranges value local_eventranges = eventranges.get_next(min(eventranges.number_ready(),self.send_n_eventranges)) # send event ranges to Droid logger.info('sending %d new event ranges to droid rank %d',len(local_eventranges),qmsg['source_rank']) outmsg = { 'type':MessageTypes.NEW_EVENT_RANGES, 'eventranges': local_eventranges, 'destination_rank':qmsg['source_rank'], } self.queues['MPIService'].put(outmsg) def read_config(self): if config_section in self.config: # read log level: if 'loglevel' in self.config[config_section]: self.loglevel = self.config[config_section]['loglevel'] logger.info('%s loglevel: %s',config_section,self.loglevel) logger.setLevel(logging.getLevelName(self.loglevel)) else: logger.warning('no "loglevel" in "%s" section of config file, keeping default',config_section) # read loop timeout: if 'loop_timeout' in self.config[config_section]: self.loop_timeout = int(self.config[config_section]['loop_timeout']) logger.info('%s loop_timeout: %s',config_section,self.loop_timeout) else: logger.warning('no "loop_timeout" in "%s" section of config file, keeping default %s',config_section,self.loop_timeout) # read send_n_eventranges: if 'send_n_eventranges' in self.config[config_section]: self.send_n_eventranges = int(self.config[config_section]['send_n_eventranges']) logger.info('%s send_n_eventranges: %s',config_section,self.send_n_eventranges) else: raise Exception('configuration section %s has no "send_n_eventranges" setting. This setting is important and should be optimized to your system. Typically you should set it to the number of AthenaMP workers on a single node or some factor of that.' % config_section) # read request_n_eventranges: if 'request_n_eventranges' in self.config[config_section]: self.request_n_eventranges = int(self.config[config_section]['request_n_eventranges']) logger.info('%s request_n_eventranges: %s',config_section,self.request_n_eventranges) else: raise Exception('configuration section %s has no "request_n_eventranges" setting. This setting is important and should be optimized to your system. Typically you should set it to the number of AthenaMP workers on a single node multiplied by the total number of Droid ranks running or some factor of that.' % config_section) else: raise Exception('no %s section in the configuration' % config_section) def pend_request(self,msg): if msg in self.pending_requests: self.pending_index += 1 else: self.pending_requests.append(msg) self.pending_index = 0