def request_eventranges(job_def):
    global sfm_har_config, sfm_har_config_done
    sfm_har_config_done.wait()

    # retrieve event request file
    eventRequestFile = sfm_har_config['eventRequestFile']
    eventRequestFile_tmp = eventRequestFile + '.tmp'

    # crate event request file
    if not os.path.exists(eventRequestFile):
        # need to output a file containing:
        #   {'nRanges': ???, 'pandaID':???, 'taskID':???, 'jobsetID':???}
        logger.debug(
            'requesting new event ranges by writing to file "%s" with this content: %s',
            eventRequestFile, job_def)

        # get new job definition
        new_job_def = {job_def['pandaID']: job_def}

        f = open(eventRequestFile_tmp, 'w')
        f.write(serializer.serialize(new_job_def))
        f.close()

        # now move tmp filename to real filename
        os.rename(eventRequestFile_tmp, eventRequestFile)

    else:
        logger.debug('request file already exists. Adding requests')

        # move current file to temp
        os.rename(eventRequestFile, eventRequestFile_tmp)

        filedata = open(eventRequestFile_tmp).read()
        requests = serializer.deserialize(filedata)

        pandaID = job_def['pandaID']
        if pandaID in requests:
            logger.debug('adding event range count to existing request')
            thisjob = requests[pandaID]
            if thisjob['jobsetID'] == job_def['jobsetID'] and thisjob[
                    'taskID'] == job_def['taskID']:
                thisjob['nRanges'] += job_def['nRanges']
            else:
                logger.warning(
                    'existing request for PandaID %s does not match new request details %s',
                    thisjob, job_def)
        else:
            logger.debug('adding new job definition to existing request')
            requests[pandaID] = job_def

        # output updated requests to file
        open(eventRequestFile_tmp, 'w').write(serializer.serialize(requests))

        # now move tmp filename to real filename
        os.rename(eventRequestFile_tmp, eventRequestFile)
Ejemplo n.º 2
0
def request_eventranges(job_def):
   global sfm_har_config,sfm_har_config_done
   sfm_har_config_done.wait()

   # retrieve event request file
   eventRequestFile = sfm_har_config['eventRequestFile']
   eventRequestFile_tmp = eventRequestFile + '.tmp'
   
   # crate event request file
   if not os.path.exists(eventRequestFile):
      # need to output a file containing:
      #   {'nRanges': ???, 'pandaID':???, 'taskID':???, 'jobsetID':???}
      logger.debug('requesting new event ranges by writing to file "%s" with this content: %s',eventRequestFile,job_def)
      
      # get new job definition
      new_job_def = {job_def['pandaID']:job_def}
      
      f = open(eventRequestFile_tmp,'w')
      f.write(serializer.serialize(new_job_def))
      f.close()

      # now move tmp filename to real filename
      os.rename(eventRequestFile_tmp,eventRequestFile)

   else:
      logger.debug('request file already exists. Adding requests')

      # move current file to temp
      os.rename(eventRequestFile,eventRequestFile_tmp)

      filedata = open(eventRequestFile_tmp).read()
      requests = serializer.deserialize(filedata)

      pandaID = job_def['pandaID']
      if pandaID in requests:
         logger.debug('adding event range count to existing request')
         thisjob = requests[pandaID]
         if thisjob['jobsetID'] == job_def['jobsetID'] and thisjob['taskID'] == job_def['taskID']:
            thisjob['nRanges'] += job_def['nRanges']
         else:
            logger.warning('existing request for PandaID %s does not match new request details %s',thisjob,job_def)
      else:
         logger.debug('adding new job definition to existing request')
         requests[pandaID] = job_def

      # output updated requests to file
      open(eventRequestFile_tmp,'w').write(serializer.serialize(requests))

      # now move tmp filename to real filename
      os.rename(eventRequestFile_tmp,eventRequestFile)
Ejemplo n.º 3
0
   def send_eventrange(self,eventranges,athpayloadcomm,no_more_events):
      logger.debug('sending event to payload')
      # if event ranges available, send one
      try:
         logger.debug('have %d ready event ranges to send to AthenaMP',eventranges.number_ready())
         local_eventranges = eventranges.get_next()
      
      # no more event ranges available
      except EventRangeList.NoMoreEventRanges:
         logger.debug('there are no more event ranges to process')
         # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events
         if no_more_events:
            logger.info('sending AthenaMP NO_MORE_EVENTS')
            athpayloadcomm.send(athena_payloadcommunicator.NO_MORE_EVENTS)
         else:
            # otherwise, raise the Exception to trigger an event request
            raise
         

      # something wrong with the index in the EventRangeList index
      except EventRangeList.RequestedMoreRangesThanAvailable:
         logger.error('requested more event ranges than available, going to try waiting for more events')
         raise EventRangeList.NoMoreEventRanges()
         
      else:
         logger.info('sending eventranges to AthenaMP: %s',local_eventranges)
         # append full path to file name for AthenaMP
         # and adjust event counter by the number of files
         # input_files = self.job_def.get()['inFiles'].split(',')
         # logger.debug('found %s input files',len(input_files))
         for evtrg in local_eventranges:
            evtrg['PFN'] = os.path.join(os.getcwd(),evtrg['LFN'])
            
         # send AthenaMP the new event ranges
         athpayloadcomm.send(serializer.serialize(local_eventranges))
Ejemplo n.º 4
0
    def send_eventrange(self, eventranges, athpayloadcomm, no_more_events):
        logger.debug('sending event to payload')
        # if event ranges available, send one
        try:
            logger.debug('have %d ready event ranges to send to AthenaMP',
                         eventranges.number_ready())
            local_eventranges = eventranges.get_next()

        # no more event ranges available
        except EventRangeList.NoMoreEventRanges:
            logger.debug('there are no more event ranges to process')
            # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events
            if no_more_events:
                logger.info('sending AthenaMP NO_MORE_EVENTS')
                athpayloadcomm.send(AthenaPayloadCommunicator.NO_MORE_EVENTS)
            else:
                # otherwise, raise the Exception to trigger an event request
                raise

        # something wrong with the index in the EventRangeList index
        except EventRangeList.RequestedMoreRangesThanAvailable:
            logger.error(
                'requested more event ranges than available, going to try waiting for more events'
            )
            raise EventRangeList.NoMoreEventRanges()

        else:
            logger.info('sending eventranges to AthenaMP: %s',
                        local_eventranges)
            # append full path to file name for AthenaMP
            # and adjust event counter by the number of files
            # input_files = self.job_def.get()['inFiles'].split(',')
            # logger.debug('found %s input files',len(input_files))
            for evtrg in local_eventranges:
                evtrg['PFN'] = os.path.join(os.getcwd(), evtrg['LFN'])

            # send AthenaMP the new event ranges
            athpayloadcomm.send(serializer.serialize(local_eventranges))
Ejemplo n.º 5
0
def stage_out_files(file_list,output_type):
   global sfm_har_config,sfm_har_config_done
   sfm_har_config_done.wait()

   if output_type not in ['output','es_output','log']:
      raise Exception('incorrect type provided: %s' % (output_type))


   # load name of eventStatusDumpJsonFile file
   eventStatusDumpJsonFile = sfm_har_config['eventStatusDumpJsonFile']
   
   eventStatusDumpData = {}
   # loop over filelist
   for filedata in file_list:

      # make sure pandaID is a string
      pandaID = str(filedata['pandaid'])

      chksum = None
      if 'chksum' in filedata:
         chksum = filedata['chksum']

      # filename = os.path.join(output_path,os.path.basename(filedata['filename']))
      
      # format data for file:
      file_descriptor = {'eventRangeID':filedata['eventrangeid'],
                         'eventStatus':filedata['eventstatus'],
                         'path':filedata['filename'],
                         'type':output_type,
                         'chksum': chksum,
                         'guid': None,
                        }
      try:
         eventStatusDumpData[pandaID].append(file_descriptor)
      except KeyError:
         eventStatusDumpData[pandaID] = [file_descriptor]

   # create a temp file to place contents
   # this avoids Harvester trying to read the file while it is being written
   eventStatusDumpJsonFile_tmp = eventStatusDumpJsonFile + '.tmp'

   # if file does not already exists, new data is just what we have
   if not os.path.exists(eventStatusDumpJsonFile):
      data = eventStatusDumpData
   
   # if the file exists, move it to a tmp filename, update its contents and then recreate it.
   else:

      # first move existing file to tmp so Harvester does not read it while we edit
      try:
         os.rename(eventStatusDumpJsonFile,eventStatusDumpJsonFile_tmp)
      except Exception:
         logger.warning('tried moving %s to a tmp filename to add more output files for Harvester.',eventStatusDumpJsonFile)
         if not os.path.exists(eventStatusDumpJsonFile):
            logger.warning('%s file no longer exists so Harvester must have grabbed it. Need to create a new file',eventStatusDumpJsonFile)
            data = eventStatusDumpData
      else:

         # now open and read in the data
         with open(eventStatusDumpJsonFile_tmp,'r') as f:
            data = serializer.deserialize(f.read())
         logger.debug('found existing data for pandaIDs: %s',data.keys())

         for pandaID in eventStatusDumpData:

            # if the pandaID already exists, just append the new file to that list
            try:
               logger.debug('addding data to existing panda list')
               data[pandaID] += eventStatusDumpData[pandaID]
            # if the pandaID does not exist, add a new list
            except KeyError:
               logger.debug('addding new panda id list')
               data[pandaID] = eventStatusDumpData[pandaID]

   if logger.getEffectiveLevel() == logging.DEBUG:
      tmpstr = ' '.join('%s:%s' % (x,len(data[x])) for x in data)
      logger.debug('writing output to file %s with keys: %s', eventStatusDumpJsonFile,tmpstr)
   
   # overwrite the temp file with the updated data
   with open(eventStatusDumpJsonFile_tmp,'w') as f:
      f.write(serializer.serialize(data,pretty_print=True))

   # move tmp file into place
   os.rename(eventStatusDumpJsonFile_tmp,eventStatusDumpJsonFile)

   logger.debug('done')
Ejemplo n.º 6
0
def stage_out_file(output_type,output_path,eventRangeID,eventStatus,pandaID,chksum=None,):
   global sfm_har_config,sfm_har_config_done
   sfm_har_config_done.wait()

   if output_type not in ['output','es_output','log']:
      raise Exception('incorrect type provided: %s' % (output_type))

   if not os.path.exists(output_path):
      raise Exception('output file not found: %s' % (output_path))

   # make sure pandaID is a string
   pandaID = str(pandaID)
      

   # load name of eventStatusDumpJsonFile file
   eventStatusDumpJsonFile = sfm_har_config['eventStatusDumpJsonFile']
   
   # first create a temp file to place contents
   # this avoids Harvester trying to read the file while it is being written
   eventStatusDumpJsonFile_tmp = eventStatusDumpJsonFile + '.tmp'
   

   # format data for file:
   file_descriptor = {'eventRangeID':eventRangeID,
                      'eventStatus':eventStatus,
                      'path':output_path,
                      'type':output_type,
                      'chksum': chksum,
                      'guid': None,
                     }

   # if file does not already exists, new data is just what we have
   if not os.path.exists(eventStatusDumpJsonFile):
      data = {pandaID: [file_descriptor]}
   
   # if the file exists, move it to a tmp filename, update its contents and then recreate it.
   else:

      # first move existing file to tmp so Harvester does not read it while we edit
      try:
         os.rename(eventStatusDumpJsonFile,eventStatusDumpJsonFile_tmp)
      except Exception:
         logger.warning('tried moving %s to a tmp filename to add more output files for Harvester.',eventStatusDumpJsonFile)
         if not os.path.exists(eventStatusDumpJsonFile):
            logger.warning('%s file no longer exists so Harvester must have grabbed it. Need to create a new file',eventStatusDumpJsonFile)
            data = {pandaID: [file_descriptor]}
      else:

         # now open and read in the data
         with open(eventStatusDumpJsonFile_tmp,'r') as f:
            data = serializer.deserialize(f.read())
         logger.debug('existing data contains %s',data)
         # if the pandaID already exists, just append the new file to that list
         if pandaID in data:
            logger.debug('addding data to existing panda list')
            data[pandaID].append(file_descriptor)
         # if the pandaID does not exist, add a new list
         else:
            logger.debug('addding new panda id list')
            data[pandaID] = [file_descriptor]

   logger.debug('output to file %s: %s',eventStatusDumpJsonFile,data)
   
   # overwrite the temp file with the updated data
   with open(eventStatusDumpJsonFile_tmp,'w') as f:
      f.write(serializer.serialize(data))

   # move tmp file into place
   os.rename(eventStatusDumpJsonFile_tmp,eventStatusDumpJsonFile)
Ejemplo n.º 7
0
   def run(self):
      ''' this is the function run as a subthread when the user runs jobComm_instance.start() '''

      self.read_config()


      logger.debug('start yampl payloadcommunicator')
      athpayloadcomm = athena_payloadcommunicator(self.yampl_socket_name)
      payload_msg = ''

      # current list of output files to send via MPI
      output_files = []
      last_output_file_mpi_send = time.time()

      # list of event ranges
      eventranges = EventRangeList.EventRangeList()
      no_more_events = False
      waiting_for_eventranges = False
      event_range_request_counter = 0

      # current panda job that AthenaMP is configured to run
      current_job = None

      while not self.exit.is_set():
         logger.debug('start loop: state: %s',self.get_state())
         
         # in debug mode, report evenranges status
         if logger.getEffectiveLevel() == logging.DEBUG:
            ready_events = eventranges.number_ready()
            number_completed = eventranges.number_completed()
            total = len(eventranges)
            logger.debug('number of ready events %s; number of completed events %s; total events %s',ready_events,number_completed,total)

         # don't want to hammer Yoda with lots of little messages for output files
         # so aggregate output files for some time period then send as a group
         if len(output_files) == 0:
            last_output_file_mpi_send = time.time()
         elif (time.time() - last_output_file_mpi_send) > self.aggregate_output_files_time:

            # send output file data to Yoda/FileManager
            logger.info('sending %s output files to Yoda/FileManager',len(output_files))
            mpi_message = {'type':MessageTypes.OUTPUT_FILE,
                           'filelist':output_files,
                           'destination_rank': 0
                          }
            self.queues['MPIService'].put(mpi_message)

            # set time for next send
            last_output_file_mpi_send = time.time()
            # reset output file list
            output_files = []


         ##################
         # WAITING_FOR_JOB: waiting for the job definition to arrive, before
         #        it does, it is assumed that there is no payload running
         ######################################################################
         if self.get_state() == self.WAITING_FOR_JOB:
            logger.info(' waiting for job definition, blocking on message queue for %s ',self.loop_timeout)
            try:
               qmsg = self.queues['JobComm'].get(block=True,timeout=self.loop_timeout)
            except Queue.Empty:
               logger.debug('no message on queue')
            else:
               # shorten our message for printing
               if logger.getEffectiveLevel() == logging.DEBUG:
                  tmpmsg = str(qmsg)
                  if len(tmpmsg) > self.debug_message_char_length:
                     tmpslice = slice(0,self.debug_message_char_length)
                     tmpmsg = tmpmsg[tmpslice] + '...'
                  logger.debug('received queue message: %s',tmpmsg)
               
               # verify message type is as expected
               if 'type' not in qmsg or qmsg['type'] != MessageTypes.NEW_JOB or 'job' not in qmsg:
                  logger.error('received unexpected message format: %s',qmsg)
               else:
                  logger.info('received job definition')
                  current_job = qmsg['job']
                  
                  

                  # change state
                  self.set_state(self.REQUEST_EVENT_RANGES)
            qmsg = None

         ##################
         # REQUEST_EVENT_RANGES: Request event ranges from Yoda
         ######################################################################
         elif self.get_state() == self.REQUEST_EVENT_RANGES:
            if not waiting_for_eventranges:
               logger.info('sending request for event ranges')
               # send MPI message to Yoda for more event ranges
               self.request_events(current_job)
               waiting_for_eventranges = True
            # change state
            self.set_state(self.WAITING_FOR_EVENT_RANGES)
         
         ##################
         # WAITING_FOR_EVENT_RANGES: Waiting for event ranges from Yoda
         ######################################################################
         elif self.get_state() == self.WAITING_FOR_EVENT_RANGES:
            logger.info('waiting for event ranges, blocking on message queue for %s',self.loop_timeout)
            try:
               qmsg = self.queues['JobComm'].get(block=True,timeout=self.loop_timeout)
            except Queue.Empty:
               logger.debug('no message on queue')
            else:
               # shorten our message for printing
               if logger.getEffectiveLevel() == logging.DEBUG:
                  tmpmsg = str(qmsg)
                  if len(tmpmsg) > self.debug_message_char_length:
                     tmpslice = slice(0,self.debug_message_char_length)
                     tmpmsg = tmpmsg[tmpslice] + '...'
                  logger.debug('received queue message: %s',tmpmsg)
               
               if 'type' not in qmsg:
                  logger.error('received unexpected message format: %s',qmsg)
               elif qmsg['type'] == MessageTypes.NEW_EVENT_RANGES:
                  logger.info('received event ranges, adding to list')
                  eventranges += EventRangeList.EventRangeList(qmsg['eventranges'])
                  # add event ranges to payload messenger list
                  # payloadcomm.add_eventranges(eventranges)
                  # change state
                  self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)
               elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES:
                  logger.info('no more event ranges for PandaID %s',qmsg['PandaID'])
                  no_more_events = True

                  # check for running events
                  if len(eventranges) == eventranges.number_completed():
                     logger.info('no eventranges left to send so triggering exit')
                     self.stop()
                  else:
                     logger.info('still have events to process so continuing')
                     self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

               else:
                  logger.error('unknown message type: %s',qmsg['type'])

               waiting_for_eventranges = False


            qmsg = None

         ##################
         # WAIT_FOR_PAYLOAD_MESSAGE: initiates
         #          a request for a message from the payload
         ######################################################################
         if self.get_state() == self.WAIT_FOR_PAYLOAD_MESSAGE:

            # first check if there is an incoming message
            try:
               logger.debug('checking for queue message')
               qmsg = self.queues['JobComm'].get(block=False)
               if MessageTypes.NEW_EVENT_RANGES in qmsg['type']:
                  logger.info('received new event range')
                  eventranges += EventRangeList.EventRangeList(qmsg['eventranges'])
                  waiting_for_eventranges = False
               elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES:
                  logger.info('no more event ranges for PandaID %s',qmsg['PandaID'])
                  no_more_events = True

                  # check for running events
                  if len(eventranges) == eventranges.number_completed():
                     logger.info('no eventranges left to send so triggering exit')
                     self.stop()
                  else:
                     logger.info('still have events to process so continuing')

               else:
                  logger.error('received message of unknown type: %s',qmsg)
            except Queue.Empty:
               logger.debug('no messages on queue')
            
            logger.info('checking for message from payload, block for %s, pending event range requests: %s',self.loop_timeout,event_range_request_counter)

            payload_msg = athpayloadcomm.recv(self.loop_timeout)

            if len(payload_msg) > 0:
               logger.debug('received message: %s',payload_msg)
               self.set_state(self.MESSAGE_RECEIVED)
            else:
               logger.debug('did not receive message from payload')
               if event_range_request_counter > 0:
                  logger.debug('have %s pending event range requests so will try sending one.',event_range_request_counter)
                  self.set_state(self.SEND_EVENT_RANGE)
               # time.sleep(self.loop_timeout)
         
         ##################
         # MESSAGE_RECEIVED: this state indicates that a message has been
         #          received from the payload and its meaning will be parsed
         ######################################################################
         elif self.get_state() == self.MESSAGE_RECEIVED:
            
            # if ready for events, send them or wait for some
            if athena_payloadcommunicator.READY_FOR_EVENTS in payload_msg:
               logger.info('payload is ready for event range')
               self.set_state(self.SEND_EVENT_RANGE)
               # increment counter to keep track of how many requests are queued
               event_range_request_counter += 1

            #### OUTPUT File received
            elif len(payload_msg.split(',')) == 4:
               # Athena sent details of an output file
               logger.info('received output file from AthenaMP')
               self.set_state(self.SEND_OUTPUT_FILE)

            else:
               logger.error('failed to parse message from Athena: %s',payload_msg)
               self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

         ##################
         # SEND_EVENT_RANGE: wait until more event ranges are sent by JobComm
         ######################################################################
         elif self.get_state() == self.SEND_EVENT_RANGE:
            logger.debug('sending event to payload')
            # if event ranges available, send one
            try:
               logger.debug('have %d ready event ranges to send to AthenaMP',eventranges.number_ready())
               local_eventranges = eventranges.get_next()
            # no more event ranges available
            except EventRangeList.NoMoreEventRanges:
               logger.debug('there are no more event ranges to process')
               # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events
               if no_more_events:
                  logger.info('sending AthenaMP NO_MORE_EVENTS')
                  athpayloadcomm.send(athena_payloadcommunicator.NO_MORE_EVENTS)
                  # return to state requesting a message
                  self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

               # otherwise wait for more events
               else:
                  logger.info('waiting for more events ranges')
                  self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)
               

            # something wrong with the index in the EventRangeList index
            except EventRangeList.RequestedMoreRangesThanAvailable:
               logger.error('requested more event ranges than available, waiting for more event ranges')
               self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)
               
            else:
               logger.info('sending %s eventranges to AthenaMP',len(local_eventranges))
               # append full path to file name for AthenaMP
               # and adjust event counter by the number of files
               # input_files = self.job_def.get()['inFiles'].split(',')
               # logger.debug('%s: found %s input files',self.prelog,len(input_files))
               for evtrg in local_eventranges:
                  evtrg['PFN'] = os.path.join(os.getcwd(),evtrg['LFN'])

               # send AthenaMP the new event ranges
               athpayloadcomm.send(serializer.serialize(local_eventranges))
               # decrement counter since we sent some events
               event_range_request_counter -= 1


               # return to state requesting a message
               self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

            payload_msg = None

         ##################
         # SEND_OUTPUT_FILE: send output file data to MPIService
         ######################################################################
         elif self.get_state() == self.SEND_OUTPUT_FILE:
            logger.debug('send output file information')
            
            # parse message
            parts = payload_msg.split(',')
            # there should be four parts:
            # "myHITS.pool.root_000.Range-6,ID:Range-6,CPU:1,WALL:1"
            if len(parts) == 4:
               # parse the parts
               outputfilename = parts[0]
               eventrangeid = parts[1].replace('ID:','')
               cpu = parts[2].replace('CPU:','')
               wallclock = parts[3].replace('WALL:','')

               # if staging, stage and change output filename
               if self.stage_outputs:
                  # move file to staging_path
                  logger.debug('shutil.move(%s,%s)',outputfilename,self.staging_path)
                  shutil.move(outputfilename,self.staging_path)
                  # change output filename
                  outputfilename = os.path.join(self.staging_path,os.path.basename(outputfilename))
                  logger.info('outputfilename - %s',outputfilename)

               # build the data for Harvester output file
               output_file_data = {'type':MessageTypes.OUTPUT_FILE,
                                   'filename':outputfilename,
                                   'eventrangeid':eventrangeid,
                                   'cpu':cpu,
                                   'wallclock':wallclock,
                                   'scope':current_job['scopeOut'],
                                   'pandaid':current_job['PandaID'],
                                   'eventstatus':'finished',
                                   'destination_rank': 0,
                                  }
               # self.output_file_data.set(output_file_data)

               # append output file data to list of files for transfer via MPI
               output_files.append(output_file_data)
               logger.info('received output file from AthenaMP; %s output files now on waiting list',len(output_files))
               
               # set event range to completed:
               logger.debug('mark event range id %s as completed',output_file_data['eventrangeid'])
               try:
                  eventranges.mark_completed(output_file_data['eventrangeid'])
               except Exception:
                  logger.error('failed to mark eventrangeid %s as completed',output_file_data['eventrangeid'])
                  self.stop()

               # return to state requesting a message
               self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

               
            else:
               logger.error('failed to parse output file')
               self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

            payload_msg = None
            
         # if ready_events is below the threshold and the no more events flag has not been set
         # request more event ranges
         if eventranges.number_ready() < self.get_more_events_threshold and not no_more_events and not waiting_for_eventranges and current_job is not None:
            logger.info('number of ready events %s below request threshold %s, asking for more.',eventranges.number_ready(),self.get_more_events_threshold)
            # send MPI message to Yoda for more event ranges
            self.request_events(current_job)
            waiting_for_eventranges = True
         
         # if the number of completed events equals the number of event ranges
         # available, and no more events flag is set, then kill subprocess and exit.
         elif eventranges.number_ready() == 0 and eventranges.number_completed() == len(eventranges) and no_more_events:
            logger.info('no more events to process, exiting')
            self.stop()
            self.all_work_done.set()
         # else:
         # logger.info('sleeping for %s',self.loop_timeout)
         # self.exit.wait(timeout=self.loop_timeout)

      # send any remaining output files to Yoda before exitingn.
      # don't want to hammer Yoda with lots of little messages for output files
      # so aggregate output files for some time period then send as a group
      if len(output_files) > 0:
         
         # send output file data to Yoda/FileManager
         logger.info('sending %s output files to Yoda/FileManager',len(output_files))
         mpi_message = {'type':MessageTypes.OUTPUT_FILE,
                        'filelist':output_files,
                        'destination_rank': 0
                       }
         self.queues['MPIService'].put(mpi_message)

         # reset output file list
         output_files = []
      
      self.set_state(self.EXITED)

      logger.info('JobComm exiting')
Ejemplo n.º 8
0
    def run(self):  # noqa: C901
        """ this is the function run as a subthread when the user runs jobComm_instance.start() """

        self.read_config()

        logger.debug('start yampl payloadcommunicator')
        athpayloadcomm = AthenaPayloadCommunicator(self.yampl_socket_name)
        payload_msg = ''

        # current list of output files to send via MPI
        output_files = []
        last_output_file_mpi_send = time.time()

        # list of event ranges
        eventranges = EventRangeList.EventRangeList()
        no_more_events = False
        waiting_for_eventranges = False
        event_range_request_counter = 0

        # current panda job that AthenaMP is configured to run
        current_job = None

        while not self.exit.is_set():
            logger.debug('start loop: state: %s', self.get_state())

            # in debug mode, report evenranges status
            if logger.getEffectiveLevel() == logging.DEBUG:
                ready_events = eventranges.number_ready()
                number_completed = eventranges.number_completed()
                total = len(eventranges)
                logger.debug(
                    'number of ready events %s; number of completed events %s; total events %s',
                    ready_events, number_completed, total)

            # don't want to hammer Yoda with lots of little messages for output files
            # so aggregate output files for some time period then send as a group
            if len(output_files) == 0:
                last_output_file_mpi_send = time.time()
            elif (time.time() - last_output_file_mpi_send
                  ) > self.aggregate_output_files_time:

                # send output file data to Yoda/FileManager
                logger.info('sending %s output files to Yoda/FileManager',
                            len(output_files))
                mpi_message = {
                    'type': MessageTypes.OUTPUT_FILE,
                    'filelist': output_files,
                    'destination_rank': 0
                }
                self.queues['MPIService'].put(mpi_message)

                # set time for next send
                last_output_file_mpi_send = time.time()
                # reset output file list
                output_files = []

            ##################
            # WAITING_FOR_JOB: waiting for the job definition to arrive, before
            #        it does, it is assumed that there is no payload running
            ######################################################################
            if self.get_state() == self.WAITING_FOR_JOB:
                logger.info(
                    ' waiting for job definition, blocking on message queue for %s ',
                    self.loop_timeout)
                try:
                    qmsg = self.queues['JobComm'].get(
                        block=True, timeout=self.loop_timeout)
                except Queue.Empty:
                    logger.debug('no message on queue')
                else:
                    # shorten our message for printing
                    if logger.getEffectiveLevel() == logging.DEBUG:
                        tmpmsg = str(qmsg)
                        if len(tmpmsg) > self.debug_message_char_length:
                            tmpslice = slice(0, self.debug_message_char_length)
                            tmpmsg = tmpmsg[tmpslice] + '...'
                        logger.debug('received queue message: %s', tmpmsg)

                    # verify message type is as expected
                    if 'type' not in qmsg or qmsg[
                            'type'] != MessageTypes.NEW_JOB or 'job' not in qmsg:
                        logger.error('received unexpected message format: %s',
                                     qmsg)
                    else:
                        logger.info('received job definition')
                        current_job = qmsg['job']

                        # change state
                        self.set_state(self.REQUEST_EVENT_RANGES)
                qmsg = None

            ##################
            # REQUEST_EVENT_RANGES: Request event ranges from Yoda
            ######################################################################
            elif self.get_state() == self.REQUEST_EVENT_RANGES:
                if not waiting_for_eventranges:
                    logger.info('sending request for event ranges')
                    # send MPI message to Yoda for more event ranges
                    self.request_events(current_job)
                    waiting_for_eventranges = True
                # change state
                self.set_state(self.WAITING_FOR_EVENT_RANGES)

            ##################
            # WAITING_FOR_EVENT_RANGES: Waiting for event ranges from Yoda
            ######################################################################
            elif self.get_state() == self.WAITING_FOR_EVENT_RANGES:
                logger.info(
                    'waiting for event ranges, blocking on message queue for %s',
                    self.loop_timeout)
                try:
                    qmsg = self.queues['JobComm'].get(
                        block=True, timeout=self.loop_timeout)
                except Queue.Empty:
                    logger.debug('no message on queue')
                else:
                    # shorten our message for printing
                    if logger.getEffectiveLevel() == logging.DEBUG:
                        tmpmsg = str(qmsg)
                        if len(tmpmsg) > self.debug_message_char_length:
                            tmpslice = slice(0, self.debug_message_char_length)
                            tmpmsg = tmpmsg[tmpslice] + '...'
                        logger.debug('received queue message: %s', tmpmsg)

                    if 'type' not in qmsg:
                        logger.error('received unexpected message format: %s',
                                     qmsg)
                    elif qmsg['type'] == MessageTypes.NEW_EVENT_RANGES:
                        logger.info('received event ranges, adding to list')
                        eventranges += EventRangeList.EventRangeList(
                            qmsg['eventranges'])
                        # add event ranges to payload messenger list
                        # payloadcomm.add_eventranges(eventranges)
                        # change state
                        self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)
                    elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES:
                        logger.info('no more event ranges for PandaID %s',
                                    qmsg['PandaID'])
                        no_more_events = True

                        # check for running events
                        if len(eventranges) == eventranges.number_completed():
                            logger.info(
                                'no eventranges left to send so triggering exit'
                            )
                            self.stop()
                        else:
                            logger.info(
                                'still have events to process so continuing')
                            self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

                    else:
                        logger.error('unknown message type: %s', qmsg['type'])

                    waiting_for_eventranges = False

                qmsg = None

            ##################
            # WAIT_FOR_PAYLOAD_MESSAGE: initiates
            #          a request for a message from the payload
            ######################################################################
            if self.get_state() == self.WAIT_FOR_PAYLOAD_MESSAGE:

                # first check if there is an incoming message
                try:
                    logger.debug('checking for queue message')
                    qmsg = self.queues['JobComm'].get(block=False)
                    if MessageTypes.NEW_EVENT_RANGES in qmsg['type']:
                        logger.info('received new event range')
                        eventranges += EventRangeList.EventRangeList(
                            qmsg['eventranges'])
                        waiting_for_eventranges = False
                    elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES:
                        logger.info('no more event ranges for PandaID %s',
                                    qmsg['PandaID'])
                        no_more_events = True

                        # check for running events
                        if len(eventranges) == eventranges.number_completed():
                            logger.info(
                                'no eventranges left to send so triggering exit'
                            )
                            self.stop()
                        else:
                            logger.info(
                                'still have events to process so continuing')

                    else:
                        logger.error('received message of unknown type: %s',
                                     qmsg)
                except Queue.Empty:
                    logger.debug('no messages on queue')

                logger.info(
                    'checking for message from payload, block for %s, pending event range requests: %s',
                    self.loop_timeout, event_range_request_counter)

                payload_msg = athpayloadcomm.recv(self.loop_timeout)

                if len(payload_msg) > 0:
                    logger.debug('received message: %s', payload_msg)
                    self.set_state(self.MESSAGE_RECEIVED)
                else:
                    logger.debug('did not receive message from payload')
                    if event_range_request_counter > 0:
                        logger.debug(
                            'have %s pending event range requests so will try sending one.',
                            event_range_request_counter)
                        self.set_state(self.SEND_EVENT_RANGE)
                    # time.sleep(self.loop_timeout)

            ##################
            # MESSAGE_RECEIVED: this state indicates that a message has been
            #          received from the payload and its meaning will be parsed
            ######################################################################
            elif self.get_state() == self.MESSAGE_RECEIVED:

                # if ready for events, send them or wait for some
                if AthenaPayloadCommunicator.READY_FOR_EVENTS in payload_msg:
                    logger.info('payload is ready for event range')
                    self.set_state(self.SEND_EVENT_RANGE)
                    # increment counter to keep track of how many requests are queued
                    event_range_request_counter += 1

                #### OUTPUT File received
                elif len(payload_msg.split(',')) == 4:
                    # Athena sent details of an output file
                    logger.info('received output file from AthenaMP')
                    self.set_state(self.SEND_OUTPUT_FILE)

                else:
                    logger.error('failed to parse message from Athena: %s',
                                 payload_msg)
                    self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

            ##################
            # SEND_EVENT_RANGE: wait until more event ranges are sent by JobComm
            ######################################################################
            elif self.get_state() == self.SEND_EVENT_RANGE:
                logger.debug('sending event to payload')
                # if event ranges available, send one
                try:
                    logger.debug(
                        'have %d ready event ranges to send to AthenaMP',
                        eventranges.number_ready())
                    local_eventranges = eventranges.get_next()
                # no more event ranges available
                except EventRangeList.NoMoreEventRanges:
                    logger.debug('there are no more event ranges to process')
                    # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events
                    if no_more_events:
                        logger.info('sending AthenaMP NO_MORE_EVENTS')
                        athpayloadcomm.send(
                            AthenaPayloadCommunicator.NO_MORE_EVENTS)
                        # return to state requesting a message
                        self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

                    # otherwise wait for more events
                    else:
                        logger.info('waiting for more events ranges')
                        self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

                # something wrong with the index in the EventRangeList index
                except EventRangeList.RequestedMoreRangesThanAvailable:
                    logger.error(
                        'requested more event ranges than available, waiting for more event ranges'
                    )
                    self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

                else:
                    logger.info('sending %s eventranges to AthenaMP',
                                len(local_eventranges))
                    # append full path to file name for AthenaMP
                    # and adjust event counter by the number of files
                    # input_files = self.job_def.get()['inFiles'].split(',')
                    # logger.debug('%s: found %s input files',self.prelog,len(input_files))
                    for evtrg in local_eventranges:
                        evtrg['PFN'] = os.path.join(os.getcwd(), evtrg['LFN'])

                    # send AthenaMP the new event ranges
                    athpayloadcomm.send(
                        serializer.serialize(local_eventranges))
                    # decrement counter since we sent some events
                    event_range_request_counter -= 1

                    # return to state requesting a message
                    self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

                payload_msg = None

            ##################
            # SEND_OUTPUT_FILE: send output file data to MPIService
            ######################################################################
            elif self.get_state() == self.SEND_OUTPUT_FILE:
                logger.debug('send output file information')

                # parse message
                parts = payload_msg.split(',')
                # there should be four parts:
                # "myHITS.pool.root_000.Range-6,ID:Range-6,CPU:1,WALL:1"
                if len(parts) == 4:
                    # parse the parts
                    outputfilename = parts[0]
                    eventrangeid = parts[1].replace('ID:', '')
                    cpu = parts[2].replace('CPU:', '')
                    wallclock = parts[3].replace('WALL:', '')

                    # if staging, stage and change output filename
                    if self.stage_outputs:
                        # move file to staging_path
                        logger.debug('shutil.move(%s,%s)', outputfilename,
                                     self.staging_path)
                        shutil.move(outputfilename, self.staging_path)
                        # change output filename
                        outputfilename = os.path.join(
                            self.staging_path,
                            os.path.basename(outputfilename))
                        logger.info('outputfilename - %s', outputfilename)

                    # build the data for Harvester output file
                    output_file_data = {
                        'type': MessageTypes.OUTPUT_FILE,
                        'filename': outputfilename,
                        'eventrangeid': eventrangeid,
                        'cpu': cpu,
                        'wallclock': wallclock,
                        'scope': current_job['scopeOut'],
                        'pandaid': current_job['PandaID'],
                        'eventstatus': 'finished',
                        'destination_rank': 0,
                    }
                    # self.output_file_data.set(output_file_data)

                    # append output file data to list of files for transfer via MPI
                    output_files.append(output_file_data)
                    logger.info(
                        'received output file from AthenaMP; %s output files now on waiting list',
                        len(output_files))

                    # set event range to completed:
                    logger.debug('mark event range id %s as completed',
                                 output_file_data['eventrangeid'])
                    try:
                        eventranges.mark_completed(
                            output_file_data['eventrangeid'])
                    except Exception:
                        logger.error(
                            'failed to mark eventrangeid %s as completed',
                            output_file_data['eventrangeid'])
                        self.stop()

                    # return to state requesting a message
                    self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

                else:
                    logger.error('failed to parse output file')
                    self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE)

                payload_msg = None

            # if ready_events is below the threshold and the no more events flag has not been set
            # request more event ranges
            if eventranges.number_ready(
            ) < self.get_more_events_threshold and not no_more_events and not waiting_for_eventranges and current_job is not None:
                logger.info(
                    'number of ready events %s below request threshold %s, asking for more.',
                    eventranges.number_ready(), self.get_more_events_threshold)
                # send MPI message to Yoda for more event ranges
                self.request_events(current_job)
                waiting_for_eventranges = True

            # if the number of completed events equals the number of event ranges
            # available, and no more events flag is set, then kill subprocess and exit.
            elif eventranges.number_ready(
            ) == 0 and eventranges.number_completed() == len(
                    eventranges) and no_more_events:
                logger.info('no more events to process, exiting')
                self.stop()
                self.all_work_done.set()
            # else:
            # logger.info('sleeping for %s',self.loop_timeout)
            # self.exit.wait(timeout=self.loop_timeout)

        # send any remaining output files to Yoda before exitingn.
        # don't want to hammer Yoda with lots of little messages for output files
        # so aggregate output files for some time period then send as a group
        if len(output_files) > 0:

            # send output file data to Yoda/FileManager
            logger.info('sending %s output files to Yoda/FileManager',
                        len(output_files))
            mpi_message = {
                'type': MessageTypes.OUTPUT_FILE,
                'filelist': output_files,
                'destination_rank': 0
            }
            self.queues['MPIService'].put(mpi_message)

            # reset output file list
            output_files = []

        self.set_state(self.EXITED)

        logger.info('JobComm exiting')
def stage_out_files(file_list, output_type):
    global sfm_har_config, sfm_har_config_done
    sfm_har_config_done.wait()

    if output_type not in ['output', 'es_output', 'log']:
        raise Exception('incorrect type provided: %s' % (output_type))

    # load name of eventStatusDumpJsonFile file
    eventStatusDumpJsonFile = sfm_har_config['eventStatusDumpJsonFile']

    eventStatusDumpData = {}
    # loop over filelist
    for filedata in file_list:

        # make sure pandaID is a string
        pandaID = str(filedata['pandaid'])

        chksum = None
        if 'chksum' in filedata:
            chksum = filedata['chksum']

        # filename = os.path.join(output_path,os.path.basename(filedata['filename']))

        # format data for file:
        file_descriptor = {
            'eventRangeID': filedata['eventrangeid'],
            'eventStatus': filedata['eventstatus'],
            'path': filedata['filename'],
            'type': output_type,
            'chksum': chksum,
            'guid': None,
        }
        try:
            eventStatusDumpData[pandaID].append(file_descriptor)
        except KeyError:
            eventStatusDumpData[pandaID] = [file_descriptor]

    # create a temp file to place contents
    # this avoids Harvester trying to read the file while it is being written
    eventStatusDumpJsonFile_tmp = eventStatusDumpJsonFile + '.tmp'

    # if file does not already exists, new data is just what we have
    if not os.path.exists(eventStatusDumpJsonFile):
        data = eventStatusDumpData

    # if the file exists, move it to a tmp filename, update its contents and then recreate it.
    else:

        # first move existing file to tmp so Harvester does not read it while we edit
        try:
            os.rename(eventStatusDumpJsonFile, eventStatusDumpJsonFile_tmp)
        except Exception:
            logger.warning(
                'tried moving %s to a tmp filename to add more output files for Harvester.',
                eventStatusDumpJsonFile)
            if not os.path.exists(eventStatusDumpJsonFile):
                logger.warning(
                    '%s file no longer exists so Harvester must have grabbed it. Need to create a new file',
                    eventStatusDumpJsonFile)
                data = eventStatusDumpData
        else:

            # now open and read in the data
            with open(eventStatusDumpJsonFile_tmp, 'r') as f:
                data = serializer.deserialize(f.read())
            logger.debug('found existing data for pandaIDs: %s', data.keys())

            for pandaID in eventStatusDumpData:

                # if the pandaID already exists, just append the new file to that list
                try:
                    logger.debug('addding data to existing panda list')
                    data[pandaID] += eventStatusDumpData[pandaID]
                # if the pandaID does not exist, add a new list
                except KeyError:
                    logger.debug('addding new panda id list')
                    data[pandaID] = eventStatusDumpData[pandaID]

    if logger.getEffectiveLevel() == logging.DEBUG:
        tmpstr = ' '.join('%s:%s' % (x, len(data[x])) for x in data)
        logger.debug('writing output to file %s with keys: %s',
                     eventStatusDumpJsonFile, tmpstr)

    # overwrite the temp file with the updated data
    with open(eventStatusDumpJsonFile_tmp, 'w') as f:
        f.write(serializer.serialize(data, pretty_print=True))

    # move tmp file into place
    os.rename(eventStatusDumpJsonFile_tmp, eventStatusDumpJsonFile)

    logger.debug('done')
Ejemplo n.º 10
0
def stage_out_file(
    output_type,
    output_path,
    eventRangeID,
    eventStatus,
    pandaID,
    chksum=None,
):
    global sfm_har_config, sfm_har_config_done
    sfm_har_config_done.wait()

    if output_type not in ['output', 'es_output', 'log']:
        raise Exception('incorrect type provided: %s' % (output_type))

    if not os.path.exists(output_path):
        raise Exception('output file not found: %s' % (output_path))

    # make sure pandaID is a string
    pandaID = str(pandaID)

    # load name of eventStatusDumpJsonFile file
    eventStatusDumpJsonFile = sfm_har_config['eventStatusDumpJsonFile']

    # first create a temp file to place contents
    # this avoids Harvester trying to read the file while it is being written
    eventStatusDumpJsonFile_tmp = eventStatusDumpJsonFile + '.tmp'

    # format data for file:
    file_descriptor = {
        'eventRangeID': eventRangeID,
        'eventStatus': eventStatus,
        'path': output_path,
        'type': output_type,
        'chksum': chksum,
        'guid': None,
    }

    # if file does not already exists, new data is just what we have
    if not os.path.exists(eventStatusDumpJsonFile):
        data = {pandaID: [file_descriptor]}

    # if the file exists, move it to a tmp filename, update its contents and then recreate it.
    else:

        # first move existing file to tmp so Harvester does not read it while we edit
        try:
            os.rename(eventStatusDumpJsonFile, eventStatusDumpJsonFile_tmp)
        except Exception:
            logger.warning(
                'tried moving %s to a tmp filename to add more output files for Harvester.',
                eventStatusDumpJsonFile)
            if not os.path.exists(eventStatusDumpJsonFile):
                logger.warning(
                    '%s file no longer exists so Harvester must have grabbed it. Need to create a new file',
                    eventStatusDumpJsonFile)
                data = {pandaID: [file_descriptor]}
        else:

            # now open and read in the data
            with open(eventStatusDumpJsonFile_tmp, 'r') as f:
                data = serializer.deserialize(f.read())
            logger.debug('existing data contains %s', data)
            # if the pandaID already exists, just append the new file to that list
            if pandaID in data:
                logger.debug('addding data to existing panda list')
                data[pandaID].append(file_descriptor)
            # if the pandaID does not exist, add a new list
            else:
                logger.debug('addding new panda id list')
                data[pandaID] = [file_descriptor]

    logger.debug('output to file %s: %s', eventStatusDumpJsonFile, data)

    # overwrite the temp file with the updated data
    with open(eventStatusDumpJsonFile_tmp, 'w') as f:
        f.write(serializer.serialize(data))

    # move tmp file into place
    os.rename(eventStatusDumpJsonFile_tmp, eventStatusDumpJsonFile)