Example #1
0
    def test_communicator_manager(self):
        """
        Make sure that es communicator manager thread works as expected.
        """
        communicator_manager = None
        try:
            args = {
                'workflow': 'eventservice_hpc',
                'queue': 'BNL_CLOUD_MCORE',
                'site': 'BNL_CLOUD_MCORE',
                'port': 25443,
                'url': 'https://aipanda007.cern.ch',
                'job_label': 'ptest',
                'pilot_user': '******',
                'node': socket.getfqdn(),
                'mem': 16000,
                'disk_space': 160000,
                'working_group': '',
                'cpu': 2601.0,
                'info': None
            }

            communicator_manager = CommunicationManager()
            communicator_manager.start()
            self.assertTrue(communicator_manager.is_alive())

            jobs = communicator_manager.get_jobs(njobs=2, args=args)
            self.assertEqual(len(jobs), 2)

            jobs = communicator_manager.get_jobs(njobs=1, args=args)
            self.assertEqual(len(jobs), 1)

            job_list = []
            for job in jobs:
                job_data = {
                    'node': socket.getfqdn(),
                    'pilotErrorCode': 0,
                    'startTime': time.time(),
                    'jobMetrics': 'coreCount=8',
                    'schedulerID': 'unknown',
                    'timestamp': time_stamp(),
                    'exeErrorCode': 0,
                    'pilotID': 'unknown|PR|2.0.0 (80)',
                    'transExitCode': 0,
                    'pilotErrorDiag': '',
                    'exeErrorDiag': ''
                }
                job_data['jobId'] = job['PandaID']
                job_data['siteName'] = 'BNL_CLOUD_MCORE'
                job_data['state'] = 'running'
                job_data['attemptNr'] = job['attemptNr'] + 1
                job_list.append(job_data)
            status = communicator_manager.update_jobs(jobs=job_list)
            self.assertEqual(status[0], True)

            events = communicator_manager.get_event_ranges(num_event_ranges=1,
                                                           job=jobs[0])
            self.assertEqual(len(events), 1)

            for event in events:
                event_range_status = {
                    "errorCode": 1220,
                    "eventRangeID": event['eventRangeID'],
                    "eventStatus": 'failed'
                }
                event_range_message = {
                    'version': 0,
                    'eventRanges': json.dumps(event_range_status)
                }
                res = communicator_manager.update_events(
                    update_events=event_range_message)
                self.assertEqual(res['StatusCode'], 0)

            events = communicator_manager.get_event_ranges(num_event_ranges=2,
                                                           job=jobs[0])
            self.assertEqual(len(events), 2)

            update_events = []
            for event in events:
                event_range = {
                    "eventRangeID": event['eventRangeID'],
                    "eventStatus": 'finished'
                }
                update_events.append(event_range)
            event_range_status = [{
                "zipFile": {
                    "numEvents": len(update_events),
                    "objstoreID": 1318,
                    "adler32": '000000',
                    "lfn": 'test_file',
                    "fsize": 100,
                    "pathConvention": 1000
                },
                "eventRanges": update_events
            }]

            event_range_message = {
                'version': 1,
                'eventRanges': json.dumps(event_range_status)
            }
            res = communicator_manager.update_events(
                update_events=event_range_message)
            self.assertEqual(res['StatusCode'], 0)

            communicator_manager.stop()
            time.sleep(2)
            self.assertFalse(communicator_manager.is_alive())
        except Exception as ex:
            if communicator_manager:
                communicator_manager.stop()
            raise ex
Example #2
0
class BaseExecutor(threading.Thread, PluginFactory):
    def __init__(self, **kwargs):
        super(BaseExecutor, self).__init__()
        self.setName("BaseExecutor")
        self.queue = None
        self.payload = None

        self.args = None
        for key in kwargs:
            setattr(self, key, kwargs[key])

        self.__stop = threading.Event()

        self.__event_ranges = []
        self.__is_set_payload = False
        self.__is_retrieve_payload = False

        self.communication_manager = None

        self.proc = None

    def get_pid(self):
        return self.proc.pid if self.proc else None

    def __del__(self):
        self.stop()
        if self.communication_manager:
            self.communication_manager.stop()

    def is_payload_started(self):
        return False

    def start(self):
        super(BaseExecutor, self).start()
        self.communication_manager = CommunicationManager()
        self.communication_manager.start()

    def stop(self):
        if not self.is_stop():
            self.__stop.set()

    def is_stop(self):
        return self.__stop.is_set()

    def stop_communicator(self):
        logger.info("Stopping communication manager")
        if self.communication_manager:
            while self.communication_manager.is_alive():
                if not self.communication_manager.is_stop():
                    self.communication_manager.stop()
        logger.info("Communication manager stopped")

    def set_payload(self, payload):
        self.payload = payload
        self.__is_set_payload = True
        job = self.get_job()
        if job and job.workdir:
            os.chdir(job.workdir)

    def is_set_payload(self):
        return self.__is_set_payload

    def set_retrieve_payload(self):
        self.__is_retrieve_payload = True

    def is_retrieve_payload(self):
        return self.__is_retrieve_payload

    def retrieve_payload(self):
        logger.info("Retrieving payload: %s" % self.args)
        jobs = self.communication_manager.get_jobs(njobs=1, args=self.args)
        logger.info("Received jobs: %s" % jobs)
        if jobs:
            job = create_job(jobs[0], queue=self.queue)

            # get the payload command from the user specific code
            pilot_user = os.environ.get('PILOT_USER', 'atlas').lower()
            user = __import__('pilot.user.%s.common' % pilot_user, globals(),
                              locals(), [pilot_user], 0)  # Python 2/3
            cmd = user.get_payload_command(job)
            logger.info("payload execution command: %s" % cmd)

            payload = {'executable': cmd, 'workdir': job.workdir, 'job': job}
            logger.info("Retrieved payload: %s" % payload)
            return payload
        return None

    def get_payload(self):
        if self.__is_set_payload:
            return self.payload

    def get_job(self):
        return self.payload['job'] if self.payload and 'job' in list(
            self.payload.keys()) else None  # Python 2/3

    def get_event_ranges(self, num_event_ranges=1, queue_factor=2):
        if config.Payload.executor_type.lower() == 'raythena':
            old_queue_factor = queue_factor
            queue_factor = 1
            logger.info("raythena - Changing queue_factor from %s to %s" %
                        (old_queue_factor, queue_factor))
        logger.info(
            "Getting event ranges: (num_ranges: %s) (queue_factor: %s)" %
            (num_event_ranges, queue_factor))
        if len(self.__event_ranges) < num_event_ranges:
            ret = self.communication_manager.get_event_ranges(
                num_event_ranges=num_event_ranges * queue_factor,
                job=self.get_job())
            for event_range in ret:
                self.__event_ranges.append(event_range)

        ret = []
        for _ in range(num_event_ranges):
            if len(self.__event_ranges) > 0:
                event_range = self.__event_ranges.pop(0)
                ret.append(event_range)
        logger.info("Received event ranges(num:%s): %s" % (len(ret), ret))
        return ret

    def update_events(self, messages):
        logger.info("Updating event ranges: %s" % messages)
        ret = self.communication_manager.update_events(messages)
        logger.info("Updated event ranges status: %s" % ret)
        return ret

    def update_jobs(self, jobs):
        logger.info("Updating jobs: %s" % jobs)
        ret = self.communication_manager.update_jobs(jobs)
        logger.info("Updated jobs status: %s" % ret)
        return ret

    def run(self):
        """
        Main run process
        """
        raise NotImplementedError()
Example #3
0
 def start(self):
     super(BaseExecutor, self).start()
     self.communication_manager = CommunicationManager()
     self.communication_manager.start()
Example #4
0
    def setUpClass(cls):
        try:
            args = {
                'workflow': 'eventservice_hpc',
                'queue': 'BNL_CLOUD_MCORE',
                'site': 'BNL_CLOUD_MCORE',
                'port': 25443,
                'url': 'https://aipanda007.cern.ch',
                'job_label': 'ptest',
                'pilot_user': '******',
                'node': socket.getfqdn(),
                'mem': 16000,
                'disk_space': 160000,
                'working_group': '',
                'cpu': 2601.0,
                'info': None
            }

            communicator_manager = CommunicationManager()
            cls._communicator_manager = communicator_manager
            communicator_manager.start()

            jobs = communicator_manager.get_jobs(njobs=1, args=args)
            job = create_job(jobs[0], 'BNL_CLOUD_MCORE')
            job.workdir = '/tmp/test_esworkexecutor'
            job.corecount = 1
            if not os.path.exists(job.workdir):
                os.makedirs(job.workdir)

            job_data = {}
            job_data['jobId'] = job['PandaID']
            job_data['siteName'] = 'BNL_CLOUD_MCORE'
            job_data['state'] = 'starting'
            job_data['attemptNr'] = job['attemptNr'] + 1
            job_data['node'] = 'pilot2_test'
            job_data['schedulerID'] = 'pilot2_test'
            job_data['coreCount'] = 1
            status = communicator_manager.update_jobs(jobs=[job_data])
            job_data['state'] = 'running'
            status = communicator_manager.update_jobs(jobs=[job_data])
            communicator_manager.stop()

            # download input files
            client = StageInESClient(job.infosys, logger=logger)
            kwargs = dict(workdir=job.workdir,
                          cwd=job.workdir,
                          usecontainer=False,
                          job=job)
            client.prepare_sources(job.indata)
            client.transfer(job.indata, activity='pr', **kwargs)

            # get the payload command from the user specific code
            pilot_user = os.environ.get('PILOT_USER', 'atlas').lower()
            user = __import__('pilot.user.%s.common' % pilot_user, globals(),
                              locals(), [pilot_user], 0)  # Python 2/3
            cmd = user.get_payload_command(job)
            logger.info("payload execution command: %s" % cmd)

            payload = {
                'executable': cmd,
                'workdir': job.workdir,
                'output_file': 'pilot_test_%s_stdout.txt' % job['PandaID'],
                'error_file': 'pilot_test_%s_stderr.txt' % job['PandaID'],
                'job': job
            }
            cls._payload = payload
        except Exception as ex:
            if cls._communicator_manager:
                cls._communicator_manager.stop()
            raise ex