def make_worker(self, jobspec_list, queue_config, job_type, resource_type):
        tmpLog = self.make_logger(_logger,
                                  'queue={0}'.format(queue_config.queueName),
                                  method_name='make_worker')

        tmpLog.debug('jobspec_list: {0}'.format(jobspec_list))

        workSpec = WorkSpec()
        workSpec.creationTime = datetime.datetime.utcnow()

        # get the queue configuration from the DB
        panda_queues_dict = PandaQueuesDict()
        queue_dict = panda_queues_dict.get(queue_config.queueName, {})
        workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1
        workSpec.maxWalltime = queue_dict.get('maxtime', 1)
        workSpec.maxDiskCount = queue_dict.get('maxwdir', 1)

        # get info from jobs
        if len(jobspec_list) > 0:
            nRemainingEvents = 0
            for jobspec in jobspec_list:
                if jobspec.nRemainingEvents:
                    nRemainingEvents += jobspec.nRemainingEvents

            nCore, maxWalltime = self.calculate_worker_requirements(
                nRemainingEvents)
            workSpec.nCore = nCore
            workSpec.maxWalltime = maxWalltime

        # TODO: this needs to be improved with real resource types
        if resource_type and resource_type != 'ANY':
            workSpec.resourceType = resource_type
        elif workSpec.nCore == 1:
            workSpec.resourceType = 'SCORE'
        else:
            workSpec.resourceType = 'MCORE'

        return workSpec
    def make_worker(self, jobspec_list, queue_config, resource_type):
        tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName),
                                  method_name='make_worker')

        tmpLog.debug('jobspec_list: {0}'.format(jobspec_list))

        workSpec = WorkSpec()
        workSpec.creationTime = datetime.datetime.utcnow()

        # get the queue configuration from the DB
        panda_queues_dict = PandaQueuesDict()
        queue_dict = panda_queues_dict.get(queue_config.queueName, {})
        workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1
        workSpec.maxWalltime = queue_dict.get('maxtime', 1)
        workSpec.maxDiskCount = queue_dict.get('maxwdir', 1)

        # get info from jobs
        if len(jobspec_list) > 0:
            nRemainingEvents = 0
            for jobspec in jobspec_list:
                if jobspec.nRemainingEvents:
                    nRemainingEvents += jobspec.nRemainingEvents

            nCore, maxWalltime = self.calculate_worker_requirements(nRemainingEvents)
            workSpec.nCore = nCore
            workSpec.maxWalltime = maxWalltime

        # TODO: this needs to be improved with real resource types
        if resource_type and resource_type != 'ANY':
            workSpec.resourceType = resource_type
        elif workSpec.nCore == 1:
            workSpec.resourceType = 'SCORE'
        else:
            workSpec.resourceType = 'MCORE'

        return workSpec
Ejemplo n.º 3
0
    def make_worker(self, jobspec_list, queue_config, resource_type):
        tmpLog = self.make_logger(_logger,
                                  'queue={0}'.format(queue_config.queueName),
                                  method_name='make_worker')

        tmpLog.debug('jobspec_list: {0}'.format(jobspec_list))

        workSpec = WorkSpec()
        workSpec.creationTime = datetime.datetime.utcnow()

        # get the queue configuration from the DB
        panda_queues_dict = PandaQueuesDict()
        queue_dict = panda_queues_dict.get(queue_config.queueName, {})

        unified_queue = queue_dict.get('capability', '') == 'ucore'
        # case of traditional (non-unified) queue: look at the queue configuration
        if not unified_queue:
            workSpec.nCore = queue_dict.get('corecount', 1) or 1
            workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1

        # case of unified queue: look at the resource type and queue configuration
        else:
            catchall = queue_dict.get('catchall', '')
            if 'useMaxRam' in catchall or queue_config.queueName in (
                    'Taiwan-LCG2-HPC2_Unified', 'Taiwan-LCG2-HPC_Unified',
                    'DESY-ZN_UCORE'):
                # temporary hack to debug killed workers in Taiwan queues
                site_corecount = queue_dict.get('corecount', 1) or 1
                site_maxrss = queue_dict.get('maxrss', 1) or 1

                # some cases need to overwrite those values
                if 'SCORE' in resource_type:
                    # the usual pilot streaming use case
                    workSpec.nCore = 1
                    workSpec.minRamCount = int(
                        math.ceil(site_maxrss / site_corecount))
                else:
                    # default values
                    workSpec.nCore = site_corecount
                    workSpec.minRamCount = site_maxrss
            else:
                workSpec.nCore, workSpec.minRamCount = self.rt_mapper.calculate_worker_requirements(
                    resource_type, queue_dict)

        # parameters that are independent on traditional vs unified
        workSpec.maxWalltime = queue_dict.get('maxtime', 1)
        workSpec.maxDiskCount = queue_dict.get('maxwdir', 1)
        walltimeLimit_default = getattr(queue_config, 'walltimeLimit', 0)

        if len(jobspec_list) > 0:
            # get info from jobs
            nCore = 0
            minRamCount = 0
            maxDiskCount = 0
            maxWalltime = 0
            ioIntensity = 0
            for jobSpec in jobspec_list:
                job_corecount, job_memory = self.get_job_core_and_memory(
                    queue_dict, jobSpec)
                nCore += job_corecount
                minRamCount += job_memory
                try:
                    maxDiskCount += jobSpec.jobParams['maxDiskCount']
                except Exception:
                    pass
                try:
                    ioIntensity += jobSpec.jobParams['ioIntensity']
                except Exception:
                    pass
            try:
                # maxWallTime from AGIS or qconf, not trusting job currently
                maxWalltime = queue_dict.get('maxtime', walltimeLimit_default)
            except Exception:
                pass

            if (nCore > 0 and 'nCore' in self.jobAttributesToUse) \
               or unified_queue:
                workSpec.nCore = nCore
            if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) \
               or unified_queue:
                workSpec.minRamCount = minRamCount
            if maxDiskCount > 0 and 'maxDiskCount' in self.jobAttributesToUse:
                workSpec.maxDiskCount = maxDiskCount
            if maxWalltime > 0 and 'maxWalltime' in self.jobAttributesToUse:
                workSpec.maxWalltime = maxWalltime
            if ioIntensity > 0 and 'ioIntensity' in self.jobAttributesToUse:
                workSpec.ioIntensity = ioIntensity
            workSpec.pilotType = jobspec_list[0].get_pilot_type()
        else:
            # when no job
            # randomize pilot type with weighting
            pdpm = getattr(queue_config,
                           'prodSourceLabelRandomWeightsPermille', {})
            choice_list = core_utils.make_choice_list(pdpm=pdpm,
                                                      default='managed')
            tmp_prodsourcelabel = random.choice(choice_list)
            fake_job = JobSpec()
            fake_job.jobParams = {}
            fake_job.jobParams['prodSourceLabel'] = tmp_prodsourcelabel
            workSpec.pilotType = fake_job.get_pilot_type()
            del fake_job
            if workSpec.pilotType in ['RC', 'ALRB', 'PT']:
                tmpLog.info('a worker has pilotType={0}'.format(
                    workSpec.pilotType))
        # TODO: this needs to be improved with real resource types
        if resource_type and resource_type != 'ANY':
            workSpec.resourceType = resource_type
        elif workSpec.nCore == 1:
            workSpec.resourceType = 'SCORE'
        else:
            workSpec.resourceType = 'MCORE'

        return workSpec
Ejemplo n.º 4
0
    def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # whether to submit any worker
        to_submit_any = True

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        if self.condorSchedd is None or not self.useSpool:
            try:
                os.mkdir(log_subdir_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
                else:
                    pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(
                self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(
                self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        # get default information from queue info
        n_core_per_node_from_queue = this_panda_queue_dict.get(
            'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
        is_unified_queue = this_panda_queue_dict.get('capability',
                                                     '') == 'ucore'

        # get override requirements from queue configured
        try:
            n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
        except AttributeError:
            n_core_per_node = n_core_per_node_from_queue

        # deal with CE
        special_par = ''
        ce_weighting = None
        if self.useAtlasGridCE:
            # If ATLAS Grid CE mode used
            tmpLog.debug('Using ATLAS Grid CE mode...')
            queues_from_queue_list = this_panda_queue_dict.get('queues', [])
            special_par = this_panda_queue_dict.get('special_par', '')
            ce_auxilary_dict = {}
            for _queue_dict in queues_from_queue_list:
                if not (_queue_dict.get('ce_endpoint') and str(
                        _queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                        and str(_queue_dict.get('ce_flavour', '')).lower()
                        in set(['arc-ce', 'cream-ce', 'htcondor-ce'])):
                    continue
                ce_endpoint = _queue_dict.get('ce_endpoint')
                if (ce_endpoint in ce_auxilary_dict
                        and str(_queue_dict.get('ce_queue_name',
                                                '')).lower() == 'default'):
                    pass
                else:
                    ce_auxilary_dict[ce_endpoint] = _queue_dict
            # qualified CEs from AGIS info
            n_qualified_ce = len(ce_auxilary_dict)
            if n_qualified_ce > 0:
                # Get CE weighting
                tmpLog.debug('Get CE weighting')
                worker_ce_all_tuple = self.get_ce_statistics(
                    self.queueName, nWorkers)
                ce_weighting = _get_ce_weighting(
                    ce_endpoint_list=list(ce_auxilary_dict.keys()),
                    worker_ce_all_tuple=worker_ce_all_tuple)
                stats_weighting_display_str = _get_ce_stats_weighting_display(
                    ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting)
                tmpLog.debug('CE stats and weighting: {0}'.format(
                    stats_weighting_display_str))
            else:
                tmpLog.error('No valid CE endpoint found')
                to_submit_any = False

        def _handle_one_worker(workspec, to_submit=to_submit_any):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger,
                                            'workerID={0}'.format(
                                                workspec.workerID),
                                            method_name='_handle_one_worker')
            ce_info_dict = dict()
            batch_log_dict = dict()
            data = {
                'workspec': workspec,
                'to_submit': to_submit,
            }
            if to_submit:
                if self.useAtlasGridCE:
                    # choose a CE
                    tmpLog.info('choose a CE...')
                    ce_chosen = _choose_ce(ce_weighting)
                    try:
                        ce_info_dict = ce_auxilary_dict[ce_chosen].copy()
                    except KeyError:
                        tmpLog.info(
                            'Problem choosing CE with weighting. Choose an arbitrary CE endpoint'
                        )
                        ce_info_dict = random.choice(
                            list(ce_auxilary_dict.values())).copy()
                    # go on info of the CE
                    ce_endpoint_from_queue = ce_info_dict.get(
                        'ce_endpoint', '')
                    ce_flavour_str = str(ce_info_dict.get('ce_flavour',
                                                          '')).lower()
                    ce_version_str = str(ce_info_dict.get('ce_version',
                                                          '')).lower()
                    ce_info_dict['ce_hostname'] = re.sub(
                        ':\w*', '', ce_endpoint_from_queue)
                    if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue:
                        # add default port to ce_endpoint if missing
                        default_port_map = {
                            'cream-ce': 8443,
                            'arc-ce': 2811,
                            'htcondor-ce': 9619,
                        }
                        if ce_flavour_str in default_port_map:
                            default_port = default_port_map[ce_flavour_str]
                            ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(
                                ce_endpoint_from_queue, default_port)
                    tmpLog.debug(
                        'For site {0} got CE endpoint: "{1}", flavour: "{2}"'.
                        format(self.queueName, ce_endpoint_from_queue,
                               ce_flavour_str))
                    if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                        sdf_template_filename = '{ce_flavour_str}.sdf'.format(
                            ce_flavour_str=ce_flavour_str)
                        self.templateFile = os.path.join(
                            self.CEtemplateDir, sdf_template_filename)
                else:
                    try:
                        # Manually define site condor schedd as ceHostname and central manager as ceEndpoint
                        if self.ceHostname and isinstance(
                                self.ceHostname,
                                list) and len(self.ceHostname) > 0:
                            if isinstance(self.ceEndpoint,
                                          list) and len(self.ceEndpoint) > 0:
                                ce_info_dict['ce_hostname'], ce_info_dict[
                                    'ce_endpoint'] = random.choice(
                                        list(
                                            zip(self.ceHostname,
                                                self.ceEndpoint)))
                            else:
                                ce_info_dict['ce_hostname'] = random.choice(
                                    self.ceHostname)
                                ce_info_dict['ce_endpoint'] = self.ceEndpoint
                        else:
                            ce_info_dict['ce_hostname'] = self.ceHostname
                            ce_info_dict['ce_endpoint'] = self.ceEndpoint
                    except AttributeError:
                        pass
                # template for batch script
                try:
                    tmpFile = open(self.templateFile)
                    sdf_template_raw = tmpFile.read()
                    tmpFile.close()
                except AttributeError:
                    tmpLog.error(
                        'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found'
                    )
                    to_submit = False
                    return data
                else:
                    # get batch_log, stdout, stderr filename, and remobe commented liness
                    sdf_template_str_list = []
                    for _line in sdf_template_raw.split('\n'):
                        if _line.startswith('#'):
                            continue
                        sdf_template_str_list.append(_line)
                        _match_batch_log = re.match('log = (.+)', _line)
                        _match_stdout = re.match('output = (.+)', _line)
                        _match_stderr = re.match('error = (.+)', _line)
                        if _match_batch_log:
                            batch_log_value = _match_batch_log.group(1)
                            continue
                        if _match_stdout:
                            stdout_value = _match_stdout.group(1)
                            continue
                        if _match_stderr:
                            stderr_value = _match_stderr.group(1)
                            continue
                    sdf_template = '\n'.join(sdf_template_str_list)
                    # Choose from Condor schedd and central managers
                    if isinstance(self.condorSchedd,
                                  list) and len(self.condorSchedd) > 0:
                        if isinstance(self.condorPool,
                                      list) and len(self.condorPool) > 0:
                            condor_schedd, condor_pool = random.choice(
                                list(zip(self.condorSchedd, self.condorPool)))
                        else:
                            condor_schedd = random.choice(self.condorSchedd)
                            condor_pool = self.condorPool
                    else:
                        condor_schedd = self.condorSchedd
                        condor_pool = self.condorPool
                    # Log Base URL
                    if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL:
                        schedd_hostname = re.sub(
                            r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?',
                            lambda matchobj: matchobj.group(1)
                            if matchobj.group(1) else '', condor_schedd)
                        log_base_url = re.sub(r'\[ScheddHostname\]',
                                              schedd_hostname, self.logBaseURL)
                    else:
                        log_base_url = self.logBaseURL
                    # URLs for log files
                    if not (log_base_url is None):
                        if workspec.batchID:
                            batchID = workspec.batchID
                            guess = False
                        else:
                            batchID = ''
                            guess = True
                        batch_log_filename = parse_batch_job_filename(
                            value_str=batch_log_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        stdout_path_file_name = parse_batch_job_filename(
                            value_str=stdout_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        stderr_path_filename = parse_batch_job_filename(
                            value_str=stderr_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        batch_log = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, batch_log_filename)
                        batch_stdout = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, stdout_path_file_name)
                        batch_stderr = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, stderr_path_filename)
                        workspec.set_log_file('batch_log', batch_log)
                        workspec.set_log_file('stdout', batch_stdout)
                        workspec.set_log_file('stderr', batch_stderr)
                        batch_log_dict['batch_log'] = batch_log
                        batch_log_dict['batch_stdout'] = batch_stdout
                        batch_log_dict['batch_stderr'] = batch_stderr
                        batch_log_dict['gtag'] = workspec.workAttributes[
                            'stdOut']
                        tmpLog.debug('Done set_log_file before submission')
                    tmpLog.debug('Done jobspec attribute setting')
                # set data dict
                data.update({
                    'workspec': workspec,
                    'to_submit': to_submit,
                    'template': sdf_template,
                    'executable_file': self.executableFile,
                    'log_dir': self.logDir,
                    'log_subdir': log_subdir,
                    'n_core_per_node': n_core_per_node,
                    'panda_queue_name': panda_queue_name,
                    'x509_user_proxy': self.x509UserProxy,
                    'ce_info_dict': ce_info_dict,
                    'batch_log_dict': batch_log_dict,
                    'special_par': special_par,
                    'harvester_queue_config': harvester_queue_config,
                    'is_unified_queue': is_unified_queue,
                    'condor_schedd': condor_schedd,
                    'condor_pool': condor_pool,
                    'use_spool': self.useSpool,
                })
            return data

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(
                baseLogger,
                'workerID={0}'.format(workspec.workerID),
                method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # exec with mcore
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retValList = thread_pool.map(submit_a_worker, dataIterator)
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(
                lambda _wv_tuple: _propagate_attributes(*_wv_tuple),
                zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList
Ejemplo n.º 5
0
    def update_label(self, site, msg, data):
        """
        Updates a label (=panda queue+CE)
        """
        start_time = time.time()
        tmp_log = core_utils.make_logger(_base_logger,
                                         'harvester_id={0}'.format(
                                             self.harvester_id),
                                         method_name='update_label')

        if not self.__active:
            tmp_log.debug('APFMon reporting not enabled')
            return

        try:
            tmp_log.debug('start')
            data = self.massage_label_data(data)

            # get the active queues from the config mapper
            all_sites = self.queue_config_mapper.get_active_queues().keys()
            panda_queues_dict = PandaQueuesDict()

            site_info = panda_queues_dict.get(site, dict())
            if not site_info:
                tmp_log.warning('No site info for {0}'.format(site))
                return

            # when no CEs associated to a queue, e.g. P1, HPCs, etc. Try to see if there is something
            # in local configuration, otherwise set it to a dummy value
            try:
                ce = self.queue_config_mapper.queueConfig[site].submitter[
                    'ceEndpoint']
                queues = [{'ce_endpoint': ce}]
            except KeyError:
                if site_info['queues']:
                    queues = site_info['queues']
                else:
                    queues = [{'ce_endpoint': NO_CE}]

            for queue in queues:
                try:
                    try:
                        ce = clean_ce(queue['ce_endpoint'])
                    except:
                        ce = ''

                    label_data = {'status': msg, 'data': data}
                    label = '{0}-{1}'.format(site, ce)
                    label_id = '{0}:{1}'.format(self.harvester_id, label)
                    url = '{0}/labels/{1}'.format(self.base_url, label_id)

                    r = requests.post(url,
                                      data=json.dumps(label_data),
                                      timeout=self.__label_timeout)
                    tmp_log.debug(
                        'label update for {0} ended with {1} {2}'.format(
                            label, r.status_code, r.text))
                except:
                    tmp_log.error('Excepted for site {0} with: {1}'.format(
                        label, traceback.format_exc()))

            end_time = time.time()
            tmp_log.debug('done (took {0})'.format(end_time - start_time))
        except:
            tmp_log.error('Excepted with: {0}'.format(traceback.format_exc()))
Ejemplo n.º 6
0
    def create_labels(self):
        """
        Creates or updates a collection of labels (=panda queue+CE)
        """
        start_time = time.time()
        tmp_log = core_utils.make_logger(_base_logger,
                                         'harvester_id={0}'.format(
                                             self.harvester_id),
                                         method_name='create_labels')

        if not self.__active:
            tmp_log.debug('APFMon reporting not enabled')
            return

        try:
            tmp_log.debug('start')

            url = '{0}/labels'.format(self.base_url)

            # get the active queues from the config mapper
            all_sites = self.queue_config_mapper.get_active_queues().keys()
            panda_queues_dict = PandaQueuesDict()

            # publish the active queues to APF mon in shards
            for sites in core_utils.create_shards(all_sites, 20):
                labels = []
                for site in sites:
                    try:
                        site_info = panda_queues_dict.get(site, dict())
                        if not site_info:
                            tmp_log.warning(
                                'No site info for {0}'.format(site))
                            continue

                        # when no CEs associated to a queue, e.g. P1, HPCs, etc. Try to see if there is something
                        # in local configuration, otherwise set it to a dummy value
                        try:
                            ce = self.queue_config_mapper.queueConfig[
                                site].submitter['ceEndpoint']
                            queues = [{'ce_endpoint': ce}]
                        except KeyError:
                            if site_info['queues']:
                                queues = site_info['queues']
                            else:
                                queues = [{'ce_endpoint': NO_CE}]

                        for queue in queues:
                            try:
                                ce = clean_ce(queue['ce_endpoint'])
                            except:
                                ce = ''

                            try:
                                ce_queue_id = queue['ce_queue_id']
                            except KeyError:
                                ce_queue_id = 0

                            labels.append({
                                'name': '{0}-{1}'.format(site, ce),
                                'wmsqueue': site,
                                'ce_queue_id': ce_queue_id,
                                'factory': self.harvester_id
                            })
                    except:
                        tmp_log.error('Excepted for site {0} with: {1}'.format(
                            site, traceback.format_exc()))
                        continue

                payload = json.dumps(labels)

                r = requests.put(url,
                                 data=payload,
                                 timeout=self.__label_timeout)
                tmp_log.debug(
                    'label creation for {0} ended with {1} {2}'.format(
                        sites, r.status_code, r.text))

            end_time = time.time()
            tmp_log.debug('done (took {0})'.format(end_time - start_time))
        except:
            tmp_log.error('Excepted with: {0}'.format(traceback.format_exc()))
Ejemplo n.º 7
0
 def load_data(self, refill_table=False):
     mainLog = _make_logger(method_name='QueueConfigMapper.load_data')
     with self.lock:
         # check if to update
         timeNow_timestamp = time.time()
         if self.lastUpdate is not None:
             last_reload_timestamp = self._get_last_reload_time()
             if (last_reload_timestamp is not None
                     and self.lastUpdate is not None
                     and datetime.datetime.utcfromtimestamp(
                         last_reload_timestamp) < self.lastUpdate
                     and timeNow_timestamp - last_reload_timestamp <
                     self.updateInterval):
                 return
     # start
     with self.lock:
         # update timesatmp of last reload, lock with check interval
         got_timesatmp_update_lock = self.dbProxy.get_process_lock(
             'qconf_reload', 'qconf_universal', self.updateInterval)
         if got_timesatmp_update_lock:
             retVal = self._update_last_reload_time()
             if retVal:
                 mainLog.debug('updated last reload timestamp')
             else:
                 mainLog.warning(
                     'failed to update last reload timestamp. Skipped')
         else:
             mainLog.debug(
                 'did not get qconf_reload timestamp lock. Skipped to update last reload timestamp'
             )
         # init
         newQueueConfig = dict()
         localTemplatesDict = dict()
         remoteTemplatesDict = dict()
         finalTemplatesDict = dict()
         localQueuesDict = dict()
         remoteQueuesDict = dict()
         dynamicQueuesDict = dict()
         allQueuesNameList = set()
         getQueuesDynamic = False
         invalidQueueList = set()
         pandaQueueDict = PandaQueuesDict()
         # get resolver
         resolver = self._get_resolver()
         if resolver is None:
             mainLog.debug('No resolver is configured')
         # load config json from cacher (RT & RQ)
         queueConfigJson_cacher = self._load_config_from_cache()
         if queueConfigJson_cacher is not None:
             for queueName, queueDict in iteritems(queueConfigJson_cacher):
                 if queueDict.get('isTemplateQueue') is True \
                     or queueName.endswith('_TEMPLATE'):
                     # is RT
                     queueDict['isTemplateQueue'] = True
                     queueDict.pop('templateQueueName', None)
                     remoteTemplatesDict[queueName] = queueDict
                 else:
                     # is RQ
                     queueDict['isTemplateQueue'] = False
                     remoteQueuesDict[queueName] = queueDict
         # load config from local json file (LT & LQ)
         queueConfigJson_local = self._load_config_from_file()
         if queueConfigJson_local is not None:
             for queueName, queueDict in iteritems(queueConfigJson_local):
                 if queueDict.get('isTemplateQueue') is True \
                     or queueName.endswith('_TEMPLATE'):
                     # is LT
                     queueDict['isTemplateQueue'] = True
                     queueDict.pop('templateQueueName', None)
                     localTemplatesDict[queueName] = queueDict
                 else:
                     # is LQ
                     queueDict['isTemplateQueue'] = False
                     localQueuesDict[queueName] = queueDict
         else:
             mainLog.warning(
                 'Failed to load config from local json file. Skipped')
         # fill in final template (FT)
         finalTemplatesDict.update(remoteTemplatesDict)
         finalTemplatesDict.update(localTemplatesDict)
         finalTemplatesDict.pop(None, None)
         # remove queues with invalid templateQueueName
         for acr, queuesDict in [('RQ', remoteQueuesDict),
                                 ('LQ', localQueuesDict)]:
             for queueName, queueDict in iteritems(queuesDict.copy()):
                 templateQueueName = queueDict.get('templateQueueName')
                 if templateQueueName is not None \
                     and templateQueueName not in finalTemplatesDict:
                     del queuesDict[queueName]
                     mainLog.warning(
                         'Invalid templateQueueName "{0}" for {1} ({2}). Skipped'
                         .format(templateQueueName, queueName, acr))
         # get queue names from resolver and fill in dynamic queue (DQ)
         if resolver is not None \
             and 'DYNAMIC' in harvester_config.qconf.queueList:
             getQueuesDynamic = True
             dynamicQueuesNameList = resolver.get_all_queue_names()
             for queueName in dynamicQueuesNameList.copy():
                 queueDict = dict()
                 # template and default template via workflow
                 templateQueueName = None
                 resolver_harvester_template = None
                 if resolver is not None:
                     resolver_harvester_template = resolver.get_harvester_template(
                         queueName)
                     resolver_type, resolver_workflow = resolver.get_type_workflow(
                         queueName)
                 if resolver_harvester_template:
                     templateQueueName = resolver_harvester_template
                 elif not (resolver_type is None
                           or resolver_workflow is None):
                     templateQueueName = '{pq_type}.{workflow}'.format(
                         pq_type=resolver_type, workflow=resolver_workflow)
                 else:
                     templateQueueName = harvester_config.qconf.defaultTemplateQueueName
                 if templateQueueName not in finalTemplatesDict:
                     # remove queues with invalid templateQueueName
                     dynamicQueuesNameList.discard(queueName)
                     mainLog.warning(
                         'Invalid templateQueueName "{0}" for {1} (DQ). Skipped'
                         .format(templateQueueName, queueName))
                     continue
                 # parameters
                 resolver_harvester_params = resolver.get_harvester_params(
                     queueName)
                 for key, val in iteritems(resolver_harvester_params):
                     if key in self.dynamic_queue_generic_attrs:
                         queueDict[key] = val
                 # fill in dynamic queue configs
                 queueDict['templateQueueName'] = templateQueueName
                 queueDict['isTemplateQueue'] = False
                 dynamicQueuesDict[queueName] = queueDict
         # fill in all queue name list (names of RQ + DQ + LQ)
         allQueuesNameList |= set(remoteQueuesDict)
         allQueuesNameList |= set(dynamicQueuesDict)
         allQueuesNameList |= set(localQueuesDict)
         allQueuesNameList.discard(None)
         # set attributes
         for queueName in allQueuesNameList:
             # sources or queues and templates
             queueSourceList = []
             templateSourceList = []
             # prepare templateQueueName
             templateQueueName = None
             for queuesDict in [
                     remoteQueuesDict, dynamicQueuesDict, localQueuesDict
             ]:
                 if queueName not in queuesDict:
                     continue
                 tmp_queueDict = queuesDict[queueName]
                 tmp_templateQueueName = tmp_queueDict.get(
                     'templateQueueName')
                 if tmp_templateQueueName is not None:
                     templateQueueName = tmp_templateQueueName
             # prepare queueDict
             queueDict = dict()
             if templateQueueName in finalTemplatesDict:
                 queueDict.update(
                     copy.deepcopy(finalTemplatesDict[templateQueueName]))
             for acr, templatesDict in [('RT', remoteTemplatesDict),
                                        ('LT', localTemplatesDict)]:
                 if templateQueueName in templatesDict:
                     templateSourceList.append(acr)
             # update queueDict
             for acr, queuesDict in [('RQ', remoteQueuesDict),
                                     ('DQ', dynamicQueuesDict),
                                     ('LQ', localQueuesDict)]:
                 if queueName not in queuesDict:
                     continue
                 queueSourceList.append(acr)
                 tmp_queueDict = queuesDict[queueName]
                 for key, val in iteritems(tmp_queueDict):
                     val = copy.deepcopy(val)
                     if key in self.updatable_plugin_attrs \
                         and isinstance(queueDict.get(key), dict) \
                         and isinstance(val, dict):
                         # update plugin parameters instead of overwriting whole plugin section
                         queueDict[key].update(val)
                     else:
                         queueDict[key] = val
             # record sources of the queue config and its templates in log
             if templateQueueName:
                 mainLog.debug(
                     ('queue {queueName} comes from {queueSource} '
                      '(with template {templateName} '
                      'from {templateSource})').format(
                          queueName=queueName,
                          templateName=templateQueueName,
                          queueSource=','.join(queueSourceList),
                          templateSource=','.join(templateSourceList)))
             else:
                 mainLog.debug(
                     'queue {queueName} comes from {queueSource}'.format(
                         queueName=queueName,
                         queueSource=','.join(queueSourceList)))
             # prepare queueConfig
             if queueName in newQueueConfig:
                 queueConfig = newQueueConfig[queueName]
             else:
                 queueConfig = QueueConfig(queueName)
             # queueName = siteName/resourceType
             queueConfig.siteName = queueConfig.queueName.split('/')[0]
             if queueConfig.siteName != queueConfig.queueName:
                 queueConfig.resourceType = queueConfig.queueName.split(
                     '/')[-1]
             # get common attributes
             commonAttrDict = dict()
             if isinstance(queueDict.get('common'), dict):
                 commonAttrDict = queueDict.get('common')
             # according to queueDict
             for key, val in iteritems(queueDict):
                 if isinstance(val,
                               dict) and 'module' in val and 'name' in val:
                     # plugin attributes
                     val = copy.deepcopy(val)
                     # fill in common attributes for all plugins
                     for c_key, c_val in iteritems(commonAttrDict):
                         if c_key not in val and c_key not in ('module',
                                                               'name'):
                             val[c_key] = c_val
                     # check module and class name
                     try:
                         _t3mP_1Mp0R7_mO6U1e__ = importlib.import_module(
                             val['module'])
                         _t3mP_1Mp0R7_N4m3__ = getattr(
                             _t3mP_1Mp0R7_mO6U1e__, val['name'])
                     except Exception as _e:
                         invalidQueueList.add(queueConfig.queueName)
                         mainLog.error(
                             'Module or class not found. Omitted {0} in queue config ({1})'
                             .format(queueConfig.queueName, _e))
                         continue
                     else:
                         del _t3mP_1Mp0R7_mO6U1e__
                         del _t3mP_1Mp0R7_N4m3__
                     # fill in siteName and queueName
                     if 'siteName' not in val:
                         val['siteName'] = queueConfig.siteName
                     if 'queueName' not in val:
                         val['queueName'] = queueConfig.queueName
                     # middleware
                     if 'middleware' in val and val[
                             'middleware'] in queueDict:
                         # keep original config
                         val['original_config'] = copy.deepcopy(val)
                         # overwrite with middleware config
                         for m_key, m_val in iteritems(
                                 queueDict[val['middleware']]):
                             val[m_key] = m_val
                 setattr(queueConfig, key, val)
             # delete isTemplateQueue attribute
             try:
                 if getattr(queueConfig, 'isTemplateQueue'):
                     mainLog.error(
                         'Internal error: isTemplateQueue is True. Omitted {0} in queue config'
                         .format(queueConfig.queueName))
                     invalidQueueList.add(queueConfig.queueName)
                 else:
                     delattr(queueConfig, 'isTemplateQueue')
             except AttributeError as _e:
                 mainLog.error(
                     'Internal error with attr "isTemplateQueue". Omitted {0} in queue config ({1})'
                     .format(queueConfig.queueName, _e))
                 invalidQueueList.add(queueConfig.queueName)
             # get Panda Queue Name
             if resolver is not None:
                 queueConfig.pandaQueueName = resolver.get_panda_queue_name(
                     queueConfig.siteName)
             # additional criteria for getJob
             if queueConfig.getJobCriteria is not None:
                 tmpCriteria = dict()
                 for tmpItem in queueConfig.getJobCriteria.split(','):
                     tmpKey, tmpVal = tmpItem.split('=')
                     tmpCriteria[tmpKey] = tmpVal
                 if len(tmpCriteria) == 0:
                     queueConfig.getJobCriteria = None
                 else:
                     queueConfig.getJobCriteria = tmpCriteria
             # nullify job attributes if NoJob mapType
             if queueConfig.mapType == WorkSpec.MT_NoJob:
                 for attName in [
                         'nQueueLimitJob', 'nQueueLimitJobRatio',
                         'nQueueLimitJobMax', 'nQueueLimitJobMin'
                 ]:
                     setattr(queueConfig, attName, None)
             # heartbeat suppression
             if queueConfig.truePilot and queueConfig.noHeartbeat == '':
                 queueConfig.noHeartbeat = 'running,transferring,finished,failed'
             # set unique name
             queueConfig.set_unique_name()
             # put into new queue configs
             newQueueConfig[queueName] = queueConfig
             # Check existence of mandatory attributes
             if queueName in newQueueConfig:
                 queueConfig = newQueueConfig[queueName]
                 missing_attr_list = []
                 for _attr in self.mandatory_attrs:
                     if not hasattr(queueConfig, _attr):
                         invalidQueueList.add(queueConfig.queueName)
                         missing_attr_list.append(_attr)
                 if missing_attr_list:
                     mainLog.error(
                         'Missing mandatory attributes {0} . Omitted {1} in queue config'
                         .format(','.join(missing_attr_list),
                                 queueConfig.queueName))
         # delete invalid queues
         for invalidQueueName in invalidQueueList:
             if invalidQueueName in newQueueConfig:
                 del newQueueConfig[invalidQueueName]
         # auto blacklisting
         autoBlacklist = False
         if resolver is not None and hasattr(harvester_config.qconf, 'autoBlacklist') and \
                 harvester_config.qconf.autoBlacklist:
             autoBlacklist = True
         # get queue dumps
         queueConfigDumps = self.dbProxy.get_queue_config_dumps()
         # get active queues
         activeQueues = dict()
         for queueName, queueConfig in iteritems(newQueueConfig):
             # get status
             if queueConfig.queueStatus is None and autoBlacklist:
                 queueConfig.queueStatus = resolver.get_queue_status(
                     queueName)
             # get dynamic information
             if 'DYNAMIC' in harvester_config.qconf.queueList:
                 # UPS queue
                 if resolver is not None and resolver.is_ups_queue(
                         queueName):
                     queueConfig.runMode = 'slave'
                     queueConfig.mapType = 'NoJob'
             # set online if undefined
             if queueConfig.queueStatus is None:
                 queueConfig.queueStatus = 'online'
             queueConfig.queueStatus = queueConfig.queueStatus.lower()
             # look for configID
             dumpSpec = QueueConfigDumpSpec()
             dumpSpec.queueName = queueName
             dumpSpec.set_data(vars(queueConfig))
             if dumpSpec.dumpUniqueName in queueConfigDumps:
                 dumpSpec = queueConfigDumps[dumpSpec.dumpUniqueName]
             else:
                 # add dump
                 dumpSpec.creationTime = datetime.datetime.utcnow()
                 dumpSpec.configID = self.dbProxy.get_next_seq_number(
                     'SEQ_configID')
                 tmpStat = self.dbProxy.add_queue_config_dump(dumpSpec)
                 if not tmpStat:
                     dumpSpec.configID = self.dbProxy.get_config_id_dump(
                         dumpSpec)
                     if dumpSpec.configID is None:
                         mainLog.error(
                             'failed to get configID for {0}'.format(
                                 dumpSpec.dumpUniqueName))
                         continue
                 queueConfigDumps[dumpSpec.dumpUniqueName] = dumpSpec
             queueConfig.configID = dumpSpec.configID
             # ignore offline
             if queueConfig.queueStatus == 'offline':
                 continue
             # filter for pilot version
             if hasattr(harvester_config.qconf, 'pilotVersion') and \
                 pandaQueueDict.get(queueConfig.siteName) is not None and \
                 pandaQueueDict.get(queueConfig.siteName).get('pilot_version') != str(harvester_config.qconf.pilotVersion):
                 continue
             if 'ALL' not in harvester_config.qconf.queueList and \
                     'DYNAMIC' not in harvester_config.qconf.queueList and \
                     queueName not in harvester_config.qconf.queueList:
                 continue
             activeQueues[queueName] = queueConfig
         self.queueConfig = newQueueConfig
         self.activeQueues = activeQueues
         newQueueConfigWithID = dict()
         for dumpSpec in queueConfigDumps.values():
             queueConfig = QueueConfig(dumpSpec.queueName)
             queueConfig.update_attributes(dumpSpec.data)
             queueConfig.configID = dumpSpec.configID
             newQueueConfigWithID[dumpSpec.configID] = queueConfig
         self.queueConfigWithID = newQueueConfigWithID
         self.lastUpdate = datetime.datetime.utcnow()
     # update database
     if self.toUpdateDB:
         self.dbProxy.fill_panda_queue_table(self.activeQueues.keys(),
                                             self,
                                             refill_table=refill_table)
         mainLog.debug('updated to DB')
     # done
     mainLog.debug('done')
Ejemplo n.º 8
0
    def submit_k8s_worker(self, work_spec):
        tmp_log = self.make_logger(base_logger,
                                   method_name='submit_k8s_worker')

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)
        prod_source_label = harvester_queue_config.get_source_label(
            work_spec.jobType)

        # set the stdout log file
        log_file_name = '{0}_{1}.out'.format(
            harvester_config.master.harvester_id, work_spec.workerID)
        work_spec.set_log_file(
            'stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name))
        # TODO: consider if we want to upload the yaml file to PanDA cache

        yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file)
        try:

            # read the job configuration (if available, only push model)
            job_fields, job_pars_parsed = self.read_job_configuration(
                work_spec)

            # decide container image and executable to run. In pull mode, defaults are provided
            container_image = self.decide_container_image(
                job_fields, job_pars_parsed)
            executable, args = self.build_executable(job_fields,
                                                     job_pars_parsed)
            tmp_log.debug(
                'container_image: "{0}"; executable: "{1}"; args: "{2}"'.
                format(container_image, executable, args))

            # choose the appropriate proxy
            panda_queues_dict = PandaQueuesDict()
            is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(
                self.queueName)
            cert, use_secret = self._choose_proxy(work_spec,
                                                  is_grandly_unified_queue)
            if not cert:
                err_str = 'No proxy specified in proxySecretPath or x509UserProxy. Not submitted'
                tmp_return_value = (False, err_str)
                return tmp_return_value

            # get the walltime limit
            try:
                max_time = panda_queues_dict.get(self.queueName)['maxtime']
            except Exception as e:
                tmp_log.warning(
                    'Could not retrieve maxtime field for queue {0}'.format(
                        self.queueName))
                max_time = None

            # submit the worker
            rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(
                yaml_content,
                work_spec,
                prod_source_label,
                container_image,
                executable,
                args,
                cert,
                cert_in_secret=use_secret,
                cpu_adjust_ratio=self.cpuAdjustRatio,
                memory_adjust_ratio=self.memoryAdjustRatio,
                max_time=max_time)
        except Exception as _e:
            tmp_log.error(traceback.format_exc())
            err_str = 'Failed to create a JOB; {0}'.format(_e)
            tmp_return_value = (False, err_str)
        else:
            work_spec.batchID = yaml_content['metadata']['name']
            tmp_log.debug('Created worker {0} with batchID={1}'.format(
                work_spec.workerID, work_spec.batchID))
            tmp_return_value = (True, '')

        return tmp_return_value
Ejemplo n.º 9
0
    def submit_k8s_worker(self, work_spec):
        tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker')

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # set the stdout log file
        log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID)
        work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name))
        # TODO: consider if we want to upload the yaml file to PanDA cache

        yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file)
        try:

            # read the job configuration (if available, only push model)
            job_fields, job_pars_parsed = self.read_job_configuration(work_spec)

            # decide container image and executable to run. In pull mode, defaults are provided
            container_image = self.decide_container_image(job_fields, job_pars_parsed)
            executable, args = self.build_executable(job_fields, job_pars_parsed)
            tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable,
                                                                                          args))

            # choose the appropriate proxy
            panda_queues_dict = PandaQueuesDict()
            this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict())

            is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName)
            cert = self._choose_proxy(work_spec, is_grandly_unified_queue)
            if not cert:
                err_str = 'No proxy specified in proxySecretPath. Not submitted'
                tmp_return_value = (False, err_str)
                return tmp_return_value

            # get the walltime limit
            try:
                max_time = this_panda_queue_dict['maxtime']
            except Exception as e:
                tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName))
                max_time = None

            associated_params_dict = {}
            for key, val in panda_queues_dict.get_harvester_params(self.queueName).items():
                if key in self._allowed_agis_attrs:
                    associated_params_dict[key] = val

            pilot_url = associated_params_dict.get('pilot_url')
            pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current'))
            python_version = str(this_panda_queue_dict.get('python_version', '2'))

            # prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType)
            pilot_opt_dict = submitter_common.get_complicated_pilot_options(work_spec.pilotType)
            if pilot_opt_dict is None:
                prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType)
                pilot_type = work_spec.pilotType
                pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else ''
            else:
                prod_source_label = pilot_opt_dict['prod_source_label']
                pilot_type = pilot_opt_dict['pilot_type_opt']
                pilot_url_str = pilot_opt_dict['pilot_url_str']

            pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label)

            # submit the worker
            rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label,
                                                                           pilot_type, pilot_url_str,
                                                                           pilot_python_option,
                                                                           container_image, executable, args, cert,
                                                                           cpu_adjust_ratio=self.cpuAdjustRatio,
                                                                           memory_adjust_ratio=self.memoryAdjustRatio,
                                                                           max_time=max_time)
        except Exception as _e:
            tmp_log.error(traceback.format_exc())
            err_str = 'Failed to create a JOB; {0}'.format(_e)
            tmp_return_value = (False, err_str)
        else:
            work_spec.batchID = yaml_content['metadata']['name']
            tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID))
            tmp_return_value = (True, '')

        return tmp_return_value
Ejemplo n.º 10
0
    def make_worker(self, jobspec_list, queue_config, resource_type):
        tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName),
                                  method_name='make_worker')

        tmpLog.debug('jobspec_list: {0}'.format(jobspec_list))

        workSpec = WorkSpec()
        workSpec.creationTime = datetime.datetime.utcnow()

        # get the queue configuration from the DB
        panda_queues_dict = PandaQueuesDict()
        queue_dict = panda_queues_dict.get(queue_config.queueName, {})

        unified_queue = queue_dict.get('capability', '') == 'ucore'
        # case of traditional (non-unified) queue: look at the queue configuration
        if not unified_queue:
            workSpec.nCore = queue_dict.get('corecount', 1) or 1
            workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1

        # case of unified queue: look at the resource type and queue configuration
        else:
            catchall = queue_dict.get('catchall', '')
            if 'useMaxRam' in catchall or queue_config.queueName in ('Taiwan-LCG2-HPC2_Unified',
                                                                       'Taiwan-LCG2-HPC_Unified', 'DESY-ZN_UCORE'):
                # temporary hack to debug killed workers in Taiwan queues
                site_corecount = queue_dict.get('corecount', 1) or 1
                site_maxrss = queue_dict.get('maxrss', 1) or 1

                # some cases need to overwrite those values
                if 'SCORE' in resource_type:
                    # the usual pilot streaming use case
                    workSpec.nCore = 1
                    workSpec.minRamCount = site_maxrss / site_corecount
                else:
                    # default values
                    workSpec.nCore = site_corecount
                    workSpec.minRamCount = site_maxrss
            else:
                workSpec.nCore, workSpec.minRamCount = self.rt_mapper.calculate_worker_requirements(resource_type,
                                                                                                    queue_dict)

        # parameters that are independent on traditional vs unified
        workSpec.maxWalltime = queue_dict.get('maxtime', 1)
        workSpec.maxDiskCount = queue_dict.get('maxwdir', 1)
        walltimeLimit_default = getattr(queue_config, 'walltimeLimit', 0)

        if len(jobspec_list) > 0:
            # get info from jobs
            nCore = 0
            minRamCount = 0
            maxDiskCount = 0
            maxWalltime = 0
            ioIntensity = 0
            for jobSpec in jobspec_list:
                job_corecount, job_memory = self.get_job_core_and_memory(queue_dict, jobSpec)
                nCore += job_corecount
                minRamCount += job_memory
                try:
                    maxDiskCount += jobSpec.jobParams['maxDiskCount']
                except Exception:
                    pass
                try:
                    ioIntensity += jobSpec.jobParams['ioIntensity']
                except Exception:
                    pass
            try:
                # maxWallTime from AGIS or qconf, not trusting job currently
                maxWalltime = queue_dict.get('maxtime', walltimeLimit_default)
            except Exception:
                pass

            if (nCore > 0 and 'nCore' in self.jobAttributesToUse) \
               or unified_queue:
                workSpec.nCore = nCore
            if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) \
               or unified_queue:
                workSpec.minRamCount = minRamCount
            if maxDiskCount > 0 and 'maxDiskCount' in self.jobAttributesToUse:
                workSpec.maxDiskCount = maxDiskCount
            if maxWalltime > 0 and 'maxWalltime' in self.jobAttributesToUse:
                workSpec.maxWalltime = maxWalltime
            if ioIntensity > 0 and 'ioIntensity' in self.jobAttributesToUse:
                workSpec.ioIntensity = ioIntensity
            workSpec.pilotType = jobspec_list[0].get_pilot_type()
        else:
            # when no job
            # randomize pilot type with weighting
            workSpec.pilotType = random.choice(self.pilotTypeRandomList)
            if workSpec.pilotType in ['RC', 'ALRB', 'PT']:
                tmpLog.info('a worker has pilotType={0}'.format(workSpec.pilotType))
        # TODO: this needs to be improved with real resource types
        if resource_type and resource_type != 'ANY':
            workSpec.resourceType = resource_type
        elif workSpec.nCore == 1:
            workSpec.resourceType = 'SCORE'
        else:
            workSpec.resourceType = 'MCORE'

        return workSpec
Ejemplo n.º 11
0
    def make_worker(self, jobspec_list, queue_config, resource_type):
        tmpLog = core_utils.make_logger(_logger,
                                        'queue={0}'.format(
                                            queue_config.queueName),
                                        method_name='make_worker')

        tmpLog.debug('jobspec_list: {0}'.format(jobspec_list))

        workSpec = WorkSpec()

        # get the queue configuration from the DB
        panda_queues_dict = PandaQueuesDict()
        queue_dict = panda_queues_dict.get(queue_config.queueName, {})

        unified_queue = 'unifiedPandaQueue' in queue_dict.get('catchall', '')
        # case of traditional (non-unified) queue: look at the queue configuration
        if not unified_queue:
            workSpec.nCore = queue_dict.get('corecount', 1) or 1
            workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1

        # case of unified queue: look at the resource type and queue configuration
        else:
            site_corecount = queue_dict.get('corecount', 1) or 1
            site_maxrss = queue_dict.get('maxrss', 1) or 1

            if 'SCORE' in resource_type:
                workSpec.nCore = 1
                workSpec.minRamCount = site_maxrss / site_corecount
            else:
                workSpec.nCore = site_corecount
                workSpec.minRamCount = site_maxrss

        # parameters that are independent on traditional vs unified
        workSpec.maxWalltime = queue_dict.get('maxtime', 1)
        workSpec.maxDiskCount = queue_dict.get('maxwdir', 1)

        # get info from jobs
        if len(jobspec_list) > 0:
            nCore = 0
            minRamCount = 0
            maxDiskCount = 0
            maxWalltime = 0
            for jobSpec in jobspec_list:
                try:
                    nCore += jobSpec.jobParams['coreCount']
                except:
                    nCore += 1
                try:
                    minRamCount += jobSpec.jobParams['minRamCount']
                except:
                    pass
                try:
                    maxDiskCount += jobSpec.jobParams['maxDiskCount']
                except:
                    pass
                try:
                    if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"):
                        if hasattr(queue_config, 'maxWalltime'):
                            maxWalltime = max(int(queue_config.walltimeLimit),
                                              jobSpec.jobParams['maxWalltime'])
                        else:
                            maxWalltime = jobSpec.jobParams['maxWalltime']
                    else:
                        maxWalltime = queue_config.walltimeLimit
                except:
                    pass
            if nCore > 0 and 'nCore' in self.jobAttributesToUse:
                workSpec.nCore = nCore
            if minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse:
                workSpec.minRamCount = minRamCount
            if maxDiskCount > 0 and 'maxDiskCount' in self.jobAttributesToUse:
                workSpec.maxDiskCount = maxDiskCount
            if maxWalltime > 0 and 'maxWalltime' in self.jobAttributesToUse:
                workSpec.maxWalltime = maxWalltime

        # TODO: this needs to be improved with real resource types
        if resource_type and resource_type != 'ANY':
            workSpec.resourceType = resource_type
        elif workSpec.nCore == 1:
            workSpec.resourceType = 'SCORE'
        else:
            workSpec.resourceType = 'MCORE'

        return workSpec
    def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        try:
            os.mkdir(log_subdir_path)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
            else:
                pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        def _handle_one_worker(workspec):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_handle_one_worker')

            # get default information from queue info
            n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
            is_unified_queue = 'unifiedPandaQueue' in this_panda_queue_dict.get('catchall', '').split(',') \
                               or this_panda_queue_dict.get('capability', '') == 'ucore'
            ce_info_dict = dict()
            batch_log_dict = dict()
            special_par = ''

            if self.useAtlasGridCE:
                # If ATLAS Grid CE mode used
                tmpLog.debug('Using ATLAS Grid CE mode...')
                queues_from_queue_list = this_panda_queue_dict.get('queues', [])
                special_par = this_panda_queue_dict.get('special_par', '')
                ce_auxilary_dict = {}
                for _queue_dict in queues_from_queue_list:
                    if not ( _queue_dict.get('ce_endpoint')
                            and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                            and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ):
                        continue
                    ce_endpoint = _queue_dict.get('ce_endpoint')
                    if ( ce_endpoint in ce_auxilary_dict
                        and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ):
                        pass
                    else:
                        ce_auxilary_dict[ce_endpoint] = _queue_dict
                # qualified CEs from AGIS info
                n_qualified_ce = len(ce_auxilary_dict)
                queue_status_dict = self.dbInterface.get_queue_status(self.queueName)
                worker_ce_stats_dict = self.dbInterface.get_worker_ce_stats(self.queueName)
                ce_weight_dict = _get_ce_weight_dict(ce_endpoint_list=list(ce_auxilary_dict.keys()),
                                                        queue_status_dict=queue_status_dict,
                                                        worker_ce_stats_dict=worker_ce_stats_dict)
                # good CEs which can be submitted to, duplicate by weight
                good_ce_weighted_list = []
                for _ce_endpoint in ce_auxilary_dict.keys():
                    good_ce_weighted_list.extend([_ce_endpoint] * ce_weight_dict.get(_ce_endpoint, 0))
                tmpLog.debug('queue_status_dict: {0} ; worker_ce_stats_dict: {1} ; ce_weight_dict: {2}'.format(
                        queue_status_dict, worker_ce_stats_dict, ce_weight_dict))
                if len(good_ce_weighted_list) > 0:
                    ce_info_dict = ce_auxilary_dict[random.choice(good_ce_weighted_list)].copy()
                else:
                    tmpLog.info('No good CE endpoint left. Choose an arbitrary CE endpoint')
                    ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy()
                ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '')
                ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower()
                ce_version_str = str(ce_info_dict.get('ce_version', '')).lower()
                ce_info_dict['ce_hostname'] = re.sub(':\w*', '',  ce_endpoint_from_queue)
                tmpLog.debug('For site {0} got CE endpoint: "{1}", flavour: "{2}"'.format(self.queueName, ce_endpoint_from_queue, ce_flavour_str))
                if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                    sdf_template_filename = '{ce_flavour_str}.sdf'.format(ce_flavour_str=ce_flavour_str)
                    self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename)

            # template for batch script
            tmpFile = open(self.templateFile)
            sdf_template = tmpFile.read()
            tmpFile.close()

            # get batch_log, stdout, stderr filename
            for _line in sdf_template.split('\n'):
                if _line.startswith('#'):
                    continue
                _match_batch_log = re.match('log = (.+)', _line)
                _match_stdout = re.match('output = (.+)', _line)
                _match_stderr = re.match('error = (.+)', _line)
                if _match_batch_log:
                    batch_log_value = _match_batch_log.group(1)
                    continue
                if _match_stdout:
                    stdout_value = _match_stdout.group(1)
                    continue
                if _match_stderr:
                    stderr_value = _match_stderr.group(1)
                    continue

            # get override requirements from queue configured
            try:
                n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
            except AttributeError:
                n_core_per_node = n_core_per_node_from_queue

            # URLs for log files
            if not (self.logBaseURL is None):
                if workspec.batchID:
                    batchID = workspec.batchID
                    guess = False
                else:
                    batchID = ''
                    guess = True
                batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                batch_log = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, batch_log_filename)
                batch_stdout = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stdout_path_file_name)
                batch_stderr = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stderr_path_filename)
                workspec.set_log_file('batch_log', batch_log)
                workspec.set_log_file('stdout', batch_stdout)
                workspec.set_log_file('stderr', batch_stderr)
                batch_log_dict['batch_log'] = batch_log
                batch_log_dict['batch_stdout'] = batch_stdout
                batch_log_dict['batch_stderr'] = batch_stderr
                batch_log_dict['gtag'] = workspec.workAttributes['stdOut']
                tmpLog.debug('Done set_log_file before submission')

            tmpLog.debug('Done jobspec attribute setting')

            # set data dict
            data = {'workspec': workspec,
                    'template': sdf_template,
                    'log_dir': self.logDir,
                    'log_subdir': log_subdir,
                    'n_core_per_node': n_core_per_node,
                    'panda_queue_name': panda_queue_name,
                    'x509_user_proxy': self.x509UserProxy,
                    'ce_info_dict': ce_info_dict,
                    'batch_log_dict': batch_log_dict,
                    'special_par': special_par,
                    'harvester_queue_config': harvester_queue_config,
                    'is_unified_queue': is_unified_queue,
                    'condor_schedd': self.condorSchedd,
                    'condor_pool': self.condorPool,
                    }

            return data

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # exec with mcore
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retValList = thread_pool.map(submit_a_worker, dataIterator)
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList
Ejemplo n.º 13
0
    def make_worker(self, jobspec_list, queue_config, resource_type):
        tmpLog = self.make_logger(_logger,
                                  'queue={0}'.format(queue_config.queueName),
                                  method_name='make_worker')

        tmpLog.debug('jobspec_list: {0}'.format(jobspec_list))

        workSpec = WorkSpec()
        workSpec.creationTime = datetime.datetime.utcnow()

        # get the queue configuration from the DB
        panda_queues_dict = PandaQueuesDict()
        queue_dict = panda_queues_dict.get(queue_config.queueName, {})

        unified_queue = 'unifiedPandaQueue' in queue_dict.get('catchall', '')\
                        or queue_dict.get('capability', '') == 'ucore'
        # case of traditional (non-unified) queue: look at the queue configuration
        if not unified_queue:
            workSpec.nCore = queue_dict.get('corecount', 1) or 1
            workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1

        # case of unified queue: look at the resource type and queue configuration
        else:

            if queue_config.queueName in ('Taiwan-LCG2-HPC2_Unified',
                                          'Taiwan-LCG2-HPC_Unified'):
                # temporary hack to debug killed workers in Taiwan queues
                site_corecount = queue_dict.get('corecount', 1) or 1
                site_maxrss = queue_dict.get('maxrss', 1) or 1

                # some cases need to overwrite those values
                if 'SCORE' in resource_type:
                    # the usual pilot streaming use case
                    workSpec.nCore = 1
                    workSpec.minRamCount = site_maxrss / site_corecount
                else:
                    # default values
                    workSpec.nCore = site_corecount
                    workSpec.minRamCount = site_maxrss
            else:
                workSpec.nCore, workSpec.minRamCount = self.rt_mapper.calculate_worker_requirements(
                    resource_type, queue_dict)

        # parameters that are independent on traditional vs unified
        workSpec.maxWalltime = queue_dict.get('maxtime', 1)
        workSpec.maxDiskCount = queue_dict.get('maxwdir', 1)

        # get info from jobs
        if len(jobspec_list) > 0:
            nCore = 0
            minRamCount = 0
            maxDiskCount = 0
            maxWalltime = 0
            for jobSpec in jobspec_list:

                job_corecount, job_memory = self.get_job_core_and_memory(
                    queue_dict, jobSpec)
                nCore += job_corecount
                minRamCount += job_memory

                try:
                    maxDiskCount += jobSpec.jobParams['maxDiskCount']
                except Exception:
                    pass

                try:
                    if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"):
                        if hasattr(queue_config, 'maxWalltime'):
                            maxWalltime = max(int(queue_config.walltimeLimit),
                                              jobSpec.jobParams['maxWalltime'])
                        else:
                            maxWalltime = jobSpec.jobParams['maxWalltime']
                    else:
                        maxWalltime = queue_config.walltimeLimit
                except Exception:
                    pass
            if (nCore > 0 and 'nCore' in self.jobAttributesToUse) \
               or unified_queue:
                workSpec.nCore = nCore
            if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) \
               or unified_queue:
                workSpec.minRamCount = minRamCount
            if maxDiskCount > 0 and 'maxDiskCount' in self.jobAttributesToUse:
                workSpec.maxDiskCount = maxDiskCount
            if maxWalltime > 0 and 'maxWalltime' in self.jobAttributesToUse:
                workSpec.maxWalltime = maxWalltime

        # TODO: this needs to be improved with real resource types
        if resource_type and resource_type != 'ANY':
            workSpec.resourceType = resource_type
        elif workSpec.nCore == 1:
            workSpec.resourceType = 'SCORE'
        else:
            workSpec.resourceType = 'MCORE'

        return workSpec
Ejemplo n.º 14
0
    def submit_workers(self, workspec_list):
        tmpLog = core_utils.make_logger(baseLogger,
                                        method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_PQ_from_PR(self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(
                self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        def _handle_one_worker(workspec):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger,
                                            'workerID={0}'.format(
                                                workspec.workerID),
                                            method_name='_handle_one_worker')

            # get default information from queue info
            n_core_per_node_from_queue = this_panda_queue_dict.get(
                'corecount', 1) if this_panda_queue_dict.get('corecount',
                                                             1) else 1
            ce_info_dict = dict()
            batch_log_dict = dict()
            special_par = ''

            if self.useAtlasGridCE:
                # If ATLAS Grid CE mode used
                tmpLog.debug('Using ATLAS Grid CE mode...')
                queues_from_queue_list = this_panda_queue_dict.get(
                    'queues', [])
                special_par = this_panda_queue_dict.get('special_par', '')
                ce_endpoint_from_queue = ''
                ce_flavour_str = ''
                ce_version_str = ''
                random.shuffle(queues_from_queue_list)
                for _queue_dict in queues_from_queue_list:
                    if _queue_dict.get('ce_endpoint') and str(
                            _queue_dict.get('ce_state',
                                            '')).upper() == 'ACTIVE':
                        ce_flavour_str = str(_queue_dict.get('ce_flavour',
                                                             '')).lower()
                        ce_version_str = str(_queue_dict.get('ce_version',
                                                             '')).lower()
                        if ce_flavour_str in set(
                            ['arc-ce', 'cream-ce', 'htcondor-ce']):
                            ce_info_dict = _queue_dict.copy()
                            ce_endpoint_from_queue = ce_info_dict.get(
                                'ce_endpoint', '')
                            ce_info_dict['ce_hostname'] = re.sub(
                                ':\w*', '', ce_endpoint_from_queue)
                            break
                        else:
                            ce_flavour_str = ''
                tmpLog.debug(
                    'For site {0} got CE endpoint: "{1}", flavour: "{2}"'.
                    format(self.queueName, ce_endpoint_from_queue,
                           ce_flavour_str))
                if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                    sdf_template_filename = '{ce_flavour_str}.sdf'.format(
                        ce_flavour_str=ce_flavour_str)
                    self.templateFile = os.path.join(self.CEtemplateDir,
                                                     sdf_template_filename)

            # template for batch script
            tmpFile = open(self.templateFile)
            sdf_template = tmpFile.read()
            tmpFile.close()

            # get batch_log, stdout, stderr filename
            for _line in sdf_template.split('\n'):
                if _line.startswith('#'):
                    continue
                _match_batch_log = re.match('log = (.+)', _line)
                _match_stdout = re.match('output = (.+)', _line)
                _match_stderr = re.match('error = (.+)', _line)
                if _match_batch_log:
                    batch_log_value = _match_batch_log.group(1)
                    continue
                if _match_stdout:
                    stdout_value = _match_stdout.group(1)
                    continue
                if _match_stderr:
                    stderr_value = _match_stderr.group(1)
                    continue

            # get override requirements from queue configured
            try:
                n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
            except AttributeError:
                n_core_per_node = n_core_per_node_from_queue

            # URLs for log files
            if not (self.logBaseURL is None):
                if workspec.batchID:
                    batchID = workspec.batchID
                    guess = False
                else:
                    batchID = ''
                    guess = True
                batch_log_filename = parse_batch_job_filename(
                    value_str=batch_log_value,
                    file_dir=self.logDir,
                    batchID=batchID,
                    guess=guess)
                stdout_path_file_name = parse_batch_job_filename(
                    value_str=stdout_value,
                    file_dir=self.logDir,
                    batchID=batchID,
                    guess=guess)
                stderr_path_filename = parse_batch_job_filename(
                    value_str=stderr_value,
                    file_dir=self.logDir,
                    batchID=batchID,
                    guess=guess)
                batch_log = '{0}/{1}'.format(self.logBaseURL,
                                             batch_log_filename)
                batch_stdout = '{0}/{1}'.format(self.logBaseURL,
                                                stdout_path_file_name)
                batch_stderr = '{0}/{1}'.format(self.logBaseURL,
                                                stderr_path_filename)
                workspec.set_log_file('batch_log', batch_log)
                workspec.set_log_file('stdout', batch_stdout)
                workspec.set_log_file('stderr', batch_stderr)
                batch_log_dict['batch_log'] = batch_log
                batch_log_dict['batch_stdout'] = batch_stdout
                batch_log_dict['batch_stderr'] = batch_stderr
                batch_log_dict['gtag'] = workspec.workAttributes['stdOut']
                tmpLog.debug('Done set_log_file')
                if not workspec.get_jobspec_list():
                    tmpLog.debug(
                        'No jobspec associated in the worker of workerID={0}'.
                        format(workspec.workerID))
                else:
                    for jobSpec in workspec.get_jobspec_list():
                        # using batchLog and stdOut URL as pilotID and pilotLog
                        jobSpec.set_one_attribute(
                            'pilotID', workspec.workAttributes['stdOut'])
                        jobSpec.set_one_attribute(
                            'pilotLog', workspec.workAttributes['batchLog'])
            tmpLog.debug('Done jobspec attribute setting')

            # set data dict
            data = {
                'workspec': workspec,
                'template': sdf_template,
                'log_dir': self.logDir,
                'n_core_per_node': n_core_per_node,
                'panda_queue_name': panda_queue_name,
                'x509_user_proxy': self.x509UserProxy,
                'ce_info_dict': ce_info_dict,
                'batch_log_dict': batch_log_dict,
                'special_par': special_par,
            }

            return data

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # exec with mcore
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retValList = thread_pool.map(submit_a_worker, dataIterator)
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        retList = []
        for workspec, tmpVal in zip(workspec_list, retValList):
            retVal, tmpDict = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            retList.append(retVal)

        tmpLog.debug('done')

        return retList
Ejemplo n.º 15
0
    def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # whether to submit any worker
        to_submit_any = True

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        if self.condorSchedd is None or not self.useSpool:
            try:
                os.mkdir(log_subdir_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
                else:
                    pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        # get default information from queue info
        n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
        is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore'
        pilot_version_orig = str(this_panda_queue_dict.get('pilot_version', ''))
        pilot_version_suffix_str = '_pilot2' if pilot_version_orig == '2' else ''

        # get override requirements from queue configured
        try:
            n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
        except AttributeError:
            n_core_per_node = n_core_per_node_from_queue

        # deal with Condor schedd and central managers; make a random list the choose
        n_bulks = _div_round_up(nWorkers, self.minBulkToRamdomizedSchedd)
        if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0:
            if isinstance(self.condorPool, list) and len(self.condorPool) > 0:
                orig_list = list(zip(self.condorSchedd, self.condorPool))
            else:
                orig_list = [ (_schedd, self.condorPool) for _schedd in self.condorSchedd ]
            if n_bulks < len(orig_list):
                schedd_pool_choice_list = random.sample(orig_list, n_bulks)
            else:
                schedd_pool_choice_list = orig_list
        else:
            schedd_pool_choice_list = [(self.condorSchedd, self.condorPool)]

        # deal with CE
        special_par = ''
        ce_weighting = None
        if self.useAtlasGridCE:
            # If ATLAS Grid CE mode used
            tmpLog.debug('Using ATLAS Grid CE mode...')
            queues_from_queue_list = this_panda_queue_dict.get('queues', [])
            special_par = this_panda_queue_dict.get('special_par', '')
            ce_auxilary_dict = {}
            for _queue_dict in queues_from_queue_list:
                if not ( _queue_dict.get('ce_endpoint')
                        and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                        and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ):
                    continue
                ce_endpoint = _queue_dict.get('ce_endpoint')
                if ( ce_endpoint in ce_auxilary_dict
                    and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ):
                    pass
                else:
                    ce_auxilary_dict[ce_endpoint] = _queue_dict
            # qualified CEs from AGIS info
            n_qualified_ce = len(ce_auxilary_dict)
            if n_qualified_ce > 0:
                # Get CE weighting
                tmpLog.debug('Get CE weighting')
                worker_ce_all_tuple = self.get_ce_statistics(self.queueName, nWorkers)
                ce_weighting = _get_ce_weighting(ce_endpoint_list=list(ce_auxilary_dict.keys()),
                                                        worker_ce_all_tuple=worker_ce_all_tuple)
                stats_weighting_display_str = _get_ce_stats_weighting_display(
                                                ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting)
                tmpLog.debug('CE stats and weighting: {0}'.format(stats_weighting_display_str))
            else:
                tmpLog.error('No valid CE endpoint found')
                to_submit_any = False



        def _handle_one_worker(workspec, to_submit=to_submit_any):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_handle_one_worker')
            ce_info_dict = dict()
            batch_log_dict = dict()
            data = {'workspec': workspec,
                    'to_submit': to_submit,}
            if to_submit:
                if self.useAtlasGridCE:
                    # choose a CE
                    tmpLog.info('choose a CE...')
                    ce_chosen = _choose_ce(ce_weighting)
                    try:
                        ce_info_dict = ce_auxilary_dict[ce_chosen].copy()
                    except KeyError:
                        tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint')
                        ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy()
                    # go on info of the CE
                    ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '')
                    ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower()
                    ce_version_str = str(ce_info_dict.get('ce_version', '')).lower()
                    ce_info_dict['ce_hostname'] = re.sub(':\w*', '',  ce_endpoint_from_queue)
                    if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue:
                        # add default port to ce_endpoint if missing
                        default_port_map = {
                                'cream-ce': 8443,
                                'arc-ce': 2811,
                                'htcondor-ce': 9619,
                            }
                        if ce_flavour_str in default_port_map:
                            default_port = default_port_map[ce_flavour_str]
                            ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port)
                    tmpLog.debug('For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"'.format(
                                    self.queueName, pilot_version_orig, ce_endpoint_from_queue, ce_flavour_str))
                    if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                        sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format(
                                                    ce_flavour_str=ce_flavour_str, pilot_version_suffix_str=pilot_version_suffix_str)
                        self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename)
                else:
                    try:
                        # Manually define site condor schedd as ceHostname and central manager as ceEndpoint
                        if self.ceHostname and isinstance(self.ceHostname, list) and len(self.ceHostname) > 0:
                            if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0:
                                ce_info_dict['ce_hostname'], ce_info_dict['ce_endpoint'] = random.choice(list(zip(self.ceHostname, self.ceEndpoint)))
                            else:
                                ce_info_dict['ce_hostname'] = random.choice(self.ceHostname)
                                ce_info_dict['ce_endpoint'] = self.ceEndpoint
                        else:
                            ce_info_dict['ce_hostname'] = self.ceHostname
                            ce_info_dict['ce_endpoint'] = self.ceEndpoint
                    except AttributeError:
                        pass
                # template for batch script
                try:
                    tmpFile = open(self.templateFile)
                    sdf_template_raw = tmpFile.read()
                    tmpFile.close()
                except AttributeError:
                    tmpLog.error('No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found')
                    to_submit = False
                    return data
                else:
                    # get batch_log, stdout, stderr filename, and remobe commented liness
                    sdf_template_str_list = []
                    for _line in sdf_template_raw.split('\n'):
                        if _line.startswith('#'):
                            continue
                        sdf_template_str_list.append(_line)
                        _match_batch_log = re.match('log = (.+)', _line)
                        _match_stdout = re.match('output = (.+)', _line)
                        _match_stderr = re.match('error = (.+)', _line)
                        if _match_batch_log:
                            batch_log_value = _match_batch_log.group(1)
                            continue
                        if _match_stdout:
                            stdout_value = _match_stdout.group(1)
                            continue
                        if _match_stderr:
                            stderr_value = _match_stderr.group(1)
                            continue
                    sdf_template = '\n'.join(sdf_template_str_list)
                    # Choose from Condor schedd and central managers
                    condor_schedd, condor_pool = random.choice(schedd_pool_choice_list)
                    # set submissionHost
                    if not condor_schedd and not condor_pool:
                        workspec.submissionHost = 'LOCAL'
                    else:
                        workspec.submissionHost = '{0},{1}'.format(condor_schedd, condor_pool)
                    tmpLog.debug('set submissionHost={0}'.format(workspec.submissionHost))
                    # Log Base URL
                    if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL:
                        schedd_hostname = re.sub(r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?',
                                                    lambda matchobj: matchobj.group(1) if matchobj.group(1) else '',
                                                    condor_schedd)
                        log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL)
                    else:
                        log_base_url = self.logBaseURL
                    # URLs for log files
                    if not (log_base_url is None):
                        if workspec.batchID:
                            batchID = workspec.batchID
                            guess = False
                        else:
                            batchID = ''
                            guess = True
                        batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir, batch_log_filename)
                        batch_stdout = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stdout_path_file_name)
                        batch_stderr = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stderr_path_filename)
                        workspec.set_log_file('batch_log', batch_log)
                        workspec.set_log_file('stdout', batch_stdout)
                        workspec.set_log_file('stderr', batch_stderr)
                        batch_log_dict['batch_log'] = batch_log
                        batch_log_dict['batch_stdout'] = batch_stdout
                        batch_log_dict['batch_stderr'] = batch_stderr
                        batch_log_dict['gtag'] = workspec.workAttributes['stdOut']
                        tmpLog.debug('Done set_log_file before submission')
                    tmpLog.debug('Done jobspec attribute setting')
                # set data dict
                data.update({
                        'workspec': workspec,
                        'to_submit': to_submit,
                        'template': sdf_template,
                        'executable_file': self.executableFile,
                        'log_dir': self.logDir,
                        'log_subdir': log_subdir,
                        'n_core_per_node': n_core_per_node,
                        'panda_queue_name': panda_queue_name,
                        'x509_user_proxy': self.x509UserProxy,
                        'ce_info_dict': ce_info_dict,
                        'batch_log_dict': batch_log_dict,
                        'special_par': special_par,
                        'harvester_queue_config': harvester_queue_config,
                        'is_unified_queue': is_unified_queue,
                        'condor_schedd': condor_schedd,
                        'condor_pool': condor_pool,
                        'use_spool': self.useSpool,
                        'pilot_version': pilot_version_orig,
                        })
            return data

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # submit
        retValList = submit_bag_of_workers(list(dataIterator))
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList