Esempio n. 1
0
def test():
    '''test submission'''
    from pandaharvester.harvestercore.job_spec import JobSpec
    from pandaharvester.harvestercore.plugin_factory import PluginFactory

    import json

    queuename = 'ARC-TEST'
    queueconfmapper = QueueConfigMapper()
    queueconf = queueconfmapper.get_queue(queuename)
    pluginfactory = PluginFactory()

    pandajob = '{"jobsetID": 11881, "logGUID": "88ee8a52-5c70-490c-a585-5eb6f48e4152", "cmtConfig": "x86_64-slc6-gcc49-opt", "prodDBlocks": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "dispatchDBlockTokenForOut": "NULL,NULL", "destinationDBlockToken": "dst:CERN-PROD_DATADISK,dst:NDGF-T1_DATADISK", "destinationSE": "CERN-PROD_PRESERVATION", "realDatasets": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00", "prodUserID": "gingrich", "GUID": "A407D965-B139-A543-8851-A8E134A678D7", "realDatasetsIn": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "nSent": 2, "cloud": "WORLD", "StatusCode": 0, "homepackage": "AtlasOffline/21.0.15", "inFiles": "EVNT.11329621._001079.pool.root.1", "processingType": "simul", "currentPriority": 900, "fsize": "129263662", "fileDestinationSE": "CERN-PROD_PRESERVATION,BOINC_MCORE", "scopeOut": "mc16_13TeV", "minRamCount": 1573, "jobDefinitionID": 0, "maxWalltime": 40638, "scopeLog": "mc16_13TeV", "transformation": "Sim_tf.py", "maxDiskCount": 485, "coreCount": 1, "prodDBlockToken": "NULL", "transferType": "NULL", "destinationDblock": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00_sub0418634273,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00_sub0418634276", "dispatchDBlockToken": "NULL", "jobPars": "--inputEVNTFile=EVNT.11329621._001079.pool.root.1 --maxEvents=50 --postInclude \\"default:RecJobTransforms/UseFrontier.py\\" --preExec \\"EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)\\" \\"EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True\\" --preInclude \\"EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py\\" --skipEvents=4550 --firstEvent=5334551 --outputHITSFile=HITS.11364822._128373.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=106692 --DBRelease=\\"all:current\\" --conditionsTag \\"default:OFLCOND-MC16-SDR-14\\" --geometryVersion=\\"default:ATLAS-R2-2016-01-00-01_VALIDATION\\" --runNumber=364168 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus", "attemptNr": 2, "swRelease": "Atlas-21.0.15", "nucleus": "CERN-PROD", "maxCpuCount": 40638, "outFiles": "HITS.11364822._128373.pool.root.11,log.11364822._128373.job.log.tgz.11", "ddmEndPointOut": "CERN-PROD_DATADISK,NDGF-T1_DATADISK", "scopeIn": "mc16_13TeV", "PandaID": 3487584273, "sourceSite": "NULL", "dispatchDblock": "panda.11364822.07.05.GEN.0c9b1d3b-feec-411a-89e4-1cbf7347d70c_dis003487584270", "prodSourceLabel": "managed", "checksum": "ad:cd0bf10b", "jobName": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.e5340_e5984_s3126.3433643361", "ddmEndPointIn": "NDGF-T1_DATADISK", "taskID": 11364822, "logFile": "log.11364822._128373.job.log.tgz.1"}'
    pandajob = json.loads(pandajob)
    jspec = JobSpec()
    jspec.convert_job_json(pandajob)
    jspec.computingSite = queuename
    jspeclist = [jspec]

    maker = pluginfactory.get_plugin(queueconf.workerMaker)
    wspec = maker.make_worker(jspeclist, queueconf)

    wspec.hasJob = 1
    wspec.set_jobspec_list(jspeclist)

    sub = ARCSubmitter()
    print sub.submit_workers([wspec])
    print wspec.batchID
Esempio n. 2
0
def test():
    '''test submission'''
    from pandaharvester.harvestercore.job_spec import JobSpec
    from pandaharvester.harvestercore.plugin_factory import PluginFactory

    import json

    queuename = 'ARC-TEST'
    queueconfmapper = QueueConfigMapper()
    queueconf = queueconfmapper.get_queue(queuename)
    pluginfactory = PluginFactory()

    pandajob = '{"jobsetID": 11881, "logGUID": "88ee8a52-5c70-490c-a585-5eb6f48e4152", "cmtConfig": "x86_64-slc6-gcc49-opt", "prodDBlocks": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "dispatchDBlockTokenForOut": "NULL,NULL", "destinationDBlockToken": "dst:CERN-PROD_DATADISK,dst:NDGF-T1_DATADISK", "destinationSE": "CERN-PROD_PRESERVATION", "realDatasets": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00", "prodUserID": "gingrich", "GUID": "A407D965-B139-A543-8851-A8E134A678D7", "realDatasetsIn": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "nSent": 2, "cloud": "WORLD", "StatusCode": 0, "homepackage": "AtlasOffline/21.0.15", "inFiles": "EVNT.11329621._001079.pool.root.1", "processingType": "simul", "currentPriority": 900, "fsize": "129263662", "fileDestinationSE": "CERN-PROD_PRESERVATION,BOINC_MCORE", "scopeOut": "mc16_13TeV", "minRamCount": 1573, "jobDefinitionID": 0, "maxWalltime": 40638, "scopeLog": "mc16_13TeV", "transformation": "Sim_tf.py", "maxDiskCount": 485, "coreCount": 1, "prodDBlockToken": "NULL", "transferType": "NULL", "destinationDblock": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00_sub0418634273,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00_sub0418634276", "dispatchDBlockToken": "NULL", "jobPars": "--inputEVNTFile=EVNT.11329621._001079.pool.root.1 --maxEvents=50 --postInclude \\"default:RecJobTransforms/UseFrontier.py\\" --preExec \\"EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)\\" \\"EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True\\" --preInclude \\"EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py\\" --skipEvents=4550 --firstEvent=5334551 --outputHITSFile=HITS.11364822._128373.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=106692 --DBRelease=\\"all:current\\" --conditionsTag \\"default:OFLCOND-MC16-SDR-14\\" --geometryVersion=\\"default:ATLAS-R2-2016-01-00-01_VALIDATION\\" --runNumber=364168 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus", "attemptNr": 2, "swRelease": "Atlas-21.0.15", "nucleus": "CERN-PROD", "maxCpuCount": 40638, "outFiles": "HITS.11364822._128373.pool.root.11,log.11364822._128373.job.log.tgz.11", "ddmEndPointOut": "CERN-PROD_DATADISK,NDGF-T1_DATADISK", "scopeIn": "mc16_13TeV", "PandaID": 3487584273, "sourceSite": "NULL", "dispatchDblock": "panda.11364822.07.05.GEN.0c9b1d3b-feec-411a-89e4-1cbf7347d70c_dis003487584270", "prodSourceLabel": "managed", "checksum": "ad:cd0bf10b", "jobName": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.e5340_e5984_s3126.3433643361", "ddmEndPointIn": "NDGF-T1_DATADISK", "taskID": 11364822, "logFile": "log.11364822._128373.job.log.tgz.1"}'
    pandajob = json.loads(pandajob)
    jspec = JobSpec()
    jspec.convert_job_json(pandajob)
    jspec.computingSite = queuename
    jspeclist = [jspec]

    maker = pluginfactory.get_plugin(queueconf.workerMaker)
    wspec = maker.make_worker(jspeclist, queueconf)

    wspec.hasJob = 1
    wspec.set_jobspec_list(jspeclist)

    sub = ARCSubmitter()
    print sub.submit_workers([wspec])
    print wspec.batchID
Esempio n. 3
0
    def post_processing(self, workspec, jobspec_list, map_type):
        '''
        Fetch job output and process pilot info for sending in final heartbeat.
        The pilot pickle is loaded and some attributes corrected (schedulerid,
        pilotlog etc), then converted to dictionary and stored in
        workspec.workAttributes[pandaid]. If pilot pickle cannot be used,
        report ARC error in pilotErrorDiag and fill all possible attributes
        using ARC information.
        '''

        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmplog = arclog.log
        tmplog.info('Post processing ARC job {0}'.format(workspec.batchID))
        job = workspec.workAttributes['arcjob']
        proxyrole = workspec.workAttributes['proxyrole']
        arcid = job['JobID']
        tmplog.info('Job id {0}'.format(arcid))

        if 'arcdownloadfiles' not in workspec.workAttributes:
            tmplog.error('No files to download')
            return

        # Assume one-to-one mapping of workers to jobs. If jobspec_list is empty
        # it means the job was cancelled by panda or otherwise forgotten
        if not jobspec_list:
            return

        # Set certificate
        userconfig = arc.UserConfig(self.cred_type)
        try:
            userconfig.ProxyPath(str(self.certs[proxyrole]))
        except:
            tmplog.error("Job {0}: no proxy found with role {1}".format(job.JobID, proxyrole))
            return

        queueconfigmapper = QueueConfigMapper()
        queueconfig = queueconfigmapper.get_queue(jobspec_list[0].computingSite)
        logbaseurl = queueconfig.submitter.get('logBaseURL')
        logbasedir = queueconfig.submitter.get('logDir', self.tmpdir)
        logsubdir = workspec.workAttributes['logsubdir']
        pandaid = str(jobspec_list[0].PandaID)

        # Construct log path and url
        logurl = '/'.join([logbaseurl, logsubdir, str(pandaid)]) if logbaseurl else None
        logdir = os.path.join(logbasedir, logsubdir)

        # post_processing is only called once, so no retries are done. But keep
        # the possibility here in case it changes
        (fetched, notfetched, notfetchedretry) = self._download_outputs(workspec.workAttributes['arcdownloadfiles'],
                                                                        logdir, arcid, pandaid, userconfig, tmplog)
        if arcid not in fetched:
            tmplog.warning("Could not get outputs of {0}".format(arcid))

        workspec.workAttributes[long(pandaid)] = {}

        workspec.workAttributes[long(pandaid)] = self._extractAndFixPilotPickle(job, pandaid, (arcid in fetched), logurl, tmplog)
        
        tmplog.debug("pilot info for {0}: {1}".format(pandaid, workspec.workAttributes[long(pandaid)]))
Esempio n. 4
0
    def post_processing(self, workspec, jobspec_list, map_type):
        '''
        Fetch job output and process pilot info for sending in final heartbeat.
        The pilot pickle is loaded and some attributes corrected (schedulerid,
        pilotlog etc), then converted to dictionary and stored in
        workspec.workAttributes[pandaid]. If pilot pickle cannot be used,
        report ARC error in pilotErrorDiag and fill all possible attributes
        using ARC information.
        '''

        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmplog = arclog.log
        tmplog.info('Post processing ARC job {0}'.format(workspec.batchID))
        job = workspec.workAttributes['arcjob']
        arcid = job['JobID']
        tmplog.info('Job id {0}'.format(arcid))

        if 'arcdownloadfiles' not in workspec.workAttributes:
            tmplog.error('No files to download')
            return True

        # Assume one-to-one mapping of workers to jobs. If jobspec_list is empty
        # it means the job was cancelled by panda or otherwise forgotten
        if not jobspec_list:
            return True

        # Set certificate to use for interacting with ARC CE
        userconfig = arc.UserConfig(self.cred_type)
        if not self._setup_proxy(usercfg, workspec, arcid, tmplog):
            return True

        queueconfigmapper = QueueConfigMapper()
        queueconfig = queueconfigmapper.get_queue(jobspec_list[0].computingSite)
        logbaseurl = queueconfig.submitter.get('logBaseURL')
        logbasedir = queueconfig.submitter.get('logDir', self.tmpdir)
        logsubdir = workspec.workAttributes['logsubdir']
        pandaid = str(jobspec_list[0].PandaID)

        # Construct log path and url
        logurl = '/'.join([logbaseurl, logsubdir, str(pandaid)]) if logbaseurl else None
        logdir = os.path.join(logbasedir, logsubdir)

        # post_processing is only called once, so no retries are done. But keep
        # the possibility here in case it changes
        (fetched, notfetched, notfetchedretry) = self._download_outputs(workspec.workAttributes['arcdownloadfiles'],
                                                                        logdir, arcid, pandaid, userconfig, tmplog)
        if arcid not in fetched:
            tmplog.warning("Could not get outputs of {0}".format(arcid))

        workspec.workAttributes[long(pandaid)] = {}

        workspec.workAttributes[long(pandaid)] = self._extractAndFixPilotPickle(job, pandaid, (arcid in fetched), logurl, tmplog)

        tmplog.debug("pilot info for {0}: {1}".format(pandaid, workspec.workAttributes[long(pandaid)]))
        return True
Esempio n. 5
0
    def submit_workers(self, workspec_list):
        retList = []
        for workSpec in workspec_list:

            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID),
                                            method_name='submit_workers')

            queueconfigmapper = QueueConfigMapper()
            queueconfig = queueconfigmapper.get_queue(workSpec.computingSite)
            prodSourceLabel = queueconfig.get_source_label()

            # If jobSpec is defined we are in push mode, if not pull mode
            # Both assume one to one worker to job mapping
            jobSpec = workSpec.get_jobspec_list()
            if jobSpec:
                jobSpec = jobSpec[0]
                tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map()))

            desc = {}
            desc['pandastatus'] = 'sent'
            desc['actpandastatus'] = 'sent'
            desc['siteName'] = workSpec.computingSite
            desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel == 'user' else 'production']
            desc['sendhb'] = 0
            metadata = {'harvesteraccesspoint': workSpec.get_access_point(),
                        'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id)}
            desc['metadata'] = json.dumps(metadata)

            if jobSpec:
                # push mode: aCT takes the url-encoded job description (like it gets from panda server)
                pandaid = jobSpec.PandaID
                actjobdesc = urllib.urlencode(jobSpec.jobParams)
            else:
                # pull mode: just set pandaid (to workerid) and prodsourcelabel
                pandaid = workSpec.workerID
                actjobdesc = 'PandaID=%d&prodSourceLabel=%s' % (pandaid, prodSourceLabel)

            tmpLog.info("Inserting job {0} into aCT DB: {1}".format(pandaid, str(desc)))
            try:
                batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)['LAST_INSERT_ID()']
            except Exception as e:
                result = (False, "Failed to insert job into aCT DB: {0}".format(str(e)))
            else:
                tmpLog.info("aCT batch id {0}".format(batchid))
                workSpec.batchID = str(batchid)
                # Set log files in workSpec
                today = time.strftime('%Y-%m-%d', time.gmtime())
                logurl = '/'.join([queueconfig.submitter.get('logBaseURL'), today, workSpec.computingSite, str(pandaid)])
                workSpec.set_log_file('batch_log', '{0}.log'.format(logurl))
                workSpec.set_log_file('stdout', '{0}.out'.format(logurl))
                workSpec.set_log_file('stderr', '{0}.err'.format(logurl))
                result = (True, '')
            retList.append(result)

        return retList
    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)
        self.queue_config_mapper = QueueConfigMapper()

        # States taken from: https://cloud.google.com/compute/docs/instances/checking-instance-status
        self.vm_to_worker_status = {
            'RUNNING': WorkSpec.ST_running,
            'TERMINATED': WorkSpec.
            ST_running,  # the VM is stopped, but has to be fully deleted
            'STOPPING': WorkSpec.ST_finished,
            'PROVISIONING': WorkSpec.ST_submitted,
            'STAGING': WorkSpec.ST_submitted
        }
 def __init__(self, **kwarg):
     BaseWorkerMaker.__init__(self, **kwarg)
     self.pluginFactory = PluginFactory()
     self.queue_config_mapper = QueueConfigMapper()
     tmpLog = self.make_logger(baseLogger, method_name='__init__')
     tmpLog.info("Multinode workermaker: created.")
     tmpLog.debug("Queue name: {0}".format(self.queueName))
     if self.mode == "static":
         tmpLog.info("Static configuration")
     elif self.mode == "dynamic":
         tmpLog.info("Dynamic configuration")
         self.nNodes, self.walltimelimit = self.get_resources()
     self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
class GoogleSubmitter(PluginBase):
    """
    Plug-in for Google Cloud Engine VM submission. In this case the worker will abstract a VM running a job
    """
    def __init__(self, **kwarg):
        self.logBaseURL = 'http://localhost/test'
        PluginBase.__init__(self, **kwarg)

        self.queue_config_mapper = QueueConfigMapper()

    def submit_workers(self, work_spec_list):
        """
        :param work_spec_list: list of workers to submit
        :return:
        """

        tmp_log = self.make_logger(base_logger, method_name='submit_workers')
        tmp_log.debug('start nWorkers={0}'.format(len(work_spec_list)))

        ret_list = []
        if not work_spec_list:
            tmp_log.debug('empty work_spec_list')
            return ret_list

        # we assume all work_specs in the list belong to the same queue
        queue_config = self.queue_config_mapper.get_queue(
            work_spec_list[0].computingSite)

        # Create VMs in parallel
        # authentication issues when running the Cloud API in multiprocess
        # pool_size = min(len(work_spec_list), 10)
        # with Pool(pool_size) as pool:
        #    ret_val_list = pool.map(create_vm, work_spec_list, lock)

        ret_val_list = []
        for work_spec in work_spec_list:
            ret_val_list.append(create_vm(work_spec, queue_config))

        # Propagate changed attributes
        for work_spec, tmp_val in zip(work_spec_list, ret_val_list):
            ret_val, tmp_dict = tmp_val

            work_spec.set_attributes_with_dict(tmp_dict)
            work_spec.set_log_file(
                'batch_log', '{0}/{1}.log'.format(self.logBaseURL,
                                                  work_spec.batchID))
            work_spec.set_log_file(
                'stdout', '{0}/{1}.out'.format(self.logBaseURL,
                                               work_spec.batchID))
            work_spec.set_log_file(
                'stderr', '{0}/{1}.err'.format(self.logBaseURL,
                                               work_spec.batchID))
            ret_list.append(ret_val)

        tmp_log.debug('done')

        return ret_list
Esempio n. 9
0
class GoogleSweeper(PluginBase):
    """
    Sweeper with kill/clean-up functions for Google Compute Engine
    """
    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)
        self.queue_config_mapper = QueueConfigMapper()

    def kill_worker(self, work_spec):
        """
        Sends the command to Google to destroy a VM

        :param work_spec: worker specification
        :type work_spec: WorkSpec
        :return: A tuple of return code (True for success, False otherwise) and error dialog
        :rtype: (bool, string)
        """

        try:
            vm_name = work_spec.batchID

            queue_config = self.queue_config_mapper.get_queue(
                work_spec.computingSite)
            try:
                zone = queue_config.zone
            except AttributeError:
                zone = ZONE

            base_logger.debug('Going to kill VM {0}'.format(vm_name))
            compute.instances().delete(project=PROJECT,
                                       zone=zone,
                                       instance=vm_name).execute()
            base_logger.debug('Killed VM {0}'.format(vm_name))
            return True, ''
        except googleapiclient.errors.HttpError as e:
            if 'was not found' in e.content:
                # the VM was already killed or does not exist for any other reason
                message = 'VM does not exist'.format(vm_name)
                base_logger.debug(message)
                return True, message
            else:
                # there was an issue killing the VM and it should be retried at another time
                return False, 'Problems killing the VM: {0}'.format(e)
        except Exception as e:
            return False, 'Problems killing the VM: {0}'.format(e)

    def sweep_worker(self, work_spec):
        """
        In the cloud, cleaning means destroying a VM

        :param work_spec: worker specification
        :type work_spec: WorkSpec
        :return: A tuple of return code (True for success, False otherwise) and error dialog
        :rtype: (bool, string)
        """
        return self.kill_worker(work_spec)
Esempio n. 10
0
 def __init__(self, single_mode=False, stop_event=None, daemon_mode=True):
     # initialize database and config
     self.singleMode = single_mode
     self.stopEvent = stop_event
     self.daemonMode = daemon_mode
     from pandaharvester.harvestercore.communicator_pool import CommunicatorPool
     self.communicatorPool = CommunicatorPool()
     from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
     self.queueConfigMapper = QueueConfigMapper()
     from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy
     dbProxy = DBProxy()
     dbProxy.make_tables(self.queueConfigMapper)
class GoogleSweeper(PluginBase):
    """
    Sweeper with kill/clean-up functions for Google Compute Engine
    """
    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)
        self.queue_config_mapper = QueueConfigMapper()

    def kill_worker(self, work_spec):
        """
        Sends the command to Google to destroy a VM

        :param work_spec: worker specification
        :type work_spec: WorkSpec
        :return: A tuple of return code (True for success, False otherwise) and error dialog
        :rtype: (bool, string)
        """

        try:
            vm_name = work_spec.batchID

            queue_config = self.queue_config_mapper.get_queue(work_spec.computingSite)
            try:
                zone = queue_config.zone
            except AttributeError:
                zone = ZONE

            base_logger.debug('Going to kill VM {0}'.format(vm_name))
            compute.instances().delete(project=PROJECT, zone=zone, instance=vm_name).execute()
            base_logger.debug('Killed VM {0}'.format(vm_name))
            return True, ''
        except googleapiclient.errors.HttpError as e:
            if 'was not found' in e.content:
                # the VM was already killed or does not exist for any other reason
                message = 'VM does not exist'.format(vm_name)
                base_logger.debug(message)
                return True, message
            else:
                # there was an issue killing the VM and it should be retried at another time
                return False, 'Problems killing the VM: {0}'.format(e)
        except Exception as e:
            return False, 'Problems killing the VM: {0}'.format(e)

    def sweep_worker(self, work_spec):
        """
        In the cloud, cleaning means destroying a VM

        :param work_spec: worker specification
        :type work_spec: WorkSpec
        :return: A tuple of return code (True for success, False otherwise) and error dialog
        :rtype: (bool, string)
        """
        return self.kill_worker(work_spec)
    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)
        self.queue_config_mapper = QueueConfigMapper()

        # States taken from: https://cloud.google.com/compute/docs/instances/checking-instance-status
        self.vm_to_worker_status = {
                                     'RUNNING': WorkSpec.ST_running,
                                     'TERMINATED': WorkSpec.ST_running, # the VM is stopped, but has to be fully deleted
                                     'STOPPING': WorkSpec.ST_finished,
                                     'PROVISIONING': WorkSpec.ST_submitted,
                                     'STAGING': WorkSpec.ST_submitted
                                     }
 def __init__(self, **kwarg):
     BaseWorkerMaker.__init__(self, **kwarg)
     self.pluginFactory = PluginFactory()
     self.queue_config_mapper = QueueConfigMapper()
     tmpLog = self.make_logger(baseLogger, method_name='__init__')
     tmpLog.info("Multinode workermaker: created.")
     tmpLog.debug("Queue name: {0}".format(self.queueName))
     if self.mode == "static":
         tmpLog.info("Static configuration")
     elif self.mode == "dynamic":
         tmpLog.info("Dynamic configuration")
         self.nNodes, self.walltimelimit = self.get_resources()
     self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
class GoogleSubmitter(PluginBase):
    """
    Plug-in for Google Cloud Engine VM submission. In this case the worker will abstract a VM running a job
    """

    def __init__(self, **kwarg):
        self.logBaseURL = 'http://localhost/test'
        PluginBase.__init__(self, **kwarg)

        self.queue_config_mapper = QueueConfigMapper()

    def submit_workers(self, work_spec_list):
        """
        :param work_spec_list: list of workers to submit
        :return:
        """

        tmp_log = self.make_logger(base_logger, method_name='submit_workers')
        tmp_log.debug('start nWorkers={0}'.format(len(work_spec_list)))

        ret_list = []
        if not work_spec_list:
            tmp_log.debug('empty work_spec_list')
            return ret_list

        # we assume all work_specs in the list belong to the same queue
        queue_config = self.queue_config_mapper.get_queue(work_spec_list[0].computingSite)

        # Create VMs in parallel
        # authentication issues when running the Cloud API in multiprocess
        # pool_size = min(len(work_spec_list), 10)
        # with Pool(pool_size) as pool:
        #    ret_val_list = pool.map(create_vm, work_spec_list, lock)

        ret_val_list = []
        for work_spec in work_spec_list:
            ret_val_list.append(create_vm(work_spec, queue_config))

        # Propagate changed attributes
        for work_spec, tmp_val in zip(work_spec_list, ret_val_list):
            ret_val, tmp_dict = tmp_val

            work_spec.set_attributes_with_dict(tmp_dict)
            work_spec.set_log_file('batch_log', '{0}/{1}.log'.format(self.logBaseURL, work_spec.batchID))
            work_spec.set_log_file('stdout', '{0}/{1}.out'.format(self.logBaseURL, work_spec.batchID))
            work_spec.set_log_file('stderr', '{0}/{1}.err'.format(self.logBaseURL, work_spec.batchID))
            ret_list.append(ret_val)

        tmp_log.debug('done')

        return ret_list
Esempio n. 15
0
    def __init__(self, pid_file, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.db_proxy = DBProxy()

        if pid_file is not None:
            self.pid_file = pid_file
        else:
            try:
                self.pid_file = harvester_config.service_monitor.pidfile
            except Exception:
                self.pid_file = None

        self.pid = self.get_master_pid()
        self.master_process = psutil.Process(self.pid)
        self.children = self.master_process.children(recursive=True)

        self.cpu_count = multiprocessing.cpu_count()
        self.queue_config_mapper = QueueConfigMapper()
        self.cred_manager = CredManager(self.queue_config_mapper,
                                        single_mode=True)
Esempio n. 16
0
    def submit_workers(self, workspec_list):
        retlist = []

        # Get queue info from DB
        pandaqueues = self.dbproxy.get_cache("panda_queues.json", None)
        if pandaqueues is None:
            raise Exception("Failed to get panda queue info from database")
        pandaqueues = pandaqueues.data

        osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None)
        if osmap is None:
            raise Exception("Failed to get Object Store info from database")
        osmap = osmap.data

        for workspec in workspec_list:

            arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
            tmplog = arclog.log

            # Assume for aCT that jobs are always pre-fetched (no late-binding)
            for jobspec in workspec.get_jobspec_list():

                tmplog.debug("JobSpec: {0}".format(jobspec.values_map()))

                if jobspec.computingSite not in pandaqueues:
                    retlist.append(
                        (False, "No queue information for {0}".format(
                            jobspec.computingSite)))
                    continue

                # Get CEs from panda queue info
                # List of (endpoint, queue) tuples
                arcces = []
                for endpoint in pandaqueues[jobspec.computingSite]['queues']:
                    ce_endpoint = endpoint['ce_endpoint']
                    if not re.search('://', ce_endpoint):
                        ce_endpoint = 'gsiftp://%s' % ce_endpoint
                    ce_queue = endpoint['ce_queue_name']
                    arcces.append((ce_endpoint, ce_queue))

                if not arcces:
                    retlist.append((False, "No CEs defined for %{0}".format(
                        jobspec.computingSite)))
                    continue

                # Set true pilot or not
                queueconfigmapper = QueueConfigMapper()
                queueconfig = queueconfigmapper.get_queue(
                    jobspec.computingSite)
                pandaqueues[jobspec.computingSite][
                    'truepilot'] = 'running' in queueconfig.noHeartbeat

                # Set log URL for GTAG env in job description
                logbaseurl = queueconfig.submitter.get('logBaseURL')
                logsubdir = self._set_logdir(jobspec.computingSite)
                logfileurl = '/'.join(
                    [logbaseurl, logsubdir,
                     '%d.out' % jobspec.PandaID]) if logbaseurl else None

                tmplog.debug("Converting to ARC XRSL format")
                arcxrsl = ARCParser(
                    jobspec.jobParams,
                    jobspec.computingSite,
                    pandaqueues[jobspec.computingSite],
                    logfileurl,
                    self.schedulerid,
                    osmap,
                    '/tmp',  # tmpdir, TODO common tmp dir
                    None,  #jobSpec.eventranges, # TODO event ranges
                    tmplog)
                arcxrsl.parse()
                xrsl = arcxrsl.getXrsl()
                tmplog.debug("ARC xrsl: {0}".format(xrsl))

                # Set the files to be downloaded at the end of the job
                downloadfiles = 'gmlog/errors'
                if 'logFile' in jobspec.jobParams:
                    downloadfiles += ';%s' % jobspec.jobParams[
                        'logFile'].replace('.tgz', '')
                if not pandaqueues[jobspec.computingSite]['truepilot']:
                    downloadfiles += ';jobSmallFiles.tgz'

                # Set certificate
                userconfig = arc.UserConfig(self.cred_type)
                proxyrole = ''
                if jobspec.jobParams['prodSourceLabel'] == 'user':
                    userconfig.ProxyPath(str(self.certs['pilot']))
                    proxyrole = 'pilot'
                else:
                    userconfig.ProxyPath(str(self.certs['production']))
                    proxyrole = 'production'
                tmplog.debug("Submitting using {0} proxy at {1}".format(
                    proxyrole, userconfig.ProxyPath()))

                try:
                    tmplog.debug("Submission targets: {0}".format(arcces))
                    arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog)
                    tmplog.info("ARC CE job id {0}".format(arcjob.JobID))
                    arc_utils.arcjob2workspec(arcjob, workspec)
                    workspec.workAttributes['arcdownloadfiles'] = downloadfiles
                    workspec.workAttributes['proxyrole'] = proxyrole
                    workspec.workAttributes['logsubdir'] = logsubdir
                    workspec.batchID = arcjob.JobID
                    tmplog.debug(workspec.workAttributes)
                    result = (True, '')
                except Exception as exc:
                    tmplog.error(traceback.format_exc())
                    result = (False,
                              "Failed to submit ARC job: {0}".format(str(exc)))

                retlist.append(result)

        return retlist
Esempio n. 17
0
    def check_status(self, jobspec):
        # make logger
        tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID),
                                  method_name='check_status')
        tmpLog.debug('executing base check_status')
        tmpStat, tmpMsg = GlobusBulkStager.check_status(self, jobspec)
        tmpLog.debug('got {0} {1}'.format(tmpStat, tmpMsg))
        if tmpStat is not True:
            return tmpStat, tmpMsg
        # get transfer groups
        groups = jobspec.get_groups_of_output_files()
        if len(groups) == 0:
            return tmpStat, tmpMsg
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # write to debug log queueConfig.stager
        tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager))
        # check queueConfig stager section to see if srcRSE is set
        if 'srcRSE' in queueConfig.stager:
            srcRSE = queueConfig.stager['srcRSE']
        else:
            tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file')
        # get destination endpoint
        nucleus = jobspec.jobParams['nucleus']
        agis = self.dbInterface.get_cache('panda_queues.json').data
        dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0]
        # if debugging log source and destination RSEs 
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE))
        # test that srcRSE and dstRSE are defined
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE))
        errStr = '' 
        if srcRSE is None:
            errStr = 'Source RSE is not defined '
        if dstRSE is None:
            errStr = errStr + ' Desitination RSE is not defined'
        if (srcRSE is None) or (dstRSE is None) :
           tmpLog.error(errStr)
           return None,errStr
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda" :
                self.Yodajob = True
        # set the location of the files in fileSpec.objstoreID
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json 
        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
        self.objstoreID = ddm[dstRSE]['id']
        if self.Yodajob :
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID))
        # set the location of the files in fileSpec.objstoreID
        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
        # create the Rucio Client
        try:
            # register dataset
            rucioAPI = RucioClient()
        except Exception:
                core_utils.dump_error_message(tmpLog)
                # treat as a temporary error
                tmpStat = None
                tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName)
                return tmpStat,tmpMsg
        # loop over all transfers
        tmpStat = True
        tmpMsg = ''
        for transferID in groups:
            if transferID is None:
                continue
            datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID, transferID)
            datasetScope = 'transient'
            # lock
            have_db_lock = self.dbInterface.get_object_lock(transferID, lock_interval=120)
            if not have_db_lock:
                msgStr = 'escape since {0} is locked by another thread'.format(transferID)
                tmpLog.debug(msgStr)
                return None, msgStr
            # get transfer status
            groupStatus = self.dbInterface.get_file_group_status(transferID)
            if 'hopped' in groupStatus:
                # already succeeded
                pass
            elif 'failed' in groupStatus:
                # transfer failure
                tmpStat = False
                tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName)
            elif 'hopping' in groupStatus:
                # check rucio rule
                ruleStatus = 'FAILED'
                try:
                    tmpLog.debug('check state for {0}:{1}'.format(datasetScope, datasetName))
                    for ruleInfo in rucioAPI.list_did_rules(datasetScope, datasetName):
                        if ruleInfo['rse_expression'] != dstRSE:
                            continue
                        ruleStatus = ruleInfo['state']
                        tmpLog.debug('got state={0}'.format(ruleStatus))
                        if ruleStatus == 'OK':
                            break
                except DataIdentifierNotFound:
                    tmpLog.error('dataset not found')
                except Exception:
                    core_utils.dump_error_message(tmpLog)
                    ruleStatus = None
                if ruleStatus in ['FAILED', 'CANCELED']:
                    # transfer failure
                    tmpStat = False
                    tmpMsg = 'rucio rule for {0}:{1} failed with {2}'.format(datasetScope, datasetName, ruleStatus)
                    # update file group status
                    self.dbInterface.update_file_group_status(transferID, 'failed')
                elif ruleStatus == 'OK':
                    # update successful file group status
                    self.dbInterface.update_file_group_status(transferID, 'hopped')
                else:
                    # replicating or temporary error
                    tmpStat = None
                    tmpMsg = 'replicating or temporary error for {0}:{1}'.format(datasetScope, datasetName)
            else:
                # make rucio rule
                fileSpecs = self.dbInterface.get_files_with_group_id(transferID)
                fileList = []
                for fileSpec in fileSpecs:
                    tmpFile = dict()
                    tmpFile['scope'] = datasetScope
                    tmpFile['name'] = fileSpec.lfn
                    tmpFile['bytes'] = fileSpec.fsize
                    tmpFile['adler32'] = fileSpec.chksum
                    if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
                        tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']}
                    else :
                        tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn))
                    tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn))
                    fileList.append(tmpFile)
                    # get source RSE
                    if srcRSE is None and fileSpec.objstoreID is not None:
                        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
                        srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0]
                try:
                    # register dataset
                    tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}'
                                 .format(datasetScope, datasetName,srcRSE,(30*24*60*60)))
                    try:
                        rucioAPI.add_dataset(datasetScope, datasetName,
                                             meta={'hidden': True},
                                             lifetime=30 * 24 * 60 * 60,
                                             rse=srcRSE
                                             )
                    except DataIdentifierAlreadyExists:
                        # ignore even if the dataset already exists
                        pass
                    except Exception:
                        errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope,
                                                                                        datasetName,
                                                                                        srcRSE)
                        core_utils.dump_error_message(tmpLog)
                        tmpLog.error(errMsg) 
                        raise
                        # return None,errMsg
                    # add files to dataset
                    #  add 500 files at a time
                    numfiles = len(fileList)
                    maxfiles = 500
                    numslices = numfiles/maxfiles
                    if (numfiles%maxfiles) > 0 :
                        numslices = numslices + 1
                    start = 0
                    for i in range(numslices) :
                        try:
                            stop = start + maxfiles
                            if stop > numfiles :
                                stop = numfiles

                            rucioAPI.add_files_to_datasets([{'scope': datasetScope,
                                                             'name': datasetName,
                                                             'dids': fileList[start:stop],
                                                             'rse': srcRSE}],
                                                           ignore_duplicate=True)
                            start = stop
                        except FileAlreadyExists:
                            # ignore if files already exist
                            pass
                        except Exception:
                            errMsg = 'Could not add files to DS - {0}:{1}  rse - {2} files - {3}'.format(datasetScope,
                                                                                                         datasetName,
                                                                                                         srcRSE,
                                                                                                         fileList)
                            core_utils.dump_error_message(tmpLog)
                            tmpLog.error(errMsg)
                            return None,errMsg
                    # add rule
                    try:
                        tmpDID = dict()
                        tmpDID['scope'] = datasetScope
                        tmpDID['name'] = datasetName
                        tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE,
                                                               lifetime=30 * 24 * 60 * 60)
                        ruleIDs = tmpRet[0]
                        tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName,
                                                                                       str(ruleIDs)))
                    except DuplicateRule:
                        # ignore duplicated rule
                        tmpLog.debug('rule is already available')
                    except Exception:
                        errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName)
                        core_utils.dump_error_message(tmpLog)
                        tmpLog.debug(errMsg)
                        #raise
                        return None,errMsg
                    # update file group status
                    self.dbInterface.update_file_group_status(transferID, 'hopping')
                except Exception:
                    core_utils.dump_error_message(tmpLog)
                    # treat as a temporary error
                    tmpStat = None
                    tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName)
            # release lock
            self.dbInterface.release_object_lock(transferID)
            # escape if already failed
            if tmpStat is False:
                break
        # all done
        if tmpStat is True:
            self.set_FileSpec_status(jobspec, 'finished')
        tmpLog.debug('done with {0} : {1}'.format(tmpStat, tmpMsg))
        return tmpStat, tmpMsg
Esempio n. 18
0
    def submit_k8s_worker(self, work_spec):
        tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker')

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # set the stdout log file
        log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID)
        work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name))
        # TODO: consider if we want to upload the yaml file to PanDA cache

        yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file)
        try:

            # read the job configuration (if available, only push model)
            job_fields, job_pars_parsed = self.read_job_configuration(work_spec)

            # decide container image and executable to run. In pull mode, defaults are provided
            container_image = self.decide_container_image(job_fields, job_pars_parsed)
            executable, args = self.build_executable(job_fields, job_pars_parsed)
            tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable,
                                                                                          args))

            # choose the appropriate proxy
            panda_queues_dict = PandaQueuesDict()
            this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict())

            is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName)
            cert = self._choose_proxy(work_spec, is_grandly_unified_queue)
            if not cert:
                err_str = 'No proxy specified in proxySecretPath. Not submitted'
                tmp_return_value = (False, err_str)
                return tmp_return_value

            # get the walltime limit
            try:
                max_time = this_panda_queue_dict['maxtime']
            except Exception as e:
                tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName))
                max_time = None

            associated_params_dict = {}
            for key, val in panda_queues_dict.get_harvester_params(self.queueName).items():
                if key in self._allowed_agis_attrs:
                    associated_params_dict[key] = val

            pilot_url = associated_params_dict.get('pilot_url')
            pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current'))
            python_version = str(this_panda_queue_dict.get('python_version', '2'))

            # prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType)
            pilot_opt_dict = submitter_common.get_complicated_pilot_options(work_spec.pilotType)
            if pilot_opt_dict is None:
                prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType)
                pilot_type = work_spec.pilotType
                pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else ''
            else:
                prod_source_label = pilot_opt_dict['prod_source_label']
                pilot_type = pilot_opt_dict['pilot_type_opt']
                pilot_url_str = pilot_opt_dict['pilot_url_str']

            pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label)

            # submit the worker
            rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label,
                                                                           pilot_type, pilot_url_str,
                                                                           pilot_python_option,
                                                                           container_image, executable, args, cert,
                                                                           cpu_adjust_ratio=self.cpuAdjustRatio,
                                                                           memory_adjust_ratio=self.memoryAdjustRatio,
                                                                           max_time=max_time)
        except Exception as _e:
            tmp_log.error(traceback.format_exc())
            err_str = 'Failed to create a JOB; {0}'.format(_e)
            tmp_return_value = (False, err_str)
        else:
            work_spec.batchID = yaml_content['metadata']['name']
            tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID))
            tmp_return_value = (True, '')

        return tmp_return_value
class MultiNodeWorkerMaker(BaseWorkerMaker):
    # constructor
    def __init__(self, **kwarg):
        BaseWorkerMaker.__init__(self, **kwarg)
        self.pluginFactory = PluginFactory()
        self.queue_config_mapper = QueueConfigMapper()
        tmpLog = self.make_logger(baseLogger, method_name='__init__')
        tmpLog.info("Multinode workermaker: created.")
        tmpLog.debug("Queue name: {0}".format(self.queueName))
        if self.mode == "static":
            tmpLog.info("Static configuration")
        elif self.mode == "dynamic":
            tmpLog.info("Dynamic configuration")
            self.nNodes, self.walltimelimit = self.get_resources()
        self.nJobsPerWorker = self.nNodes * self.nJobsPerNode

    def _get_executable(self):
        # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters
        exe_str = ""

        tmpLog = self.make_logger(baseLogger, method_name='_get_executable')

        # prepare static enviroment
        env_str = ""
        if self.env not in (None, "NULL"):
            env_str = "\n".join(map(lambda s: s.strip(),  self.env.split(", ")))

        # prepare executor
        try:
            if self.executor == "aprun":  # "aprun -n [number of required nodes/jobs] -d [number of cpu per node/job]" - for one multicore job per node
                exe_str = self.executor + " -n {0} -d {1} ".format(self.nJobsPerWorker, self.nCorePerJob)
                exe_str += self.pilot
            else:
                exe_str = self.executor + " " + self.pilot
            if self.pilot_params:
                exe_str = " ".join([exe_str, self.pilot_params])
        except Exception:
            tmpLog.error("Unable to build executor command check configuration")
            exe_str = ""

        exe_str = "\n".join([env_str, exe_str])
        tmpLog.debug("Shell script body: \n%s" % exe_str)

        return exe_str

    # make a worker from jobs
    def make_worker(self, jobspec_list, queue_config, resource_type):
        tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(queue_config.queueName),
                                        method_name='make_worker')

        tmpLog.info("Multi node worker preparation started.")
        tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit,
                                                                                  self.nNodes))

        workSpec = WorkSpec()
        workSpec.nCore = self.nNodes * queue_config.submitter['nCorePerNode']
        workSpec.minRamCount = 0
        workSpec.maxDiskCount = 0
        workSpec.maxWalltime = self.walltimelimit
        workSpec.workParams = self._get_executable()

        if len(jobspec_list) > 0:
            # push case: we know the job and set the parameters of the job
            for jobSpec in jobspec_list:
                try:
                    workSpec.minRamCount += jobSpec.jobParams['minRamCount']
                except Exception:
                    pass
                try:
                    workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount']
                except Exception:
                    pass
                #try:
                #    if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"):
                #        workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime'])
                #    else:
                #        workSpec.maxWalltime = queue_config.walltimeLimit
                #except Exception:
                #    pass
        tmpLog.info("Worker for {0} nodes with {2} jobs with walltime {1} sec. defined".format(self.nNodes,
                                                                                             workSpec.maxWalltime,
                                                                                             self.nJobsPerWorker))

        return workSpec

    # def get_num_jobs_per_worker(self, n_workers):
    #     """
    #     Function to set 'size' of worker. Define number of jobs per worker
    #     """
    #     tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName),
    #                                     method_name='get_num_jobs_per_worker')
    #     tmpLog.info("Get number of jobs per worker")
    #     self.nJobsPerWorker = 1
    #     if self.mode == "static":
    #         tmpLog.info("Static configuration")
    #         self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
    #     elif self.mode == "dynamic":
    #         tmpLog.info("Dynamic configuration")
    #         self.nNodes, self.walltimelimit = self.get_resources()
    #         self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
    #
    #     tmpLog.info("Get: {0} jobs to run for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit))
    #     return self.nJobsPerWorker

    def get_resources(self):
        """
        Function to get resourcese and map them to number of jobs
        """
        tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName),
                                        method_name='get_resources')
        njobs = 0
        walltime = self.walltimelimit
        queue_config = self.queue_config_mapper.get_queue(self.queueName)
        resource_utils = self.pluginFactory.get_plugin(queue_config.resource)
        if resource_utils:
            nodes, walltime = resource_utils.get_resources()
        else:
            tmpLog.info("Resource plugin is not defined")
            nodes = self.nNodes

        return nodes, walltime
    def check_status(self, jobspec):
        tmpStat = True
        tmpMsg = ''
        # make logger
        tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident),
                                  method_name='check_status')
        tmpLog.debug('start')
        # check that jobspec.computingSite is defined
        if jobspec.computingSite is None:
            # not found
            tmpLog.error('jobspec.computingSite is not defined')
            return False, 'jobspec.computingSite is not defined'
        else:
            tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # write to debug log queueConfig.stager
        tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager))
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda" :
                self.Yodajob = True
        # get destination endpoint
        nucleus = jobspec.jobParams['nucleus']
        agis = self.dbInterface.get_cache('panda_queues.json').data
        dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0]
        # set the location of the files in fileSpec.objstoreID
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json 
        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
        self.objstoreID = ddm[dstRSE]['id']
        if self.Yodajob :
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID))
        # set the location of the files in fileSpec.objstoreID
        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
        # Get the files grouped by Rucio Rule ID 
        groups = jobspec.get_groups_of_output_files()
        if len(groups) == 0:
            tmpLog.debug('No Rucio Rules')
            return None,'No Rucio Rules'
        tmpLog.debug('#Rucio Rules - {0} - Rules - {1}'.format(len(groups),groups)) 
        
        try:
            rucioAPI = RucioClient()
        except:
            tmpLog.error('failure to get Rucio Client try again later')
            return None,'failure to get Rucio Client try again later'

        # loop over the Rucio rules 
        for rucioRule in groups:
            if rucioRule is None:
                continue
            # lock
            have_db_lock = self.dbInterface.get_object_lock(rucioRule, lock_interval=120)
            if not have_db_lock:
                msgStr = 'escape since {0} is locked by another thread'.format(rucioRule)
                tmpLog.debug(msgStr)
                return None, msgStr
            # get transfer status
            groupStatus = self.dbInterface.get_file_group_status(rucioRule)
            tmpLog.debug('rucioRule - {0} - groupStatus - {1}'.format(rucioRule,groupStatus))
            if 'transferred' in groupStatus:
                # already succeeded - set the fileSpec status for these files 
                self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
                pass
            elif 'failed' in groupStatus :
                # transfer failure
                tmpStat = False
                tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName)
            elif 'transferring' in groupStatus or 'pending' in groupStatus:
                # transfer started in Rucio check status
                try:
                    result = rucioAPI.get_replication_rule(rucioRule,False)
                    if result['state'] == "OK" :
                        # files transfered to nucleus
                        tmpLog.debug('Files for Rucio Rule {0} successfully transferred'.format(rucioRule))
                        self.dbInterface.update_file_group_status(rucioRule, 'transferred')
                        # set the fileSpec status for these files 
                        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
                        self.set_FileSpec_status(jobspec,'finished')
                    elif result['state'] == "FAILED" :
                        # failed Rucio Transfer
                        tmpStat = False
                        tmpMsg = 'Failed Rucio Transfer - Rucio Rule - {0}'.format(rucioRule)
                        tmpLog.debug(tmpMsg)
                        self.set_FileSpec_status(jobspec,'failed')
                    elif result['state'] == 'STUCK' :
                        tmpStat = None
                        tmpMsg = 'Rucio Transfer Rule {0} Stuck'.format(rucioRule)
                        tmpLog.debug(tmpMsg)
                except:
                        tmpStat = None
                        tmpMsg = 'Could not get information or Rucio Rule {0}'.format(rucioRule)
                        tmpLog.error(tmpMsg)
                        pass
            # release the lock
            if have_db_lock:
                tmpLog.debug('attempt to release DB lock for Rucio Rule {0}'.format(rucioRule))
                release_db_lock = self.dbInterface.release_object_lock(rucioRule) 
                if release_db_lock:
                    tmpLog.debug('released DB lock for rucioRule - {0}'.format(rucioRule))
                    have_db_lock = False 
                else:
                    msgStr = ' Could not release DB lock for {}'.format(rucioRule)
                    tmpLog.error(msgStr)
                    return None, msgStr

        tmpLog.debug('stop')
        return  tmpStat, tmpMsg
    def check_stage_out_status(self, jobspec):
        tmpStat = True
        tmpMsg = ''
        # make logger
        tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident),
                                  method_name='check_stage_out_status')
        tmpLog.debug('start')
        # check that jobspec.computingSite is defined
        if jobspec.computingSite is None:
            # not found
            tmpLog.error('jobspec.computingSite is not defined')
            return False, 'jobspec.computingSite is not defined'
        else:
            tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # write to debug log queueConfig.stager
        tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager))
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda" :
                self.Yodajob = True
        # get destination endpoint
        nucleus = jobspec.jobParams['nucleus']
        agis = self.dbInterface.get_cache('panda_queues.json').data
        dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0]
        # set the location of the files in fileSpec.objstoreID
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json 
        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
        self.objstoreID = ddm[dstRSE]['id']
        if self.Yodajob :
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID))
        # set the location of the files in fileSpec.objstoreID
        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
        # Get the files grouped by Rucio Rule ID 
        groups = jobspec.get_groups_of_output_files()
        if len(groups) == 0:
            tmpLog.debug('No Rucio Rules')
            return None,'No Rucio Rules'
        tmpLog.debug('#Rucio Rules - {0} - Rules - {1}'.format(len(groups),groups)) 
        
        try:
            rucioAPI = RucioClient()
        except:
            tmpLog.error('failure to get Rucio Client try again later')
            return None,'failure to get Rucio Client try again later'

        # loop over the Rucio rules 
        for rucioRule in groups:
            if rucioRule is None:
                continue
            # lock
            have_db_lock = self.dbInterface.get_object_lock(rucioRule, lock_interval=120)
            if not have_db_lock:
                msgStr = 'escape since {0} is locked by another thread'.format(rucioRule)
                tmpLog.debug(msgStr)
                return None, msgStr
            # get transfer status
            groupStatus = self.dbInterface.get_file_group_status(rucioRule)
            tmpLog.debug('rucioRule - {0} - groupStatus - {1}'.format(rucioRule,groupStatus))
            if 'transferred' in groupStatus:
                # already succeeded - set the fileSpec status for these files 
                self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
                pass
            elif 'failed' in groupStatus :
                # transfer failure
                tmpStat = False
                tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName)
            elif 'transferring' in groupStatus or 'pending' in groupStatus:
                # transfer started in Rucio check status
                try:
                    result = rucioAPI.get_replication_rule(rucioRule,False)
                    if result['state'] == "OK" :
                        # files transfered to nucleus
                        tmpLog.debug('Files for Rucio Rule {0} successfully transferred'.format(rucioRule))
                        self.dbInterface.update_file_group_status(rucioRule, 'transferred')
                        # set the fileSpec status for these files 
                        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
                        self.set_FileSpec_status(jobspec,'finished')
                    elif result['state'] == "FAILED" :
                        # failed Rucio Transfer
                        tmpStat = False
                        tmpMsg = 'Failed Rucio Transfer - Rucio Rule - {0}'.format(rucioRule)
                        tmpLog.debug(tmpMsg)
                        self.set_FileSpec_status(jobspec,'failed')
                    elif result['state'] == 'STUCK' :
                        tmpStat = None
                        tmpMsg = 'Rucio Transfer Rule {0} Stuck'.format(rucioRule)
                        tmpLog.debug(tmpMsg)
                except:
                        tmpStat = None
                        tmpMsg = 'Could not get information or Rucio Rule {0}'.format(rucioRule)
                        tmpLog.error(tmpMsg)
                        pass
            # release the lock
            if have_db_lock:
                tmpLog.debug('attempt to release DB lock for Rucio Rule {0}'.format(rucioRule))
                release_db_lock = self.dbInterface.release_object_lock(rucioRule) 
                if release_db_lock:
                    tmpLog.debug('released DB lock for rucioRule - {0}'.format(rucioRule))
                    have_db_lock = False 
                else:
                    msgStr = ' Could not release DB lock for {}'.format(rucioRule)
                    tmpLog.error(msgStr)
                    return None, msgStr

        tmpLog.debug('stop')
        return  tmpStat, tmpMsg
Esempio n. 22
0
 def __init__(self, **kwarg):
     PluginBase.__init__(self, **kwarg)
     self.pluginFactory = PluginFactory()
     self.queue_config_mapper = QueueConfigMapper()
     tmpLog = self.make_logger(baseLogger, method_name='__init__')
     tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor))
    def __init__(self, **kwarg):
        self.logBaseURL = 'http://localhost/test'
        PluginBase.__init__(self, **kwarg)

        self.queue_config_mapper = QueueConfigMapper()
Esempio n. 24
0
    def check_status(self, jobspec):
        # make logger
        tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident),
                                  method_name='check_status')
        tmpLog.debug('start')
        # show the dummy transfer id and set to a value with the PandaID if needed.
        tmpLog.debug('self.dummy_transfer_id = {}'.format(self.dummy_transfer_id))
        if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base,'XXXX') :
            old_dummy_transfer_id = self.dummy_transfer_id
            self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base,jobspec.PandaID)
            tmpLog.debug('Change self.dummy_transfer_id  from {0} to {1}'.format(old_dummy_transfer_id,self.dummy_transfer_id))
 
        # default return
        tmpRetVal = (True, '')
        # set flag if have db lock
        have_db_lock = False 
        # check that jobspec.computingSite is defined
        if jobspec.computingSite is None:
            # not found
            tmpLog.error('jobspec.computingSite is not defined')
            return False, 'jobspec.computingSite is not defined'
        else:
            tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda" :
                self.Yodajob = True
        # set the location of the files in fileSpec.objstoreID
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json 
        self.objstoreID = int(queueConfig.stager['objStoreID_ES'])
        if self.Yodajob :
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID))
        # test we have a Globus Transfer Client
        if not self.tc :
            errStr = 'failed to get Globus Transfer Client'
            tmpLog.error(errStr)
            return False, errStr
        # set transferID to None
        transferID = None
        # get the scope of the log files
        outfileattrib = jobspec.get_output_file_attributes()
        scopeLog = 'xxxx'
        for key in outfileattrib.keys():
            if "log.tgz" in key :
                scopeLog = outfileattrib[key]['scope']
        # get transfer groups
        groups = jobspec.get_groups_of_output_files()
        tmpLog.debug('jobspec.get_groups_of_output_files() = : {0}'.format(groups))
        # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests
        for dummy_transferID in groups:
            # skip if valid transfer ID not dummy one
            if validate_transferid(dummy_transferID) :
                continue
            # lock for 120 sec
            tmpLog.debug('attempt to set DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID))
            have_db_lock = self.dbInterface.get_object_lock(dummy_transferID, lock_interval=120)
            if not have_db_lock:
                # escape since locked by another thread
                msgStr = 'escape since locked by another thread'
                tmpLog.debug(msgStr)
                return None, msgStr
            # refresh group information since that could have been updated by another thread before getting the lock
            tmpLog.debug('self.dbInterface.refresh_file_group_info(jobspec)')
            self.dbInterface.refresh_file_group_info(jobspec)
            # get transfer groups again with refreshed info
            tmpLog.debug('After db refresh call groups=jobspec.get_groups_of_output_files()')
            groups = jobspec.get_groups_of_output_files()
            tmpLog.debug('jobspec.get_groups_of_output_files() = : {0}'.format(groups))
            # the dummy transfer ID is still there
            if dummy_transferID in groups:
                groupUpdateTime = groups[dummy_transferID]['groupUpdateTime']
                # get files with the dummy transfer ID across jobs
                fileSpecs = self.dbInterface.get_files_with_group_id(dummy_transferID)
                # submit transfer if there are more than 10 files or the group was made before more than 10 min
                msgStr = 'dummy_transferID = {0}  number of files = {1}'.format(dummy_transferID,len(fileSpecs))
                tmpLog.debug(msgStr)
                if len(fileSpecs) >= 10 or \
                        groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                    tmpLog.debug('prepare to transfer files')
                    # submit transfer and get a real transfer ID
                    # set the Globus destination Endpoint id and path will get them from Agis eventually  
                    #self.Globus_srcPath = queueConfig.stager['Globus_srcPath']
                    self.srcEndpoint = queueConfig.stager['srcEndpoint']
                    self.Globus_srcPath = self.basePath
                    self.Globus_dstPath = queueConfig.stager['Globus_dstPath']
                    self.dstEndpoint = queueConfig.stager['dstEndpoint']
                    # Test the endpoints and create the transfer data class 
                    errMsg = None
                    try:
                        # Test endpoints for activation
                        tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint)
                        tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint)
                        if tmpStatsrc and tmpStatdst:
                            errStr = 'source Endpoint and destination Endpoint activated'
                            tmpLog.debug(errStr)
                        else:
                            errMsg = ''
                            if not tmpStatsrc :
                                errMsg += ' source Endpoint not activated '
                            if not tmpStatdst :
                                errMsg += ' destination Endpoint not activated '
                            # release process lock
                            tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID))
                            self.have_db_lock = self.dbInterface.release_object_lock(dummy_transferID)
                            if not self.have_db_lock:
                                errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID)
                            tmpLog.error(errMsg)
                            tmpRetVal = (None,errMsg)
                            return tmpRetVal
                        # both endpoints activated now prepare to transfer data
                        tdata = None
                        tdata = TransferData(self.tc,
                                             self.srcEndpoint,
                                             self.dstEndpoint,
                                             sync_level="checksum")
                    except:
                        errStat, errMsg = globus_utils.handle_globus_exception(tmpLog)
                        # release process lock
                        tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID))
                        release_db_lock = self.dbInterface.release_object_lock(dummy_transferID)
                        if not release_db_lock:
                            errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID)
                        tmpLog.error(errMsg)
                        tmpRetVal = (errStat, errMsg)
                        return tmpRetVal
                    # loop over all files
                    ifile = 0
                    for fileSpec in fileSpecs:
                        logfile = False
                        scope ='panda'
                        if fileSpec.scope is not None :
                            scope = fileSpec.scope
                        # for Yoda job set the scope to transient for non log files
                        if self.Yodajob :
                            scope = 'transient'
                        if fileSpec.fileType == "log" :
                            logfile = True
                            scope = scopeLog
                        # only print to log file first 25 files
                        if ifile < 25 :
                            msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope)
                            tmpLog.debug(msgStr)
                        if ifile == 25 :
                            msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope)
                            tmpLog.debug(msgStr)
                        hash = hashlib.md5()
                        hash.update('%s:%s' % (scope, fileSpec.lfn))
                        hash_hex = hash.hexdigest()
                        correctedscope = "/".join(scope.split('.'))
                        srcURL = fileSpec.path
                        dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath,
                                                                                   scope=correctedscope,
                                                                                   hash1=hash_hex[0:2],
                                                                                   hash2=hash_hex[2:4],
                                                                                   lfn=fileSpec.lfn)
                        if logfile :
                            tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL))
                        if ifile < 25 :
                            tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL))
                        # add files to transfer object - tdata
                        if os.access(srcURL, os.R_OK):
                            if ifile < 25 :
                                tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL))
                            tdata.add_item(srcURL,dstURL)
                        else:
                            errMsg = "source file {} does not exist".format(srcURL)
                            # release process lock
                            tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID))
                            release_db_lock = self.dbInterface.release_object_lock(dummy_transferID)
                            if not release_db_lock:
                                errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID)
                            tmpLog.error(errMsg)
                            tmpRetVal = (False,errMsg)
                            return tmpRetVal
                        ifile += 1
                    # submit transfer 
                    tmpLog.debug('Number of files to transfer - {}'.format(len(tdata['DATA'])))
                    try:
                        transfer_result = self.tc.submit_transfer(tdata)
                        # check status code and message
                        tmpLog.debug(str(transfer_result))
                        if transfer_result['code'] == "Accepted":
                            # succeeded
                            # set transfer ID which are used for later lookup
                            transferID = transfer_result['task_id']
                            tmpLog.debug('successfully submitted id={0}'.format(transferID))
                            # set status for files
                            self.dbInterface.set_file_group(fileSpecs, transferID, 'running')
                            msgStr = 'submitted transfer with ID={0}'.format(transferID)
                            tmpLog.debug(msgStr)
                        else:
                            # release process lock
                            tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID))
                            release_db_lock = self.dbInterface.release_object_lock(dummy_transferID)
                            if not release_db_lock:
                                errMsg = 'Could not release DB lock for {}'.format(dummy_transferID)
                                tmpLog.error(errMsg)
                            tmpRetVal = (None, transfer_result['message'])
                            return tmpRetVal
                    except Exception as e:
                        errStat,errMsg = globus_utils.handle_globus_exception(tmpLog)
                        # release process lock
                        tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID))
                        release_db_lock = self.dbInterface.release_object_lock(dummy_transferID)
                        if not release_db_lock:
                            errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID)
                        tmpLog.error(errMsg)
                        return errStat, errMsg
                else:
                    msgStr = 'wait until enough files are pooled'
                    tmpLog.debug(msgStr)
                # release the lock
                tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID))
                release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) 
                if release_db_lock:
                    tmpLog.debug('released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID))
                    have_db_lock = False
                else:
                    msgStr += ' - Could not release DB lock for {}'.format(dummy_transferID)
                    tmpLog.error(msgStr)
                # return None to retry later
                return None, msgStr
            # release the db lock if needed
            if have_db_lock:
                tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID))
                release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) 
                if release_db_lock:
                    tmpLog.debug('released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID))
                    have_db_lock = False 
                else:
                    msgStr += ' - Could not release DB lock for {}'.format(dummy_transferID)
                    tmpLog.error(msgStr)
                    return None, msgStr
        # check transfer with real transfer IDs
        # get transfer groups 
        tmpLog.debug("groups = jobspec.get_groups_of_output_files()")
        groups = jobspec.get_groups_of_output_files()
        tmpLog.debug('Number of transfer groups - {0}'.format(len(groups)))
        tmpLog.debug('transfer groups any state - {0}'.format(groups))
        if len(groups) == 0:
            tmpLog.debug("jobspec.get_groups_of_output_files(skip_done=True) returned no files ")
            tmpLog.debug("check_status return status - True ")
            return True,''

        for transferID in groups:
            # allow only valid UUID
            if validate_transferid(transferID) :
                # get transfer task
                tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,self.tc,transferID)
                # return a temporary error when failed to get task
                if not tmpStat:
                    errStr = 'failed to get transfer task; tc = %s; transferID = %s' % (str(self.tc),str(transferID))
                    tmpLog.error(errStr)
                    return None, errStr
                # return a temporary error when task is missing 
                if transferID not in transferTasks:
                    errStr = 'transfer task ID - {} is missing'.format(transferID)
                    tmpLog.error(errStr)
                    return None, errStr
                # succeeded in finding a transfer task by tranferID
                if transferTasks[transferID]['status'] == 'SUCCEEDED':
                    tmpLog.debug('transfer task {} succeeded'.format(transferID))
                    self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
                    if self.changeFileStatusOnSuccess:
                        self.set_FileSpec_status(jobspec, 'finished')
                    return True, ''
                # failed
                if transferTasks[transferID]['status'] == 'FAILED':
                    errStr = 'transfer task {} failed'.format(transferID)
                    tmpLog.error(errStr)
                    self.set_FileSpec_status(jobspec,'failed')
                    return False, errStr
                # another status
                tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status'])
                tmpLog.debug(tmpStr)
                return None, ''
        # end of loop over transfer groups
        tmpLog.debug('End of loop over transfers groups - ending check_status function')
        return None,'no valid transfer id found'
class MultiNodeWorkerMaker(BaseWorkerMaker):
    # constructor
    def __init__(self, **kwarg):
        BaseWorkerMaker.__init__(self, **kwarg)
        self.pluginFactory = PluginFactory()
        self.queue_config_mapper = QueueConfigMapper()
        tmpLog = self.make_logger(baseLogger, method_name='__init__')
        tmpLog.info("Multinode workermaker: created.")
        tmpLog.debug("Queue name: {0}".format(self.queueName))
        if self.mode == "static":
            tmpLog.info("Static configuration")
        elif self.mode == "dynamic":
            tmpLog.info("Dynamic configuration")
            self.nNodes, self.walltimelimit = self.get_resources()
        self.nJobsPerWorker = self.nNodes * self.nJobsPerNode

    def _get_executable(self):
        # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters
        exe_str = ""

        tmpLog = self.make_logger(baseLogger, method_name='_get_executable')

        # prepare static enviroment
        env_str = ""
        if self.env not in (None, "NULL"):
            env_str = "\n".join(map(lambda s: s.strip(),  self.env.split(", ")))

        # prepare executor
        try:
            if self.executor == "aprun":  # "aprun -n [number of required nodes/jobs] -d [number of cpu per node/job]" - for one multicore job per node
                exe_str = self.executor + " -n {0} -d {1} ".format(self.nJobsPerWorker, self.nCorePerJob)
                exe_str += self.pilot
            else:
                exe_str = self.executor + " " + self.pilot
            if self.pilot_params:
                exe_str = " ".join([exe_str, self.pilot_params])
        except Exception:
            tmpLog.error("Unable to build executor command check configuration")
            exe_str = ""

        exe_str = "\n".join([env_str, exe_str])
        tmpLog.debug("Shell script body: \n%s" % exe_str)

        return exe_str

    # make a worker from jobs
    def make_worker(self, jobspec_list, queue_config, job_type, resource_type):
        tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(queue_config.queueName),
                                        method_name='make_worker')

        tmpLog.info("Multi node worker preparation started.")
        tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit,
                                                                                  self.nNodes))

        workSpec = WorkSpec()
        workSpec.nCore = self.nNodes * queue_config.submitter['nCorePerNode']
        workSpec.minRamCount = 0
        workSpec.maxDiskCount = 0
        workSpec.maxWalltime = self.walltimelimit
        workSpec.workParams = self._get_executable()

        if len(jobspec_list) > 0:
            # push case: we know the job and set the parameters of the job
            for jobSpec in jobspec_list:
                try:
                    workSpec.minRamCount += jobSpec.jobParams['minRamCount']
                except Exception:
                    pass
                try:
                    workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount']
                except Exception:
                    pass
                #try:
                #    if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"):
                #        workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime'])
                #    else:
                #        workSpec.maxWalltime = queue_config.walltimeLimit
                #except Exception:
                #    pass
        tmpLog.info("Worker for {0} nodes with {2} jobs with walltime {1} sec. defined".format(self.nNodes,
                                                                                             workSpec.maxWalltime,
                                                                                             self.nJobsPerWorker))

        return workSpec

    # def get_num_jobs_per_worker(self, n_workers):
    #     """
    #     Function to set 'size' of worker. Define number of jobs per worker
    #     """
    #     tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName),
    #                                     method_name='get_num_jobs_per_worker')
    #     tmpLog.info("Get number of jobs per worker")
    #     self.nJobsPerWorker = 1
    #     if self.mode == "static":
    #         tmpLog.info("Static configuration")
    #         self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
    #     elif self.mode == "dynamic":
    #         tmpLog.info("Dynamic configuration")
    #         self.nNodes, self.walltimelimit = self.get_resources()
    #         self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
    #
    #     tmpLog.info("Get: {0} jobs to run for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit))
    #     return self.nJobsPerWorker

    def get_resources(self):
        """
        Function to get resourcese and map them to number of jobs
        """
        tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName),
                                        method_name='get_resources')
        njobs = 0
        walltime = self.walltimelimit
        queue_config = self.queue_config_mapper.get_queue(self.queueName)
        resource_utils = self.pluginFactory.get_plugin(queue_config.resource)
        if resource_utils:
            nodes, walltime = resource_utils.get_resources()
        else:
            tmpLog.info("Resource plugin is not defined")
            nodes = self.nNodes

        return nodes, walltime
Esempio n. 26
0
    def submit_workers(self, workspec_list):
        retlist = []

        # Get queue info from DB
        pandaqueues = self.dbproxy.get_cache("panda_queues.json", None)
        if pandaqueues is None:
            raise Exception("Failed to get panda queue info from database")
        pandaqueues = pandaqueues.data

        osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None)
        if osmap is None:
            raise Exception("Failed to get Object Store info from database")
        osmap = osmap.data

        for workspec in workspec_list:

            arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
            tmplog = arclog.log

            # Assume for aCT that jobs are always pre-fetched (no late-binding)
            for jobspec in workspec.get_jobspec_list():

                tmplog.debug("JobSpec: {0}".format(jobspec.values_map()))

                if jobspec.computingSite not in pandaqueues:
                    retlist.append((False, "No queue information for {0}".format(jobspec.computingSite)))
                    continue

                # Get CEs from panda queue info
                # List of (endpoint, queue) tuples
                arcces = []
                for endpoint in pandaqueues[jobspec.computingSite]['queues']:
                    ce_endpoint = endpoint['ce_endpoint']
                    if not re.search('://', ce_endpoint):
                        ce_endpoint = 'gsiftp://%s' % ce_endpoint
                    ce_queue = endpoint['ce_queue_name']
                    arcces.append((ce_endpoint, ce_queue))

                if not arcces:
                    retlist.append((False, "No CEs defined for %{0}".format(jobspec.computingSite)))
                    continue

                # Set true pilot or not
                queueconfigmapper = QueueConfigMapper()
                queueconfig = queueconfigmapper.get_queue(jobspec.computingSite)
                pandaqueues[jobspec.computingSite]['truepilot'] = 'running' in queueconfig.noHeartbeat

                # Set log URL for GTAG env in job description
                logbaseurl = queueconfig.submitter.get('logBaseURL')
                logsubdir = self._set_logdir(jobspec.computingSite)
                logfileurl = '/'.join([logbaseurl, logsubdir, '%d.out' % jobspec.PandaID]) if logbaseurl else None

                tmplog.debug("Converting to ARC XRSL format")
                arcxrsl = ARCParser(jobspec.jobParams,
                                    jobspec.computingSite,
                                    pandaqueues[jobspec.computingSite],
                                    logfileurl,
                                    self.schedulerid,
                                    osmap,
                                    '/tmp', # tmpdir, TODO common tmp dir
                                    None, #jobSpec.eventranges, # TODO event ranges
                                    tmplog)
                arcxrsl.parse()
                xrsl = arcxrsl.getXrsl()
                tmplog.debug("ARC xrsl: {0}".format(xrsl))
                
                # Set the files to be downloaded at the end of the job
                downloadfiles = 'gmlog/errors'
                if 'logFile' in jobspec.jobParams:
                    downloadfiles += ';%s' %jobspec.jobParams['logFile'].replace('.tgz', '')
                if not pandaqueues[jobspec.computingSite]['truepilot']:
                    downloadfiles += ';jobSmallFiles.tgz'
                    
                # Set certificate
                userconfig = arc.UserConfig(self.cred_type)
                proxyrole = ''
                if jobspec.jobParams['prodSourceLabel'] == 'user':
                    userconfig.ProxyPath(str(self.certs['pilot']))
                    proxyrole = 'pilot'
                else:
                    userconfig.ProxyPath(str(self.certs['production']))
                    proxyrole = 'production'
                tmplog.debug("Submitting using {0} proxy at {1}".format(proxyrole, userconfig.ProxyPath()))

                try:
                    tmplog.debug("Submission targets: {0}".format(arcces))
                    arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog)
                    tmplog.info("ARC CE job id {0}".format(arcjob.JobID))
                    arc_utils.arcjob2workspec(arcjob, workspec)
                    workspec.workAttributes['arcdownloadfiles'] = downloadfiles
                    workspec.workAttributes['proxyrole'] = proxyrole
                    workspec.workAttributes['logsubdir'] = logsubdir
                    workspec.batchID = arcjob.JobID
                    tmplog.debug(workspec.workAttributes)
                    result = (True, '')
                except Exception as exc:
                    tmplog.error(traceback.format_exc())
                    result = (False, "Failed to submit ARC job: {0}".format(str(exc)))

                retlist.append(result)

        return retlist
Esempio n. 27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--queueName', action='store', dest='queueName', default=None, required=True,
                        help='the name of queue where harvester is installed')
    parser.add_argument('--middleware', action='store', dest='middleware', default='rpc',
                        help='middleware to access the remote target machine')
    options = parser.parse_args()

    # get queue
    qcm = QueueConfigMapper()
    qcm.load_data()
    queueConfig = qcm.get_queue(options.queueName)
    if queueConfig is None:
        print ('ERROR: queue={0} not found in panda_queueconfig.json'.format(options.queueName))
        sys.exit(1)

    # get middleware
    if not hasattr(queueConfig, options.middleware):
        print ('ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json'.format(options.middleware,
                                                                                               options.queueName))
        sys.exit(1)
    middleware = getattr(queueConfig, options.middleware)

    # get ssh parameters
    sshHost = middleware['remoteHost']
    try:
        sshPort = middleware['remotePort']
    except Exception:
        sshPort = 22
    sshUserName = middleware['sshUserName']
    try:
        sshPassword = middleware['sshPassword']
    except Exception:
        sshPassword = None

    privateKey = None
    passPhrase = None
    if sshPassword is None:
        try:
            privateKey = middleware['privateKey']
        except Exception:
            print ("ERROR: set sshPassword or privateKey in middleware={0}".format(options.middleware))
            sys.exit(1)
        try:
            passPhrase = middleware['passPhrase']
        except Exception:
            passPhrase = None

    try:
        jumpHost = middleware['jumpHost']
    except Exception:
        jumpHost = None
    try:
        jumpPort = middleware['jumpPort']
    except Exception:
        jumpPort = 22

    # ssh
    sshTunnelPool.make_tunnel_server(sshHost, sshPort, remote_bind_port=middleware['remoteBindPort'],
                                     num_tunnels=1, ssh_username=sshUserName, ssh_password=sshPassword,
                                     private_key=privateKey, pass_phrase=passPhrase,
                                     jump_host=jumpHost, jump_port=jumpPort
                                     )
    ssh = sshTunnelPool.get_tunnel(sshHost, sshPort)[-1]
    return ssh
 def __init__(self, **kwarg):
     PluginBase.__init__(self, **kwarg)
     self.queue_config_mapper = QueueConfigMapper()
Esempio n. 29
0
 def __init__(self, **kwarg):
     PluginBase.__init__(self, **kwarg)
     self.pluginFactory = PluginFactory()
     self.queue_config_mapper = QueueConfigMapper()
     tmpLog = self.make_logger(baseLogger, method_name='__init__')
     tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor))
 def check_status(self, jobspec):
     # make logger
     tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID),
                                     method_name='check_status')
     tmpLog.debug('start')
     # default return
     tmpRetVal = (True, '')
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
     # test we have a Globus Transfer Client
     if not self.tc :
         errStr = 'failed to get Globus Transfer Client'
         tmpLog.error(errStr)
         return False, errStr
     # set transferID to None
     transferID = None
     # get transfer groups
     groups = jobspec.get_groups_of_input_files(skip_ready=True)
     tmpLog.debug('jobspec.get_groups_of_input_files() = : {0}'.format(groups))
     # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests
     if self.dummy_transfer_id in groups:
         # lock for 120 sec
         if not self.have_db_lock :
             tmpLog.debug('attempt to set DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
             self.have_db_lock = self.dbInterface.get_object_lock(self.dummy_transfer_id, lock_interval=120)
         if not self.have_db_lock:
             # escape since locked by another thread
             msgStr = 'escape since locked by another thread'
             tmpLog.debug(msgStr)
             return None, msgStr
         # refresh group information since that could have been updated by another thread before getting the lock
         self.dbInterface.refresh_file_group_info(jobspec)
         # get transfer groups again with refreshed info
         groups = jobspec.get_groups_of_input_files(skip_ready=True)
         # the dummy transfer ID is still there
         if self.dummy_transfer_id in groups:
             groupUpdateTime = groups[self.dummy_transfer_id]['groupUpdateTime']
             # get files with the dummy transfer ID across jobs
             fileSpecs = self.dbInterface.get_files_with_group_id(self.dummy_transfer_id)
             # submit transfer if there are more than 10 files or the group was made before more than 10 min
             msgStr = 'self.dummy_transfer_id = {0}  number of files = {1}'.format(self.dummy_transfer_id,len(fileSpecs))
             tmpLog.debug(msgStr)
             if len(fileSpecs) >= 10 or \
                     groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                 tmpLog.debug('prepare to transfer files')
                 # submit transfer and get a real transfer ID
                 # set the Globus destination Endpoint id and path will get them from Agis eventually  
                 from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
                 queueConfigMapper = QueueConfigMapper()
                 queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
                 self.Globus_srcPath = queueConfig.preparator['Globus_srcPath']
                 self.srcEndpoint = queueConfig.preparator['srcEndpoint']
                 self.Globus_dstPath = self.basePath
                 #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath']
                 self.dstEndpoint = queueConfig.preparator['dstEndpoint']
                 # Test the endpoints and create the transfer data class 
                 errMsg = None
                 try:
                     # Test endpoints for activation
                     tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint)
                     tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint)
                     if tmpStatsrc and tmpStatdst:
                         errStr = 'source Endpoint and destination Endpoint activated'
                         tmpLog.debug(errStr)
                     else:
                         errMsg = ''
                         if not tmpStatsrc :
                             errMsg += ' source Endpoint not activated '
                         if not tmpStatdst :
                             errMsg += ' destination Endpoint not activated '
                         # release process lock
                         tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
                         self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id)
                         if not self.have_db_lock:
                             errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id)
                         tmpLog.error(errMsg)
                         tmpRetVal = (None,errMsg)
                         return tmpRetVal
                     # both endpoints activated now prepare to transfer data
                     tdata = TransferData(self.tc,
                                          self.srcEndpoint,
                                          self.dstEndpoint,
                                          sync_level="checksum")
                 except:
                     errStat, errMsg = globus_utils.handle_globus_exception(tmpLog)
                     # release process lock
                     tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
                     self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id)
                     if not self.have_db_lock:
                         errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id)
                     tmpLog.error(errMsg)
                     tmpRetVal = (errStat, errMsg)
                     return tmpRetVal
                 # loop over all files
                 for fileSpec in fileSpecs:
                     attrs = jobspec.get_input_file_attributes()
                     msgStr = "len(jobSpec.get_input_file_attributes()) = {0} type - {1}".format(len(attrs),type(attrs))
                     tmpLog.debug(msgStr)
                     for key, value in attrs.iteritems():
                         msgStr = "input file attributes - {0} {1}".format(key,value)
                         tmpLog.debug(msgStr)
                     msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope)
                     tmpLog.debug(msgStr)
                     scope = fileSpec.scope
                     hash = hashlib.md5()
                     hash.update('%s:%s' % (scope, fileSpec.lfn))
                     hash_hex = hash.hexdigest()
                     correctedscope = "/".join(scope.split('.'))
                     #srcURL = fileSpec.path
                     srcURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_srcPath,
                                                                                scope=correctedscope,
                                                                                hash1=hash_hex[0:2],
                                                                                hash2=hash_hex[2:4],
                                                                                lfn=fileSpec.lfn)
                     dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath,
                                                                                scope=correctedscope,
                                                                                hash1=hash_hex[0:2],
                                                                                hash2=hash_hex[2:4],
                                                                                lfn=fileSpec.lfn)
                     tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL))
                     # add files to transfer object - tdata
                     tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL))
                     tdata.add_item(srcURL,dstURL)
                 # submit transfer 
                 try:
                     transfer_result = self.tc.submit_transfer(tdata)
                     # check status code and message
                     tmpLog.debug(str(transfer_result))
                     if transfer_result['code'] == "Accepted":
                         # succeeded
                         # set transfer ID which are used for later lookup
                         transferID = transfer_result['task_id']
                         tmpLog.debug('successfully submitted id={0}'.format(transferID))
                         # set status for files
                         self.dbInterface.set_file_group(fileSpecs, transferID, 'running')
                         msgStr = 'submitted transfer with ID={0}'.format(transferID)
                         tmpLog.debug(msgStr)
                     else:
                         # release process lock
                         tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
                         self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id)
                         if not self.have_db_lock:
                             errMsg = 'Could not release DB lock for {}'.format(self.dummy_transfer_id)
                             tmpLog.error(errMsg)
                         tmpRetVal = (None, transfer_result['message'])
                         return tmpRetVal
                 except Exception as e:
                     errStat,errMsg = globus_utils.handle_globus_exception(tmpLog)
                     # release process lock
                     tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
                     self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id)
                     if not self.have_db_lock:
                         errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id)
                     tmpLog.error(errMsg)
                     return errStat, errMsg
             else:
                 msgStr = 'wait until enough files are pooled'
                 tmpLog.debug(msgStr)
             # release the lock
             tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
             self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) 
             if not self.have_db_lock:
                 msgStr += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id)
                 tmpLog.error(msgStr)
             # return None to retry later
             return None, msgStr
     # check transfer with real transfer IDs
     # get transfer groups 
     groups = jobspec.get_groups_of_input_files(skip_ready=True)
     for transferID in groups:
         if transferID != self.dummy_transfer_id :
             # get transfer task
             tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,self.tc,transferID)
             # return a temporary error when failed to get task
             if not tmpStat:
                 errStr = 'failed to get transfer task'
                 tmpLog.error(errStr)
                 return None, errStr
             # return a temporary error when task is missing 
             if transferID not in transferTasks:
                 errStr = 'transfer task ID - {} is missing'.format(transferID)
                 tmpLog.error(errStr)
                 return None, errStr
             # succeeded in finding a transfer task by tranferID
             if transferTasks[transferID]['status'] == 'SUCCEEDED':
                 tmpLog.debug('transfer task {} succeeded'.format(transferID))
                 self.set_FileSpec_status(jobspec,'finished')
                 return True, ''
             # failed
             if transferTasks[transferID]['status'] == 'FAILED':
                 errStr = 'transfer task {} failed'.format(transferID)
                 tmpLog.error(errStr)
                 self.set_FileSpec_status(jobspec,'failed')
                 return False, errStr
             # another status
             tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status'])
             tmpLog.debug(tmpStr)
             return None, ''
Esempio n. 31
0
 def trigger_preparation(self, jobspec):
     # get logger
     tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID),
                               method_name='trigger_preparation')
     tmpLog.debug('start')               
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
     # test we have a Globus Transfer Client
     if not self.tc :
         errStr = 'failed to get Globus Transfer Client'
         tmpLog.error(errStr)
         return False, errStr
     # get label
     label = self.make_label(jobspec)
     tmpLog.debug('label={0}'.format(label))
     # get transfer tasks
     tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog,self.tc,label)
     if not tmpStat:
         errStr = 'failed to get transfer tasks'
         tmpLog.error(errStr)
         return False, errStr
     # check if already queued
     if label in transferTasks:
         tmpLog.debug('skip since already queued with {0}'.format(str(transferTasks[label])))
         return True, ''
     # set the Globus destination Endpoint id and path will get them from Agis eventually  
     from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
     queueConfigMapper = QueueConfigMapper()
     queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
     self.Globus_srcPath = queueConfig.preparator['Globus_srcPath']
     self.srcEndpoint = queueConfig.preparator['srcEndpoint']
     self.Globus_dstPath = self.basePath
     #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath']
     self.dstEndpoint = queueConfig.preparator['dstEndpoint']
     # get input files
     files = []
     lfns = []
     inFiles = jobspec.get_input_file_attributes(skip_ready=True)
     for inLFN, inFile in iteritems(inFiles):
         # set path to each file
         inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN)
         dstpath = inFile['path']
         # check if path exists if not create it.
         if not os.access(self.basePath, os.F_OK):
             os.makedirs(self.basePath)
         # create the file paths for the Globus source and destination endpoints 
         Globus_srcpath = mover_utils.construct_file_path(self.Globus_srcPath, inFile['scope'], inLFN)
         Globus_dstpath = mover_utils.construct_file_path(self.Globus_dstPath, inFile['scope'], inLFN)
         files.append({'scope': inFile['scope'],
                       'name': inLFN,
                       'Globus_dstPath': Globus_dstpath,
                       'Globus_srcPath': Globus_srcpath})
         lfns.append(inLFN)
     tmpLog.debug('files[] {0}'.format(files))
     try:
         # Test endpoints for activation
         tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint)
         tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint)
         if tmpStatsrc and tmpStatdst:
             errStr = 'source Endpoint and destination Endpoint activated'
             tmpLog.debug(errStr)
         else:
             errStr = ''
             if not tmpStatsrc :
                 errStr += ' source Endpoint not activated '
             if not tmpStatdst :
                 errStr += ' destination Endpoint not activated '
             tmpLog.error(errStr)
             return False,errStr
         # both endpoints activated now prepare to transfer data
         if len(files) > 0:
             tdata = TransferData(self.tc,
                                  self.srcEndpoint,
                                  self.dstEndpoint,
                                  label=label,
                                  sync_level="checksum")
             # loop over all input files and add 
             for myfile in files:
                 tdata.add_item(myfile['Globus_srcPath'],myfile['Globus_dstPath'])
             # submit
             transfer_result = self.tc.submit_transfer(tdata)
             # check status code and message
             tmpLog.debug(str(transfer_result))
             if transfer_result['code'] == "Accepted":
                 # succeeded
                 # set transfer ID which are used for later lookup
                 transferID = transfer_result['task_id']
                 jobspec.set_groups_to_files({transferID: {'lfns': lfns, 'groupStatus': 'active'}})
                 tmpLog.debug('done')
                 return True,''
             else:
                 return False,transfer_result['message']
         # if no files to transfer return True
         return True, 'No files to transfer'
     except:
         errStat,errMsg = globus_utils.handle_globus_exception(tmpLog)
         return errStat, {}
Esempio n. 32
0
from pandaharvester.harvestercore.plugin_factory import PluginFactory


file_prefix = 'panda.sgotest.'


def exit_func():
    for f in os.listdir('.'):
        if f.startswith(file_prefix):
            os.remove(f)


atexit.register(exit_func)

queueName = sys.argv[1]
queueConfigMapper = QueueConfigMapper()
queueConfig = queueConfigMapper.get_queue(queueName)

fileSpec = FileSpec()
fileSpec.fileType = 'output'
fileSpec.lfn = file_prefix + uuid.uuid4().hex + '.gz'
fileSpec.fileAttributes = {'guid': str(uuid.uuid4())}
fileSpec.checksum = '0d439274'
assFileSpec = FileSpec()
assFileSpec.lfn = file_prefix + uuid.uuid4().hex
assFileSpec.fileType = 'es_output'
assFileSpec.fsize = random.randint(10, 100)
assFileSpec.path = os.getcwd() + '/' + assFileSpec.lfn
oFile = open(assFileSpec.lfn, 'w')
oFile.write(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize)))
oFile.close()
    def trigger_stage_out(self, jobspec):
        # make logger
        tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident),
                                  method_name='trigger_stage_out')
        tmpLog.debug('start')
        # initialize some values
        tmpStat = None
        tmpMsg = ''
        srcRSE = None
        dstRSE = None
        datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID,str(uuid.uuid4()))
        datasetScope = 'transient'
        # check that jobspec.computingSite is defined
        if jobspec.computingSite is None:
            # not found
            tmpLog.error('jobspec.computingSite is not defined')
            return False, 'jobspec.computingSite is not defined'
        else:
            tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # write to debug log queueConfig.stager
        tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager))
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda" :
                self.Yodajob = True
        # get destination endpoint
        nucleus = jobspec.jobParams['nucleus']
        agis = self.dbInterface.get_cache('panda_queues.json').data
        dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0]
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json 
        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
        self.objstoreID = ddm[dstRSE]['id']
        if self.Yodajob :
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID))
        # set the location of the files in fileSpec.objstoreID
        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
        self.RSE_dstpath = queueConfig.stager['RSE_dstPath']
        # check queueConfig stager section to see if srcRSE is set
        if 'srcRSE' in queueConfig.stager:
            srcRSE = queueConfig.stager['srcRSE']
        else:
            tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file')
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE))
            
        # loop over the output files and copy the files
        ifile = 0
        errors = []
        fileList = []
        lfns = []
        fileSpec_list = []
        fileSpec_list = jobspec.get_output_file_specs(skip_done=False)
        msgStr = '#(jobspec.get_output_file_specs(skip_done=False)) = {0}'\
                 .format(len(fileSpec_list))
        tmpLog.debug(msgStr)
        for fileSpec in fileSpec_list:
           msgstr = 'fileSpec: dataset scope - {0} file name - {1} size(Bytes) - {2} adler32 - {3}'\
              .format(datasetScope,fileSpec.lfn,fileSpec.fsize,fileSpec.chksum)
           if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
              msgstr += ' guid - {0}'.format(fileSpec.fileAttributes['guid'])
           tmpLog.debug(msgstr)


        #for fileSpec in jobspec.get_output_file_specs(skip_done=True):
        for fileSpec in jobspec.get_output_file_specs(skip_done=False):
            scope ='panda'
            if fileSpec.scope is not None :
                scope = fileSpec.scope
            # for Yoda job set the scope to transient 
            if self.Yodajob :
                scope = 'transient'
            # only print to log file first 25 files
            if ifile < 25 :
                msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope)
                tmpLog.debug(msgStr)
            if ifile == 25 :
                msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope)
                tmpLog.debug(msgStr)
            hash = hashlib.md5()
            hash.update('%s:%s' % (scope, fileSpec.lfn))
            hash_hex = hash.hexdigest()
            correctedscope = "/".join(scope.split('.'))
            srcURL = fileSpec.path
            dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.RSE_dstPath,
                                                                       scope=correctedscope,
                                                                       hash1=hash_hex[0:2],
                                                                       hash2=hash_hex[2:4],
                                                                       lfn=fileSpec.lfn)
            if ifile < 25 :
                tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL))
            tmpFile = dict()
            # copy the source file from source to destination skip over if file already exists
            if os.path.exists(dstURL):
                tmpLog.debug('Already copied file {0}'.format(dstURL))
                # save for adding to rucio dataset
                tmpFile['scope'] = datasetScope
                tmpFile['name'] = fileSpec.lfn
                tmpFile['bytes'] = fileSpec.fsize
                tmpFile['adler32'] = fileSpec.chksum
                if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
                    tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']}
                else :
                    tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn))
                tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn))
                fileList.append(tmpFile)
                lfns.append(fileSpec.lfn)
                # get source RSE
                if srcRSE is None and fileSpec.objstoreID is not None:
                    ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
                    srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0]
                    tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE))
            else :
                if os.path.exists(srcURL) :
                    # check if destination directory exists if not create it
                    dstDIR = os.path.dirname(dstURL)
                    try:
                        if not os.path.exists(dstDIR) :
                            os.makedirs(dstDIR)
                            mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP 
                            mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID
                            os.chmod(dstDIR,mode)
                        # copy the source file to destination file
                        shutil.copy2(srcURL, dstURL)
                        # save for adding to rucio dataset
                        tmpFile['scope'] = datasetScope
                        tmpFile['name'] = fileSpec.lfn
                        tmpFile['bytes'] = fileSpec.fsize
                        tmpFile['adler32'] = fileSpec.chksum
                        if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
                            tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']}
                        else :
                            tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn))
                        tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn))
                        fileList.append(tmpFile)
                        lfns.append(fileSpec.lfn)
                        # get source RSE if not already set
                        if srcRSE is None and fileSpec.objstoreID is not None:
                            ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
                            srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0]
                            tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE))
                    except (IOError, os.error) as why:
                        errors.append((srcURL, dstURL, str(why)))
                else :
                    errors.append((srcURL, dstURL, 'Source file missing'))
            ifile += 1

        # test that srcRSE and dstRSE are defined
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE))
        errStr = '' 
        if srcRSE is None:
            errStr = 'Source RSE is not defined '
        if dstRSE is None:
            errStr = errStr + ' Desitination RSE is not defined'
        if (srcRSE is None) or (dstRSE is None) :
           tmpLog.error(errStr)
           return None,errStr

        # test to see if there are any files to add dataset
        if len(fileList) == 0:
            errStr = 'There are no files to add to database'
            tmpLog.error(errStr)
            return None,errStr
        # print out the file list
        tmpLog.debug('fileList - {0}'.format(fileList))
        
        # create the dataset and add files to it and create a transfer rule
        try:
            # register dataset
            rucioAPI = RucioClient()
            tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}'
                         .format(datasetScope, datasetName,srcRSE,(30*24*60*60)))
            try:
                rucioAPI.add_dataset(datasetScope, datasetName,
                                     meta={'hidden': True},
                                     lifetime=30 * 24 * 60 * 60,
                                     rse=srcRSE
                                     )
            except DataIdentifierAlreadyExists:
                # ignore even if the dataset already exists
                pass
            except Exception:
                errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope,
                                                                                datasetName,
                                                                                srcRSE)
                core_utils.dump_error_message(tmpLog)
                tmpLog.error(errMsg)
                return None,errMsg
            # add files to dataset
            #  add 500 files at a time
            numfiles = len(fileList)
            maxfiles = 500
            numslices = numfiles/maxfiles
            if (numfiles%maxfiles) > 0 :
               numslices = numslices + 1
            start = 0
            for i in range(numslices) :
               try:
                  stop = start + maxfiles
                  if stop > numfiles :
                     stop = numfiles

                  rucioAPI.add_files_to_datasets([{'scope': datasetScope,
                                                   'name': datasetName,
                                                   'dids': fileList[start:stop],
                                                   'rse': srcRSE}],
                                                 ignore_duplicate=True)
                  start = stop
               except FileAlreadyExists:
                  # ignore if files already exist
                  pass
               except Exception:
                  errMsg = 'Could not add files to DS - {0}:{1}  rse - {2} files - {3}'.format(datasetScope,
                                                                                               datasetName,
                                                                                               srcRSE,
                                                                                               fileList)
                  core_utils.dump_error_message(tmpLog)
                  tmpLog.error(errMsg)
                  return None,errMsg
            # add rule
            try:
                tmpDID = dict()
                tmpDID['scope'] = datasetScope
                tmpDID['name'] = datasetName
                tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE,
                                                       lifetime=30 * 24 * 60 * 60)
                ruleIDs = tmpRet[0]
                tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName,
                                                                               str(ruleIDs)))
                # group the output files together by the Rucio transfer rule
                jobspec.set_groups_to_files({ruleIDs: {'lfns': lfns,'groupStatus': 'pending'}})
                msgStr = 'jobspec.set_groups_to_files -Rucio rule - {0}, lfns - {1}, groupStatus - pending'.format(ruleIDs,lfns)
                tmpLog.debug(msgStr)
                tmpLog.debug('call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)')
                tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,'transferring')
                tmpLog.debug('called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,transferring)')
                tmpStat = True
                tmpMsg = 'created Rucio rule successfully'
            except DuplicateRule:
                # ignore duplicated rule
                tmpLog.debug('rule is already available')
            except Exception:
                errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName)
                core_utils.dump_error_message(tmpLog)
                tmpLog.debug(errMsg)
                return None,errMsg
            # update file group status
            self.dbInterface.update_file_group_status(ruleIDs, 'transferring')
        except Exception:
                core_utils.dump_error_message(tmpLog)
                # treat as a temporary error
                tmpStat = None
                tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName)

        #  Now test for any errors
        if errors:
            for error in errors:
                tmpLog.debug('copy error source {0} destination {1} Reason {2}'.format(error[0],error[1],error[2]))
            raise Error(errors)
        # otherwise we are OK                            
        tmpLog.debug('stop')
        return tmpStat,tmpMsg
Esempio n. 34
0
 def trigger_preparation(self, jobspec):
     # get logger
     tmpLog = core_utils.make_logger(_logger,
                                     'PandaID={0}'.format(jobspec.PandaID),
                                     method_name='trigger_preparation')
     tmpLog.debug('start')
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(
             jobspec.computingSite))
     # test we have a Globus Transfer Client
     if not self.tc:
         errStr = 'failed to get Globus Transfer Client'
         tmpLog.error(errStr)
         return False, errStr
     # get label
     label = self.make_label(jobspec)
     tmpLog.debug('label={0}'.format(label))
     # get transfer tasks
     tmpStat, transferTasks = globus_utils.get_transfer_tasks(
         tmpLog, self.tc, label)
     if not tmpStat:
         errStr = 'failed to get transfer tasks'
         tmpLog.error(errStr)
         return False, errStr
     # check if already queued
     if label in transferTasks:
         tmpLog.debug('skip since already queued with {0}'.format(
             str(transferTasks[label])))
         return True, ''
     # set the Globus destination Endpoint id and path will get them from Agis eventually
     from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
     queueConfigMapper = QueueConfigMapper()
     queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
     self.Globus_srcPath = queueConfig.preparator['Globus_srcPath']
     self.srcEndpoint = queueConfig.preparator['srcEndpoint']
     self.Globus_dstPath = self.basePath
     #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath']
     self.dstEndpoint = queueConfig.preparator['dstEndpoint']
     # get input files
     files = []
     lfns = []
     inFiles = jobspec.get_input_file_attributes(skip_ready=True)
     for inLFN, inFile in iteritems(inFiles):
         # set path to each file
         inFile['path'] = mover_utils.construct_file_path(
             self.basePath, inFile['scope'], inLFN)
         dstpath = inFile['path']
         # check if path exists if not create it.
         if not os.access(self.basePath, os.F_OK):
             os.makedirs(self.basePath)
         # create the file paths for the Globus source and destination endpoints
         Globus_srcpath = mover_utils.construct_file_path(
             self.Globus_srcPath, inFile['scope'], inLFN)
         Globus_dstpath = mover_utils.construct_file_path(
             self.Globus_dstPath, inFile['scope'], inLFN)
         files.append({
             'scope': inFile['scope'],
             'name': inLFN,
             'Globus_dstPath': Globus_dstpath,
             'Globus_srcPath': Globus_srcpath
         })
         lfns.append(inLFN)
     tmpLog.debug('files[] {0}'.format(files))
     try:
         # Test endpoints for activation
         tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(
             tmpLog, self.tc, self.srcEndpoint)
         tmpStatdst, dstStr = globus_utils.check_endpoint_activation(
             tmpLog, self.tc, self.dstEndpoint)
         if tmpStatsrc and tmpStatdst:
             errStr = 'source Endpoint and destination Endpoint activated'
             tmpLog.debug(errStr)
         else:
             errStr = ''
             if not tmpStatsrc:
                 errStr += ' source Endpoint not activated '
             if not tmpStatdst:
                 errStr += ' destination Endpoint not activated '
             tmpLog.error(errStr)
             return False, errStr
         # both endpoints activated now prepare to transfer data
         if len(files) > 0:
             tdata = TransferData(self.tc,
                                  self.srcEndpoint,
                                  self.dstEndpoint,
                                  label=label,
                                  sync_level="checksum")
             # loop over all input files and add
             for myfile in files:
                 tdata.add_item(myfile['Globus_srcPath'],
                                myfile['Globus_dstPath'])
             # submit
             transfer_result = self.tc.submit_transfer(tdata)
             # check status code and message
             tmpLog.debug(str(transfer_result))
             if transfer_result['code'] == "Accepted":
                 # succeeded
                 # set transfer ID which are used for later lookup
                 transferID = transfer_result['task_id']
                 jobspec.set_groups_to_files(
                     {transferID: {
                         'lfns': lfns,
                         'groupStatus': 'active'
                     }})
                 tmpLog.debug('done')
                 return True, ''
             else:
                 return False, transfer_result['message']
         # if no files to transfer return True
         return True, 'No files to transfer'
     except:
         errStat, errMsg = globus_utils.handle_globus_exception(tmpLog)
         return errStat, {}
    def trigger_stage_out(self, jobspec):
        # make logger
        tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident),
                                  method_name='trigger_stage_out')
        tmpLog.debug('start')
        # initialize some values
        tmpStat = None
        tmpMsg = ''
        srcRSE = None
        dstRSE = None
        datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID,str(uuid.uuid4()))
        datasetScope = 'transient'
        # check that jobspec.computingSite is defined
        if jobspec.computingSite is None:
            # not found
            tmpLog.error('jobspec.computingSite is not defined')
            return False, 'jobspec.computingSite is not defined'
        else:
            tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # write to debug log queueConfig.stager
        tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager))
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda" :
                self.Yodajob = True
        # get destination endpoint
        nucleus = jobspec.jobParams['nucleus']
        agis = self.dbInterface.get_cache('panda_queues.json').data
        dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0]
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json 
        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
        self.objstoreID = ddm[dstRSE]['id']
        if self.Yodajob :
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID))
        # set the location of the files in fileSpec.objstoreID
        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
        self.RSE_dstpath = queueConfig.stager['RSE_dstPath']
        # check queueConfig stager section to see if srcRSE is set
        if 'srcRSE' in queueConfig.stager:
            srcRSE = queueConfig.stager['srcRSE']
        else:
            tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file')
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE))
            
        # loop over the output files and copy the files
        ifile = 0
        errors = []
        fileList = []
        lfns = []
        fileSpec_list = []
        fileSpec_list = jobspec.get_output_file_specs(skip_done=False)
        msgStr = '#(jobspec.get_output_file_specs(skip_done=False)) = {0}'\
                 .format(len(fileSpec_list))
        tmpLog.debug(msgStr)
        for fileSpec in fileSpec_list:
           msgstr = 'fileSpec: dataset scope - {0} file name - {1} size(Bytes) - {2} adler32 - {3}'\
              .format(datasetScope,fileSpec.lfn,fileSpec.fsize,fileSpec.chksum)
           if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
              msgstr += ' guid - {0}'.format(fileSpec.fileAttributes['guid'])
           tmpLog.debug(msgstr)


        #for fileSpec in jobspec.get_output_file_specs(skip_done=True):
        for fileSpec in jobspec.get_output_file_specs(skip_done=False):
            scope ='panda'
            if fileSpec.scope is not None :
                scope = fileSpec.scope
            # for Yoda job set the scope to transient 
            if self.Yodajob :
                scope = 'transient'
            # only print to log file first 25 files
            if ifile < 25 :
                msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope)
                tmpLog.debug(msgStr)
            if ifile == 25 :
                msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope)
                tmpLog.debug(msgStr)
            hash = hashlib.md5()
            hash.update('%s:%s' % (scope, fileSpec.lfn))
            hash_hex = hash.hexdigest()
            correctedscope = "/".join(scope.split('.'))
            srcURL = fileSpec.path
            dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.RSE_dstPath,
                                                                       scope=correctedscope,
                                                                       hash1=hash_hex[0:2],
                                                                       hash2=hash_hex[2:4],
                                                                       lfn=fileSpec.lfn)
            if ifile < 25 :
                tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL))
            tmpFile = dict()
            # copy the source file from source to destination skip over if file already exists
            if os.path.exists(dstURL):
                tmpLog.debug('Already copied file {0}'.format(dstURL))
                # save for adding to rucio dataset
                tmpFile['scope'] = datasetScope
                tmpFile['name'] = fileSpec.lfn
                tmpFile['bytes'] = fileSpec.fsize
                tmpFile['adler32'] = fileSpec.chksum
                if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
                    tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']}
                else :
                    tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn))
                tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn))
                fileList.append(tmpFile)
                lfns.append(fileSpec.lfn)
                # get source RSE
                if srcRSE is None and fileSpec.objstoreID is not None:
                    ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
                    srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0]
                    tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE))
            else :
                if os.path.exists(srcURL) :
                    # check if destination directory exists if not create it
                    dstDIR = os.path.dirname(dstURL)
                    try:
                        if not os.path.exists(dstDIR) :
                            os.makedirs(dstDIR)
                            mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP 
                            mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID
                            os.chmod(dstDIR,mode)
                        # copy the source file to destination file
                        shutil.copy2(srcURL, dstURL)
                        # save for adding to rucio dataset
                        tmpFile['scope'] = datasetScope
                        tmpFile['name'] = fileSpec.lfn
                        tmpFile['bytes'] = fileSpec.fsize
                        tmpFile['adler32'] = fileSpec.chksum
                        if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
                            tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']}
                        else :
                            tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn))
                        tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn))
                        fileList.append(tmpFile)
                        lfns.append(fileSpec.lfn)
                        # get source RSE if not already set
                        if srcRSE is None and fileSpec.objstoreID is not None:
                            ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
                            srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0]
                            tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE))
                    except (IOError, os.error) as why:
                        errors.append((srcURL, dstURL, str(why)))
                else :
                    errors.append((srcURL, dstURL, 'Source file missing'))
            ifile += 1

        # test that srcRSE and dstRSE are defined
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE))
        errStr = '' 
        if srcRSE is None:
            errStr = 'Source RSE is not defined '
        if dstRSE is None:
            errStr = errStr + ' Desitination RSE is not defined'
        if (srcRSE is None) or (dstRSE is None) :
           tmpLog.error(errStr)
           return None,errStr

        # test to see if there are any files to add dataset
        if len(fileList) == 0:
            errStr = 'There are no files to add to database'
            tmpLog.error(errStr)
            return None,errStr
        # print out the file list
        tmpLog.debug('fileList - {0}'.format(fileList))
        
        # create the dataset and add files to it and create a transfer rule
        try:
            # register dataset
            rucioAPI = RucioClient()
            tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}'
                         .format(datasetScope, datasetName,srcRSE,(30*24*60*60)))
            try:
                rucioAPI.add_dataset(datasetScope, datasetName,
                                     meta={'hidden': True},
                                     lifetime=30 * 24 * 60 * 60,
                                     rse=srcRSE
                                     )
            except DataIdentifierAlreadyExists:
                # ignore even if the dataset already exists
                pass
            except Exception:
                errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope,
                                                                                datasetName,
                                                                                srcRSE)
                core_utils.dump_error_message(tmpLog)
                tmpLog.error(errMsg)
                return None,errMsg
            # add files to dataset
            #  add 500 files at a time
            numfiles = len(fileList)
            maxfiles = 500
            numslices = numfiles/maxfiles
            if (numfiles%maxfiles) > 0 :
               numslices = numslices + 1
            start = 0
            for i in range(numslices) :
               try:
                  stop = start + maxfiles
                  if stop > numfiles :
                     stop = numfiles

                  rucioAPI.add_files_to_datasets([{'scope': datasetScope,
                                                   'name': datasetName,
                                                   'dids': fileList[start:stop],
                                                   'rse': srcRSE}],
                                                 ignore_duplicate=True)
                  start = stop
               except FileAlreadyExists:
                  # ignore if files already exist
                  pass
               except Exception:
                  errMsg = 'Could not add files to DS - {0}:{1}  rse - {2} files - {3}'.format(datasetScope,
                                                                                               datasetName,
                                                                                               srcRSE,
                                                                                               fileList)
                  core_utils.dump_error_message(tmpLog)
                  tmpLog.error(errMsg)
                  return None,errMsg
            # add rule
            try:
                tmpDID = dict()
                tmpDID['scope'] = datasetScope
                tmpDID['name'] = datasetName
                tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE,
                                                       lifetime=30 * 24 * 60 * 60)
                ruleIDs = tmpRet[0]
                tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName,
                                                                               str(ruleIDs)))
                # group the output files together by the Rucio transfer rule
                jobspec.set_groups_to_files({ruleIDs: {'lfns': lfns,'groupStatus': 'pending'}})
                msgStr = 'jobspec.set_groups_to_files -Rucio rule - {0}, lfns - {1}, groupStatus - pending'.format(ruleIDs,lfns)
                tmpLog.debug(msgStr)
                tmpLog.debug('call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)')
                tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,'transferring')
                tmpLog.debug('called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,transferring)')
                tmpStat = True
                tmpMsg = 'created Rucio rule successfully'
            except DuplicateRule:
                # ignore duplicated rule
                tmpLog.debug('rule is already available')
            except Exception:
                errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName)
                core_utils.dump_error_message(tmpLog)
                tmpLog.debug(errMsg)
                return None,errMsg
            # update file group status
            self.dbInterface.update_file_group_status(ruleIDs, 'transferring')
        except Exception:
                core_utils.dump_error_message(tmpLog)
                # treat as a temporary error
                tmpStat = None
                tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName)

        #  Now test for any errors
        if errors:
            for error in errors:
                tmpLog.debug('copy error source {0} destination {1} Reason {2}'.format(error[0],error[1],error[2]))
            raise Error(errors)
        # otherwise we are OK                            
        tmpLog.debug('stop')
        return tmpStat,tmpMsg
    def check_stage_out_status(self, jobspec):
        # make logger
        tmpLog = self.make_logger(_logger,
                                  'PandaID={0} ThreadID={1}'.format(
                                      jobspec.PandaID,
                                      threading.current_thread().ident),
                                  method_name='check_stage_out_status')
        tmpLog.debug('start')
        # show the dummy transfer id and set to a value with the PandaID if needed.
        tmpLog.debug('self.dummy_transfer_id = {}'.format(
            self.dummy_transfer_id))
        if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base,
                                                      'XXXX'):
            old_dummy_transfer_id = self.dummy_transfer_id
            self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base,
                                                      jobspec.PandaID)
            tmpLog.debug(
                'Change self.dummy_transfer_id  from {0} to {1}'.format(
                    old_dummy_transfer_id, self.dummy_transfer_id))

        # default return
        tmpRetVal = (True, '')
        # set flag if have db lock
        have_db_lock = False
        # check that jobspec.computingSite is defined
        if jobspec.computingSite is None:
            # not found
            tmpLog.error('jobspec.computingSite is not defined')
            return False, 'jobspec.computingSite is not defined'
        else:
            tmpLog.debug('jobspec.computingSite : {0}'.format(
                jobspec.computingSite))
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda":
                self.Yodajob = True
        # set the location of the files in fileSpec.objstoreID
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json
        self.objstoreID = int(queueConfig.stager['objStoreID_ES'])
        if self.Yodajob:
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug(
                'Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'
                .format(jobspec.PandaID, self.objstoreID, self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(
                jobspec.PandaID, self.objstoreID))
        # test we have a Globus Transfer Client
        if not self.tc:
            errStr = 'failed to get Globus Transfer Client'
            tmpLog.error(errStr)
            return False, errStr
        # set transferID to None
        transferID = None
        # get the scope of the log files
        outfileattrib = jobspec.get_output_file_attributes()
        scopeLog = 'xxxx'
        for key in outfileattrib.keys():
            if "log.tgz" in key:
                scopeLog = outfileattrib[key]['scope']
        # get transfer groups
        groups = jobspec.get_groups_of_output_files()
        tmpLog.debug(
            'jobspec.get_groups_of_output_files() = : {0}'.format(groups))
        # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests
        for dummy_transferID in groups:
            # skip if valid transfer ID not dummy one
            if validate_transferid(dummy_transferID):
                continue
            # lock for 120 sec
            tmpLog.debug(
                'attempt to set DB lock for self.id - {0} dummy_transferID - {1}'
                .format(self.id, dummy_transferID))
            have_db_lock = self.dbInterface.get_object_lock(dummy_transferID,
                                                            lock_interval=120)
            if not have_db_lock:
                # escape since locked by another thread
                msgStr = 'escape since locked by another thread'
                tmpLog.debug(msgStr)
                return None, msgStr
            # refresh group information since that could have been updated by another thread before getting the lock
            tmpLog.debug('self.dbInterface.refresh_file_group_info(jobspec)')
            self.dbInterface.refresh_file_group_info(jobspec)
            # get transfer groups again with refreshed info
            tmpLog.debug(
                'After db refresh call groups=jobspec.get_groups_of_output_files()'
            )
            groups = jobspec.get_groups_of_output_files()
            tmpLog.debug(
                'jobspec.get_groups_of_output_files() = : {0}'.format(groups))
            # the dummy transfer ID is still there
            if dummy_transferID in groups:
                groupUpdateTime = groups[dummy_transferID]['groupUpdateTime']
                # get files with the dummy transfer ID across jobs
                fileSpecs = self.dbInterface.get_files_with_group_id(
                    dummy_transferID)
                # submit transfer if there are more than 10 files or the group was made before more than 10 min
                msgStr = 'dummy_transferID = {0}  number of files = {1}'.format(
                    dummy_transferID, len(fileSpecs))
                tmpLog.debug(msgStr)
                if len(fileSpecs) >= 10 or \
                        groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                    tmpLog.debug('prepare to transfer files')
                    # submit transfer and get a real transfer ID
                    # set the Globus destination Endpoint id and path will get them from Agis eventually
                    #self.Globus_srcPath = queueConfig.stager['Globus_srcPath']
                    self.srcEndpoint = queueConfig.stager['srcEndpoint']
                    self.Globus_srcPath = self.basePath
                    self.Globus_dstPath = queueConfig.stager['Globus_dstPath']
                    self.dstEndpoint = queueConfig.stager['dstEndpoint']
                    # Test the endpoints and create the transfer data class
                    errMsg = None
                    try:
                        # Test endpoints for activation
                        tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(
                            tmpLog, self.tc, self.srcEndpoint)
                        tmpStatdst, dstStr = globus_utils.check_endpoint_activation(
                            tmpLog, self.tc, self.dstEndpoint)
                        if tmpStatsrc and tmpStatdst:
                            errStr = 'source Endpoint and destination Endpoint activated'
                            tmpLog.debug(errStr)
                        else:
                            errMsg = ''
                            if not tmpStatsrc:
                                errMsg += ' source Endpoint not activated '
                            if not tmpStatdst:
                                errMsg += ' destination Endpoint not activated '
                            # release process lock
                            tmpLog.debug(
                                'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                                .format(self.id, dummy_transferID))
                            self.have_db_lock = self.dbInterface.release_object_lock(
                                dummy_transferID)
                            if not self.have_db_lock:
                                errMsg += ' - Could not release DB lock for {}'.format(
                                    dummy_transferID)
                            tmpLog.error(errMsg)
                            tmpRetVal = (None, errMsg)
                            return tmpRetVal
                        # both endpoints activated now prepare to transfer data
                        tdata = None
                        tdata = TransferData(self.tc,
                                             self.srcEndpoint,
                                             self.dstEndpoint,
                                             sync_level="checksum")
                    except:
                        errStat, errMsg = globus_utils.handle_globus_exception(
                            tmpLog)
                        # release process lock
                        tmpLog.debug(
                            'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                            .format(self.id, dummy_transferID))
                        release_db_lock = self.dbInterface.release_object_lock(
                            dummy_transferID)
                        if not release_db_lock:
                            errMsg += ' - Could not release DB lock for {}'.format(
                                dummy_transferID)
                        tmpLog.error(errMsg)
                        tmpRetVal = (errStat, errMsg)
                        return tmpRetVal
                    # loop over all files
                    ifile = 0
                    for fileSpec in fileSpecs:
                        logfile = False
                        scope = 'panda'
                        if fileSpec.scope is not None:
                            scope = fileSpec.scope
                        # for Yoda job set the scope to transient for non log files
                        if self.Yodajob:
                            scope = 'transient'
                        if fileSpec.fileType == "log":
                            logfile = True
                            scope = scopeLog
                        # only print to log file first 25 files
                        if ifile < 25:
                            msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(
                                fileSpec.lfn, fileSpec.scope)
                            tmpLog.debug(msgStr)
                        if ifile == 25:
                            msgStr = "printed first 25 files skipping the rest".format(
                                fileSpec.lfn, fileSpec.scope)
                            tmpLog.debug(msgStr)
                        hash = hashlib.md5()
                        hash.update('%s:%s' % (scope, fileSpec.lfn))
                        hash_hex = hash.hexdigest()
                        correctedscope = "/".join(scope.split('.'))
                        srcURL = fileSpec.path
                        dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(
                            endPoint=self.Globus_dstPath,
                            scope=correctedscope,
                            hash1=hash_hex[0:2],
                            hash2=hash_hex[2:4],
                            lfn=fileSpec.lfn)
                        if logfile:
                            tmpLog.debug('src={srcURL} dst={dstURL}'.format(
                                srcURL=srcURL, dstURL=dstURL))
                        if ifile < 25:
                            tmpLog.debug('src={srcURL} dst={dstURL}'.format(
                                srcURL=srcURL, dstURL=dstURL))
                        # add files to transfer object - tdata
                        if os.access(srcURL, os.R_OK):
                            if ifile < 25:
                                tmpLog.debug("tdata.add_item({},{})".format(
                                    srcURL, dstURL))
                            tdata.add_item(srcURL, dstURL)
                        else:
                            errMsg = "source file {} does not exist".format(
                                srcURL)
                            # release process lock
                            tmpLog.debug(
                                'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                                .format(self.id, dummy_transferID))
                            release_db_lock = self.dbInterface.release_object_lock(
                                dummy_transferID)
                            if not release_db_lock:
                                errMsg += ' - Could not release DB lock for {}'.format(
                                    dummy_transferID)
                            tmpLog.error(errMsg)
                            tmpRetVal = (False, errMsg)
                            return tmpRetVal
                        ifile += 1
                    # submit transfer
                    tmpLog.debug('Number of files to transfer - {}'.format(
                        len(tdata['DATA'])))
                    try:
                        transfer_result = self.tc.submit_transfer(tdata)
                        # check status code and message
                        tmpLog.debug(str(transfer_result))
                        if transfer_result['code'] == "Accepted":
                            # succeeded
                            # set transfer ID which are used for later lookup
                            transferID = transfer_result['task_id']
                            tmpLog.debug(
                                'successfully submitted id={0}'.format(
                                    transferID))
                            # set status for files
                            self.dbInterface.set_file_group(
                                fileSpecs, transferID, 'running')
                            msgStr = 'submitted transfer with ID={0}'.format(
                                transferID)
                            tmpLog.debug(msgStr)
                        else:
                            # release process lock
                            tmpLog.debug(
                                'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                                .format(self.id, dummy_transferID))
                            release_db_lock = self.dbInterface.release_object_lock(
                                dummy_transferID)
                            if not release_db_lock:
                                errMsg = 'Could not release DB lock for {}'.format(
                                    dummy_transferID)
                                tmpLog.error(errMsg)
                            tmpRetVal = (None, transfer_result['message'])
                            return tmpRetVal
                    except Exception as e:
                        errStat, errMsg = globus_utils.handle_globus_exception(
                            tmpLog)
                        # release process lock
                        tmpLog.debug(
                            'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                            .format(self.id, dummy_transferID))
                        release_db_lock = self.dbInterface.release_object_lock(
                            dummy_transferID)
                        if not release_db_lock:
                            errMsg += ' - Could not release DB lock for {}'.format(
                                dummy_transferID)
                        tmpLog.error(errMsg)
                        return errStat, errMsg
                else:
                    msgStr = 'wait until enough files are pooled'
                    tmpLog.debug(msgStr)
                # release the lock
                tmpLog.debug(
                    'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                    .format(self.id, dummy_transferID))
                release_db_lock = self.dbInterface.release_object_lock(
                    dummy_transferID)
                if release_db_lock:
                    tmpLog.debug(
                        'released DB lock for self.id - {0} dummy_transferID - {1}'
                        .format(self.id, dummy_transferID))
                    have_db_lock = False
                else:
                    msgStr += ' - Could not release DB lock for {}'.format(
                        dummy_transferID)
                    tmpLog.error(msgStr)
                # return None to retry later
                return None, msgStr
            # release the db lock if needed
            if have_db_lock:
                tmpLog.debug(
                    'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                    .format(self.id, dummy_transferID))
                release_db_lock = self.dbInterface.release_object_lock(
                    dummy_transferID)
                if release_db_lock:
                    tmpLog.debug(
                        'released DB lock for self.id - {0} dummy_transferID - {1}'
                        .format(self.id, dummy_transferID))
                    have_db_lock = False
                else:
                    msgStr += ' - Could not release DB lock for {}'.format(
                        dummy_transferID)
                    tmpLog.error(msgStr)
                    return None, msgStr
        # check transfer with real transfer IDs
        # get transfer groups
        tmpLog.debug("groups = jobspec.get_groups_of_output_files()")
        groups = jobspec.get_groups_of_output_files()
        tmpLog.debug('Number of transfer groups - {0}'.format(len(groups)))
        tmpLog.debug('transfer groups any state - {0}'.format(groups))
        if len(groups) == 0:
            tmpLog.debug(
                "jobspec.get_groups_of_output_files(skip_done=True) returned no files "
            )
            tmpLog.debug("check_stage_out_status return status - True ")
            return True, ''

        for transferID in groups:
            # allow only valid UUID
            if validate_transferid(transferID):
                # get transfer task
                tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(
                    tmpLog, self.tc, transferID)
                # return a temporary error when failed to get task
                if not tmpStat:
                    errStr = 'failed to get transfer task; tc = %s; transferID = %s' % (
                        str(self.tc), str(transferID))
                    tmpLog.error(errStr)
                    return None, errStr
                # return a temporary error when task is missing
                if transferID not in transferTasks:
                    errStr = 'transfer task ID - {} is missing'.format(
                        transferID)
                    tmpLog.error(errStr)
                    return None, errStr
                # succeeded in finding a transfer task by tranferID
                if transferTasks[transferID]['status'] == 'SUCCEEDED':
                    tmpLog.debug(
                        'transfer task {} succeeded'.format(transferID))
                    self.set_FileSpec_objstoreID(jobspec, self.objstoreID,
                                                 self.pathConvention)
                    if self.changeFileStatusOnSuccess:
                        self.set_FileSpec_status(jobspec, 'finished')
                    return True, ''
                # failed
                if transferTasks[transferID]['status'] == 'FAILED':
                    errStr = 'transfer task {} failed'.format(transferID)
                    tmpLog.error(errStr)
                    self.set_FileSpec_status(jobspec, 'failed')
                    return False, errStr
                # another status
                tmpStr = 'transfer task {0} status: {1}'.format(
                    transferID, transferTasks[transferID]['status'])
                tmpLog.debug(tmpStr)
                return None, ''
        # end of loop over transfer groups
        tmpLog.debug(
            'End of loop over transfers groups - ending check_stage_out_status function'
        )
        return None, 'no valid transfer id found'
Esempio n. 37
0
    def submit_workers(self, workspec_list):
        retList = []
        for workSpec in workspec_list:

            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID),
                                            method_name='submit_workers')

            queueconfigmapper = QueueConfigMapper()
            queueconfig = queueconfigmapper.get_queue(workSpec.computingSite)
            prodSourceLabel = queueconfig.get_source_label()

            # If jobSpec is defined we are in push mode, if not pull mode
            # Both assume one to one worker to job mapping
            jobSpec = workSpec.get_jobspec_list()
            if jobSpec:
                jobSpec = jobSpec[0]
                tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map()))
                # Unified queues: take prodsourcelabel from job
                prodSourceLabel = jobSpec.jobParams.get('prodSourceLabel', prodSourceLabel)

            desc = {}
            # If we need to prefetch events, set aCT status waiting.
            # feed_events in act_messenger will fill events and release the job
            if queueconfig.prefetchEvents:
                desc['pandastatus'] = 'waiting'
                desc['actpandastatus'] = 'waiting'
                desc['arcjobid'] = -1 # dummy id to prevent submission
            else:
                desc['pandastatus'] = 'sent'
                desc['actpandastatus'] = 'sent'
            desc['siteName'] = workSpec.computingSite
            desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel in ['user', 'panda'] else 'production']
            desc['prodSourceLabel'] = prodSourceLabel
            desc['sendhb'] = 0
            metadata = {'harvesteraccesspoint': workSpec.get_access_point(),
                        'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id)}
            desc['metadata'] = json.dumps(metadata)

            if jobSpec:
                # push mode: aCT takes the url-encoded job description (like it gets from panda server)
                pandaid = jobSpec.PandaID
                actjobdesc = urllib.parse.urlencode(jobSpec.jobParams)
            else:
                # pull mode: just set pandaid (to workerid) and prodsourcelabel
                pandaid = workSpec.workerID
                actjobdesc = 'PandaID=%d&prodSourceLabel=%s' % (pandaid, prodSourceLabel)

            tmpLog.info("Inserting job {0} into aCT DB: {1}".format(pandaid, str(desc)))
            try:
                batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)['LAST_INSERT_ID()']
            except Exception as e:
                result = (False, "Failed to insert job into aCT DB: {0}".format(str(e)))
            else:
                tmpLog.info("aCT batch id {0}".format(batchid))
                workSpec.batchID = str(batchid)
                workSpec.submissionHost = self.hostname
                workSpec.nativeStatus = desc['actpandastatus']
                # Set log files in workSpec
                today = time.strftime('%Y-%m-%d', time.gmtime())
                logurl = '/'.join([queueconfig.submitter.get('logBaseURL'), today, workSpec.computingSite, str(pandaid)])
                workSpec.set_log_file('batch_log', '{0}.log'.format(logurl))
                workSpec.set_log_file('stdout', '{0}.out'.format(logurl))
                workSpec.set_log_file('stderr', '{0}.err'.format(logurl))
                workSpec.set_log_file('jdl', '{0}.jdl'.format(logurl))
                result = (True, '')
            retList.append(result)

        return retList
Esempio n. 38
0
 def trigger_stage_out(self, jobspec):
     # make logger
     tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID),
                               method_name='trigger_stage_out')
     tmpLog.debug('start')
     # default return
     tmpRetVal = (True, '')
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
     # test we have a Globus Transfer Client
     if not self.tc :
         errStr = 'failed to get Globus Transfer Client'
         tmpLog.error(errStr)
         return False, errStr
     # get label
     label = self.make_label(jobspec)
     tmpLog.debug('label={0}'.format(label))
     # get transfer tasks
     tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog,self.tc,label)
     if not tmpStat:
         errStr = 'failed to get transfer tasks'
         tmpLog.error(errStr)
         return False, errStr
     # check if already queued
     if label in transferTasks:
         tmpLog.debug('skip since already queued with {0}'.format(str(transferTasks[label])))
         return True, ''
     # set the Globus destination Endpoint id and path will get them from Agis eventually  
     from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
     queueConfigMapper = QueueConfigMapper()
     queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
     #self.Globus_srcPath = queueConfig.stager['Globus_srcPath']
     self.srcEndpoint = queueConfig.stager['srcEndpoint']
     self.Globus_srcPath = self.basePath
     self.Globus_dstPath = queueConfig.stager['Globus_dstPath']
     self.dstEndpoint = queueConfig.stager['dstEndpoint']
     # Test the endpoints and create the transfer data class 
     errMsg = None
     try:
         # Test endpoints for activation
         tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint)
         tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint)
         if tmpStatsrc and tmpStatdst:
             errStr = 'source Endpoint and destination Endpoint activated'
             tmpLog.debug(errStr)
         else:
             errMsg = ''
             if not tmpStatsrc :
                 errMsg += ' source Endpoint not activated '
             if not tmpStatdst :
                 errMsg += ' destination Endpoint not activated '
             tmpLog.error(errMsg)
             tmpRetVal = (False,errMsg)
             return tmpRetVal
         # both endpoints activated now prepare to transfer data
         tdata = TransferData(self.tc,
                              self.srcEndpoint,
                              self.dstEndpoint,
                              label=label,
                              sync_level="checksum")
     except:
         errStat,errMsg = globus_utils.handle_globus_exception(tmpLog)
         tmpRetVal = (errStat, errMsg)
         return tmpRetVal
     # loop over all files
     fileAttrs = jobspec.get_output_file_attributes()
     lfns = []
     for fileSpec in jobspec.outFiles:
         scope = fileAttrs[fileSpec.lfn]['scope']
         hash = hashlib.md5()
         hash.update('%s:%s' % (scope, fileSpec.lfn))
         hash_hex = hash.hexdigest()
         correctedscope = "/".join(scope.split('.'))
         srcURL = fileSpec.path
         dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath,
                                                                    scope=correctedscope,
                                                                    hash1=hash_hex[0:2],
                                                                    hash2=hash_hex[2:4],
                                                                    lfn=fileSpec.lfn)
         tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL))
         # add files to transfer object - tdata
         if os.access(srcURL, os.R_OK):
             tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL))
             tdata.add_item(srcURL,dstURL)
             lfns.append(fileSpec.lfn)
         else:
             errMsg = "source file {} does not exist".format(srcURL)
             tmpLog.error(errMsg)
             tmpRetVal = (False,errMsg)
             return tmpRetVal
     # submit transfer 
     try:
         transfer_result = self.tc.submit_transfer(tdata)
         # check status code and message
         tmpLog.debug(str(transfer_result))
         if transfer_result['code'] == "Accepted":
             # succeeded
             # set transfer ID which are used for later lookup
             transferID = transfer_result['task_id']
             tmpLog.debug('successfully submitted id={0}'.format(transferID))
             jobspec.set_groups_to_files({transferID: {'lfns': lfns, 'groupStatus': 'active'}})
             # set
             for fileSpec in jobspec.outFiles:
                 if fileSpec.fileAttributes == None:
                     fileSpec.fileAttributes = {}
                     fileSpec.fileAttributes['transferID'] = transferID
         else:
             tmpRetVal = (False, transfer_result['message'])
     except Exception as e:
         errStat,errMsg = globus_utils.handle_globus_exception(tmpLog)
         if errMsg is None:
             errtype, errvalue = sys.exc_info()[:2]
             errMsg = "{0} {1}".format(errtype.__name__, errvalue)
         tmpRetVal = (errStat,errMsg)
     # return
     tmpLog.debug('done')
     return tmpRetVal
Esempio n. 39
0
    def submit_k8s_worker(self, work_spec):
        tmp_log = self.make_logger(base_logger,
                                   method_name='submit_k8s_worker')

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)
        prod_source_label = harvester_queue_config.get_source_label(
            work_spec.jobType)

        # set the stdout log file
        log_file_name = '{0}_{1}.out'.format(
            harvester_config.master.harvester_id, work_spec.workerID)
        work_spec.set_log_file(
            'stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name))
        # TODO: consider if we want to upload the yaml file to PanDA cache

        yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file)
        try:

            # read the job configuration (if available, only push model)
            job_fields, job_pars_parsed = self.read_job_configuration(
                work_spec)

            # decide container image and executable to run. In pull mode, defaults are provided
            container_image = self.decide_container_image(
                job_fields, job_pars_parsed)
            executable, args = self.build_executable(job_fields,
                                                     job_pars_parsed)
            tmp_log.debug(
                'container_image: "{0}"; executable: "{1}"; args: "{2}"'.
                format(container_image, executable, args))

            # choose the appropriate proxy
            panda_queues_dict = PandaQueuesDict()
            is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(
                self.queueName)
            cert, use_secret = self._choose_proxy(work_spec,
                                                  is_grandly_unified_queue)
            if not cert:
                err_str = 'No proxy specified in proxySecretPath or x509UserProxy. Not submitted'
                tmp_return_value = (False, err_str)
                return tmp_return_value

            # get the walltime limit
            try:
                max_time = panda_queues_dict.get(self.queueName)['maxtime']
            except Exception as e:
                tmp_log.warning(
                    'Could not retrieve maxtime field for queue {0}'.format(
                        self.queueName))
                max_time = None

            # submit the worker
            rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(
                yaml_content,
                work_spec,
                prod_source_label,
                container_image,
                executable,
                args,
                cert,
                cert_in_secret=use_secret,
                cpu_adjust_ratio=self.cpuAdjustRatio,
                memory_adjust_ratio=self.memoryAdjustRatio,
                max_time=max_time)
        except Exception as _e:
            tmp_log.error(traceback.format_exc())
            err_str = 'Failed to create a JOB; {0}'.format(_e)
            tmp_return_value = (False, err_str)
        else:
            work_spec.batchID = yaml_content['metadata']['name']
            tmp_log.debug('Created worker {0} with batchID={1}'.format(
                work_spec.workerID, work_spec.batchID))
            tmp_return_value = (True, '')

        return tmp_return_value
Esempio n. 40
0
class cpCompasStagerHPC(BaseStager):
    # constructor
    def __init__(self, **kwarg):
        BaseStager.__init__(self, **kwarg)
        self.queue_config_mapper = QueueConfigMapper()
        
    # check status
    def check_stage_out_status(self, jobspec):
        """Check the status of stage-out procedure. If staging-out is done synchronously in trigger_stage_out
        this method should always return True.
        Output files are available through jobspec.get_outfile_specs(skip_done=False) which gives
        a list of FileSpecs not yet done.
        FileSpec.attemptNr shows how many times the transfer was checked for the file.
        If the file was successfully transferred, status should be set to 'finished'.
        Or 'failed', if the file failed to be transferred. Once files are set to 'finished' or 'failed',
        jobspec.get_outfile_specs(skip_done=False) ignores them.

        :param jobspec: job specifications
        :type jobspec: JobSpec
        :return: A tuple of return code (True: transfer success, False: fatal transfer failure,
                 None: on-going or temporary failure) and error dialog
        :rtype: (bool, string)
        """
        
        # make logger
        tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID),
                                  method_name='check_stage_out_status')
        tmpLog.debug('start')
        
#         for fileSpec in jobspec.get_output_file_specs(skip_done=True):
#             fileSpec.status = 'finished'
        return True, ''

    # trigger stage out
    def trigger_stage_out(self, jobspec):
        """Trigger the stage-out procedure for the job.
        Output files are available through jobspec.get_outfile_specs(skip_done=False) which gives
        a list of FileSpecs not yet done.
        FileSpec.attemptNr shows how many times transfer was tried for the file so far.

        :param jobspec: job specifications
        :type jobspec: JobSpec
        :return: A tuple of return code (True: success, False: fatal failure, None: temporary failure)
                 and error dialog
        :rtype: (bool, string)
        """
        
        # make logger
        tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID),
                                  method_name='check_stage_out_status')
        tmpLog.debug('start')
        allChecked = True
        ErrMsg = 'These files failed to upload: '
        
        tmpLog.debug('Getting seprodpath from queue_config')
        queue_config = self.queue_config_mapper.get_queue(self.queueName)
        
        tmpLog.debug('Requesting full spec of the job {0}' . format(jobspec.PandaID))
        proxy = DBProxy()
        jobSpec_full = proxy.get_job(jobspec.PandaID)
        
        for fileSpec in jobspec.get_output_file_specs(skip_done=True):
            destination = queue_config.seprodpath
            filename = fileSpec.lfn
            
            se_path = ''
            sw_path = ''
            prod_name = ''
            prodSlt = ''
            TMPMDSTFILE = ''
            TMPHISTFILE = ''
            EVTDUMPFILE = ''
            MERGEDMDSTFILE = ''
            MERGEDHISTFILE = ''
            MERGEDDUMPFILE = ''
            
            if not ".log.tgz" in fileSpec.lfn:
                tmpLog.debug('Getting sw path, name and hist filename from jobPars')
                sw_prefix, sw_path, prod_name, prodSlt, TMPMDSTFILE, TMPHISTFILE, EVTDUMPFILE, MERGEDMDSTFILE, MERGEDHISTFILE, MERGEDDUMPFILE, PRODSOFT, MCGENFILEOUT = self.getSWPathAndNameAndFilename(jobSpec_full.jobParams['jobPars'])
                
                tmpLog.debug('sw_prefix: {0}' . format(sw_prefix))
                tmpLog.debug('sw_path: {0}' . format(sw_path))
                tmpLog.debug('prod_name: {0}' . format(prod_name))
                tmpLog.debug('prodSlt: {0}' . format(prodSlt))
                tmpLog.debug('TMPMDSTFILE: {0}' . format(TMPMDSTFILE))
                tmpLog.debug('TMPHISTFILE: {0}' . format(TMPHISTFILE))
                tmpLog.debug('EVTDUMPFILE: {0}' . format(EVTDUMPFILE))
                tmpLog.debug('MERGEDMDSTFILE: {0}' . format(MERGEDMDSTFILE))
                tmpLog.debug('MERGEDHISTFILE: {0}' . format(MERGEDHISTFILE))
                tmpLog.debug('MERGEDDUMPFILE: {0}' . format(MERGEDDUMPFILE))
                tmpLog.debug('PRODSOFT: {0}' . format(PRODSOFT))
                tmpLog.debug('MCGENFILEOUT: {0}' . format(MCGENFILEOUT))
                           
                # prod
                if fileSpec.lfn == TMPMDSTFILE :
                    se_path = sw_prefix + sw_path + PRODSOFT + '/mDST.chunks'
                if fileSpec.lfn == TMPHISTFILE:
                    se_path = sw_prefix + sw_path + PRODSOFT + '/TRAFDIC'
                if fileSpec.lfn == "testevtdump.raw":
                    se_path = sw_prefix + sw_path + PRODSOFT + '/evtdump/slot' + prodSlt
                    filename = EVTDUMPFILE
                if fileSpec.lfn == "payload_stdout.out.gz":
                    se_path = sw_prefix + sw_path + PRODSOFT + '/logFiles'
                    filename = prod_name + '.' + TMPHISTFILE.replace('.root', '.stdout.gz')
                if fileSpec.lfn == "payload_stderr.out.gz":
                    se_path = sw_prefix + sw_path + PRODSOFT + '/logFiles'
                    filename = prod_name + '.' + TMPHISTFILE.replace('.root', '.stderr.gz')
                                
                # merge
                if fileSpec.lfn == MERGEDMDSTFILE :
                    se_path = sw_prefix + sw_path + PRODSOFT + '/mDST'
                if fileSpec.lfn == MERGEDHISTFILE:
                    se_path = sw_prefix + sw_path + PRODSOFT + '/histos'
                if fileSpec.lfn == MERGEDDUMPFILE:
                    se_path = sw_prefix + sw_path + PRODSOFT + '/mergedDump/slot' + prodSlt
                
                # mc generation
                if fileSpec.lfn == MCGENFILEOUT:
                    se_path = sw_prefix + '/mc/' + sw_path + PRODSOFT + '/mcgen'
                    filename = MCGENFILEOUT
                                
                destination = se_path
                
            surl = "{0}/{1}" . format(destination, filename)
            dst_gpfn = "{0}/{1}" . format(destination, filename)
            lfcdir = destination
            
            tmpLog.debug('fileSpec.path = {0}' . format(fileSpec.path))
            tmpLog.debug('SURL = {0}' . format(surl))
            tmpLog.debug('dst_gpfn = {0}' . format(dst_gpfn))
            tmpLog.debug('lfcdir = {0}' . format(lfcdir))
            
            tmpLog.debug('Create if does not exist {0}' . format(lfcdir))
            if not os.path.exists(lfcdir):
                os.makedirs(lfcdir)
            
            tmpLog.debug('Copy {0} to {1}' . format(fileSpec.path, dst_gpfn))
            shutil.copyfile(fileSpec.path, dst_gpfn)
            if os.path.exists(dst_gpfn):
                fileSpec.status = 'finished'
            else:
                fileSpec.status = 'failed'
                allChecked = False
                ErrMsg += '{0} ' . format(fileSpec.lfn)
            
            # force update
            fileSpec.force_update('status')
            
            tmpLog.debug('Status of file {0} is {1}' . format(fileSpec.path, fileSpec.status))
            
        del jobSpec_full
        
        tmpLog.debug('done')
        
        if allChecked:
            return True, ''
        else:
            return False, ErrMsg

    def getSWPathAndNameAndFilename(self, jobPars):
        """ Get COMPASS_SW_PATH and COMPASS_PROD_NAME from JobPars """
        
        a = jobPars.find('COMPASS_SW_PREFIX')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        sw_prefix = d[d.find('=') + 1:]
        
        a = jobPars.find('COMPASS_SW_PATH')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        sw_path = d[d.find('=') + 1:]

        a = jobPars.find('COMPASS_PROD_NAME')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        prod_name = d[d.find('=') + 1:]
        
        a = jobPars.find('prodSlt')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        prodSlt = d[d.find('=') + 1:]
        
        a = jobPars.find('TMPMDSTFILE')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        TMPMDSTFILE = d[d.find('=') + 1:]
        
        a = jobPars.find('TMPHISTFILE')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        TMPHISTFILE = d[d.find('=') + 1:]
        
        a = jobPars.find('EVTDUMPFILE')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        EVTDUMPFILE = d[d.find('=') + 1:]
        
        a = jobPars.find('MERGEDMDSTFILE')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        MERGEDMDSTFILE = d[d.find('=') + 1:]
        
        a = jobPars.find('MERGEDHISTFILE')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        MERGEDHISTFILE = d[d.find('=') + 1:]
        
        a = jobPars.find('MERGEDDUMPFILE')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        MERGEDDUMPFILE = d[d.find('=') + 1:]
        
        a = jobPars.find('PRODSOFT')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        PRODSOFT = d[d.find('=') + 1:]
        
        a = jobPars.find('MCGENFILEOUT')
        b = jobPars[a:]
        c = b.find(';')
        d = b[:c]
        MCGENFILEOUT = d[d.find('=') + 1:]
        
        return sw_prefix, sw_path, prod_name, prodSlt, TMPMDSTFILE, TMPHISTFILE, EVTDUMPFILE, MERGEDMDSTFILE, MERGEDHISTFILE, MERGEDDUMPFILE, PRODSOFT, MCGENFILEOUT

    # zip output files
    def zip_output(self, jobspec):
        tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID),
                                  method_name='zip_output')
        return self.simple_zip_output(jobspec, tmpLog)

    # asynchronous zip output
    def async_zip_output(self, jobspec):
        return True, ''

    # post zipping
    def post_zip_output(self, jobspec):
        return True, ''
Esempio n. 41
0
def qconf_refresh(arguments):
    from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
    qcm = QueueConfigMapper()
    qcm.lastUpdate = None
    qcm.load_data()
Esempio n. 42
0
 def __init__(self, **kwarg):
     BaseStager.__init__(self, **kwarg)
     self.queue_config_mapper = QueueConfigMapper()
Esempio n. 43
0
def main():
    logging.basicConfig()

    parser = argparse.ArgumentParser()
    parser.add_argument('--remoteDir', action='store', dest='remoteDir', default='harvester',
                        help='directory on the remote target machine where harvester is installed')
    parser.add_argument('--remoteBuildDir', action='store', dest='remoteBuildDir', default='harvester_build',
                        help='directory on the remote target machine where harvester is build')
    parser.add_argument('--remotePythonSetup', action='store', dest='remotePythonSetup', default='',
                        help='python setup on remote target machine')
    parser.add_argument('--queueName', action='store', dest='queueName', default=None, required=True,
                        help='the name of queue where harvester is installed')
    parser.add_argument('--middleware', action='store', dest='middleware', default='rpc',
                        help='middleware to access the remote target machine')
    options = parser.parse_args()

    # remove ~/ which doesn't work with sftp
    options.remoteDir = re.sub('^~/', '', options.remoteDir)
    options.remoteBuildDir = re.sub('^~/', '', options.remoteBuildDir)

    # get queue
    qcm = QueueConfigMapper()
    qcm.load_data()
    queueConfig = qcm.get_queue(options.queueName)
    if queueConfig is None:
        print ('ERROR: queue={0} not found in panda_queueconfig.json'.format(options.queueName))
        sys.exit(1)

    # get middleware
    if not hasattr(queueConfig, options.middleware):
        print ('ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json'.format(options.middleware,
                                                                                               options.queueName))
        sys.exit(1)
    middleware = getattr(queueConfig, options.middleware)

    # get ssh parameters
    sshHost = middleware['remoteHost']
    try:
        sshPort = middleware['remotePort']
    except Exception:
        sshPort = 22
    sshUserName = middleware['sshUserName']
    try:
        sshPassword = middleware['sshPassword']
    except Exception:
        sshPassword = None

    privateKey = None
    passPhrase = None
    if sshPassword is None:
        try:
            privateKey = middleware['privateKey']
        except Exception:
            print ("ERROR: set sshPassword or privateKey in middleware={0}".format(options.middleware))
            sys.exit(1)
        try:
            passPhrase = middleware['passPhrase']
        except Exception:
            passPhrase = None

    try:
        jumpHost = middleware['jumpHost']
    except Exception:
        jumpHost = None
    try:
        jumpPort = middleware['jumpPort']
    except Exception:
        jumpPort = 22

    # ssh
    sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey,
                                    jumpHost, jumpPort)

    # get remote python version
    exec_out = sshClient.exec_command(
        ';'.join([options.remotePythonSetup,
                  """python -c 'import sys;print("{0}{1}".format(*(sys.version_info[:2])))' """])
        )
    remotePythonVer = exec_out[1].read().rstrip()
    sshClient.close()
    print ('remote python version : {0}'.format(remotePythonVer))

    # make tmp dir
    with TemporaryDirectory() as tmpDir:
        harvesterGit = "git+git://github.com/PanDAWMS/panda-harvester.git"

        # get all dependencies
        print ("getting dependencies")
        p = subprocess.Popen("pip download -d {0} {1}; rm -rf {0}/*".format(tmpDir, harvesterGit),
                             stdout=subprocess.PIPE,
                             shell=True)
        stdout, stderr = p.communicate()
        packages = []
        for line in stdout.split('\n'):
            if line.startswith('Successfully downloaded'):
                packages = line.split()[2:]
        packages.append(harvesterGit)
        packages.append('pip')
        packages.remove('pandaharvester')

        # download packages
        print ("pip download to {0}".format(tmpDir))
        for package in packages:
            print ("getting {0}".format(package))
            ret = subprocess.call("pip download --no-deps --python-version {0} -d {1} {2}".format(remotePythonVer,
                                                                                                  tmpDir, package),
                                  shell=True)
            if ret != 0:
                print ("ERROR: failed to download {0}".format(package))
                sys.exit(1)

        # sftp
        sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey,
                                        jumpHost, jumpPort)
        try:
            sshClient.exec_command('rm -rf {0}; mkdir -p {0}'.format(options.remoteBuildDir))
        except Exception:
            pass
        sftp = sshClient.open_sftp()
        for name in os.listdir(tmpDir):
            path = os.path.join(tmpDir, name)
            if os.path.isdir(path):
                continue
            remotePath = os.path.join(options.remoteBuildDir, name)
            print ("copy {0} to {1}".format(name, remotePath))
            sftp.put(path, remotePath)

        # install
        print ("install harvester")
        buildDir = options.remoteBuildDir
        if not buildDir.startswith('/'):
            buildDir = '~/' + buildDir
        exec_out = sshClient.exec_command(
            ';'.join([options.remotePythonSetup,
                      'cd {0}'.format(options.remoteDir),
                      'pip install pip pandaharvester --no-index --find-links {0}'.format(buildDir)])
                      )
        print (exec_out[1].read())
        print (exec_out[2].read())
        sshClient.close()
Esempio n. 44
0
class SAGAMonitor(PluginBase):
    # constructor
    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)
        self.pluginFactory = PluginFactory()
        self.queue_config_mapper = QueueConfigMapper()
        tmpLog = self.make_logger(baseLogger, method_name='__init__')
        tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor))

    # check workers
    def check_workers(self, workspec_list):
        """Check status of workers. This method takes a list of WorkSpecs as input argument
        and returns a list of worker's statuses.
  
        :param workspec_list: a list of work specs instances
        :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses.
        :rtype: (bool, [string,])
        """
        try:
            job_service = saga.job.Service(self.adaptor)
        except saga.SagaException as ex:
            time.sleep(10)
            self.check_workers(workspec_list)
        sagadateformat_str = '%a %b %d %H:%M:%S %Y'
        retList = []
        for workSpec in workspec_list:
            # make logger
            errStr = ''
            tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID),
                                      method_name='check_workers')
            tmpLog.debug("SAGA monitor started")
            if workSpec.batchID:
                saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workSpec.batchID)
                try:
                    worker = job_service.get_job(saga_submission_id)
                    tmpLog.debug(
                        'SAGA State for submission with batchid: {0} is: {1}'.format(workSpec.batchID, worker.state))
                    harvester_job_state = SAGASubmitter.status_translator(worker.state)
                    workSpec.nativeStatus = worker.state
                    workSpec.set_status(harvester_job_state)
                    tmpLog.debug(
                        'Worker state with batchid: {0} is: {1} exit code: {2}'.format(workSpec.batchID, harvester_job_state, worker.exit_code))
                    workSpec.set_status(harvester_job_state)
                    if worker.created:
                        tmpLog.debug("Worker created (SAGA): {0}".format(worker.created))
                        workSpec.submitTime = datetime.strptime(worker.created, sagadateformat_str)
                    if worker.started:
                        tmpLog.debug("Worker started (SAGA): {0}".format(worker.started))
                        workSpec.startTime = datetime.strptime(worker.started, sagadateformat_str)
                    if worker.finished:
                        tmpLog.debug("Worker finished (SAGA): {0}".format(worker.finished))
                        workSpec.endTime = datetime.strptime(worker.finished, sagadateformat_str)

                    if workSpec.is_final_status():
                        workSpec.nativeExitCode = worker.exit_code
                        tmpLog.info("Worker in final status [{0}] exit code: {1}".format(workSpec.status, workSpec.nativeExitCode))
                        if workSpec.nativeExitCode != 0:  # let's try to find exit code, exit message etc...
                            tmpLog.info("Deep check to find exit code and exit status required")
                            harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob(
                                workSpec.batchID, workSpec.workerID)
                            if harvester_job_state == "":
                                harvester_job_state = workSpec.ST_finished
                            if not workSpec.startTime:
                                workSpec.startTime = starttime
                            if endtime:
                                workSpec.endTime = endtime
                            workSpec.set_status(harvester_job_state)
                        tmpLog.info('Worker {2} with BatchID={0} finished with exit code {1} and state {3}'.format(
                            workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state))
                        tmpLog.debug('Started: [{0}] finished: [{1}]'.format(worker.started, worker.finished))

                    if worker.state == saga.job.PENDING:
                        queue_time = (datetime.now() - workSpec.submitTime).total_seconds()
                        tmpLog.info("Worker queued for {0} sec.".format(queue_time))
                        if hasattr(self, 'maxqueuetime') and queue_time > self.maxqueuetime:
                            tmpLog.info(
                                "Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time,
                                                                                                         self.maxqueuetime))
                            worker.cancel()
                            worker.wait()
                            workSpec.nativeExitCode = worker.exit_code
                            cur_time = datetime.now()
                            workSpec.startTime = cur_time
                            workSpec.endTime = cur_time
                            workSpec.set_pilot_closed()
                            workSpec.set_status(workSpec.ST_cancelled)
                            harvester_job_state = workSpec.ST_cancelled
                            tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state,
                                                                                         workSpec.nativeExitCode))
                            # proper processing of jobs for worker will be required, to avoid 'fake' fails

                except saga.SagaException as ex:
                    tmpLog.info('An exception occured during retriving worker information {0}'.format(workSpec.batchID))
                    tmpLog.info(ex.get_message())
                    # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better
                    # some more work for SAGA to get proper state
                    harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob(
                        workSpec.batchID, workSpec.workerID)
                    if harvester_job_state == "":
                        harvester_job_state = workSpec.ST_finished
                    if not workSpec.startTime:
                        workSpec.startTime = starttime
                    if endtime:
                        workSpec.endTime = endtime
                    workSpec.set_status(harvester_job_state)
                    tmpLog.debug('Worker state set to: {0} ({1})'.format(workSpec.status, harvester_job_state))
                retList.append((harvester_job_state, errStr))
                # for compatibility with dummy monitor
                f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w')
                f.write(workSpec.status)
                f.close()

            else:
                tmpLog.debug("SAGA monitor found worker [{0}] without batchID".format(workSpec.workerID))

        job_service.close()
        tmpLog.debug('Results: {0}'.format(retList))

        return True, retList

    def deep_checkjob(self, batchid, workerid):
        """
        Get job state, exit code and some more parameters, from resources depending sources

        :param batchid:
        :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage
        """
        tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workerid), method_name='deep_checkjob')
        harvester_job_state = None
        nativeexitcode = None
        nativestatus = None
        diagmessage = ""
        starttime = None
        endtime = None
        queue_config = self.queue_config_mapper.get_queue(self.queueName)
        if hasattr(queue_config, 'resource'):
            resource_utils = self.pluginFactory.get_plugin(queue_config.resource)
        else:
            tmpLog.debug("Resource configuration missed for: {0}".format(self.queueName))
            resource_utils = None
        if resource_utils:
            batchjob_info = resource_utils.get_batchjob_info(batchid)
        if batchjob_info:
            tmpLog.info('Batch job info collected: {0}'.format(batchjob_info))
            harvester_job_state = batchjob_info['status']
            nativeexitcode = batchjob_info['nativeExitCode']
            nativestatus = batchjob_info['nativeStatus']
            diagmessage = batchjob_info['nativeExitMsg']
            if batchjob_info['start_time']:
                starttime = batchjob_info['start_time']
            if batchjob_info['finish_time']:
                endtime = batchjob_info['finish_time']

        return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage
Esempio n. 45
0
class SAGAMonitor(PluginBase):
    # constructor
    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)
        self.pluginFactory = PluginFactory()
        self.queue_config_mapper = QueueConfigMapper()
        tmpLog = self.make_logger(baseLogger, method_name='__init__')
        tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor))

    # check workers
    def check_workers(self, workspec_list):
        """Check status of workers. This method takes a list of WorkSpecs as input argument
        and returns a list of worker's statuses.
  
        :param workspec_list: a list of work specs instances
        :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses.
        :rtype: (bool, [string,])
        """
        try:
            job_service = rs.job.Service(self.adaptor)
        except rs.SagaException as ex:
            time.sleep(10)
            self.check_workers(workspec_list)
        
        retList = []
        for workSpec in workspec_list:
            # make logger
            errStr = ''
            tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID),
                                      method_name='check_workers')
            tmpLog.debug("SAGA monitor started")
            if workSpec.batchID:
                saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workSpec.batchID)
                try:
                    worker = job_service.get_job(saga_submission_id)
                    tmpLog.debug(
                        'SAGA State for submission with batchid: {0} is: {1}'.format(workSpec.batchID, worker.state))
                    harvester_job_state = SAGASubmitter.status_translator(worker.state)
                    workSpec.nativeStatus = worker.state
                    workSpec.set_status(harvester_job_state)
                    tmpLog.debug(
                        'Worker state with batchid: {0} is: {1} exit code: {2}'.format(workSpec.batchID, harvester_job_state, worker.exit_code))
                    workSpec.set_status(harvester_job_state)
                    if worker.created:
                        tmpLog.debug("Worker created (SAGA): {0}".format(worker.created))
                        workSpec.submitTime = datetime.utcfromtimestamp(worker.created)
                    if worker.started:
                        tmpLog.debug("Worker started (SAGA): {0}".format(worker.started))
                        workSpec.startTime = datetime.utcfromtimestamp(worker.started)
                    if worker.finished:
                        tmpLog.debug("Worker finished (SAGA): {0}".format(worker.finished))
                        workSpec.endTime = datetime.utcfromtimestamp(worker.finished)

                    if workSpec.is_final_status():
                        workSpec.nativeExitCode = worker.exit_code
                        tmpLog.info("Worker in final status [{0}] exit code: {1}".format(workSpec.status, workSpec.nativeExitCode))
                        if workSpec.nativeExitCode != 0:  # let's try to find exit code, exit message etc...
                            tmpLog.info("Deep check to find exit code and exit status required")
                            harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob(
                                workSpec.batchID, workSpec.workerID)
                            if harvester_job_state == "":
                                harvester_job_state = workSpec.ST_finished
                            if not workSpec.startTime:
                                workSpec.startTime = starttime
                            if endtime:
                                workSpec.endTime = endtime
                            workSpec.set_status(harvester_job_state)
                            
#                            jsonFilePath = os.path.join(workSpec.get_access_point(), harvester_config.payload_interaction.killWorkerFile)
#                            tmpLog.debug('Going to request kill worker via file {0}.'.format(jsonFilePath))
#                            try:
#                                os.utime(jsonFilePath, None)
#                            except OSError:
#                                open(jsonFilePath, 'a').close()
                                
                        tmpLog.info('Worker {2} with BatchID={0} finished with exit code {1} and state {3}'.format(
                            workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state))
                        tmpLog.debug('Started: [{0}] finished: [{1}]'.format(worker.started, worker.finished))

                    if worker.state == rs.job.PENDING:
                        queue_time = (datetime.now() - workSpec.submitTime).total_seconds()
                        tmpLog.info("Worker queued for {0} sec.".format(queue_time))
                        if hasattr(self, 'maxqueuetime') and queue_time > self.maxqueuetime:
                            tmpLog.info(
                                "Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time,
                                                                                                         self.maxqueuetime))
                            worker.cancel()
                            worker.wait()
                            workSpec.nativeExitCode = worker.exit_code
                            cur_time = datetime.now()
                            workSpec.startTime = cur_time
                            workSpec.endTime = cur_time
                            workSpec.set_pilot_closed()
                            workSpec.set_status(workSpec.ST_cancelled)
                            harvester_job_state = workSpec.ST_cancelled
                            tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state,
                                                                                         workSpec.nativeExitCode))
                            # proper processing of jobs for worker will be required, to avoid 'fake' fails
                    
                    if worker.state == rs.job.RUNNING:
                        tmpLog.info("Going to check that all jobs of the worker are in the final status.")
                        dbProxy = DBProxy()
                        job_spec_list = dbProxy.get_jobs_with_worker_id(workSpec.workerID, None, only_running=False, slim=False)
                         
                        allFinal = True
                        for job_spec in job_spec_list:
                            if not job_spec.is_final_status():
                                allFinal = False
                                tmpLog.info("Not all jobs are in the final status, skip till the next monitoring cycle.")
                                break
                         
                        if allFinal:
                            tmpLog.info("All jobs are in the final status, going to cancel the worker.")
                            worker.cancel()
                            worker.wait()
                            workSpec.nativeExitCode = 0
                            cur_time = datetime.utcnow()
                            workSpec.endTime = cur_time
                            jsonFilePath = os.path.join(workSpec.get_access_point(), harvester_config.payload_interaction.killWorkerFile)
                            tmpLog.debug('Going to request kill worker via file {0}.'.format(jsonFilePath))
                            try:
                                os.utime(jsonFilePath, None)
                            except OSError:
                                open(jsonFilePath, 'a').close()
                                 
                            workSpec.set_status(workSpec.ST_finished)
                            harvester_job_state = workSpec.ST_finished
                            tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state,
                                                                                         workSpec.nativeExitCode))
                        
                except rs.SagaException as ex:
                    tmpLog.info('An exception occured during retriving worker information {0}'.format(workSpec.batchID))
                    tmpLog.info(ex.get_message())
                    # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better
                    # some more work for SAGA to get proper state
                    harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob(
                        workSpec.batchID, workSpec.workerID)
                    if harvester_job_state == "":
                        harvester_job_state = workSpec.ST_finished
                    if not workSpec.startTime:
                        workSpec.startTime = starttime
                    if endtime:
                        workSpec.endTime = endtime
                    workSpec.set_status(harvester_job_state)
                    tmpLog.debug('Worker state set to: {0} ({1})'.format(workSpec.status, harvester_job_state))
                retList.append((harvester_job_state, errStr))
                # for compatibility with dummy monitor
                f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w')
                f.write(workSpec.status)
                f.close()

            else:
                tmpLog.debug("SAGA monitor found worker [{0}] without batchID".format(workSpec.workerID))

        job_service.close()
        tmpLog.debug('Results: {0}'.format(retList))

        return True, retList

    def deep_checkjob(self, batchid, workerid):
        """
        Get job state, exit code and some more parameters, from resources depending sources

        :param batchid:
        :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage
        """
        tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workerid), method_name='deep_checkjob')
        harvester_job_state = None
        nativeexitcode = None
        nativestatus = None
        diagmessage = ""
        starttime = None
        endtime = None
        queue_config = self.queue_config_mapper.get_queue(self.queueName)
        if hasattr(queue_config, 'resource'):
            resource_utils = self.pluginFactory.get_plugin(queue_config.resource)
        else:
            tmpLog.debug("Resource configuration missed for: {0}".format(self.queueName))
            resource_utils = None
        if resource_utils:
            batchjob_info = resource_utils.get_batchjob_info(batchid)
        if batchjob_info:
            tmpLog.info('Batch job info collected: {0}'.format(batchjob_info))
            harvester_job_state = batchjob_info['status']
            nativeexitcode = batchjob_info['nativeExitCode']
            nativestatus = batchjob_info['nativeStatus']
            diagmessage = batchjob_info['nativeExitMsg']
            if batchjob_info['start_time']:
                starttime = batchjob_info['start_time']
            if batchjob_info['finish_time']:
                endtime = batchjob_info['finish_time']

        return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage
    def __init__(self, **kwarg):
        self.logBaseURL = 'http://localhost/test'
        PluginBase.__init__(self, **kwarg)

        self.queue_config_mapper = QueueConfigMapper()
Esempio n. 47
0
                              timeout=self.__worker_update_timeout)
            tmp_log.debug('worker update for {0} ended with {1} {2}'.format(
                batch_id, r.status_code, r.text))

            end_time = time.time()
            tmp_log.debug('done (took {0})'.format(end_time - start_time))
        except:
            tmp_log.error('Excepted with: {0}'.format(traceback.format_exc()))


if __name__ == "__main__":
    """
    Quick tests
    """
    from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
    queue_config_mapper = QueueConfigMapper()

    apfmon = Apfmon(queue_config_mapper)
    apfmon.create_factory()
    apfmon.create_labels()

    worker_a = WorkSpec()
    worker_a.batchID = 1
    worker_a.computingSite = 'CERN-PROD-DEV_UCORE'
    worker_a.computingElement = 'bla1'
    worker_a.workAttributes = {
        "batchLog":
        "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.log",
        "stdErr":
        "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.err",
        "stdOut":
 def trigger_stage_out(self, jobspec):
     # make logger
     tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident),
                               method_name='trigger_stage_out')
     tmpLog.debug('start')
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
     # get the queueConfig and corresponding objStoreID_ES
     queueConfigMapper = QueueConfigMapper()
     queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
     # write to debug log queueConfig.stager
     tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager))
     # check queueConfig stager section to see if jobtype is set
     if 'jobtype' in queueConfig.stager:
         if queueConfig.stager['jobtype'] == "Yoda" :
             self.Yodajob = True
     # set the location of the files in fileSpec.objstoreID
     # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json 
     self.objstoreID = int(queueConfig.stager['objStoreID_ES'])
     if self.Yodajob :
         self.pathConvention = int(queueConfig.stager['pathConvention'])
         tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention))
     else:
         self.pathConvention = None
         tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID))
     self.RSE_dstpath = queueConfig.stager['RSE_dstPath']
     # loop over the output files and copy the files
     ifile = 0
     errors = []
     for fileSpec in jobspec.get_output_file_specs(skip_done=True):
         scope ='panda'
         if fileSpec.scope is not None :
             scope = fileSpec.scope
         # for Yoda job set the scope to transient 
         if self.Yodajob :
             scope = 'transient'
         # only print to log file first 25 files
         if ifile < 25 :
             msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope)
             tmpLog.debug(msgStr)
         if ifile == 25 :
             msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope)
             tmpLog.debug(msgStr)
         hash = hashlib.md5()
         hash.update('%s:%s' % (scope, fileSpec.lfn))
         hash_hex = hash.hexdigest()
         correctedscope = "/".join(scope.split('.'))
         srcURL = fileSpec.path
         dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.RSE_dstPath,
                                                                    scope=correctedscope,
                                                                    hash1=hash_hex[0:2],
                                                                    hash2=hash_hex[2:4],
                                                                    lfn=fileSpec.lfn)
         if ifile < 25 :
             tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL))
         # copy the source file from source to destination skip over if file already exists
         if os.path.exists(dstURL):
             tmpLog.debug('Already copied file {0}'.format(dstURL))
             # Set the file spec status
             if self.changeFileStatusOnSuccess:
                 fileSpec.status = 'finished'
         else :
             if os.path.exists(srcURL) :
                 # check if destination directory exists if not create it
                 dstDIR = os.path.dirname(dstURL)
                 try:
                     if not os.path.exists(dstDIR) :
                         os.makedirs(dstDIR)
                         mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP 
                         mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID
                         os.chmod(dstDIR,mode)
                     # copy the source file to destination file
                     shutil.copy2(srcURL, dstURL)
                     # Set the file spec status
                     if self.changeFileStatusOnSuccess:
                         self.set_FileSpec_status(jobspec, 'finished')
                 except (IOError, os.error) as why:
                     errors.append((srcURL, dstURL, str(why)))
             else :
                 errors.append((srcURL, dstURL, 'Source file missing'))
         ifile += 1
     #  Now test for any errors
     if errors:
         for error in errors:
             tmpLog.debug('copy error source {0} destination {1} Reason {2}'.format(error[0],error[1],error[2]))
         raise Error(errors)
     # otherwise we are OK                            
     tmpLog.debug('stop')
     return True, ''
Esempio n. 49
0
from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
from pandaharvester.harvestercore.communicator_pool import CommunicatorPool

for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict):
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
        stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
        loggerObj.addHandler(stdoutHandler)

pp = pprint.PrettyPrinter(indent=4)

queueConfigMapper = QueueConfigMapper()

proxy = DBProxy()

sqlJ = "SELECT * FROM job_table"

resultsJobcur = proxy.execute(sqlJ)
resultsJob = resultsJobcur.fetchall()
proxy.commit()

sqlF = "SELECT * FROM file_table"

resultsFilescur = proxy.execute(sqlF)
resultsFiles = resultsFilescur.fetchall()
proxy.commit()
class GoogleMonitor(PluginBase):
    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)
        self.queue_config_mapper = QueueConfigMapper()

        # States taken from: https://cloud.google.com/compute/docs/instances/checking-instance-status
        self.vm_to_worker_status = {
                                     'RUNNING': WorkSpec.ST_running,
                                     'TERMINATED': WorkSpec.ST_running, # the VM is stopped, but has to be fully deleted
                                     'STOPPING': WorkSpec.ST_finished,
                                     'PROVISIONING': WorkSpec.ST_submitted,
                                     'STAGING': WorkSpec.ST_submitted
                                     }

    def list_vms(self, zone):
        """
        List the status of the running VMs
        :return:
        """

        try:
            result = compute.instances().list(project=PROJECT, zone=zone).execute()

            try:
                vm_instances = result['items']
            except KeyError:
                # there are no VMs running
                return [], {}

            # make a list with the VM names
            vm_names = map(lambda vm_instance: vm_instance['name'], vm_instances)

            # make a dictionary so we can retrieve a VM by its name
            vm_name_to_status = {}
            for vm_instance in vm_instances:
                vm_name_to_status[vm_instance['name']] = vm_instance['status']

            return vm_names, vm_name_to_status

        except:
            return None, None

    def kill_worker(self, vm_name, zone):
        """
        Sends the command to Google to destroy a VM
        """

        try:
            base_logger.debug('Going to kill VM {0}'.format(vm_name))
            compute.instances().delete(project=PROJECT, zone=zone, instance=vm_name).execute()
            base_logger.debug('Killed VM {0}'.format(vm_name))
        except Exception as e:
            base_logger.error('Problems killing the VM: {0}'.format(e))

    def check_workers(self, workers):
        """
        This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses.
        Nth element in the return list corresponds to the status of Nth WorkSpec in the given list.

        :param worker_list: a list of work specs instances
        :return: A tuple containing the return code (True for success, False otherwise) and a list of worker's statuses
        :rtype: (bool, [string,])
        """

        if not workers:
            return False, 'Empty workers list received'

        # it assumes that all workers belong to the same queue, which is currently the case
        # we assume all work_specs in the list belong to the same queue
        queue_config = self.queue_config_mapper.get_queue(workers[0].computingSite)
        try:
            zone = queue_config.zone
        except AttributeError:
            zone = ZONE

        # running instances
        vm_names, vm_name_to_status = self.list_vms(zone)
        if vm_names is None and vm_name_to_status is None:
            error_string = 'Could not list the VMs'
            base_logger.error(error_string)
            return False, error_string

        # extract the list of batch IDs
        batch_IDs = map(lambda x: str(x.batchID), workers)
        base_logger.debug('Batch IDs: {0}'.format(batch_IDs))

        ret_list = []
        for batch_ID in batch_IDs:
            tmp_log = self.make_logger(base_logger, 'batch ID={0}'.format(batch_ID), method_name='check_workers')

            if batch_ID not in vm_names:
                new_status = WorkSpec.ST_finished
                message = 'VM not found'
            else:
                try:
                    new_status = self.vm_to_worker_status[vm_name_to_status[batch_ID]]
                    message = 'VM status returned by GCE API'

                    # Preemptible VMs: GCE terminates a VM, but a stopped VM with its disk is left and needs to be
                    # explicitly deleted
                    if vm_name_to_status[batch_ID] == 'TERMINATED':
                        self.kill_worker(batch_ID, zone)

                except KeyError:
                    new_status = WorkSpec.ST_missed
                    message = 'Unknown status to Harvester: {0}'.format(vm_name_to_status[batch_ID])

            tmp_log.debug('new_status={0}'.format(new_status))
            ret_list.append((new_status, message))

        base_logger.debug('ret_list: {0}'.format(ret_list))
        return True, ret_list
Esempio n. 51
0
    def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # whether to submit any worker
        to_submit_any = True

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        if self.condorSchedd is None or not self.useSpool:
            try:
                os.mkdir(log_subdir_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
                else:
                    pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(
                self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(
                self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        # get default information from queue info
        n_core_per_node_from_queue = this_panda_queue_dict.get(
            'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
        is_unified_queue = this_panda_queue_dict.get('capability',
                                                     '') == 'ucore'

        # get override requirements from queue configured
        try:
            n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
        except AttributeError:
            n_core_per_node = n_core_per_node_from_queue

        # deal with CE
        special_par = ''
        ce_weighting = None
        if self.useAtlasGridCE:
            # If ATLAS Grid CE mode used
            tmpLog.debug('Using ATLAS Grid CE mode...')
            queues_from_queue_list = this_panda_queue_dict.get('queues', [])
            special_par = this_panda_queue_dict.get('special_par', '')
            ce_auxilary_dict = {}
            for _queue_dict in queues_from_queue_list:
                if not (_queue_dict.get('ce_endpoint') and str(
                        _queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                        and str(_queue_dict.get('ce_flavour', '')).lower()
                        in set(['arc-ce', 'cream-ce', 'htcondor-ce'])):
                    continue
                ce_endpoint = _queue_dict.get('ce_endpoint')
                if (ce_endpoint in ce_auxilary_dict
                        and str(_queue_dict.get('ce_queue_name',
                                                '')).lower() == 'default'):
                    pass
                else:
                    ce_auxilary_dict[ce_endpoint] = _queue_dict
            # qualified CEs from AGIS info
            n_qualified_ce = len(ce_auxilary_dict)
            if n_qualified_ce > 0:
                # Get CE weighting
                tmpLog.debug('Get CE weighting')
                worker_ce_all_tuple = self.get_ce_statistics(
                    self.queueName, nWorkers)
                ce_weighting = _get_ce_weighting(
                    ce_endpoint_list=list(ce_auxilary_dict.keys()),
                    worker_ce_all_tuple=worker_ce_all_tuple)
                stats_weighting_display_str = _get_ce_stats_weighting_display(
                    ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting)
                tmpLog.debug('CE stats and weighting: {0}'.format(
                    stats_weighting_display_str))
            else:
                tmpLog.error('No valid CE endpoint found')
                to_submit_any = False

        def _handle_one_worker(workspec, to_submit=to_submit_any):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger,
                                            'workerID={0}'.format(
                                                workspec.workerID),
                                            method_name='_handle_one_worker')
            ce_info_dict = dict()
            batch_log_dict = dict()
            data = {
                'workspec': workspec,
                'to_submit': to_submit,
            }
            if to_submit:
                if self.useAtlasGridCE:
                    # choose a CE
                    tmpLog.info('choose a CE...')
                    ce_chosen = _choose_ce(ce_weighting)
                    try:
                        ce_info_dict = ce_auxilary_dict[ce_chosen].copy()
                    except KeyError:
                        tmpLog.info(
                            'Problem choosing CE with weighting. Choose an arbitrary CE endpoint'
                        )
                        ce_info_dict = random.choice(
                            list(ce_auxilary_dict.values())).copy()
                    # go on info of the CE
                    ce_endpoint_from_queue = ce_info_dict.get(
                        'ce_endpoint', '')
                    ce_flavour_str = str(ce_info_dict.get('ce_flavour',
                                                          '')).lower()
                    ce_version_str = str(ce_info_dict.get('ce_version',
                                                          '')).lower()
                    ce_info_dict['ce_hostname'] = re.sub(
                        ':\w*', '', ce_endpoint_from_queue)
                    if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue:
                        # add default port to ce_endpoint if missing
                        default_port_map = {
                            'cream-ce': 8443,
                            'arc-ce': 2811,
                            'htcondor-ce': 9619,
                        }
                        if ce_flavour_str in default_port_map:
                            default_port = default_port_map[ce_flavour_str]
                            ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(
                                ce_endpoint_from_queue, default_port)
                    tmpLog.debug(
                        'For site {0} got CE endpoint: "{1}", flavour: "{2}"'.
                        format(self.queueName, ce_endpoint_from_queue,
                               ce_flavour_str))
                    if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                        sdf_template_filename = '{ce_flavour_str}.sdf'.format(
                            ce_flavour_str=ce_flavour_str)
                        self.templateFile = os.path.join(
                            self.CEtemplateDir, sdf_template_filename)
                else:
                    try:
                        # Manually define site condor schedd as ceHostname and central manager as ceEndpoint
                        if self.ceHostname and isinstance(
                                self.ceHostname,
                                list) and len(self.ceHostname) > 0:
                            if isinstance(self.ceEndpoint,
                                          list) and len(self.ceEndpoint) > 0:
                                ce_info_dict['ce_hostname'], ce_info_dict[
                                    'ce_endpoint'] = random.choice(
                                        list(
                                            zip(self.ceHostname,
                                                self.ceEndpoint)))
                            else:
                                ce_info_dict['ce_hostname'] = random.choice(
                                    self.ceHostname)
                                ce_info_dict['ce_endpoint'] = self.ceEndpoint
                        else:
                            ce_info_dict['ce_hostname'] = self.ceHostname
                            ce_info_dict['ce_endpoint'] = self.ceEndpoint
                    except AttributeError:
                        pass
                # template for batch script
                try:
                    tmpFile = open(self.templateFile)
                    sdf_template_raw = tmpFile.read()
                    tmpFile.close()
                except AttributeError:
                    tmpLog.error(
                        'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found'
                    )
                    to_submit = False
                    return data
                else:
                    # get batch_log, stdout, stderr filename, and remobe commented liness
                    sdf_template_str_list = []
                    for _line in sdf_template_raw.split('\n'):
                        if _line.startswith('#'):
                            continue
                        sdf_template_str_list.append(_line)
                        _match_batch_log = re.match('log = (.+)', _line)
                        _match_stdout = re.match('output = (.+)', _line)
                        _match_stderr = re.match('error = (.+)', _line)
                        if _match_batch_log:
                            batch_log_value = _match_batch_log.group(1)
                            continue
                        if _match_stdout:
                            stdout_value = _match_stdout.group(1)
                            continue
                        if _match_stderr:
                            stderr_value = _match_stderr.group(1)
                            continue
                    sdf_template = '\n'.join(sdf_template_str_list)
                    # Choose from Condor schedd and central managers
                    if isinstance(self.condorSchedd,
                                  list) and len(self.condorSchedd) > 0:
                        if isinstance(self.condorPool,
                                      list) and len(self.condorPool) > 0:
                            condor_schedd, condor_pool = random.choice(
                                list(zip(self.condorSchedd, self.condorPool)))
                        else:
                            condor_schedd = random.choice(self.condorSchedd)
                            condor_pool = self.condorPool
                    else:
                        condor_schedd = self.condorSchedd
                        condor_pool = self.condorPool
                    # Log Base URL
                    if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL:
                        schedd_hostname = re.sub(
                            r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?',
                            lambda matchobj: matchobj.group(1)
                            if matchobj.group(1) else '', condor_schedd)
                        log_base_url = re.sub(r'\[ScheddHostname\]',
                                              schedd_hostname, self.logBaseURL)
                    else:
                        log_base_url = self.logBaseURL
                    # URLs for log files
                    if not (log_base_url is None):
                        if workspec.batchID:
                            batchID = workspec.batchID
                            guess = False
                        else:
                            batchID = ''
                            guess = True
                        batch_log_filename = parse_batch_job_filename(
                            value_str=batch_log_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        stdout_path_file_name = parse_batch_job_filename(
                            value_str=stdout_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        stderr_path_filename = parse_batch_job_filename(
                            value_str=stderr_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        batch_log = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, batch_log_filename)
                        batch_stdout = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, stdout_path_file_name)
                        batch_stderr = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, stderr_path_filename)
                        workspec.set_log_file('batch_log', batch_log)
                        workspec.set_log_file('stdout', batch_stdout)
                        workspec.set_log_file('stderr', batch_stderr)
                        batch_log_dict['batch_log'] = batch_log
                        batch_log_dict['batch_stdout'] = batch_stdout
                        batch_log_dict['batch_stderr'] = batch_stderr
                        batch_log_dict['gtag'] = workspec.workAttributes[
                            'stdOut']
                        tmpLog.debug('Done set_log_file before submission')
                    tmpLog.debug('Done jobspec attribute setting')
                # set data dict
                data.update({
                    'workspec': workspec,
                    'to_submit': to_submit,
                    'template': sdf_template,
                    'executable_file': self.executableFile,
                    'log_dir': self.logDir,
                    'log_subdir': log_subdir,
                    'n_core_per_node': n_core_per_node,
                    'panda_queue_name': panda_queue_name,
                    'x509_user_proxy': self.x509UserProxy,
                    'ce_info_dict': ce_info_dict,
                    'batch_log_dict': batch_log_dict,
                    'special_par': special_par,
                    'harvester_queue_config': harvester_queue_config,
                    'is_unified_queue': is_unified_queue,
                    'condor_schedd': condor_schedd,
                    'condor_pool': condor_pool,
                    'use_spool': self.useSpool,
                })
            return data

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(
                baseLogger,
                'workerID={0}'.format(workspec.workerID),
                method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # exec with mcore
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retValList = thread_pool.map(submit_a_worker, dataIterator)
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(
                lambda _wv_tuple: _propagate_attributes(*_wv_tuple),
                zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList
Esempio n. 52
0
 def trigger_stage_out(self, jobspec):
     # make logger
     tmpLog = self.make_logger(baseLogger,
                               'PandaID={0} ThreadID={1}'.format(
                                   jobspec.PandaID,
                                   threading.current_thread().ident),
                               method_name='trigger_stage_out')
     tmpLog.debug('start')
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(
             jobspec.computingSite))
     # get the queueConfig and corresponding objStoreID_ES
     queueConfigMapper = QueueConfigMapper()
     queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
     # write to debug log queueConfig.stager
     tmpLog.debug(
         'jobspec.computingSite - {0} queueConfig.stager {1}'.format(
             jobspec.computingSite, queueConfig.stager))
     # check queueConfig stager section to see if jobtype is set
     if 'jobtype' in queueConfig.stager:
         if queueConfig.stager['jobtype'] == "Yoda":
             self.Yodajob = True
     # set the location of the files in fileSpec.objstoreID
     # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json
     self.objstoreID = int(queueConfig.stager['objStoreID_ES'])
     if self.Yodajob:
         self.pathConvention = int(queueConfig.stager['pathConvention'])
         tmpLog.debug(
             'Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'
             .format(jobspec.PandaID, self.objstoreID, self.pathConvention))
     else:
         self.pathConvention = None
         tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(
             jobspec.PandaID, self.objstoreID))
     self.RSE_dstpath = queueConfig.stager['RSE_dstPath']
     # loop over the output files and copy the files
     ifile = 0
     errors = []
     for fileSpec in jobspec.get_output_file_specs(skip_done=True):
         scope = 'panda'
         if fileSpec.scope is not None:
             scope = fileSpec.scope
         # for Yoda job set the scope to transient
         if self.Yodajob:
             scope = 'transient'
         # only print to log file first 25 files
         if ifile < 25:
             msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(
                 fileSpec.lfn, fileSpec.scope)
             tmpLog.debug(msgStr)
         if ifile == 25:
             msgStr = "printed first 25 files skipping the rest".format(
                 fileSpec.lfn, fileSpec.scope)
             tmpLog.debug(msgStr)
         hash = hashlib.md5()
         hash.update('%s:%s' % (scope, fileSpec.lfn))
         hash_hex = hash.hexdigest()
         correctedscope = "/".join(scope.split('.'))
         srcURL = fileSpec.path
         dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(
             endPoint=self.RSE_dstPath,
             scope=correctedscope,
             hash1=hash_hex[0:2],
             hash2=hash_hex[2:4],
             lfn=fileSpec.lfn)
         if ifile < 25:
             tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL,
                                                             dstURL=dstURL))
         # copy the source file from source to destination skip over if file already exists
         if os.path.exists(dstURL):
             tmpLog.debug('Already copied file {0}'.format(dstURL))
             # Set the file spec status
             if self.changeFileStatusOnSuccess:
                 fileSpec.status = 'finished'
         else:
             if os.path.exists(srcURL):
                 # check if destination directory exists if not create it
                 dstDIR = os.path.dirname(dstURL)
                 try:
                     if not os.path.exists(dstDIR):
                         os.makedirs(dstDIR)
                         mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP
                         mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID
                         os.chmod(dstDIR, mode)
                     # copy the source file to destination file
                     shutil.copy2(srcURL, dstURL)
                     # Set the file spec status
                     if self.changeFileStatusOnSuccess:
                         self.set_FileSpec_status(jobspec, 'finished')
                 except (IOError, os.error) as why:
                     errors.append((srcURL, dstURL, str(why)))
             else:
                 errors.append((srcURL, dstURL, 'Source file missing'))
         ifile += 1
     #  Now test for any errors
     if errors:
         for error in errors:
             tmpLog.debug(
                 'copy error source {0} destination {1} Reason {2}'.format(
                     error[0], error[1], error[2]))
         raise Error(errors)
     # otherwise we are OK
     tmpLog.debug('stop')
     return True, ''
Esempio n. 53
0
fork_child_pid = os.fork()
if fork_child_pid != 0:
    signal_utils.set_suicide_handler(None)
    os.wait()
else:

    if len(sys.argv) not in (2, 4):
        print("Wrong number of parameters. You can either:")
        print("  - specify the queue name")
        print(
            "  - specify the queue name, jobType (managed, user) and resourceType (SCORE, SCORE_HIMEM, MCORE, MCORE_HIMEM)"
        )
        sys.exit(0)

    queueName = sys.argv[1]
    queueConfigMapper = QueueConfigMapper()
    queueConfig = queueConfigMapper.get_queue(queueName)

    if queueConfig.prodSourceLabel in ('user', 'managed'):
        jobType = queueConfig.prodSourceLabel
    else:
        jobType = 'managed'  # default, can be overwritten by parameters

    resourceType = 'SCORE'  # default, can be overwritten by parameters

    if len(sys.argv) == 4:
        # jobType should be 'managed' or 'user'. If not specified will default to a production job
        if sys.argv[2] in ('user', 'managed'):
            jobType = sys.argv[2]
        else:
            print('value for jobType not valid, defaulted to {0}'.format(
    def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # whether to submit any worker
        to_submit_any = True

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        if self.condorSchedd is None or not self.useSpool:
            try:
                os.mkdir(log_subdir_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
                else:
                    pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        # get default information from queue info
        n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
        is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore'
        pilot_version_orig = str(this_panda_queue_dict.get('pilot_version', ''))
        pilot_version_suffix_str = '_pilot2' if pilot_version_orig == '2' else ''

        # get override requirements from queue configured
        try:
            n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
        except AttributeError:
            n_core_per_node = n_core_per_node_from_queue

        # deal with Condor schedd and central managers; make a random list the choose
        n_bulks = _div_round_up(nWorkers, self.minBulkToRamdomizedSchedd)
        if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0:
            if isinstance(self.condorPool, list) and len(self.condorPool) > 0:
                orig_list = list(zip(self.condorSchedd, self.condorPool))
            else:
                orig_list = [ (_schedd, self.condorPool) for _schedd in self.condorSchedd ]
            if n_bulks < len(orig_list):
                schedd_pool_choice_list = random.sample(orig_list, n_bulks)
            else:
                schedd_pool_choice_list = orig_list
        else:
            schedd_pool_choice_list = [(self.condorSchedd, self.condorPool)]

        # deal with CE
        special_par = ''
        ce_weighting = None
        if self.useAtlasGridCE:
            # If ATLAS Grid CE mode used
            tmpLog.debug('Using ATLAS Grid CE mode...')
            queues_from_queue_list = this_panda_queue_dict.get('queues', [])
            special_par = this_panda_queue_dict.get('special_par', '')
            ce_auxilary_dict = {}
            for _queue_dict in queues_from_queue_list:
                if not ( _queue_dict.get('ce_endpoint')
                        and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                        and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ):
                    continue
                ce_endpoint = _queue_dict.get('ce_endpoint')
                if ( ce_endpoint in ce_auxilary_dict
                    and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ):
                    pass
                else:
                    ce_auxilary_dict[ce_endpoint] = _queue_dict
            # qualified CEs from AGIS info
            n_qualified_ce = len(ce_auxilary_dict)
            if n_qualified_ce > 0:
                # Get CE weighting
                tmpLog.debug('Get CE weighting')
                worker_ce_all_tuple = self.get_ce_statistics(self.queueName, nWorkers)
                ce_weighting = _get_ce_weighting(ce_endpoint_list=list(ce_auxilary_dict.keys()),
                                                        worker_ce_all_tuple=worker_ce_all_tuple)
                stats_weighting_display_str = _get_ce_stats_weighting_display(
                                                ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting)
                tmpLog.debug('CE stats and weighting: {0}'.format(stats_weighting_display_str))
            else:
                tmpLog.error('No valid CE endpoint found')
                to_submit_any = False



        def _handle_one_worker(workspec, to_submit=to_submit_any):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_handle_one_worker')
            ce_info_dict = dict()
            batch_log_dict = dict()
            data = {'workspec': workspec,
                    'to_submit': to_submit,}
            if to_submit:
                if self.useAtlasGridCE:
                    # choose a CE
                    tmpLog.info('choose a CE...')
                    ce_chosen = _choose_ce(ce_weighting)
                    try:
                        ce_info_dict = ce_auxilary_dict[ce_chosen].copy()
                    except KeyError:
                        tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint')
                        ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy()
                    # go on info of the CE
                    ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '')
                    ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower()
                    ce_version_str = str(ce_info_dict.get('ce_version', '')).lower()
                    ce_info_dict['ce_hostname'] = re.sub(':\w*', '',  ce_endpoint_from_queue)
                    if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue:
                        # add default port to ce_endpoint if missing
                        default_port_map = {
                                'cream-ce': 8443,
                                'arc-ce': 2811,
                                'htcondor-ce': 9619,
                            }
                        if ce_flavour_str in default_port_map:
                            default_port = default_port_map[ce_flavour_str]
                            ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port)
                    tmpLog.debug('For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"'.format(
                                    self.queueName, pilot_version_orig, ce_endpoint_from_queue, ce_flavour_str))
                    if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                        sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format(
                                                    ce_flavour_str=ce_flavour_str, pilot_version_suffix_str=pilot_version_suffix_str)
                        self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename)
                else:
                    try:
                        # Manually define site condor schedd as ceHostname and central manager as ceEndpoint
                        if self.ceHostname and isinstance(self.ceHostname, list) and len(self.ceHostname) > 0:
                            if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0:
                                ce_info_dict['ce_hostname'], ce_info_dict['ce_endpoint'] = random.choice(list(zip(self.ceHostname, self.ceEndpoint)))
                            else:
                                ce_info_dict['ce_hostname'] = random.choice(self.ceHostname)
                                ce_info_dict['ce_endpoint'] = self.ceEndpoint
                        else:
                            ce_info_dict['ce_hostname'] = self.ceHostname
                            ce_info_dict['ce_endpoint'] = self.ceEndpoint
                    except AttributeError:
                        pass
                # template for batch script
                try:
                    tmpFile = open(self.templateFile)
                    sdf_template_raw = tmpFile.read()
                    tmpFile.close()
                except AttributeError:
                    tmpLog.error('No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found')
                    to_submit = False
                    return data
                else:
                    # get batch_log, stdout, stderr filename, and remobe commented liness
                    sdf_template_str_list = []
                    for _line in sdf_template_raw.split('\n'):
                        if _line.startswith('#'):
                            continue
                        sdf_template_str_list.append(_line)
                        _match_batch_log = re.match('log = (.+)', _line)
                        _match_stdout = re.match('output = (.+)', _line)
                        _match_stderr = re.match('error = (.+)', _line)
                        if _match_batch_log:
                            batch_log_value = _match_batch_log.group(1)
                            continue
                        if _match_stdout:
                            stdout_value = _match_stdout.group(1)
                            continue
                        if _match_stderr:
                            stderr_value = _match_stderr.group(1)
                            continue
                    sdf_template = '\n'.join(sdf_template_str_list)
                    # Choose from Condor schedd and central managers
                    condor_schedd, condor_pool = random.choice(schedd_pool_choice_list)
                    # set submissionHost
                    if not condor_schedd and not condor_pool:
                        workspec.submissionHost = 'LOCAL'
                    else:
                        workspec.submissionHost = '{0},{1}'.format(condor_schedd, condor_pool)
                    tmpLog.debug('set submissionHost={0}'.format(workspec.submissionHost))
                    # Log Base URL
                    if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL:
                        schedd_hostname = re.sub(r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?',
                                                    lambda matchobj: matchobj.group(1) if matchobj.group(1) else '',
                                                    condor_schedd)
                        log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL)
                    else:
                        log_base_url = self.logBaseURL
                    # URLs for log files
                    if not (log_base_url is None):
                        if workspec.batchID:
                            batchID = workspec.batchID
                            guess = False
                        else:
                            batchID = ''
                            guess = True
                        batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir, batch_log_filename)
                        batch_stdout = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stdout_path_file_name)
                        batch_stderr = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stderr_path_filename)
                        workspec.set_log_file('batch_log', batch_log)
                        workspec.set_log_file('stdout', batch_stdout)
                        workspec.set_log_file('stderr', batch_stderr)
                        batch_log_dict['batch_log'] = batch_log
                        batch_log_dict['batch_stdout'] = batch_stdout
                        batch_log_dict['batch_stderr'] = batch_stderr
                        batch_log_dict['gtag'] = workspec.workAttributes['stdOut']
                        tmpLog.debug('Done set_log_file before submission')
                    tmpLog.debug('Done jobspec attribute setting')
                # set data dict
                data.update({
                        'workspec': workspec,
                        'to_submit': to_submit,
                        'template': sdf_template,
                        'executable_file': self.executableFile,
                        'log_dir': self.logDir,
                        'log_subdir': log_subdir,
                        'n_core_per_node': n_core_per_node,
                        'panda_queue_name': panda_queue_name,
                        'x509_user_proxy': self.x509UserProxy,
                        'ce_info_dict': ce_info_dict,
                        'batch_log_dict': batch_log_dict,
                        'special_par': special_par,
                        'harvester_queue_config': harvester_queue_config,
                        'is_unified_queue': is_unified_queue,
                        'condor_schedd': condor_schedd,
                        'condor_pool': condor_pool,
                        'use_spool': self.useSpool,
                        'pilot_version': pilot_version_orig,
                        })
            return data

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # submit
        retValList = submit_bag_of_workers(list(dataIterator))
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList