Example #1
0
def diracAPI_interactive(connection_attempts=5):
    '''
    Run an interactive server within the DIRAC environment.
    '''
    import os
    import sys
    import time
    import inspect
    import traceback
    from GangaDirac.Lib.Server.InspectionClient import runClient
    serverpath = os.path.join(
        os.path.dirname(inspect.getsourcefile(runClient)),
        'InspectionServer.py')
    from Ganga.Core.GangaThread.WorkerThreads import getQueues
    getQueues().add(
        execute("execfile('%s')" % serverpath, timeout=None, shell=False))

    #time.sleep(1)
    sys.stdout.write(
        "\nType 'q' or 'Q' or 'exit' or 'exit()' to quit but NOT ctrl-D")
    i = 0
    excpt = None
    while i < connection_attempts:
        try:
            runClient()
            break
        except:
            if i == (connection_attempts - 1):
                excpt = traceback.format_exc()
        finally:
            i += 1
    return excpt
Example #2
0
    def requeue_dirac_finished_jobs(requeue_jobs, finalised_statuses):
        """
        Method used to requeue jobs whih are in the finalized state of some form, finished/failed/etc
        Args:
            requeue_jobs (list): This is a list of the jobs which are to be requeued to be finalised
            finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running
        """

        # requeue existing completed job
        for j in requeue_jobs:
            if j.been_queued:
                continue

            if monitoring_component:
                if monitoring_component.should_stop():
                    break
            if not configDirac['serializeBackend']:
                getQueues()._monitoring_threadpool.add_function(
                    DiracBase.job_finalisation,
                    args=(j, finalised_statuses[j.backend.status]),
                    priority=5,
                    name="Job %s Finalizing" % j.fqid)
                j.been_queued = True
            else:
                DiracBase.job_finalisation(
                    j, finalised_statuses[j.backend.status])
Example #3
0
    def requeue_dirac_finished_jobs(requeue_jobs, finalised_statuses):
        """
        Method used to requeue jobs whih are in the finalized state of some form, finished/failed/etc
        Args:
            requeue_jobs (list): This is a list of the jobs which are to be requeued to be finalised
            finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running
        """

        from Ganga.Core import monitoring_component

        # requeue existing completed job
        for j in requeue_jobs:
            if j.been_queued:
                continue

            if monitoring_component:
                if monitoring_component.should_stop():
                    break
            if not configDirac['serializeBackend']:
                getQueues()._monitoring_threadpool.add_function(DiracBase.job_finalisation,
                                                           args=(j, finalised_statuses[j.backend.status]),
                                                           priority=5, name="Job %s Finalizing" % j.fqid)
                j.been_queued = True
            else:
                DiracBase.job_finalisation(j, finalised_statuses[j.backend.status])
Example #4
0
def diracAPI_interactive(connection_attempts=5):
    '''
    Run an interactive server within the DIRAC environment.
    '''
    import os
    import sys
    import time
    import inspect
    import traceback
    from GangaDirac.Lib.Server.InspectionClient import runClient
    serverpath = os.path.join(os.path.dirname(inspect.getsourcefile(runClient)), 'InspectionServer.py')
    from Ganga.Core.GangaThread.WorkerThreads import getQueues
    getQueues().add(execute("execfile('%s')" % serverpath, timeout=None, shell=False))

    #time.sleep(1)
    sys.stdout.write( "\nType 'q' or 'Q' or 'exit' or 'exit()' to quit but NOT ctrl-D")
    i = 0
    excpt = None
    while i < connection_attempts:
        try:
            runClient()
            break
        except:
            if i == (connection_attempts - 1):
                excpt = traceback.format_exc()
        finally:
            i += 1
    return excpt
Example #5
0
def calculateSiteSEMapping(file_replicas, wanted_common_site, uniqueSE,
                           site_to_SE_mapping, SE_to_site_mapping):

    SE_dict = dict()
    maps_size = 0
    found = []

    # First find the SE for each site
    for lfn, repz in file_replicas.iteritems():
        sitez = set([])
        for replica in repz:
            sitez.add(replica)
            if not replica in found:

                getQueues()._monitoring_threadpool.add_function(
                    addToMapping, (str(replica), site_to_SE_mapping))

                maps_size = maps_size + 1
                found.append(replica)

        SE_dict[lfn] = sitez

    # Doing this in parallel so wait for it to finish
    while len(site_to_SE_mapping) != maps_size:
        time.sleep(0.1)

    # Now calculate the 'inverse' dictionary of site for each SE
    for _SE, _sites in site_to_SE_mapping.iteritems():
        for site_i in _sites:
            if site_i not in SE_to_site_mapping:
                SE_to_site_mapping[site_i] = set([])
            if _SE not in SE_to_site_mapping[site_i]:
                SE_to_site_mapping[site_i].add(_SE)

    # These can be used to select the site which know of a given SE
    # Or vice versa

    # Now lets generate a dictionary of some chosen site vs LFN to use in
    # constructing subsets
    allSubSets = []
    allChosenSets = {}

    site_dict = {}
    for _lfn, sites in SE_dict.iteritems():
        site_dict[_lfn] = set([])
        for _site in sites:
            for _SE in site_to_SE_mapping[_site]:
                site_dict[_lfn].add(_SE)

    # Now select a set of site to use as a seed for constructing a subset of
    # LFN
    for lfn in site_dict.keys():
        allChosenSets[lfn] = generate_site_selection(site_dict[lfn],
                                                     wanted_common_site,
                                                     uniqueSE,
                                                     site_to_SE_mapping,
                                                     SE_to_site_mapping)

    return site_dict, allSubSets, allChosenSets
def calculateSiteSEMapping(file_replicas, wanted_common_site, uniqueSE, site_to_SE_mapping, SE_to_site_mapping):

    SE_dict = dict()
    maps_size = 0
    found = []

    # First find the SE for each site
    for lfn, repz in file_replicas.iteritems():
        sitez = set([])
        for replica in repz:
            sitez.add(replica)
            if not replica in found:

                getQueues()._monitoring_threadpool.add_function(addToMapping, (str(replica), site_to_SE_mapping))

                maps_size = maps_size + 1
                found.append(replica)

        SE_dict[lfn] = sitez

    # Doing this in parallel so wait for it to finish
    while len(site_to_SE_mapping) != maps_size:
        time.sleep(0.1)

    # Now calculate the 'inverse' dictionary of site for each SE
    for _SE, _sites in site_to_SE_mapping.iteritems():
        for site_i in _sites:
            if site_i not in SE_to_site_mapping:
                SE_to_site_mapping[site_i] = set([])
            if _SE not in SE_to_site_mapping[site_i]:
                SE_to_site_mapping[site_i].add(_SE)

    # These can be used to select the site which know of a given SE
    # Or vice versa

    # Now lets generate a dictionary of some chosen site vs LFN to use in
    # constructing subsets
    allSubSets = []
    allChosenSets = {}

    site_dict = {}
    for _lfn, sites in SE_dict.iteritems():
        site_dict[_lfn] = set([])
        for _site in sites:
            for _SE in site_to_SE_mapping[_site]:
                site_dict[_lfn].add(_SE)

    # Now select a set of site to use as a seed for constructing a subset of
    # LFN
    for lfn in site_dict.keys():
        allChosenSets[lfn] = generate_site_selection(
            site_dict[lfn], wanted_common_site, uniqueSE, site_to_SE_mapping, SE_to_site_mapping)

    return site_dict, allSubSets, allChosenSets
Example #7
0
 def minorResubmit(self, job):
     """perform just a minor resubmit"""
     try:
         trf = self._getParent()
     except Exception as err:
         logger.debug("GetParent exception!\n%s" % str(err))
         trf = None
     if trf is not None and trf.submit_with_threads:
         addInfoString( self, "Attempting job re-submission with queues..." )
         from Ganga.Core.GangaThread.WorkerThreads import getQueues
         getQueues().add(job.resubmit)
     else:
         addInfoString( self, "Attempting job re-submission..." )
         job.resubmit()
Example #8
0
 def minorResubmit(self, job):
     """perform just a minor resubmit"""
     try:
         trf = self._getParent()
     except Exception as err:
         logger.debug("GetParent exception!\n%s" % str(err))
         trf = None
     if trf is not None and trf.submit_with_threads:
         addInfoString(self, "Attempting job re-submission with queues...")
         from Ganga.Core.GangaThread.WorkerThreads import getQueues
         getQueues().add(job.resubmit)
     else:
         addInfoString(self, "Attempting job re-submission...")
         job.resubmit()
def lookUpLFNReplicas(inputs, allLFNData):
    # Build a useful dictionary and list
    allLFNs = [_lfn.lfn for _lfn in inputs]
    LFNdict = dict.fromkeys(allLFNs)
    for _lfn in inputs:
        LFNdict[_lfn.lfn] = _lfn

    # Request the replicas for all LFN 'LFN_parallel_limit' at a time to not overload the
    # server and give some feedback as this is going on
    global limit_divide_one
    for i in range(int(math.ceil(float(len(allLFNs)) * limit_divide_one))):

        getQueues()._monitoring_threadpool.add_function(getLFNReplicas, (allLFNs, i, allLFNData))

    while len(allLFNData) != int(math.ceil(float(len(allLFNs)) * limit_divide_one)):
        time.sleep(1.)
        # This can take a while so lets protect any repo locks
        import Ganga.Runtime.Repository_runtime
        Ganga.Runtime.Repository_runtime.updateLocksNow()

    return allLFNs, LFNdict
def lookUpLFNReplicas(inputs, ignoremissing):
    """
    This method launches several worker threads to collect the replica information for all LFNs which are given as inputs and stores this in allLFNData
    Args:
        inputs (list): This is a list of input DiracFile which are 
    Returns:
        allLFNs (list): List of all of the LFNs in the inputs
        LFNdict (dict): dict of LFN to DiracFiles
    """
    allLFNData = {}
    # Build a useful dictionary and list
    allLFNs = [_lfn.lfn for _lfn in inputs]
    LFNdict = dict.fromkeys(allLFNs)
    for _lfn in inputs:
        LFNdict[_lfn.lfn] = _lfn

    # Request the replicas for all LFN 'LFN_parallel_limit' at a time to not overload the
    # server and give some feedback as this is going on
    global limit_divide_one
    for i in range(int(math.ceil(float(len(allLFNs)) * limit_divide_one))):

        getQueues()._monitoring_threadpool.add_function(getLFNReplicas, (allLFNs, i, allLFNData))

    while len(allLFNData) != int(math.ceil(float(len(allLFNs)) * limit_divide_one)):
        time.sleep(1.)
        # This can take a while so lets protect any repo locks
        import Ganga.Runtime.Repository_runtime
        Ganga.Runtime.Repository_runtime.updateLocksNow()

    bad_lfns = []

    # Sort this information and store is in the relevant Ganga objects
    badLFNCheck(bad_lfns, allLFNs, LFNdict, ignoremissing, allLFNData)

    # Check if we have any bad lfns
    if bad_lfns and ignoremissing is False:
        logger.error("Errors found getting LFNs:\n%s" % str(bad_lfns))
        raise SplittingError("Error trying to split dataset with invalid LFN and ignoremissing = False")

    return allLFNs, LFNdict, bad_lfns
def lookUpLFNReplicas(inputs, ignoremissing):
    """
    This method launches several worker threads to collect the replica information for all LFNs which are given as inputs and stores this in allLFNData
    Args:
        inputs (list): This is a list of input DiracFile which are 
    Returns:
        allLFNs (list): List of all of the LFNs in the inputs
        LFNdict (dict): dict of LFN to DiracFiles
    """
    allLFNData = {}
    # Build a useful dictionary and list
    allLFNs = [_lfn.lfn for _lfn in inputs]
    LFNdict = dict.fromkeys(allLFNs)
    for _lfn in inputs:
        LFNdict[_lfn.lfn] = _lfn

    # Request the replicas for all LFN 'LFN_parallel_limit' at a time to not overload the
    # server and give some feedback as this is going on
    global limit_divide_one
    for i in range(int(math.ceil(float(len(allLFNs)) * limit_divide_one))):

        getQueues()._monitoring_threadpool.add_function(getLFNReplicas, (allLFNs, i, allLFNData))

    while len(allLFNData) != int(math.ceil(float(len(allLFNs)) * limit_divide_one)):
        time.sleep(1.)
        # This can take a while so lets protect any repo locks
        import Ganga.Runtime.Repository_runtime
        Ganga.Runtime.Repository_runtime.updateLocksNow()

    bad_lfns = []

    # Sort this information and store is in the relevant Ganga objects
    badLFNCheck(bad_lfns, allLFNs, LFNdict, ignoremissing, allLFNData)

    # Check if we have any bad lfns
    if bad_lfns and ignoremissing is False:
        logger.error("Errors found getting LFNs:\n%s" % str(bad_lfns))
        raise SplittingError("Error trying to split dataset with invalid LFN and ignoremissing = False")

    return allLFNs, LFNdict, bad_lfns
Example #12
0
def lookUpLFNReplicas(inputs, allLFNData):
    # Build a useful dictionary and list
    allLFNs = [_lfn.lfn for _lfn in inputs]
    LFNdict = dict.fromkeys(allLFNs)
    for _lfn in inputs:
        LFNdict[_lfn.lfn] = _lfn

    # Request the replicas for all LFN 'LFN_parallel_limit' at a time to not overload the
    # server and give some feedback as this is going on
    global limit_divide_one
    for i in range(int(math.ceil(float(len(allLFNs)) * limit_divide_one))):

        getQueues()._monitoring_threadpool.add_function(
            getLFNReplicas, (allLFNs, i, allLFNData))

    while len(allLFNData) != int(
            math.ceil(float(len(allLFNs)) * limit_divide_one)):
        time.sleep(1.)
        # This can take a while so lets protect any repo locks
        import Ganga.Runtime.Repository_runtime
        Ganga.Runtime.Repository_runtime.updateLocksNow()

    return allLFNs, LFNdict
Example #13
0
    def checkForSubmission(self):
        """Check if this unit should submit a job"""

        # check the delay
        if time.time() < self.start_time:
            return False

        # check if we already have a job
        if len(self.active_job_ids) != 0:
            return False

        # if we're using threads, check the max number
        from Ganga.Core.GangaThread.WorkerThreads import getQueues
        if self._getParent().submit_with_threads and getQueues().totalNumUserThreads() > self._getParent().max_active_threads:
            return False

        return True
Example #14
0
    def checkForSubmission(self):
        """Check if this unit should submit a job"""

        # check the delay
        if time.time() < self.start_time:
            return False

        # check if we already have a job
        if len(self.active_job_ids) != 0:
            return False

        # if we're using threads, check the max number
        from Ganga.Core.GangaThread.WorkerThreads import getQueues
        if self._getParent().submit_with_threads and getQueues(
        ).totalNumUserThreads() > self._getParent().max_active_threads:
            return False

        return True
Example #15
0
def postBootstrapHook():
    configDirac = Ganga.Utility.Config.getConfig('DIRAC')
    configOutput = Ganga.Utility.Config.getConfig('Output')
    configPoll = Ganga.Utility.Config.getConfig('PollThread')
    configProxy = Ganga.Utility.Config.getConfig('defaults_DiracProxy')

    configDirac.setSessionValue('DiracEnvJSON',
                                os.environ['GANGADIRACENVIRONMENT'])
    configDirac.setSessionValue('userVO', 'lhcb')
    configDirac.setSessionValue('allDiracSE', [
        'CERN-USER', 'CNAF-USER', 'GRIDKA-USER', 'IN2P3-USER', 'SARA-USER',
        'PIC-USER', 'RAL-USER'
    ])
    configDirac.setSessionValue('noInputDataBannedSites', [])
    configDirac.setSessionValue('RequireDefaultSE', False)
    configDirac.setSessionValue('proxyInitCmd', 'lhcb-proxy-init')
    configDirac.setSessionValue('proxyInfoCmd', 'lhcb-proxy-info')

    configOutput.setSessionValue('FailJobIfNoOutputMatched', 'False')

    configPoll.setSessionValue('autoCheckCredentials', False)

    configProxy.setSessionValue('group', 'lhcb_user')
    configProxy.setSessionValue('encodeDefaultProxyFileName', False)

    # This is being dropped from 6.1.0 due to causing some bug in loading large numbers of jobs
    #
    # This will be nice to re-add once there is lazy loading support passed to the display for the 'jobs' command 09/2015 rcurrie
    #
    #from Ganga.GPIDev.Lib.Registry.JobRegistry import config as display_config
    #display_config.setSessionValue( 'jobs_columns', ('fqid', 'status', 'name', 'subjobs', 'application', 'backend', 'backend.actualCE', 'backend.extraInfo', 'comment') )
    #display_config.setSessionValue( 'jobs_columns_functions', {'comment': 'lambda j: j.comment', 'backend.extraInfo': 'lambda j : j.backend.extraInfo ', 'subjobs': 'lambda j: len(j.subjobs)', 'backend.actualCE': 'lambda j:j.backend.actualCE', 'application': 'lambda j: j.application._name', 'backend': 'lambda j:j.backend._name'} )
    #display_config.setSessionValue('jobs_columns_width', {'fqid': 8, 'status': 10, 'name': 10, 'application': 15, 'backend.extraInfo': 30, 'subjobs': 8, 'backend.actualCE': 17, 'comment': 20, 'backend': 15} )

    from Ganga.Core.GangaThread.WorkerThreads import getQueues
    queue = getQueues()
    if queue is not None:
        queue.add(updateCreds)
    else:
        updateCreds()
Example #16
0
def postBootstrapHook():
    configDirac = Ganga.Utility.Config.getConfig('DIRAC')
    configOutput = Ganga.Utility.Config.getConfig('Output')
    configPoll = Ganga.Utility.Config.getConfig('PollThread')
    configProxy = Ganga.Utility.Config.getConfig('defaults_DiracProxy')

    configDirac.setSessionValue('DiracEnvJSON', os.environ['GANGADIRACENVIRONMENT'])
    configDirac.setSessionValue('userVO', 'lhcb')
    configDirac.setSessionValue('allDiracSE', ['CERN-USER', 'CNAF-USER', 'GRIDKA-USER', 'IN2P3-USER', 'SARA-USER', 'PIC-USER', 'RAL-USER'])
    configDirac.setSessionValue('noInputDataBannedSites', [])
    configDirac.setSessionValue('RequireDefaultSE', False)
    configDirac.setSessionValue('proxyInitCmd', 'lhcb-proxy-init')
    configDirac.setSessionValue('proxyInfoCmd', 'lhcb-proxy-info')

    configOutput.setSessionValue('FailJobIfNoOutputMatched', 'False')

    configPoll.setSessionValue('autoCheckCredentials', False)

    configProxy.setSessionValue('group', 'lhcb_user')
    configProxy.setSessionValue('encodeDefaultProxyFileName', False)

# This is being dropped from 6.1.0 due to causing some bug in loading large numbers of jobs
#
# This will be nice to re-add once there is lazy loading support passed to the display for the 'jobs' command 09/2015 rcurrie
#
#from Ganga.GPIDev.Lib.Registry.JobRegistry import config as display_config
#display_config.setSessionValue( 'jobs_columns', ('fqid', 'status', 'name', 'subjobs', 'application', 'backend', 'backend.actualCE', 'backend.extraInfo', 'comment') )
#display_config.setSessionValue( 'jobs_columns_functions', {'comment': 'lambda j: j.comment', 'backend.extraInfo': 'lambda j : j.backend.extraInfo ', 'subjobs': 'lambda j: len(j.subjobs)', 'backend.actualCE': 'lambda j:j.backend.actualCE', 'application': 'lambda j: j.application._name', 'backend': 'lambda j:j.backend._name'} )
#display_config.setSessionValue('jobs_columns_width', {'fqid': 8, 'status': 10, 'name': 10, 'application': 15, 'backend.extraInfo': 30, 'subjobs': 8, 'backend.actualCE': 17, 'comment': 20, 'backend': 15} )

    from Ganga.Core.GangaThread.WorkerThreads import getQueues
    queue = getQueues()
    if queue is not None:
        queue.add(updateCreds)
    else:
        updateCreds()
Example #17
0
    def update(self):
        """Update the unit and (re)submit jobs as required"""

        # if we're complete, then just return
        if self.status in ["completed", "recreating"] or not self.active:
            return 0

        # check if submission is needed
        task = self._getParent()._getParent()
        trf = self._getParent()
        maxsub = task.n_tosub()

        # check parent unit(s)
        req_ok = self.checkParentUnitsAreComplete()

        # set the start time if not already set
        if len(self.req_units) > 0 and req_ok and self.start_time == 0:
            self.start_time = time.time() + trf.chain_delay * 60 - 1

        if req_ok and self.checkForSubmission() and maxsub > 0:

            # create job and submit
            addInfoString( self, "Creating Job..." )
            j = self.createNewJob()
            if j.name == '':
                j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID())

            try:
                if trf.submit_with_threads:
                    addInfoString( self, "Attempting job submission with queues..." )
                    from Ganga.Core.GangaThread.WorkerThreads import getQueues
                    getQueues().add(j.submit)
                else:
                    addInfoString( self, "Attempting job submission..." )
                    j.submit()

            except Exception as err:
                logger.debug("update Err: %s" % str(err))
                addInfoString( self, "Failed Job Submission")
                addInfoString( self, "Reason: %s" % (formatTraceback()))
                logger.error("Couldn't submit the job. Deactivating unit.")
                self.prev_job_ids.append(j.id)
                self.active = False
                trf._setDirty()  # ensure everything's saved
                return 1

            self.active_job_ids.append(j.id)
            self.updateStatus("running")
            trf._setDirty()  # ensure everything's saved

            if trf.submit_with_threads:
                return 0

            return 1

        # update any active jobs
        for jid in self.active_job_ids:

            # we have an active job so see if this job is OK and resubmit if
            # not
            try:
                job = getJobByID(jid)
            except Exception as err:
                logger.debug("Update2 Err: %s" % str(err))
                logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" %
                               (jid, task.id, trf.getID(), self.getID()))
                continue

            if job.status == "completed":

                # check if actually completed
                if not self.checkCompleted(job):
                    return 0

                # check for DS copy
                if trf.unit_copy_output:
                    if not self.copy_output:
                        trf.createUnitCopyOutputDS(self.getID())

                    if not self.copyOutput():
                        return 0

                # check for merger
                if trf.unit_merger:
                    if not self.merger:
                        self.merger = trf.createUnitMerger(self.getID())

                    if not self.merge():
                        return 0

                # all good so mark unit as completed
                self.updateStatus("completed")

            elif job.status == "failed" or job.status == "killed":

                # check for too many resubs
                if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1:
                    logger.error("Too many resubmits (%i). Deactivating unit." % (
                        self.minor_resub_count + self.major_resub_count))
                    addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % ( self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                rebroker = False

                if self.minor_resub_count > trf.minor_run_limit - 1:
                    if self._getParent().rebroker_on_job_fail:
                        rebroker = True
                    else:
                        logger.error(
                            "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count)
                        addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count))
                        self.active = False
                        return 0

                if self.major_resub_count > trf.major_run_limit - 1:
                    logger.error(
                        "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count)
                    addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                # check the type of resubmit
                if rebroker or self.checkMajorResubmit(job):

                    self.major_resub_count += 1
                    self.minor_resub_count = 0

                    try:
                        addInfoString( self, "Attempting major resubmit...")
                        self.majorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err3: %s" % str(err))
                        logger.error("Couldn't resubmit the job. Deactivating unit.")
                        addInfoString( self, "Failed Job resubmission")
                        addInfoString( self, "Reason: %s" % (formatTraceback()))
                        self.active = False

                    # break the loop now because we've probably changed the
                    # active jobs list
                    return 1
                else:
                    self.minor_resub_count += 1
                    try:
                        addInfoString( self, "Attempting minor resubmit...")
                        self.minorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err4: %s" % str(err))
                        logger.error("Couldn't resubmit the job. Deactivating unit.")
                        addInfoString( self, "Failed Job resubmission")
                        addInfoString( self, "Reason: %s" % (formatTraceback()))
                        self.active = False
                        return 1
Example #18
0
def GangaDiracSplitter(inputs, filesPerJob, maxFiles, ignoremissing):
    """
    Generator that yields a datasets for dirac split jobs
    """

    split_files = []
    i = inputs.__class__()

    if len(inputs.getLFNs()) != len(inputs.files):
        raise SplittingError(
            "Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata")

    file_replicas = {}

    from Ganga.Core.GangaThread.WorkerThreads import getQueues

    for i in inputs:
        #logging.debug( "getting metadata: %s" % str(i.lfn) )
        getQueues().add(i.getReplicas)

    logger.info("Requesting LFN replica info")

    # This finds all replicas for all LFNs...
    # This will probably struggle for LFNs which don't exist
    all_lfns = [i.locations for i in inputs]
    while [] in all_lfns:
        import time
        time.sleep(0.5)
        all_lfns = [i.locations for i in inputs]

    logger.info("Got replicas")

    for i in inputs:
        file_replicas[i.lfn] = i.locations
        #logger.info( "%s" % str( i.accessURL() ) )

    logger.debug("found all replicas")

    super_dict = dict()
    for lfn, repz in file_replicas.iteritems():
        sitez = set([])
        for i in repz:
            # print i
            sitez.add(i)
        super_dict[lfn] = sitez

    allSubSets = []
    allChosenSets = {}

    logger.info("Determining overlap")

    import random
    for i in super_dict.keys():

        # Randomly Select 2 SE as the starting point for spliting jobs
        if len(super_dict[i]) > 2:
            req_sitez = set([])
            chosen = random.sample(super_dict[i], 2)
            for s in chosen:
                req_sitez.add(s)
        # Keep the 2 or less SE as the SE of choice
        else:
            req_sitez = set([])
            for s in super_dict[i]:
                req_sitez.add(s)

        allChosenSets[i] = req_sitez

    logger.debug("Found all SE in use")

    Tier1Sites = set([])

    for i in super_dict.keys():

        req_sitez = allChosenSets[i]
        _this_subset = []

        # Starting with i, populate subset with LFNs which have an
        # overlap of at least 2 SE

        for k in super_dict.keys():
            if req_sitez.issubset(super_dict[k]):
                if len(_this_subset) >= filesPerJob:
                    break
                _this_subset.append(str(k))
                super_dict.pop(k)

        if len(_this_subset) > 0:
            allSubSets.append(_this_subset)

    split_files = allSubSets

    logger.info("Created %s subsets" % str(len(split_files)))

    #logger.info( "Split Files: %s" % str(split_files) )

    for dataset in split_files:
        yield dataset
Example #19
0
def diracAPI_async(cmd, timeout=120):
    '''
    Execute DIRAC API commands from w/in Ganga.
    '''
    from Ganga.Core.GangaThread.WorkerThreads import getQueues
    return getQueues().add(execute(cmd, timeout=timeout))
Example #20
0
    def updateMonitoringInformation(_jobs):
        """Check the status of jobs and retrieve output sandboxes"""
        # Only those jobs in 'submitted','running' are passed in here for checking
        # if however they have already completed in Dirac they may have been put on queue
        # for processing from last time. These should be put back on queue without
        # querying dirac again. Their signature is status = running and job.backend.status
        # already set to Done or Failed etc.

        jobs = [stripProxy(j) for j in _jobs]

        logger = getLogger()

        # make sure proxy is valid
        if not _proxyValid():
            if DiracBase.dirac_monitoring_is_active:
                logger.warning('DIRAC monitoring inactive (no valid proxy found).')
                DiracBase.dirac_monitoring_is_active = False
            return
        else:
            DiracBase.dirac_monitoring_is_active = True

        # remove from consideration any jobs already in the queue. Checking this non persisted attribute
        # is better than querying the queue as cant tell if a job has just been taken off queue and is being processed
        # also by not being persistent, this attribute automatically allows queued jobs from last session to be considered
        # for requeing
        interesting_jobs = [j for j in jobs if not j.been_queued]
        # status that correspond to a ganga 'completed' or 'failed' (see DiracCommands.status(id))
        # if backend status is these then the job should be on the queue
        queueable_dirac_statuses = configDirac['queueable_dirac_statuses']

        monitor_jobs = [j for j in interesting_jobs if j.backend.status not in queueable_dirac_statuses]
        requeue_jobs = [j for j in interesting_jobs if j.backend.status in queueable_dirac_statuses]

        logger.debug('Interesting jobs: ' + repr([j.fqid for j in interesting_jobs]))
        logger.debug('Monitor jobs    : ' + repr([j.fqid for j in monitor_jobs]))
        logger.debug('Requeue jobs    : ' + repr([j.fqid for j in requeue_jobs]))

        from Ganga.Core.GangaThread.WorkerThreads import getQueues

        from Ganga.Core import monitoring_component

        # requeue existing completed job
        for j in requeue_jobs:
            if j.been_queued:
                continue

            if monitoring_component:
                if monitoring_component.should_stop():
                    break
            getQueues()._monitoring_threadpool.add_function(DiracBase.job_finalisation,
                                                       args=(j, queueable_dirac_statuses[j.backend.status]),
                                                       priority=5, name="Job %s Finalizing" % j.fqid)
            j.been_queued = True

        # now that can submit in non_blocking mode, can see jobs in submitting
        # that have yet to be assigned an id so ignore them
        # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE
        # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT
        # dirac_job_ids    = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ]
        # Correction this did become a problem for a crashed session during
        # submit, see #104454
        dead_jobs = (j for j in monitor_jobs if j.backend.id is None)
        for d in dead_jobs:
            d.updateStatus('failed')
            if d.master is not None:
                d.master.updateMasterJobStatus()

        ganga_job_status = [j.status for j in monitor_jobs if j.backend.id is not None]
        dirac_job_ids = [j.backend.id for j in monitor_jobs if j.backend.id is not None]

        logger.debug("GangaStatus: %s" % str(ganga_job_status))
        logger.debug("diracJobIDs: %s" % str(dirac_job_ids))

        if not dirac_job_ids:
            ## Nothing to do here stop bugging DIRAC about it!
            ## Everything else beyond here in the function depends on some ids present here, no ids means we can stop.
            return

        statusmapping = configDirac['statusmapping']

        result = execute('status(%s, %s)' %( str(dirac_job_ids), repr(statusmapping)))

        if len(result) != len(ganga_job_status):
            logger.warning('Dirac monitoring failed for %s, result = %s' % (
                str(dirac_job_ids), str(result)))
            return


        thread_handled_states = ['completed', 'failed']
        for job, state, old_state in zip(monitor_jobs, result, ganga_job_status):
            if monitoring_component:
                if monitoring_component.should_stop():
                    break

            if job.been_queued:
                continue

            job.backend.statusInfo = state[0]
            job.backend.status = state[1]
            job.backend.actualCE = state[2]
            updated_dirac_status = state[3]
            try:
                job.backend.extraInfo = state[4]
            except Exception as err:
                logger.debug("gxception: %s" % str(err))
                pass
            logger.debug('Job status vector  : ' + job.fqid + ' : ' + repr(state))

            # Is this really catching a real problem?
            if job.status != old_state:
                logger.warning('User changed Ganga job status from %s -> %s' % (str(old_state), job.status))
                continue
            ####################

            if updated_dirac_status == job.status:
                continue

            if updated_dirac_status in thread_handled_states:
                if job.status != 'running':
                    DiracBase._getStateTime(job, 'running')
                    if job.status in ['removed', 'killed']:
                        continue
                    if (job.master and job.master.status in ['removed', 'killed']):
                        continue  # user changed it under us
                    job.updateStatus('running')
                    if job.master:
                        job.master.updateMasterJobStatus()

                if job.been_queued:
                    continue

                getQueues()._monitoring_threadpool.add_function(DiracBase.job_finalisation,
                                                           args=(job, updated_dirac_status),
                                                           priority=5, name="Job %s Finalizing" % job.fqid)
                job.been_queued = True

            else:
                DiracBase._getStateTime(job, updated_dirac_status)
                if job.status in ['removed', 'killed']:
                    continue
                if (job.master and job.master.status in ['removed', 'killed']):
                    continue  # user changed it under us
                job.updateStatus(updated_dirac_status)
                if job.master:
                    job.master.updateMasterJobStatus()
Example #21
0
def diracAPI_async(cmd, timeout=120):
    '''
    Execute DIRAC API commands from w/in Ganga.
    '''
    from Ganga.Core.GangaThread.WorkerThreads import getQueues
    return getQueues().add(execute(cmd, timeout=timeout))
Example #22
0
    def master_updateMonitoringInformation(jobs):
        """ Update monitoring information for  jobs: jobs is a list of
        jobs  in   this  backend  which   require  monitoring  (either
        'submitted' or 'running' state).  The jobs list never contains
        the subjobs.

        The default implementation  iterates  over subjobs and calls
        updateMonitoringInformation().
        """

        from Ganga.Core import monitoring_component
        was_monitoring_running = monitoring_component and monitoring_component.isEnabled(False)

        logger.debug("Running Monitoring for Jobs: %s" % [j.getFQID('.') for j in jobs])

        ## Only process 10 files from the backend at once
        #blocks_of_size = 10
        poll_config = getConfig('PollThread')
        try:
            blocks_of_size = poll_config['numParallelJobs']
        except Exception as err:
            logger.debug("Problem with PollThread Config, defaulting to block size of 5 in master_updateMon...")
            logger.debug("Error: %s" % err)
            blocks_of_size = 5
        ## Separate different backends implicitly
        simple_jobs = {}

        multiThreadMon = poll_config['enable_multiThreadMon']

        # FIXME Add some check for (sub)jobs which are in a transient state but
        # are not locked by an active session of ganga

        queues = getQueues()

        for j in jobs:
            ## All subjobs should have same backend
            if len(j.subjobs) > 0:
                #logger.info("Looking for sj")
                monitorable_subjob_ids = []

                if isType(j.subjobs, SubJobXMLList):
                    cache = j.subjobs.getAllCachedData()
                    for sj_id in range(0,len(j.subjobs)):
                        if cache[sj_id]['status'] in ['submitted', 'running']:
                            if j.subjobs.isLoaded(sj_id):
                                ## SJ may have changed from cache in memory
                                this_sj = j.subjobs(sj_id)
                                if this_sj.status in ['submitted', 'running']:
                                    monitorable_subjob_ids.append(sj_id)
                            else:
                                monitorable_subjob_ids.append(sj_id)
                else:
                    for sj in j.subjobs:
                        if sj.status in ['submitted', 'running']:
                            monitorable_subjob_ids.append(sj.id)

                #logger.info('Monitoring subjobs: %s', monitorable_subjob_ids)

                if not monitorable_subjob_ids:
                    continue

                #logger.info("Dividing")

                monitorable_blocks = []
                temp_block = []

                for this_sj_id in monitorable_subjob_ids:
                    temp_block.append(this_sj_id)
                    if len(temp_block) == blocks_of_size:
                        monitorable_blocks.append(temp_block)
                        temp_block = []

                if temp_block:
                    monitorable_blocks.append(temp_block)
                    temp_block = []

                for this_block in monitorable_blocks:

                    # If the monitoring function was running at the start of the function but has since stopped, break.
                    if was_monitoring_running and monitoring_component and not monitoring_component.isEnabled(False) or not monitoring_component:
                        break

                    try:
                        subjobs_to_monitor = []
                        for sj_id in this_block:
                            subjobs_to_monitor.append(j.subjobs[sj_id])
                        if multiThreadMon:
                            if queues.totalNumIntThreads() < getConfig("Queues")['NumWorkerThreads']:
                                queues._addSystem(j.backend.updateMonitoringInformation, args=(subjobs_to_monitor,), name="Backend Monitor")
                        else:
                            j.backend.updateMonitoringInformation(subjobs_to_monitor)
                    except Exception as err:
                        logger.error("Monitoring Error: %s" % err)

                j.updateMasterJobStatus()

            else:
                backend_name = getName(j.backend)
                if backend_name not in simple_jobs:
                    simple_jobs[backend_name] = []
                simple_jobs[backend_name].append(j)

        if len(simple_jobs) > 0:
            for this_backend in simple_jobs.keys():
                logger.debug('Monitoring jobs: %s', repr([jj._repr() for jj in simple_jobs[this_backend]]))
                if multiThreadMon:
                    if queues.totalNumIntThreads() < getConfig("Queues")['NumWorkerThreads']:
                        queues._addSystem(stripProxy(simple_jobs[this_backend][0].backend).updateMonitoringInformation,
                                          args=(simple_jobs[this_backend],), name="Backend Monitor")
                else:
                    stripProxy(simple_jobs[this_backend][0].backend).updateMonitoringInformation(simple_jobs[this_backend])

        logger.debug("Finished Monitoring request")

        if not multiThreadMon:
            return

        loop = True
        while loop:
            for stat in queues._monitoring_threadpool.worker_status():
                loop = False;
                if stat[0] is not None and stat[0].startswith("Backend Monitor"):
                    loop = True;
                    break;
            time.sleep(1.)
Example #23
0
    def master_submit(self, rjobs, subjobconfigs, masterjobconfig, keep_going=False, parallel_submit=False):
        """  Submit   the  master  job  and  all   its  subjobs.   The
        masterjobconfig  is  shared,  individual  subjob  configs  are
        defined  in  subjobconfigs.   Submission  of  individual  jobs
        (not-split) also  always goes via  this method.  In  that case
        the subjobconfigs contains just one element - the job itself.

        The default  implementation of  this method emulates  the bulk
        submission  calling  a submit()  method  on individual  subjob
        objects.  If submission  of any of the subjobs  fails then the
        whole   process  is  aborted   with  IncompleteSubmissionError
        exception. The subjobs which  have already been submitted stay
        submitted.

        The default implementation does not process the masterjobconfig.
        Therefore this method may be overriden in the derived class
        in the following way:

        def master_submit(self,masterjobconfig,subjobconfigs,keep_going):
           ... 
           do_some_processsing_of(masterjobconfig)
           ...
           return IBackend.master_submit(self,subjobconfigs,masterjobconfig,keep_joing)


        Implementation note: we set keep_going to be optional in the
        signature of IBackend.master_submit() to allow the existing
        backend implementations which do not support keep_going=True
        and which at some point may call IBackend.master_submit() to
        work without change. It may sometimes be non-trivial to enable
        support for keep_going=True in some backends, even if the
        finally call IBackend.master_submit(). Therefore it is left to
        the decision of backend developer to explicitly enable the
        support for keep_going flag.

        """
        from Ganga.Utility.logging import log_user_exception

        logger.debug("SubJobConfigs: %s" % len(subjobconfigs))
        logger.debug("rjobs: %s" % len(rjobs))
        assert(implies(rjobs, len(subjobconfigs) == len(rjobs)))

        incomplete = 0
        incomplete_subjobs = []

        def handleError(x):
            if keep_going:
                incomplete_subjobs.append(fqid)
                return False
            else:
                if incomplete:
                    raise x
                else:
                    return True

        master_input_sandbox = self.master_prepare(masterjobconfig)
        # Shall we submit in parallel
        if parallel_submit:

            from Ganga.Core.GangaThread.WorkerThreads import getQueues

            threads_before = getQueues().totalNumIntThreads()

            for sc, sj in zip(subjobconfigs, rjobs):

                b = sj.backend

                # Must check for credentials here as we cannot handle missing credentials on Queues by design!
                if hasattr(b, 'credential_requirements') and b.credential_requirements is not None:
                    from Ganga.GPIDev.Credentials.CredentialStore import credential_store
                    try:
                        cred = credential_store[b.credential_requirements]
                    except GangaKeyError:
                        credential_store.create(b.credential_requirements)

                fqid = sj.getFQID('.')
                # FIXME would be nice to move this to the internal threads not user ones
                getQueues()._monitoring_threadpool.add_function(self._parallel_submit, (b, sj, sc, master_input_sandbox, fqid, logger), callback_func = self._successfulSubmit, callback_args = (sj, incomplete_subjobs))

            def subjob_status_check(rjobs):
                has_submitted = True
                for sj in rjobs:
                    if sj.status not in ["submitted","failed","completed","running","completing"] and sj.getFQID('.') not in incomplete_subjobs:
                        has_submitted = False
                        break
                return has_submitted

            while not subjob_status_check(rjobs):
                import time
                time.sleep(1.)

            if incomplete_subjobs:
                raise IncompleteJobSubmissionError(
                    incomplete_subjobs, 'submission failed for subjobs %s' % incomplete_subjobs)
            return 1

        # Alternatively submit sequentially
        for sc, sj in zip(subjobconfigs, rjobs):

            fqid = sj.getFQID('.')
            logger.info("submitting job %s to %s backend", fqid, getName(sj.backend))
            try:
                b = stripProxy(sj.backend)
                sj.updateStatus('submitting')
                if b.submit(sc, master_input_sandbox):
                    sj.updateStatus('submitted')
                    # sj._commit() # PENDING: TEMPORARY DISABLED
                    incomplete = 1
                    stripProxy(sj.info).increment()
                else:
                    if handleError(IncompleteJobSubmissionError(fqid, 'submission failed')):
                        raise IncompleteJobSubmissionError(fqid, 'submission failed')
            except Exception as x:
                sj.updateStatus('new')
                if isType(x, GangaException):
                    logger.error("%s" % x)
                    log_user_exception(logger, debug=True)
                else:
                    log_user_exception(logger, debug=False)
                raise IncompleteJobSubmissionError(fqid, 'submission failed')

        return 1
Example #24
0
    def update(self):
        """Update the unit and (re)submit jobs as required"""

        # if we're complete, then just return
        if self.status in ["completed", "recreating"] or not self.active:
            return 0

        # check if submission is needed
        task = self._getParent()._getParent()
        trf = self._getParent()
        maxsub = task.n_tosub()

        # check parent unit(s)
        req_ok = self.checkParentUnitsAreComplete()

        # set the start time if not already set
        if len(self.req_units) > 0 and req_ok and self.start_time == 0:
            self.start_time = time.time() + trf.chain_delay * 60 - 1

        if req_ok and self.checkForSubmission() and maxsub > 0:

            # create job and submit
            addInfoString(self, "Creating Job...")
            j = self.createNewJob()
            if j.name == '':
                j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID())

            try:
                if trf.submit_with_threads:
                    addInfoString(self,
                                  "Attempting job submission with queues...")
                    from Ganga.Core.GangaThread.WorkerThreads import getQueues
                    getQueues().add(j.submit)
                else:
                    addInfoString(self, "Attempting job submission...")
                    j.submit()

            except Exception as err:
                logger.debug("update Err: %s" % str(err))
                addInfoString(self, "Failed Job Submission")
                addInfoString(self, "Reason: %s" % (formatTraceback()))
                logger.error("Couldn't submit the job. Deactivating unit.")
                self.prev_job_ids.append(j.id)
                self.active = False
                trf._setDirty()  # ensure everything's saved
                return 1

            self.active_job_ids.append(j.id)
            self.updateStatus("running")
            trf._setDirty()  # ensure everything's saved

            if trf.submit_with_threads:
                return 0

            return 1

        # update any active jobs
        for jid in self.active_job_ids:

            # we have an active job so see if this job is OK and resubmit if
            # not
            try:
                job = getJobByID(jid)
            except Exception as err:
                logger.debug("Update2 Err: %s" % str(err))
                logger.warning(
                    "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)"
                    % (jid, task.id, trf.getID(), self.getID()))
                continue

            if job.status == "completed":

                # check if actually completed
                if not self.checkCompleted(job):
                    return 0

                # check for DS copy
                if trf.unit_copy_output:
                    if not self.copy_output:
                        trf.createUnitCopyOutputDS(self.getID())

                    if not self.copyOutput():
                        return 0

                # check for merger
                if trf.unit_merger:
                    if not self.merger:
                        self.merger = trf.createUnitMerger(self.getID())

                    if not self.merge():
                        return 0

                # all good so mark unit as completed
                self.updateStatus("completed")

            elif job.status == "failed" or job.status == "killed":

                # check for too many resubs
                if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1:
                    logger.error(
                        "Too many resubmits (%i). Deactivating unit." %
                        (self.minor_resub_count + self.major_resub_count))
                    addInfoString(
                        self, "Deactivating unit. Too many resubmits (%i)" %
                        (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                rebroker = False

                if self.minor_resub_count > trf.minor_run_limit - 1:
                    if self._getParent().rebroker_on_job_fail:
                        rebroker = True
                    else:
                        logger.error(
                            "Too many minor resubmits (%i). Deactivating unit."
                            % self.minor_resub_count)
                        addInfoString(
                            self,
                            "Deactivating unit. Too many resubmits (%i)" %
                            (self.minor_resub_count + self.minor_resub_count))
                        self.active = False
                        return 0

                if self.major_resub_count > trf.major_run_limit - 1:
                    logger.error(
                        "Too many major resubmits (%i). Deactivating unit." %
                        self.major_resub_count)
                    addInfoString(
                        self, "Deactivating unit. Too many resubmits (%i)" %
                        (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                # check the type of resubmit
                if rebroker or self.checkMajorResubmit(job):

                    self.major_resub_count += 1
                    self.minor_resub_count = 0

                    try:
                        addInfoString(self, "Attempting major resubmit...")
                        self.majorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err3: %s" % str(err))
                        logger.error(
                            "Couldn't resubmit the job. Deactivating unit.")
                        addInfoString(self, "Failed Job resubmission")
                        addInfoString(self, "Reason: %s" % (formatTraceback()))
                        self.active = False

                    # break the loop now because we've probably changed the
                    # active jobs list
                    return 1
                else:
                    self.minor_resub_count += 1
                    try:
                        addInfoString(self, "Attempting minor resubmit...")
                        self.minorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err4: %s" % str(err))
                        logger.error(
                            "Couldn't resubmit the job. Deactivating unit.")
                        addInfoString(self, "Failed Job resubmission")
                        addInfoString(self, "Reason: %s" % (formatTraceback()))
                        self.active = False
                        return 1
Example #25
0
    def master_submit(self, rjobs, subjobconfigs, masterjobconfig, keep_going=False, parallel_submit=False):
        """  Submit   the  master  job  and  all   its  subjobs.   The
        masterjobconfig  is  shared,  individual  subjob  configs  are
        defined  in  subjobconfigs.   Submission  of  individual  jobs
        (not-split) also  always goes via  this method.  In  that case
        the subjobconfigs contains just one element - the job itself.

        The default  implementation of  this method emulates  the bulk
        submission  calling  a submit()  method  on individual  subjob
        objects.  If submission  of any of the subjobs  fails then the
        whole   process  is  aborted   with  IncompleteSubmissionError
        exception. The subjobs which  have already been submitted stay
        submitted.

        The default implementation does not process the masterjobconfig.
        Therefore this method may be overriden in the derived class
        in the following way:

        def master_submit(self,masterjobconfig,subjobconfigs,keep_going):
           ... 
           do_some_processsing_of(masterjobconfig)
           ...
           return IBackend.master_submit(self,subjobconfigs,masterjobconfig,keep_joing)


        Implementation note: we set keep_going to be optional in the
        signature of IBackend.master_submit() to allow the existing
        backend implementations which do not support keep_going=True
        and which at some point may call IBackend.master_submit() to
        work without change. It may sometimes be non-trivial to enable
        support for keep_going=True in some backends, even if the
        finally call IBackend.master_submit(). Therefore it is left to
        the decision of backend developer to explicitly enable the
        support for keep_going flag.

        """
        from Ganga.Core import IncompleteJobSubmissionError, GangaException
        from Ganga.Utility.logging import log_user_exception

        logger.debug("SubJobConfigs: %s" % len(subjobconfigs))
        logger.debug("rjobs: %s" % len(rjobs))
        assert implies(rjobs, len(subjobconfigs) == len(rjobs))

        incomplete = 0
        incomplete_subjobs = []

        def handleError(x):
            if keep_going:
                incomplete_subjobs.append(fqid)
                return False
            else:
                if incomplete:
                    raise x
                else:
                    return True

        master_input_sandbox = self.master_prepare(masterjobconfig)

        if parallel_submit:

            from Ganga.Core.GangaThread.WorkerThreads import getQueues

            threads_before = getQueues().totalNumIntThreads()

            for sc, sj in zip(subjobconfigs, rjobs):

                fqid = sj.getFQID(".")
                b = sj.backend
                # FIXME would be nice to move this to the internal threads not user ones
                # from Ganga.GPIDev.Base.Proxy import stripProxy
                getQueues()._monitoring_threadpool.add_function(
                    self._parallel_submit, (b, sj, sc, master_input_sandbox, fqid, logger)
                )

            def subjob_status_check(rjobs):
                has_submitted = True
                for sj in rjobs:
                    if sj.status not in ["submitted", "failed", "completed", "running", "completing"]:
                        has_submitted = False
                        break
                return has_submitted

            while not subjob_status_check(rjobs):
                import time

                time.sleep(1.0)

            for i in rjobs:
                if i.status in ["new", "failed"]:
                    return 0
            return 1

        for sc, sj in zip(subjobconfigs, rjobs):

            fqid = sj.getFQID(".")
            logger.info("submitting job %s to %s backend", fqid, getName(sj.backend))
            try:
                b = stripProxy(sj.backend)
                sj.updateStatus("submitting")
                if b.submit(sc, master_input_sandbox):
                    sj.updateStatus("submitted")
                    # sj._commit() # PENDING: TEMPORARY DISABLED
                    incomplete = 1
                    stripProxy(sj.info).increment()
                else:
                    if handleError(IncompleteJobSubmissionError(fqid, "submission failed")):
                        return 0
            except Exception as x:
                # sj.updateStatus('new')
                if isType(x, GangaException):
                    logger.error("%s" % x)
                    log_user_exception(logger, debug=True)
                else:
                    log_user_exception(logger, debug=False)
                if handleError(IncompleteJobSubmissionError(fqid, str(x))):
                    return 0

        if incomplete_subjobs:
            raise IncompleteJobSubmissionError(incomplete_subjobs, "submission failed")

        return 1
def calculateSiteSEMapping(file_replicas, uniqueSE, site_to_SE_mapping, SE_to_site_mapping, bannedSites, ignoremissing):
    """
    If uniqueSE:
        This constructs 2 dicts which allow for going between SE and sites based upon a key/value lookup.
        This generates this knowledge from looping through the LFN replicas at given sites and given SE.
    else:
        Don't construct a site<->SE mapping as it's not needed

    This returns a dict of which sites each LFN are accessible at

    Args:
        file_replicas (dict): This is the dictionary of LFN replicas with LFN as the key
        site_to_SE_mapping (dict): Dict which has sites as keys and SE as values
        SE_to_site_mapping (dict): Dict which has sites as values and SE as keys
        bannedSites (list) : List which has the sites banned by the job
        ignoremissing (bool) : Bool for whether to continue if an LFN has no available SEs

    Returns:
        site_dict (dict): Dict of {'LFN':set([sites]), ...}
    """

    SE_dict = dict()
    maps_size = 0
    found = []

    logger.info("Calculating site<->SE Mapping")

    # First find the SE for each site
    for lfn, repz in file_replicas.iteritems():
        sitez = set([])
        if uniqueSE:
            for replica in repz:
                sitez.add(replica)
                if not replica in found:
                    getQueues()._monitoring_threadpool.add_function(addToMapping, (str(replica), site_to_SE_mapping))

                    maps_size = maps_size + 1
                    found.append(replica)

        SE_dict[lfn] = sitez

    # Doing this in parallel so wait for it to finish
    while len(site_to_SE_mapping) != maps_size:
        time.sleep(0.1)

    for iSE in site_to_SE_mapping.keys():
        for site in site_to_SE_mapping[iSE]:
            if any(site == item for item in bannedSites):
                site_to_SE_mapping[iSE].remove(site)
        if not site_to_SE_mapping[iSE]:
            del site_to_SE_mapping[iSE]

    if uniqueSE:
        # Now calculate the 'inverse' dictionary of site for each SE
        for _SE, _sites in site_to_SE_mapping.iteritems():
            for site_i in _sites:
                if site_i not in SE_to_site_mapping:
                    SE_to_site_mapping[site_i] = set([])
                if _SE not in SE_to_site_mapping[site_i]:
                    SE_to_site_mapping[site_i].add(_SE)

    # These can be used to select the site which know of a given SE
    # Or vice versa

    # Now lets generate a dictionary of some chosen site vs LFN to use in
    # constructing subsets
    site_dict = {}
    for _lfn, sites in SE_dict.iteritems():
        site_dict[_lfn] = set([])
        for _site in sites:
            if _site in site_to_SE_mapping.keys():
                for _SE in site_to_SE_mapping[_site]:
                    site_dict[_lfn].add(_SE)
        if site_dict[_lfn] == set([]) and not ignoremissing:
            raise SplitterError('LFN %s has no site available and ignoremissing = false! Perhaps you have banned too many sites.' % str(_lfn))
        elif site_dict[_lfn] == set([]) and ignoremissing:
            logger.warning('LFN %s has no site available and ignoremissing = true! Removing this LFN from the dataset!' % str(_lfn))
            del site_dict[_lfn]

    if site_dict == {}:
        raise SplitterError('There are no LFNs in the dataset - perhaps you have banned too many sites.')
    else:
        return site_dict
def calculateSiteSEMapping(file_replicas, uniqueSE, site_to_SE_mapping, SE_to_site_mapping):
    """
    If uniqueSE:
        This constructs 2 dicts which allow for going between SE and sites based upon a key/value lookup.
        This generates this knowledge from looping through the LFN replicas at given sites and given SE.
    else:
        Don't construct a site<->SE mapping as it's not needed

    This returns a dict of which sites each LFN are accessible at

    Args:
        file_replicas (dict): This is the dictionary of LFN replicas with LFN as the key
        site_to_SE_mapping (dict): Dict which has sites as keys and SE as values
        SE_to_site_mapping (dict): Dict which has sites as values and SE as keys

    Returns:
        site_dict (dict): Dict of {'LFN':set([sites]), ...}
    """

    SE_dict = dict()
    maps_size = 0
    found = []

    logger.info("Calculating site<->SE Mapping")

    # First find the SE for each site
    for lfn, repz in file_replicas.iteritems():
        sitez = set([])
        if uniqueSE:
            for replica in repz:
                sitez.add(replica)
                if not replica in found:

                    getQueues()._monitoring_threadpool.add_function(addToMapping, (str(replica), site_to_SE_mapping))

                    maps_size = maps_size + 1
                    found.append(replica)

        SE_dict[lfn] = sitez

    # Doing this in parallel so wait for it to finish
    while len(site_to_SE_mapping) != maps_size:
        time.sleep(0.1)

    if uniqueSE:
        # Now calculate the 'inverse' dictionary of site for each SE
        for _SE, _sites in site_to_SE_mapping.iteritems():
            for site_i in _sites:
                if site_i not in SE_to_site_mapping:
                    SE_to_site_mapping[site_i] = set([])
                if _SE not in SE_to_site_mapping[site_i]:
                    SE_to_site_mapping[site_i].add(_SE)

    # These can be used to select the site which know of a given SE
    # Or vice versa

    # Now lets generate a dictionary of some chosen site vs LFN to use in
    # constructing subsets
    site_dict = {}
    for _lfn, sites in SE_dict.iteritems():
        site_dict[_lfn] = set([])
        for _site in sites:
            for _SE in site_to_SE_mapping[_site]:
                site_dict[_lfn].add(_SE)

    return site_dict
Example #28
0
def GangaDiracSplitter(inputs, filesPerJob, maxFiles, ignoremissing):
    """
    Generator that yields a datasets for dirac split jobs
    """

    split_files = []
    i = inputs.__class__()

    if len(inputs.getLFNs()) != len(inputs.files):
        raise SplittingError(
            "Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata"
        )

    file_replicas = {}

    from Ganga.Core.GangaThread.WorkerThreads import getQueues

    for i in inputs:
        #logging.debug( "getting metadata: %s" % str(i.lfn) )
        getQueues().add(i.getReplicas)

    logger.info("Requesting LFN replica info")

    # This finds all replicas for all LFNs...
    # This will probably struggle for LFNs which don't exist
    all_lfns = [i.locations for i in inputs]
    while [] in all_lfns:
        import time
        time.sleep(0.5)
        all_lfns = [i.locations for i in inputs]

    logger.info("Got replicas")

    for i in inputs:
        file_replicas[i.lfn] = i.locations
        #logger.info( "%s" % str( i.accessURL() ) )

    logger.debug("found all replicas")

    super_dict = dict()
    for lfn, repz in file_replicas.iteritems():
        sitez = set([])
        for i in repz:
            # print i
            sitez.add(i)
        super_dict[lfn] = sitez

    allSubSets = []
    allChosenSets = {}

    logger.info("Determining overlap")

    import random
    for i in super_dict.keys():

        # Randomly Select 2 SE as the starting point for spliting jobs
        if len(super_dict[i]) > 2:
            req_sitez = set([])
            chosen = random.sample(super_dict[i], 2)
            for s in chosen:
                req_sitez.add(s)
        # Keep the 2 or less SE as the SE of choice
        else:
            req_sitez = set([])
            for s in super_dict[i]:
                req_sitez.add(s)

        allChosenSets[i] = req_sitez

    logger.debug("Found all SE in use")

    Tier1Sites = set([])

    for i in super_dict.keys():

        req_sitez = allChosenSets[i]
        _this_subset = []

        # Starting with i, populate subset with LFNs which have an
        # overlap of at least 2 SE

        for k in super_dict.keys():
            if req_sitez.issubset(super_dict[k]):
                if len(_this_subset) >= filesPerJob:
                    break
                _this_subset.append(str(k))
                super_dict.pop(k)

        if len(_this_subset) > 0:
            allSubSets.append(_this_subset)

    split_files = allSubSets

    logger.info("Created %s subsets" % str(len(split_files)))

    #logger.info( "Split Files: %s" % str(split_files) )

    for dataset in split_files:
        yield dataset