def getCondorStatusConstrained(collector_names, type_constraint, constraint=None,
                               format_list=None, subsystem_name=None):
    out_status_dict = {}
    for collector in collector_names:
        full_constraint = type_constraint[0:]  # make copy
        if constraint is not None:
            full_constraint = "(%s) && (%s)" % (full_constraint, constraint)

        try:
            status = condorMonitor.CondorStatus(subsystem_name=subsystem_name,
                                                pool_name=collector)
            status.load(full_constraint, format_list)
        except condorMonitor.QueryError:
            if collector is not None:
                msg = "Condor Error. Failed to talk to collector %s: " % collector
            else:
                msg = "Condor Error. Failed to talk to collector: "
            logSupport.log.exception(msg)
            # If collector not found it is equivalent to no classads
            continue
        except RuntimeError:
            logSupport.log.exception("Runtime error. Failed to talk to collector: ")
            continue
        except Exception:
            logSupport.log.exception("Unknown error. Failed to talk to collector: ")
            continue

        if len(status.fetchStored()) > 0:
            out_status_dict[collector] = status
            
    return out_status_dict
Exemple #2
0
def get_globals_classads(factory_collector=glideFactoryInterface.DEFAULT_VAL):
    if factory_collector == glideFactoryInterface.DEFAULT_VAL:
        factory_collector = glideFactoryInterface.factoryConfig.factory_collector

    status_constraint = '(GlideinMyType=?="glideclientglobal")'

    status = condorMonitor.CondorStatus("any", pool_name=factory_collector)
    status.require_integrity(
        True)  # important, this dictates what gets submitted

    status.load(status_constraint)

    data = status.fetchStored()
    return data
Exemple #3
0
def getMonitorVMStatus(pool_name, monitorVM):
    cs = condorMonitor.CondorStatus(pool_name=pool_name)
    data = cs.fetch(constraint='(Name=="%s")' % monitorVM,
                    format_list=[('IS_MONITOR_VM', 'b'),
                                 ('HAS_MONITOR_VM', 'b'), ('State', 's'),
                                 ('Activity', 's'), ('vm2_State', 's'),
                                 ('vm2_Activity', 's'),
                                 ('GLEXEC_STARTER', 'b'),
                                 ('USES_MONITOR_STARTD', 'b'),
                                 ('GLEXEC_JOB', 'b')])
    if monitorVM not in data:
        raise RuntimeError("Monitor slot %s does not exist!" % monitorVM)

    return data[monitorVM]
def get_production_ress_entries(server, ref_dict_list):

    production_entries = []

    condor_obj = condorMonitor.CondorStatus(pool_name=server)
    condor_obj.load(
        constraint=
        '(GlueCEInfoContactString=!=UNDEFINED)&&(GlueCEStateStatus=?="Production")',
        format_list=[])
    condor_refs = condor_obj.fetchStored().keys()
    #del condor_obj

    for el in ref_dict_list:
        ref = el['ref']
        if ref in condor_refs:
            production_entries.append(el['entry_name'])

    return production_entries
Exemple #5
0
  def get_ress_data(self):
    common.logit("ReSS host: %s" % self.glidein.ress_host())
    #-- validate host ---
    if not common.url_is_valid(self.glidein.ress_host()):
      common.logerr("ReSS server (%s) in ress_host option is not valid or inaccssible." % self.glidein.ress_host())

    condor_sbin = "%s/sbin" % self.wms.condor_location()
    condor_bin  = "%s/bin" % self.wms.condor_location()
    condorExe.set_path(condor_bin, condor_sbin)
    #-- get gatekeeper data from ReSS --
    common.logit("Supported VOs: %s" % self.glidein.entry_vos())
    constraint = self.glidein.ress_vo_constraint()
    common.logit("Constraints: %s" % constraint)
    condor_obj=condorMonitor.CondorStatus(pool_name=self.glidein.ress_host())
    try:
      condor_obj.load(constraint=constraint)
      condor_data=condor_obj.fetchStored()
    except Exception as e: 
      common.logerr(e)
    del condor_obj
    return condor_data
Exemple #6
0
def getMonitorVM(pool_name, jobVM):
    cs = condorMonitor.CondorStatus(pool_name=pool_name)
    data = cs.fetch(constraint='(Name=="%s")' % jobVM,
                    format_list=[('IS_MONITOR_VM', 'b'),
                                 ('HAS_MONITOR_VM', 'b'),
                                 ('Monitoring_Name', 's')])
    if jobVM not in data:
        raise RuntimeError("Job claims it runs on %s, but cannot find it!" %
                           jobVM)
    job_data = data[jobVM]
    if ('HAS_MONITOR_VM' not in job_data) or ('IS_MONITOR_VM' not in job_data):
        raise RuntimeError("Slot %s does not support monitoring!" % jobVM)
    if not (job_data['HAS_MONITOR_VM'] == True):
        raise RuntimeError(
            "Slot %s does not support monitoring! HAS_MONITOR_VM not True." %
            jobVM)
    if not (job_data['IS_MONITOR_VM'] == False):
        raise RuntimeError(
            "Slot %s is a monitoring slot itself! Cannot monitor." % jobVM)
    if 'Monitoring_Name' not in job_data:
        raise RuntimeError("Slot %s does not publish the monitoring slot!" %
                           jobVM)

    return job_data['Monitoring_Name']
Exemple #7
0
def findWork(factory_name, glidein_name, entry_name, supported_signtypes,
             pub_key_obj=None, additional_constraints=None,
             factory_collector=DEFAULT_VAL):
    """
    Find request classAds that have my (factory, glidein name, entry name) and create the dictionary of work request information.

    @type factory_name: string
    @param factory_name: name of the factory
    @type glidein_name: string
    @param glidein_name: name of the glidein instance
    @type entry_name: string
    @param entry_name: name of the factory entry
    @type supported_signtypes: list
    @param supported_signtypes: only support one kind of signtype, 'sha1', default is None
    @type pub_key_obj: string
    @param pub_key_obj: only support 'RSA'
    @type additional_constraints: string
    @param additional_constraints: any additional constraints to include for querying the WMS collector, default is None
    
    @type factory_collector: string or None
    @param factory_collector: the collector to query, special value 'default' will get it from the global config

    @return: dictionary, each key is the name of a frontend.  Each value has a 'requests' and a 'params' key.  Both refer to classAd dictionaries.
    """

    global factoryConfig
    logSupport.log.debug("Querying collector for requests")

    if factory_collector==DEFAULT_VAL:
        factory_collector=factoryConfig.factory_collector

    status_constraint = '(GlideinMyType=?="%s") && (ReqGlidein=?="%s@%s@%s")' % (factoryConfig.client_id, entry_name, glidein_name, factory_name)

    if supported_signtypes is not None:
        status_constraint += ' && stringListMember(%s%s,"%s")' % (factoryConfig.client_web_prefix, factoryConfig.client_web_signtype_suffix, string.join(supported_signtypes, ","))

    if additional_constraints is not None:
        status_constraint = "((%s)&&(%s))" % (status_constraint, additional_constraints)

    status = condorMonitor.CondorStatus(subsystem_name="any", pool_name=factory_collector)
    status.require_integrity(True) #important, this dictates what gets submitted
    status.glidein_name = glidein_name
    status.entry_name = entry_name

    # serialize access to the Collector accross all the processes
    # these is a single Collector anyhow
    lock_fname=os.path.join(factoryConfig.lock_dir, "gfi_status.lock")
    if not os.path.exists(lock_fname): #create a lock file if needed
        try:
            fd=open(lock_fname, "w")
            fd.close()
        except:
            # could be a race condition
            pass

    fd=open(lock_fname, "r+")
    try:
        fcntl.flock(fd, fcntl.LOCK_EX)
        try:
            status.load(status_constraint)
        finally:
            fcntl.flock(fd, fcntl.LOCK_UN)
    finally:
        fd.close()

    data = status.fetchStored()

    reserved_names = ("ReqName", "ReqGlidein", "ClientName", "FrontendName", "GroupName", "ReqPubKeyID", "ReqEncKeyCode", "ReqEncIdentity", "AuthenticatedIdentity")

    out = {}

    # copy over requests and parameters
    for k in data.keys():
        kel = data[k]
        el = {"requests":{}, "web":{}, "params":{}, "params_decrypted":{}, "monitor":{}, "internals":{}}
        for (key, prefix) in (("requests", factoryConfig.client_req_prefix),
                             ("web", factoryConfig.client_web_prefix),
                             ("params", factoryConfig.glidein_param_prefix),
                             ("monitor", factoryConfig.glidein_monitor_prefix)):
            plen = len(prefix)
            for attr in kel.keys():
                if attr in reserved_names:
                    continue # skip reserved names
                if attr[:plen] == prefix:
                    el[key][attr[plen:]] = kel[attr]
        if pub_key_obj is not None:
            if 'ReqPubKeyID' in kel:
                try:
                    sym_key_obj = pub_key_obj.extract_sym_key(kel['ReqEncKeyCode'])
                except:
                    continue # bad key, ignore entry
            else:
                sym_key_obj = None # no key used, will not decrypt
        else:
            sym_key_obj = None # have no key, will not decrypt

        if sym_key_obj is not None:
            # this is verifying that the identity that the client claims to be is the identity that Condor thinks it is
            try:
                enc_identity = sym_key_obj.decrypt_hex(kel['ReqEncIdentity'])
            except:
                logSupport.log.warning("Client %s provided invalid ReqEncIdentity, could not decode. Skipping for security reasons." % k)
                continue # corrupted classad
            if enc_identity != kel['AuthenticatedIdentity']:
                logSupport.log.warning("Client %s provided invalid ReqEncIdentity(%s!=%s). Skipping for security reasons." % (k, enc_identity, kel['AuthenticatedIdentity']))
                continue # uh oh... either the client is misconfigured, or someone is trying to cheat


        invalid_classad = False
        for (key, prefix) in (("params_decrypted", factoryConfig.encrypted_param_prefix),):
            plen = len(prefix)
            for attr in kel.keys():
                if attr in reserved_names:
                    continue # skip reserved names
                if attr[:plen] == prefix:
                    el[key][attr[plen:]] = None # define it even if I don't understand the content
                    if sym_key_obj is not None:
                        try:
                            el[key][attr[plen:]] = sym_key_obj.decrypt_hex(kel[attr])
                        except:
                            invalid_classad = True
                            break  # I don't understand it -> invalid
        if invalid_classad:
            logSupport.log.warning("At least one of the encrypted parameters for client %s cannot be decoded. Skipping for security reasons." % k)
            continue  # need to go this way as I may have problems in an inner loop

        for attr in kel.keys():
            if attr in ("ClientName", "FrontendName", "GroupName", "ReqName", "LastHeardFrom", "ReqPubKeyID", "AuthenticatedIdentity"):
                el["internals"][attr] = kel[attr]

        out[k] = el

    return out
Exemple #8
0
def findGroupWork(factory_name, glidein_name, entry_names, supported_signtypes,
                  pub_key_obj=None, additional_constraints=None,
                  factory_collector=DEFAULT_VAL):
    """
    Find request classAds that have my (factory, glidein name, entries) and
    create the dictionary of dictionary of work request information.
    Example: work[entry_name][frontend] = {'params':'value', 'requests':'value}

    @type factory_name: string
    @param factory_name: name of the factory

    @type glidein_name: string
    @param glidein_name: name of the glidein instance

    @type entry_names: list
    @param entry_names: list of factory entry names

    @type supported_signtypes: list
    @param supported_signtypes: only support one kind of signtype, 'sha1', default is None

    @type pub_key_obj: string
    @param pub_key_obj: only support 'RSA', defaults to None

    @type additional_constraints: string
    @param additional_constraints: any additional constraints to include for querying the WMS collector, default is None

    @type factory_collector: string or None
    @param factory_collector: the collector to query, special value 'default' will get it from the global config

    @rtype: dict
    @return: Dictionary of work to perform. Return format is work[entry_name][frontend] = {'params':'value', 'requests':'value}
    """

    global factoryConfig

    if factory_collector==DEFAULT_VAL:
        factory_collector=factoryConfig.factory_collector

    req_glideins = ''
    for entry in entry_names:
        req_glideins = '%s@%s@%s,%s' % (entry, glidein_name,
                                        factory_name, req_glideins)
    # Strip off leading & trailing comma
    req_glideins = req_glideins.strip(',')

    status_constraint='(GlideinMyType=?="%s") && (stringListMember(ReqGlidein,"%s")=?=True)' % (factoryConfig.client_id, req_glideins)

    if (supported_signtypes is not None):
        status_constraint += ' && stringListMember(%s%s,"%s")' % \
            (factoryConfig.client_web_prefix,
             factoryConfig.client_web_signtype_suffix,
             string.join(supported_signtypes, ","))

    if (pub_key_obj is not None):
        # Get only classads that have my key or no key at all
        # Any other key will not work
        status_constraint += ' && (((ReqPubKeyID=?="%s") && (ReqEncKeyCode=!=Undefined) && (ReqEncIdentity=!=Undefined)) || (ReqPubKeyID=?=Undefined))' % pub_key_obj.get_pub_key_id()

    if (additional_constraints is not None):
        status_constraint = "(%s)&&(%s)" % (status_constraint,
                                            additional_constraints)

    status = condorMonitor.CondorStatus(subsystem_name="any", pool_name=factory_collector)
    # Important, this dictates what gets submitted
    status.require_integrity(True)
    status.glidein_name = glidein_name

    # Serialize access to the Collector accross all the processes
    # these is a single Collector anyhow
    lock_fname = os.path.join(factoryConfig.lock_dir, "gfi_status.lock")
    if not os.path.exists(lock_fname):
        # Create a lock file if needed
        try:
            fd = open(lock_fname, "w")
            fd.close()
        except:
            # could be a race condition
            pass

    fd = open(lock_fname, "r+")

    try:
        fcntl.flock(fd, fcntl.LOCK_EX)
        try:
            status.load(status_constraint)
        finally:
            fcntl.flock(fd, fcntl.LOCK_UN)
    finally:
        fd.close()

    data = status.fetchStored()

    reserved_names = ("ReqName", "ReqGlidein", "ClientName", "FrontendName",
                      "GroupName", "ReqPubKeyID", "ReqEncKeyCode",
                      "ReqEncIdentity", "AuthenticatedIdentity")

    # Output is now in the format of
    # out[entry_name][frontend]
    out = {}

    # Copy over requests and parameters

    for k in data:
        kel = data[k]
        el = {"requests":{}, "web":{}, "params":{},
              "params_decrypted":{}, "monitor":{}, "internals":{}}

        for (key, prefix) in (("requests", factoryConfig.client_req_prefix),
                             ("web", factoryConfig.client_web_prefix),
                             ("params", factoryConfig.glidein_param_prefix),
                             ("monitor", factoryConfig.glidein_monitor_prefix)):
            plen = len(prefix)
            for attr in kel:
                if attr in reserved_names:
                    # Skip reserved names
                    continue
                if attr[:plen] == prefix:
                    el[key][attr[plen:]] = kel[attr]

        # sym_key_obj will stay None if
        # 1) extract_sym_key throws exception
        # 2) kel does not contain 'ReqPubKeyID'
        # 3) pub_key_obj is None and there is no key to decrypt
        sym_key_obj = None
        if (pub_key_obj is not None) and ('ReqPubKeyID' in kel):
            try:
                sym_key_obj = pub_key_obj.extract_sym_key(kel['ReqEncKeyCode'])
            except:
                continue

        if (sym_key_obj is not None):
            # Verify that the identity the client claims to be is the
            # identity that Condor thinks it is
            try:
                enc_identity = sym_key_obj.decrypt_hex(kel['ReqEncIdentity'])
            except:
                logSupport.log.warning("Client %s provided invalid ReqEncIdentity, could not decode. Skipping for security reasons." % k)
                continue # Corrupted classad
            if enc_identity != kel['AuthenticatedIdentity']:
                logSupport.log.warning("Client %s provided invalid ReqEncIdentity(%s!=%s). Skipping for security reasons." % (k, enc_identity,
                                            kel['AuthenticatedIdentity']))
                # Either the client is misconfigured or someone is cheating
                continue

        invalid_classad = False

        for (key, prefix) in (("params_decrypted", factoryConfig.encrypted_param_prefix),):
            # TODO: useless for, only one element
            plen = len(prefix)
            for attr in kel:
                if attr in reserved_names:
                    # Skip reserved names
                    continue
                if attr[:plen] == prefix:
                    # Define it even if I don't understand the content
                    el[key][attr[plen:]] = None
                    if sym_key_obj is not None :
                        try:
                            el[key][attr[plen:]] = sym_key_obj.decrypt_hex(kel[attr])
                        except:
                            # I don't understand it -> invalid
                            invalid_classad = True
                            break

        # Continue if I have problems in an inner loop
        if invalid_classad:
            logSupport.log.warning("At least one of the encrypted parameters for client %s cannot be decoded. Skipping for security reasons."%k)
            continue

        for attr in kel:
            if attr in ("ClientName", "FrontendName", "GroupName", "ReqName",
                        "LastHeardFrom", "ReqPubKeyID", "AuthenticatedIdentity"):
                el["internals"][attr] = kel[attr]

        out[k] = el

    return workGroupByEntries(out)
Exemple #9
0
def spawn(sleep_time, advertize_rate, startup_dir, glideinDescript,
          frontendDescript, entries, restart_attempts, restart_interval):
    """
    Spawn and keep track of the entry processes. Restart them if required.
    Advertise glidefactoryglobal classad every iteration

    @type sleep_time: long
    @param sleep_time: Delay between every iteration
    @type advertize_rate: long
    @param advertize_rate: Rate at which entries advertise their classads
    @type startup_dir: String
    @param startup_dir: Path to glideinsubmit directory
    @type glideinDescript: glideFactoryConfig.GlideinDescript
    @param glideinDescript: Factory config's glidein description object
    @type frontendDescript: glideFactoryConfig.FrontendDescript
    @param frontendDescript: Factory config's frontend description object
    @type entries: list
    @param entries: Sorted list of entry names
    @type restart_interval: long
    @param restart_interval: Allowed restart interval in second
    @type restart_attempts: long
    @param restart_attempts: Number of allowed restart attempts in the interval
    """

    global STARTUP_DIR
    childs = {}

    # Number of glideFactoryEntry processes to spawn and directly relates to
    # number of concurrent condor_status processess
    #
    # NOTE: If number of entries gets too big, we may excede the shell args
    #       limit. If that becomes an issue, move the logic to identify the
    #       entries to serve to the group itself.
    #
    # Each process will handle multiple entries split as follows
    #   - Sort the entries alphabetically. Already done
    #   - Divide the list into equal chunks as possible
    #   - Last chunk may get fewer entries
    entry_process_count = 1


    starttime = time.time()
    oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime'])
    oldkey_eoltime = starttime + oldkey_gracetime

    childs_uptime={}

    factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile'])

    logSupport.log.info("Available Entries: %s" % entries)

    group_size = long(math.ceil(float(len(entries))/entry_process_count))
    entry_groups = entry_grouper(group_size, entries)

    def _set_rlimit(soft_l=None, hard_l=None):
        #set new hard and soft open file limits
        #if setting limits fails or no input parameters use inherited limits
        #from parent process 
        #nb 1.  it is possible to raise limits 
        #up to [hard_l,hard_l] but once lowered they cannot be raised
        #nb 2. it may be better just to omit calling this function at
        #all from subprocess - in which case it inherits limits from
        #parent process

        lim =  resource.getrlimit(resource.RLIMIT_NOFILE)
        if soft_l is not None or hard_l is not None:
            if not hard_l:
                hard_l = soft_l
            if not soft_l:
                soft_l=hard_l
            try:    
                new_lim = [soft_l, hard_l]
                resource.setrlimit(resource.RLIMIT_NOFILE, new_lim)
            except:
                resource.setrlimit(resource.RLIMIT_NOFILE, lim)



    try:
        for group in range(len(entry_groups)):
            entry_names = string.join(entry_groups[group], ':')
            logSupport.log.info("Starting EntryGroup %s: %s" % \
                (group, entry_groups[group]))

            # Converted to using the subprocess module
            command_list = [sys.executable,
                            os.path.join(STARTUP_DIR,
                                         "glideFactoryEntryGroup.py"),
                            str(os.getpid()),
                            str(sleep_time),
                            str(advertize_rate),
                            startup_dir,
                            entry_names,
                            str(group)]
            childs[group] = subprocess.Popen(command_list, shell=False,
                                             stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE,
                                             close_fds=True,
                                             preexec_fn=_set_rlimit)

            # Get the startup time. Used to check if the entry is crashing
            # periodically and needs to be restarted.
            childs_uptime[group] = list()
            childs_uptime[group].insert(0, time.time())

        logSupport.log.info("EntryGroup startup times: %s" % childs_uptime)

        for group in childs:
            # set it in non blocking mode
            # since we will run for a long time, we do not want to block
            for fd in (childs[group].stdout.fileno(),
                       childs[group].stderr.fileno()):
                fl = fcntl.fcntl(fd, fcntl.F_GETFL)
                fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)

        # If RemoveOldCredFreq < 0, do not do credential cleanup.
        curr_time = 0  # To ensure curr_time is always initialized
        if int(glideinDescript.data['RemoveOldCredFreq']) > 0:
            # Convert credential removal frequency from hours to seconds
            remove_old_cred_freq = int(glideinDescript.data['RemoveOldCredFreq']) * 60 * 60
            curr_time = time.time()
            update_time = curr_time + remove_old_cred_freq

            # Convert credential removal age from days to seconds
            remove_old_cred_age = int(glideinDescript.data['RemoveOldCredAge']) * 60 * 60 * 24

            # Create cleaners for old credential files
            logSupport.log.info("Adding cleaners for old credentials")
            cred_base_dir = glideinDescript.data['ClientProxiesBaseDir']
            for username in frontendDescript.get_all_usernames():
                cred_base_user = os.path.join(cred_base_dir, "user_%s" % username)
                cred_user_instance_dirname = os.path.join(cred_base_user, "glidein_%s" % glideinDescript.data['GlideinName'])
                cred_cleaner = cleanupSupport.DirCleanupCredentials(
                    cred_user_instance_dirname,
                    "(credential_*)", remove_old_cred_age)
                cleanupSupport.cred_cleaners.add_cleaner(cred_cleaner)

        iteration_basetime = time.time()
        while True:
            # retrieves WebMonitoringURL from glideclient classAd
            iteration_timecheck = time.time()
            iteration_timediff = iteration_timecheck - iteration_basetime

            if iteration_timediff >= 3600:  # every hour
                iteration_basetime = time.time()  # reset the start time
                fronmonpath = os.path.join(startup_dir, "monitor", "frontendmonitorlink.txt")
                fronmonconstraint = '(MyType=="glideclient")'
                fronmonformat_list = [('WebMonitoringURL', 's'), ('FrontendName', 's')]
                fronmonstatus = condorMonitor.CondorStatus(subsystem_name="any")
                fronmondata = fronmonstatus.fetch(constraint=fronmonconstraint, format_list=fronmonformat_list)
                fronmon_list_names = fronmondata.keys()
                if fronmon_list_names is not None:
                    urlset = set()
                    if os.path.exists(fronmonpath):
                        os.remove(fronmonpath)
                    for frontend_entry in fronmon_list_names:
                        fronmonelement = fronmondata[frontend_entry]
                        fronmonurl = fronmonelement['WebMonitoringURL'].encode('utf-8')
                        fronmonfrt = fronmonelement['FrontendName'].encode('utf-8')
                        if (fronmonfrt, fronmonurl) not in urlset:
                            urlset.add((fronmonfrt, fronmonurl))
                            with open(fronmonpath, 'w') as fronmonf:
                                fronmonf.write("%s, %s" % (fronmonfrt, fronmonurl))

            # Record the iteration start time
            iteration_stime = time.time()

            # THIS IS FOR SECURITY
            # Make sure you delete the old key when its grace is up.
            # If a compromised key is left around and if attacker can somehow
            # trigger FactoryEntry process crash, we do not want the entry
            # to pick up the old key again when factory auto restarts it.
            if time.time() > oldkey_eoltime and glideinDescript.data['OldPubKeyObj'] is not None:
                glideinDescript.data['OldPubKeyObj'] = None
                glideinDescript.data['OldPubKeyType'] = None
                try:
                    glideinDescript.remove_old_key()
                    logSupport.log.info("Removed the old public key after its grace time of %s seconds" % oldkey_gracetime)
                except:
                    # Do not crash if delete fails. Just log it.
                    logSupport.log.warning("Failed to remove the old public key after its grace time")

            # Only removing credentials in the v3+ protocol
            # Affects Corral Frontend which only supports the v3+ protocol.
            # IF freq < zero, do not do cleanup.
            if int(glideinDescript.data['RemoveOldCredFreq']) > 0 and curr_time >= update_time:
                logSupport.log.info("Checking credentials for cleanup")

                # Query queue for glideins. Don't remove proxies in use.
                try:
                    in_use_creds = glideFactoryLib.getCondorQCredentialList()
                    cleanupSupport.cred_cleaners.cleanup(in_use_creds)
                except:
                    logSupport.log.exception("Unable to cleanup old credentials")

                update_time = curr_time + remove_old_cred_freq

            curr_time = time.time()

            logSupport.log.info("Checking for credentials %s" % entries)

            # Read in the frontend globals classad
            # Do this first so that the credentials are immediately
            # available when the Entries startup
            classads = {}
            try:
                classads = glideFactoryCredentials.get_globals_classads()
            except Exception:
                logSupport.log.error("Error occurred retrieving globals classad -- is Condor running?")

            for classad_key in classads:
                classad = classads[classad_key]
                try:
                    glideFactoryCredentials.process_global(classad,
                                                           glideinDescript,
                                                           frontendDescript)
                except:
                    logSupport.log.exception("Error occurred processing the globals classads: ")


            logSupport.log.info("Checking EntryGroups %s" % childs.keys())
            for group in childs:
                entry_names = string.join(entry_groups[group], ':')
                child = childs[group]

                # empty stdout and stderr
                try:
                    tempOut = child.stdout.read()
                    if len(tempOut) != 0:
                        logSupport.log.warning("EntryGroup %s STDOUT: %s" % (group, tempOut))
                except IOError:
                    pass # ignore
                try:
                    tempErr = child.stderr.read()
                    if len(tempErr) != 0:
                        logSupport.log.warning("EntryGroup %s STDERR: %s" % (group, tempErr))
                except IOError:
                    pass  # ignore

                # look for exited child
                if child.poll():
                    # the child exited
                    logSupport.log.warning("EntryGroup %s exited. Checking if it should be restarted." % (group))
                    tempOut = child.stdout.readlines()
                    tempErr = child.stderr.readlines()

                    if is_crashing_often(childs_uptime[group],
                                         restart_interval, restart_attempts):
                        del childs[group]
                        raise RuntimeError("EntryGroup '%s' has been crashing too often, quit the whole factory:\n%s\n%s" % (group, tempOut, tempErr))
                    else:
                        # Restart the entry setting its restart time
                        logSupport.log.warning("Restarting EntryGroup %s." % (group))
                        del childs[group]

                        command_list = [sys.executable,
                                        os.path.join(STARTUP_DIR,
                                                     "glideFactoryEntryGroup.py"),
                                        str(os.getpid()),
                                        str(sleep_time),
                                        str(advertize_rate),
                                        startup_dir,
                                        entry_names,
                                        str(group)]
                        childs[group] = subprocess.Popen(command_list,
                                                         shell=False,
                                                         stdout=subprocess.PIPE,
                                                         stderr=subprocess.PIPE,
                                                         close_fds=True,
                                                         preexec_fn=_set_rlimit)                                                         

                        if len(childs_uptime[group]) == restart_attempts:
                            childs_uptime[group].pop(0)
                        childs_uptime[group].append(time.time())
                        for fd in (childs[group].stdout.fileno(),
                                   childs[group].stderr.fileno()):
                            fl = fcntl.fcntl(fd, fcntl.F_GETFL)
                            fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
                        logSupport.log.warning("EntryGroup startup/restart times: %s" % (childs_uptime,))

            # Aggregate Monitoring data periodically
            logSupport.log.info("Aggregate monitoring data")
            stats = aggregate_stats(factory_downtimes.checkDowntime())
            save_stats(stats, os.path.join(startup_dir, glideFactoryConfig.factoryConfig.aggregated_stats_file))

            # Aggregate job data periodically
            if glideinDescript.data.get('AdvertisePilotAccounting', False) in ['True', '1']:   # data attributes are strings
                logSupport.log.info("Starting updating job classads")
                update_classads()
                logSupport.log.info("Finishing updating job classads")

            # Advertise the global classad with the factory keys and Factory statistics
            try:
                # KEL TODO need to add factory downtime?
                glideFactoryInterface.advertizeGlobal(
                    glideinDescript.data['FactoryName'],
                    glideinDescript.data['GlideinName'],
                    glideFactoryLib.factoryConfig.supported_signtypes,
                    glideinDescript.data['PubKeyObj']
                    )
            except Exception as e:
                logSupport.log.exception("Error advertising global classads: %s" % e)

            cleanupSupport.cleaners.cleanup()

            iteration_etime = time.time()
            iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime)
            if iteration_sleep_time < 0:
                iteration_sleep_time = 0
            logSupport.log.info("Sleep %s secs" % iteration_sleep_time)
            time.sleep(iteration_sleep_time)

        # end while 1:

    finally:
        # cleanup at exit
        logSupport.log.info("Received signal...exit")
        try:
            try:
                clean_exit(childs)
            except:
                # if anything goes wrong, hardkill the rest
                for group in childs:
                    logSupport.log.info("Hard killing EntryGroup %s" % group)
                    try:
                        os.kill(childs[group].pid, signal.SIGKILL)
                    except OSError:
                        pass # ignore dead clients
        finally:
            logSupport.log.info("Deadvertize myself")
            try:
                glideFactoryInterface.deadvertizeFactory(
                    glideinDescript.data['FactoryName'],
                    glideinDescript.data['GlideinName'])
            except:
                logSupport.log.exception("Factory deadvertize failed!")
            try:
                glideFactoryInterface.deadvertizeFactoryClientMonitoring(
                    glideinDescript.data['FactoryName'],
                    glideinDescript.data['GlideinName'])
            except:
                logSupport.log.exception("Factory Monitoring deadvertize failed!")
        logSupport.log.info("All EntryGroups should be terminated")
Exemple #10
0
def main():
    opts = get_opts()

    pool_name = opts.pool_name
    constraint = opts.constraint
    want_gk = opts.want_gk
    want_gc = opts.want_gc
    want_monitor = opts.want_monitor
    want_bench = opts.want_bench
    want_glexec = opts.want_glexec
    total_only = opts.total_only
    summarize = 'entry'
    if opts.summarize_site:
        summarize = 'size'

    if not want_monitor:
        if constraint is None:
            constraint = 'IS_MONITOR_VM =!= TRUE'
        else:
            constraint = '(%s) && (IS_MONITOR_VM =!= TRUE)' % constraint

    format_list = [('Machine', 's'), ('State', 's'), ('Activity', 's'),
                   ('GLIDEIN_Site', 's'), ('GLIDEIN_Factory', 's'),
                   ('GLIDEIN_Name', 's'), ('GLIDEIN_Entry_Name', 's'),
                   ('EnteredCurrentActivity', 'i')]
    attrs = [
        'State', 'Activity', 'GLIDEIN_Site', 'GLIDEIN_Factory', 'GLIDEIN_Name',
        'GLIDEIN_Entry_Name', 'EnteredCurrentActivity'
    ]

    if want_gk:
        format_list.append(('GLIDEIN_Gatekeeper', 's'))
        format_list.append(('GLIDEIN_GridType', 's'))
        attrs.append('GLIDEIN_Gatekeeper')
        attrs.append('GLIDEIN_GridType')

    if want_gc:
        format_list.append(('GLIDEIN_ClusterId', 'i'))
        format_list.append(('GLIDEIN_ProcId', 'i'))
        format_list.append(('GLIDEIN_Schedd', 's'))
        attrs.append('GLIDEIN_ClusterId')
        attrs.append('GLIDEIN_ProcId')
        attrs.append('GLIDEIN_Schedd')

    if want_glexec:
        format_list.append(('GLEXEC_STARTER', 'b'))
        format_list.append(('GLEXEC_JOB', 'b'))
        attrs.append('GLEXEC_STARTER')
        attrs.append('GLEXEC_JOB')

    if want_bench:
        format_list.append(('KFlops', 'i'))
        format_list.append(('Mips', 'i'))
        attrs.append('KFlops')
        attrs.append('Mips')

    cs = condorMonitor.CondorStatus(pool_name=pool_name)
    cs.load(constraint=constraint, format_list=format_list)

    global data
    data = cs.stored_data
    keys = data.keys()

    keys.sort(machine_cmp)

    counts_header = ('Total', 'Owner', 'Claimed/Busy', 'Claimed/Retiring',
                     'Claimed/Other', 'Unclaimed', 'Matched', 'Other')

    if want_bench:
        counts_header += ('GFlops', '  GIPS')

    print_mask = "%-39s %-9s"
    if want_gk:
        print_mask += " %-5s %-43s"
    print_mask += " %-19s %-19s"
    if want_gc:
        print_mask += " %-39s %-14s"
    if want_glexec:
        print_mask += " %-7s"
    if want_bench:
        print_mask += " %-5s %-5s"
    print_mask += " %-9s %-8s %-10s"

    header = ('Name', 'Site')
    if want_gk:
        header += ('Grid', 'Gatekeeper')
    header += ('Factory', 'Entry')
    if want_gc:
        header += ('GlideSchedd', 'GlideCluster')
    if want_glexec:
        header += ('gLExec', )
    if want_bench:
        header += ('MFlop', 'Mips')
    header += ('State', 'Activity', 'ActvtyTime')

    if not total_only:
        print()
        print(print_mask % header)
        print()

    counts = {'Total': {}}
    for c in counts_header:
        counts['Total'][c] = 0

    for vm_name in keys:
        el = data[vm_name]

        cel = {
        }  # this will have all the needed attributes (??? if nothing else)
        for a in attrs:
            if a in el:
                cel[a] = el[a]
            else:
                cel[a] = '???'
        if cel['EnteredCurrentActivity'] != '???':
            cel['EnteredCurrentActivity'] = fmt_time(
                long(cel['EnteredCurrentActivity']))

        state = cel['State']
        activity = cel['Activity']

        if 'KFlops' in el:
            gflops = (el['KFlops'] * 1.e-6)
            mflops_str = "%i" % (el['KFlops'] / 1000)
        else:
            mflops = 0.0
            mflops_str = "???"

        if 'Mips' in el:
            gips = el['Mips'] * 1.e-3
            mips_str = el['Mips']
        else:
            mips = 0.0
            mips_str = "???"

        if summarize == 'site':
            sum_str = cel['GLIDEIN_Site']
        else:
            sum_str = "%s@%s@%s" % (cel['GLIDEIN_Entry_Name'],
                                    cel['GLIDEIN_Name'],
                                    cel['GLIDEIN_Factory'])
        if sum_str not in counts:
            counts[sum_str] = {}
            for c in counts_header:
                counts[sum_str][c] = 0

        for t in ('Total', sum_str):
            ct = counts[t]
            ct['Total'] += 1
            if state in ('Owner', 'Unclaimed', 'Matched'):
                ct[state] += 1
            elif state == 'Claimed':
                if activity in ('Busy', 'Retiring'):
                    ct['%s/%s' % (state, activity)] += 1
                else:
                    ct['Claimed/Other'] += 1
            else:
                ct['Other'] += 1
            if want_bench:
                ct['GFlops'] += gflops
                ct['  GIPS'] += gips

        if not total_only:
            print_arr = (vm_name, cel['GLIDEIN_Site'])
            if want_gk:
                print_arr += (cel['GLIDEIN_GridType'],
                              cel['GLIDEIN_Gatekeeper'])
            print_arr += ("%s@%s" %
                          (cel['GLIDEIN_Name'], cel['GLIDEIN_Factory']),
                          cel['GLIDEIN_Entry_Name'])
            if want_gc:
                print_arr += (
                    cel['GLIDEIN_Schedd'], "%i.%i" %
                    (cel['GLIDEIN_ClusterId'], cel['GLIDEIN_ProcId']))
            if want_glexec:
                glexec_str = 'None'
                if 'GLEXEC_JOB' in el and el['GLEXEC_JOB']:
                    glexec_str = 'Job'
                elif 'GLEXEC_STARTER' in el and el['GLEXEC_STARTER']:
                    glexec_str = 'Starter'
                print_arr += (glexec_str, )
            if want_bench:
                print_arr += (mflops_str, mips_str)
            print_arr += (state, activity, cel['EnteredCurrentActivity'])

            print(print_mask % print_arr)

    print()

    count_print_mask = "%39s"
    for c in counts_header:
        count_print_mask += " %%%is" % len(c)
    print(count_print_mask % (('', ) + counts_header))

    ckeys = counts.keys()

    if summarize == 'site':
        ckeys.sort(ltotal_cmp)
    else:  # default is entry
        ckeys.sort(entry_cmp)

    if len(ckeys) > 1:
        print()  # put a space before the entry names

    count_print_val = None
    for t in ckeys:
        if t == 'Total':
            print()  # put an empty line before Total
            count_print_val = [t]
        else:
            count_print_val = ['']
        for c in counts_header:
            count_print_val.append(int(counts[t][c]))

        print(count_print_mask % tuple(count_print_val))

    print()
Exemple #11
0
    def go_request_glideins(self):
        ilog('Entered go_request_glideins.')
        from glideinwms.frontend import glideinFrontendInterface
        from glideinwms.lib import condorMonitor, condorExe, pubCrypto
        from glideinwms.frontend.glideinFrontendPlugins import proxy_plugins, createCredentialList
        # query job collector
        ilog('Checking the condor pool.')
        try:
            pool_status = condorMonitor.CondorStatus()
            pool_status.load(
                '(IS_MONITOR_VM=!=True)&&(%s)' % self.glidekeeper_constraint,
                [('State', 's')])
            running_glideins = len(pool_status.fetchStored())
            del pool_status
            self.running_glideins = running_glideins
            ilog('Found %d glideins in the pool.' % running_glideins)
        except:
            self.errors.append((time.time(), "condor_status failed"))
            return

        # query WMS collector
        ilog('Checking factory glideins.')
        glidein_dict = {}
        for factory_pool in self.factory_pools:
            factory_pool_node = factory_pool[0]
            factory_identity = factory_pool[1]
            try:
                if self.proxy_data != None:
                    full_constraint = self.factory_constraint + ' && (PubKeyType=?="RSA") && (GlideinAllowx509_Proxy=!=False)'
                else:
                    full_constraint = self.factory_constraint + ' && (GlideinRequirex509_Proxy=!=True)'
                ilog(
                    'Running findGlideins with these params: \n\tpool: %s\n\tident: %s\n\tsigtype: %s\n\tconstraints: %s'
                    % (
                        str(factory_pool_node), str(None),
                        str(self.signature_type), str(full_constraint)
                        #str(self.proxy_data!=None),
                        #str(True)
                    ))
                factory_glidein_dict = glideinFrontendInterface.findGlideins(
                    factory_pool_node,
                    None,  #factory_identity, #TODO: How do we authenticate with the factory? 
                    self.signature_type,
                    full_constraint
                    #self.proxy_data!=None,
                    #get_only_matching=True
                )
            except RuntimeError, e:
                factory_glidein_dict = {
                }  # in case of error, treat as there is nothing there
                ilog('Error from findGlideins: %s' % str(e))
            ilog('Found %d possible in factory_pool %s' %
                 (len(factory_glidein_dict.keys()), dbgp(factory_pool)))

            for glidename in factory_glidein_dict.keys():
                ilog('Now testing glidein with name %s' % glidename)
                glidein_el = factory_glidein_dict[glidename]
                ilog('Glidein stats: \n\n %s \n\n' % dbgp(glidein_el))
                if not glidein_el['attrs'].has_key(
                        'PubKeyType'):  # no pub key at all, skip
                    ilog('%s has no PubKeyType -- skipping.' % glidename)
                    continue
                elif glidein_el['attrs'][
                        'PubKeyType'] == 'RSA':  # only trust RSA for now
                    try:
                        # augment
                        glidein_el['attrs']['PubKeyObj'] = pubCrypto.PubRSAKey(
                            str(
                                re.sub(r"\\+n", r"\n",
                                       glidein_el['attrs']['PubKeyValue'])))
                        # and add
                        glidein_dict[(factory_pool_node,
                                      glidename)] = glidein_el
                        ilog('Adding %s to glidein_dict' % glidename)
                    except RuntimeError, e:
                        ilog('Hit error when adding %s to glidein_dict:\n%s' %
                             (glidename, str(e)))
                        continue  # skip
                    except:
Exemple #12
0
                    factory_pool_node, self.client_name)
            except RuntimeError, e:
                self.errors.append(
                    (time.time(), "Deadvertizing failed: %s" % e))
            except:
                tb = traceback.format_exception(sys.exc_info()[0],
                                                sys.exc_info()[1],
                                                sys.exc_info()[2])
                self.errors.append(
                    (time.time(),
                     "Deadvertizing failed: %s" % string.join(tb, '')))

        # Stop all the glideins I can see
        ilog('Getting glidein pool status data.')
        try:
            pool_status = condorMonitor.CondorStatus()
            pool_status.load(self.glidekeeper_constraint,
                             [('GLIDEIN_COLLECTOR_NAME', 's'),
                              ('GLIDEIN_MASTER_NAME', 's'),
                              ('MyAddress', 's')])
            pool_data = pool_status.fetchStored()
        except:
            self.errors.append((time.time(), "condor_status failed"))

        for k in pool_data.keys():
            el = pool_data[k]
            ilog('Now killing pool with data: (%s -> %s)' %
                 (dbgp(k), dbgp(el)))
            try:

                condorExe.exe_cmd("../sbin/condor_off",
Exemple #13
0
def query_ress(ress_source, vo=''):
    """
    Queries the specified RESS url source for information about the sites.

    Returns dictionary with RESS entries.  An entry is created for each classad (site can be listed multiple times).
    
    Can raise error
    """
    
    # TODO - there are multiple classads for an entry for each cluster/vo/etc.  Currently only the common information in all the classads for 
    # a site is used (gatekeeper, site and queue names) but if VO specific information is included in the future, this will require more 
    # complicated logic for building the entries dictionary
    
    ress_constraint = '(GlueCEInfoContactString=!=UNDEFINED)'
    if vo!='':
        ress_constraint = '(GlueCEInfoContactString=!=UNDEFINED)&&(StringlistMember("VO:%s",GlueCEAccessControlBaseRule))'%vo
    
    ress_ip = socket.gethostbyname(ress_source)

    # Get RESS info
    condor_obj = condorMonitor.CondorStatus(pool_name=ress_source)
    format_list=[('GlueCEInfoContactString', 's'), ('GlueCEName', 's'), ('GlueSiteName', 's'), ('GlueCEInfoJobManager', 's'), ('GlueCEUniqueID', 's'), ('GlueCEPolicyMaxObtainableWallClockTime', 'i'), ('GlueCEStateStatus', 's')]
    condor_data = condor_obj.fetch(constraint=ress_constraint, format_list=format_list)
    
    ress_entries = {}
    
    for condor_id in condor_data.keys():
        # Condor id is the value in the Name attribute of the classad.  The same entry may have multiple Names and therefore classads but each 
        # will have a unique Name/condor_id
        condor_el = condor_data[condor_id]

        # Default values for an entry
        gridtype = gatekeeper = rsl = wall_clocktime = ce_status = ''
        wall_clocktime = 0
        
        gatekeeper_name = condor_el['GlueCEInfoContactString'].encode('utf-8')
        queue_name = condor_el['GlueCEName'].encode('utf-8')
        site_name = condor_el['GlueSiteName'].encode('utf-8')
        
        # Determine rsl by jobmanager
        # OSG only supports gt2 (gt5 in near future?), do not need to create other rsl strings to support other grid types like cream
        if condor_el['GlueCEInfoJobManager'].encode('utf-8') == "condor":
            rsl = ""
        else:
            rsl = '(queue=%s)(jobtype=single)' % queue_name
        
        glue_id = condor_el['GlueCEUniqueID'].encode('utf-8')       
        
        wall_clocktime = int(condor_el['GlueCEPolicyMaxObtainableWallClockTime'])
        # Adjust to max of 48 hours or default of 36 hours as needed
        # This value is given in minutes
        if (wall_clocktime / 60) > 48:
            wall_clocktime = 48 * 60
        if wall_clocktime == 0:
            wall_clocktime = 36 * 60
        
        # TODO what to do with this?  New file of disabled entries?                  
        ce_status = condor_el['GlueCEStateStatus'].encode('utf-8')

        # Because RESS is specific to OSG, can default all entries to these values 
        glexec_bin = "OSG"
        work_dir ='OSG'
           
        # Could not find support for non-gt2 sites so defaulting gridtype to gt2.  Even if there are some sites, the overwhelming
        # majority is gt2.  May need to check GlueCEInfoGRAMVersion when sites start moving to gram5 (does gwms support gt5 yet?)
        gridtype = 'gt2'
        
        entry = {'site_name' : site_name + '_' + queue_name,
                'gridtype' : gridtype,
                'gatekeeper' : gatekeeper_name,
                'rsl' : rsl,
                'wall_clocktime' : wall_clocktime/60,
                'ref_id' : condor_id,
                'ce_status' : ce_status,
                'glexec_bin' : glexec_bin,
                'work_dir' : work_dir,
                'source' : ress_source,
                'source_type' : 'RESS',
                'GlueCEUniqueID' : glue_id}                      
        ress_entries[condor_id] = entry

    return ress_entries