Python roll_all_logs Exemples, glideinwms.lib.logSupport.roll_all_logs Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : glideFactoryEntryGroup.py Projet : bbockelm/glideinWMS

def find_and_perform_work(factory_in_downtime, glideinDescript,
                          frontendDescript, group_name, my_entries):
    """
    For all entries in this group, find work requests from the WMS collector,
    validate credentials, and requests glideins. If an entry is in downtime,
    requested glideins is zero.

    @type factory_in_downtime: boolean
    @param factory_in_downtime: True if factory is in downtime

    @type glideinDescript: dict
    @param glideinDescript: Factory glidein config values

    @type frontendDescript: dict
    @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects keyed on entry name

    @return: Dictionary of work to do keyed on entry name
    @rtype: dict
    """

    # Work done by group keyed by entry name. This will be returned back
    groupwork_done = {}

    # Step 1:
    # Find work to perform. Work is a dict work[entry_name][frontend]
    # We may or may not be able to perform all the work but that will be
    # checked later per entry

    work = {}
    work = find_work(factory_in_downtime, glideinDescript,
                     frontendDescript, group_name, my_entries)

    # TODO: If we return here check if we need to do cleanup of held glideins?
    #       So far only de-advertising is confirmed to trigger not cleanup
    work_count = get_work_count(work)
    if (work_count == 0):
        logSupport.log.info("No work found")
        return groupwork_done

    logSupport.log.info("Found %s total tasks to work on" % work_count)

    # Max number of children to fork at a time
    # Each child currently takes ~50 MB
    # Leaving 3GB for system, max number of children to fork is
    # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB)
    parallel_workers = 0
    try:
        parallel_workers = int(glideinDescript.data['EntryParallelWorkers'])
    except KeyError:
        logSupport.log.debug("EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits.")

    post_work_info = {}
    work_info_read_err = False

    if parallel_workers <= 0:
        logSupport.log.debug("Setting parallel_workers limit dynamically based on the available free memory")
        free_mem = os.sysconf('SC_AVPHYS_PAGES')*os.sysconf('SC_PAGE_SIZE')
        parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES))
        if parallel_workers < 1: parallel_workers = 1

    logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers)

    forkm_obj = ForkManager()
    # Only fork of child processes for entries that have corresponding
    # work todo, ie glideclient classads.
    for ent in work:
        entry = my_entries[ent]
        forkm_obj.add_fork(entry.name,
                           forked_check_and_perform_work,
                           factory_in_downtime, entry, work)
    try:
        post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers)
    except RuntimeError:
        # Expect all errors logged already
        work_info_read_err = True

    logSupport.roll_all_logs()

    # Gather results from the forked children
    logSupport.log.info("All children forked for glideFactoryEntry.check_and_perform_work terminated. Loading post work state for the entry.")
    logSupport.log.debug("All children forked for glideFactoryEntry.check_and_perform_work terminated. Loading post work state for the entry.")

    for entry in my_entries:
        # Update the entry object from the post_work_info
        if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)):
            groupwork_done[entry] = {'work_done': post_work_info[entry]['work_done']}
            (my_entries[entry]).setState(post_work_info[entry])

        else:
            logSupport.log.debug("No work found for entry %s from anyt frontends" % entry)

    if work_info_read_err:
        logSupport.log.debug("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated")
        logSupport.log.warning("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated")


    return groupwork_done

Exemple #2

0

Afficher le fichier

Fichier : glideFactoryEntryGroup.py Projet : bbockelm/glideinWMS

def iterate(parent_pid, sleep_time, advertize_rate, glideinDescript,
            frontendDescript, group_name, my_entries):
    """
    Iterate over set of tasks until its time to quit or die. The main "worker"
    function for the Factory Entry Group.
    @todo: More description to come

    @type parent_pid: int
    @param parent_pid: the pid for the Factory daemon

    @type sleep_time: int
    @param sleep_time: The number of seconds to sleep between iterations

    @type advertize_rate: int
    @param advertize_rate: The rate at which advertising should occur

    @type glideinDescript: glideFactoryConfig.GlideinDescript
    @param glideinDescript: glidein.descript object in the Factory root dir

    @type frontendDescript: glideFactoryConfig.FrontendDescript
    @param frontendDescript: frontend.descript object in the Factory root dir

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects keyed on entry name
    """

    is_first=1
    count=0;

    # Record the starttime so we know when to disable the use of old pub key
    starttime = time.time()

    # The grace period should be in the factory config. Use it to determine
    # the end of lifetime for the old key object. Hardcoded for now to 30 mins.
    oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime'])
    oldkey_eoltime = starttime + oldkey_gracetime

    factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile'])

    while 1:

        # Check if parent is still active. If not cleanup and die.
        check_parent(parent_pid, glideinDescript, my_entries)

        cleanupSupport.cleaners.start_background_cleanup()

        # Check if its time to invalidate factory's old key
        if ( (time.time() > oldkey_eoltime) and
             (glideinDescript.data['OldPubKeyObj'] is not None) ):
            # Invalidate the use of factory's old key
            logSupport.log.info("Retiring use of old key.")
            logSupport.log.info("Old key was valid from %s to %s ie grace of ~%s sec" % (starttime,oldkey_eoltime,oldkey_gracetime))
            glideinDescript.data['OldPubKeyType'] = None
            glideinDescript.data['OldPubKeyObj'] = None

        # Check if the factory is in downtime. Group is in downtime only if the
        # factory is in downtime. Entry specific downtime is handled in entry
        factory_in_downtime = factory_downtimes.checkDowntime(entry="factory")

        # Record the iteration start time
        iteration_stime = time.time()
        iteration_stime_str = time.ctime()

        if factory_in_downtime:
            logSupport.log.info("Iteration at (in downtime) %s" % iteration_stime_str)
        else:
            logSupport.log.info("Iteration at %s" % iteration_stime_str)

        # PM: Shouldn't this be inside the else statement above?
        # Why do we want to execute this if we are in downtime?
        # Or do we want to execute only few steps here but code prevents us?
        try:
            done_something = iterate_one(count==0, factory_in_downtime,
                                         glideinDescript, frontendDescript,
                                         group_name, my_entries)

            logSupport.log.info("Writing stats for all entries")

            try:
                pids = []
                # generate a list of entries for each CPU
                cpuCount = int(glideinDescript.data['MonitorUpdateThreadCount'])
                logSupport.log.info("Number of parallel writes for stats: %i" % cpuCount)

                entrylists = [my_entries.values()[cpu::cpuCount] for cpu in xrange(cpuCount)]

                # Fork's keyed by cpu number. Actual key is irrelevant
                pipe_ids = {}

                post_writestats_info = {}

                for cpu in xrange(cpuCount):
                    r,w = os.pipe()
                    unregister_sighandler()
                    pid = os.fork()
                    if pid:
                        # I am the parent
                        register_sighandler()
                        pids.append(pid)
                        os.close(w)
                        pipe_ids[cpu] = {'r': r, 'pid': pid}
                    else:
                        # I am the child
                        os.close(r)
                        logSupport.disable_rotate = True
                        # Return the pickled entry object in form of dict
                        # return_dict[entry.name][entry.getState()]
                        return_dict = {}
                        for entry in entrylists[cpu]:
                            try:
                                entry.writeStats()
                                return_dict[entry.name] = entry.getState()
                            except:
                                entry.log.warning("Error writing stats for entry '%s'" % (entry.name))
                                entry.log.exception("Error writing stats for entry '%s': " % (entry.name))

                        try:
                            os.write(w, cPickle.dumps(return_dict))
                        except:
                            # Catch and log exceptions if any to avoid
                            # runaway processes.
                            entry.log.exception("Error writing pickled state for entry '%s': " % (entry.name))
                        os.close(w)
                        # Exit without triggering SystemExit exception
                        os._exit(0)

                try:
                    logSupport.log.info("Processing response from children after write stats")
                    post_writestats_info = fetch_fork_result_list(pipe_ids)
                except:
                    logSupport.log.exception("Error processing response from one or more children after write stats")

                logSupport.roll_all_logs()

                for i in post_writestats_info:
                    for ent in post_writestats_info[i]:
                        (my_entries[ent]).setState(post_writestats_info[i][ent])
            except KeyboardInterrupt:
                raise # this is an exit signal, pass through
            except:
                # never fail for stats reasons!
                logSupport.log.exception("Error writing stats: ")
        except KeyboardInterrupt:
            raise # this is an exit signal, pass through
        except:
            if is_first:
                raise
            else:
                # if not the first pass, just warn
                logSupport.log.exception("Exception occurred: ")

        cleanupSupport.cleaners.wait_for_cleanup()

        iteration_etime = time.time()
        iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime)
        if (iteration_sleep_time < 0):
            iteration_sleep_time = 0
        logSupport.log.info("Sleep %is" % iteration_sleep_time)
        time.sleep(iteration_sleep_time)

        count = (count+1) % advertize_rate
        is_first = 0

Exemple #3

0

Afficher le fichier

Fichier : glideFactoryEntryGroup.py Projet : mambelli/glideinwms

def iterate(parent_pid, sleep_time, advertize_rate, glideinDescript,
            frontendDescript, group_name, my_entries):
    """
    Iterate over set of tasks until its time to quit or die. The main "worker"
    function for the Factory Entry Group.
    @todo: More description to come

    @type parent_pid: int
    @param parent_pid: the pid for the Factory daemon

    @type sleep_time: int
    @param sleep_time: The number of seconds to sleep between iterations

    @type advertize_rate: int
    @param advertize_rate: The rate at which advertising should occur

    @type glideinDescript: glideFactoryConfig.GlideinDescript
    @param glideinDescript: glidein.descript object in the Factory root dir

    @type frontendDescript: glideFactoryConfig.FrontendDescript
    @param frontendDescript: frontend.descript object in the Factory root dir

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects keyed on entry name
    """

    is_first = True  # In first iteration
    count = 0

    # Record the starttime so we know when to disable the use of old pub key
    starttime = time.time()

    # The grace period should be in the factory config. Use it to determine
    # the end of lifetime for the old key object. Hardcoded for now to 30 mins.
    oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime'])
    oldkey_eoltime = starttime + oldkey_gracetime

    factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(
        glideinDescript.data['DowntimesFile'])

    while True:

        # Check if parent is still active. If not cleanup and die.
        check_parent(parent_pid, glideinDescript, my_entries)

        cleanupSupport.cleaners.start_background_cleanup()

        # Check if its time to invalidate factory's old key
        if ((time.time() > oldkey_eoltime)
                and (glideinDescript.data['OldPubKeyObj'] is not None)):
            # Invalidate the use of factory's old key
            logSupport.log.info("Retiring use of old key.")
            logSupport.log.info(
                "Old key was valid from %s to %s ie grace of ~%s sec" %
                (starttime, oldkey_eoltime, oldkey_gracetime))
            glideinDescript.data['OldPubKeyType'] = None
            glideinDescript.data['OldPubKeyObj'] = None

        # Check if the factory is in downtime. Group is in downtime only if the
        # factory is in downtime. Entry specific downtime is handled in entry
        factory_in_downtime = factory_downtimes.checkDowntime(entry="factory")

        # Record the iteration start time
        iteration_stime = time.time()
        iteration_stime_str = time.ctime()

        if factory_in_downtime:
            logSupport.log.info("Iteration at (in downtime) %s" %
                                iteration_stime_str)
        else:
            logSupport.log.info("Iteration at %s" % iteration_stime_str)

        # PM: Shouldn't this be inside the else statement above?
        # Why do we want to execute this if we are in downtime?
        # Or do we want to execute only few steps here but code prevents us?
        try:
            done_something = iterate_one(count == 0, factory_in_downtime,
                                         glideinDescript, frontendDescript,
                                         group_name, my_entries)

            logSupport.log.info("Writing stats for all entries")

            try:
                pids = []
                # generate a list of entries for each CPU
                cpuCount = int(
                    glideinDescript.data['MonitorUpdateThreadCount'])
                logSupport.log.info("Number of parallel writes for stats: %i" %
                                    cpuCount)

                entrylists = [
                    my_entries.values()[cpu::cpuCount]
                    for cpu in xrange(cpuCount)
                ]

                # Fork's keyed by cpu number. Actual key is irrelevant
                pipe_ids = {}

                post_writestats_info = {}

                for cpu in xrange(cpuCount):
                    r, w = os.pipe()
                    unregister_sighandler()
                    pid = os.fork()
                    if pid:
                        # I am the parent
                        register_sighandler()
                        pids.append(pid)
                        os.close(w)
                        pipe_ids[cpu] = {'r': r, 'pid': pid}
                    else:
                        # I am the child
                        os.close(r)
                        logSupport.disable_rotate = True
                        # Return the pickled entry object in form of dict
                        # return_dict[entry.name][entry.getState()]
                        return_dict = {}
                        for entry in entrylists[cpu]:
                            try:
                                entry.writeStats()
                                return_dict[entry.name] = entry.getState()
                            except:
                                entry.log.warning(
                                    "Error writing stats for entry '%s'" %
                                    (entry.name))
                                entry.log.exception(
                                    "Error writing stats for entry '%s': " %
                                    (entry.name))

                        try:
                            os.write(w, cPickle.dumps(return_dict))
                        except:
                            # Catch and log exceptions if any to avoid
                            # runaway processes.
                            entry.log.exception(
                                "Error writing pickled state for entry '%s': "
                                % (entry.name))
                        os.close(w)
                        # Exit without triggering SystemExit exception
                        os._exit(0)

                try:
                    logSupport.log.info(
                        "Processing response from children after write stats")
                    post_writestats_info = fetch_fork_result_list(pipe_ids)
                except:
                    logSupport.log.exception(
                        "Error processing response from one or more children after write stats"
                    )

                logSupport.roll_all_logs()

                for i in post_writestats_info:
                    for ent in post_writestats_info[i]:
                        (my_entries[ent]).setState(
                            post_writestats_info[i][ent])
            except KeyboardInterrupt:
                raise  # this is an exit signal, pass through
            except:
                # never fail for stats reasons!
                logSupport.log.exception("Error writing stats: ")
        except KeyboardInterrupt:
            raise  # this is an exit signal, pass through
        except:
            if is_first:
                raise
            else:
                # If not the first pass, just warn
                logSupport.log.exception(
                    "Exception occurred in the main loop of Factory Group %s: "
                    % group_name)

        cleanupSupport.cleaners.wait_for_cleanup()

        iteration_etime = time.time()
        iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime)
        if (iteration_sleep_time < 0):
            iteration_sleep_time = 0
        logSupport.log.info("Sleep %is" % iteration_sleep_time)
        time.sleep(iteration_sleep_time)

        count = (count + 1) % advertize_rate
        is_first = False  # Entering following iterations

Exemple #4

0

Afficher le fichier

Fichier : glideFactoryEntryGroup.py Projet : mambelli/glideinwms

def find_and_perform_work(do_advertize, factory_in_downtime, glideinDescript,
                          frontendDescript, group_name, my_entries):
    """
    For all entries in this group, find work requests from the WMS collector,
    validate credentials, and requests glideins. If an entry is in downtime,
    requested glideins is zero.

    @type factory_in_downtime: boolean
    @param factory_in_downtime: True if factory is in downtime

    @type glideinDescript: dict
    @param glideinDescript: Factory glidein config values

    @type frontendDescript: dict
    @param frontendDescript: Security mappings for frontend identities, security classes, and usernames

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects (glideFactoryEntry.Entry) keyed on entry name

    @return: Dictionary of work to do keyed on entry name
    @rtype: dict
    """

    # Work done by group keyed by entry name. This will be returned back
    groupwork_done = {}

    # Step 1:
    # Find work to perform. Work is a dict work[entry_name][frontend]
    # We may or may not be able to perform all the work but that will be
    # checked later per entry
    # work includes all entries, empty value for entries w/ no work to do
    # to allow cleanup, ... (remove held glideins, ...)

    work = find_work(factory_in_downtime, glideinDescript, frontendDescript,
                     group_name, my_entries)

    # Request from a Frontend group to an entry
    work_count = get_work_count(work)
    if (work_count == 0):
        logSupport.log.info("No work found")
        if do_advertize:
            logSupport.log.info("Continuing to update monitoring info")
        else:
            return groupwork_done

    logSupport.log.info("Found %s total tasks to work on" % work_count)

    # Max number of children to fork at a time
    # Each child currently takes ~50 MB
    # Leaving 3GB for system, max number of children to fork is
    # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB)
    parallel_workers = 0
    try:
        parallel_workers = int(glideinDescript.data['EntryParallelWorkers'])
    except KeyError:
        logSupport.log.debug(
            "EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits."
        )

    post_work_info = {}
    work_info_read_err = False

    if parallel_workers <= 0:
        logSupport.log.debug(
            "Setting parallel_workers limit dynamically based on the available free memory"
        )
        free_mem = os.sysconf('SC_AVPHYS_PAGES') * os.sysconf('SC_PAGE_SIZE')
        parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES))
        if parallel_workers < 1: parallel_workers = 1

    logSupport.log.debug("Setting parallel_workers limit of %s" %
                         parallel_workers)

    forkm_obj = ForkManager()
    # Only fork of child processes for entries that have corresponding
    # work to do, ie glideclient classads.
    # TODO: #22163, change in 3.5 coordinate w/ find_work():
    #  change so that only the entries w/ work to do are returned in 'work'
    #  currently work contains all entries
    #  cleanup is still done correctly, handled also in the entries w/o work function (forked as single function)
    entries_without_work = []
    for ent in my_entries:
        if work.get(ent):
            entry = my_entries[ent]  # ent is the entry.name
            forkm_obj.add_fork(ent, forked_check_and_perform_work,
                               factory_in_downtime, entry, work[ent])
        else:
            entries_without_work.append(ent)
    # Evaluate stats for entries without work only if these will be advertised
    # TODO: #22163, check if this is causing too much load
    # Since glideins only decrease for entries not receiving requests, a more efficient way
    # could be to advertise entries that had non 0 # of glideins at the previous round
    if do_advertize and len(entries_without_work) > 0:
        forkm_obj.add_fork('GWMS_ENTRIES_WITHOUT_WORK',
                           forked_update_entries_stats, factory_in_downtime,
                           [my_entries[i] for i in entries_without_work])
    t_begin = time.time()
    try:
        post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers)
        t_end = time.time() - t_begin
    except RuntimeError:
        # Expect all errors logged already
        work_info_read_err = True
        t_end = time.time() - t_begin

    logSupport.roll_all_logs()
    # Gather results from the forked children
    logSupport.log.info(
        "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry."
        % t_end)
    logSupport.log.debug(
        "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry."
        % t_end)

    for entry in my_entries:
        # Update the entry object from the post_work_info
        if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)):
            groupwork_done[entry] = {
                'work_done': post_work_info[entry]['work_done']
            }
            (my_entries[entry]).setState(post_work_info[entry])
        else:
            logSupport.log.debug(
                "No work found for entry %s from any frontends" % entry)

    if 'GWMS_ENTRIES_WITHOUT_WORK' in post_work_info and len(
            post_work_info['GWMS_ENTRIES_WITHOUT_WORK']['entries']) > 0:
        for entry, entry_state in post_work_info['GWMS_ENTRIES_WITHOUT_WORK'][
                'entries']:
            (my_entries[entry]).setState(entry_state)

    if work_info_read_err:
        logSupport.log.debug(
            "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated"
        )
        logSupport.log.warning(
            "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated"
        )

    return groupwork_done

Exemple #5

0

Afficher le fichier

Fichier : glideFactoryEntryGroup.py Projet : holzman/glideinwms-old

                # Hard kill myself. Don't want any cleanup, since I was created
                # just for doing check and perform work for each entry
            finally:
                # Exit, immediately. Don't want any cleanup, since I was created
                # just for doing check and perform work for each entry
                os._exit(0)

    # Gather info from rest of the entries
    try:
        post_work_info_subset = fetch_fork_result_list(pipe_ids)
        post_work_info.update(post_work_info_subset)
    except RuntimeError:
        # Expect all errors logged already
        work_info_read_err = True

    logSupport.roll_all_logs()

    # Gather results from the forked children
    logSupport.log.info("All children forked for glideFactoryEntry.check_and_perform_work terminated. Loading post work state for the entry.")
    logSupport.log.debug("All children forked for glideFactoryEntry.check_and_perform_work terminated. Loading post work state for the entry.")

    for entry in my_entries:
        # Update the entry object from the post_work_info
        if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)):
            groupwork_done[entry] = {'work_done': post_work_info[entry]['work_done']}
            (my_entries[entry]).setState(post_work_info[entry])

        else:
            logSupport.log.debug("No work found for entry %s from anyt frontends" % entry)

    if work_info_read_err:

Exemple #6

0

Afficher le fichier

Fichier : glideFactoryEntryGroup.py Projet : glideinWMS/glideinWMS

def find_and_perform_work(do_advertize, factory_in_downtime, glideinDescript,
                          frontendDescript, group_name, my_entries):
    """
    For all entries in this group, find work requests from the WMS collector,
    validate credentials, and requests glideins. If an entry is in downtime,
    requested glideins is zero.

    @type factory_in_downtime: boolean
    @param factory_in_downtime: True if factory is in downtime

    @type glideinDescript: dict
    @param glideinDescript: Factory glidein config values

    @type frontendDescript: dict
    @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects (glideFactoryEntry.Entry) keyed on entry name

    @return: Dictionary of work to do keyed on entry name
    @rtype: dict
    """

    # Work done by group keyed by entry name. This will be returned back
    groupwork_done = {}

    # Step 1:
    # Find work to perform. Work is a dict work[entry_name][frontend]
    # We may or may not be able to perform all the work but that will be
    # checked later per entry
    # work includes all entries, empty value for entries w/ no work to do
    # to allow cleanup, ... (remove held glideins, ...)

    work = find_work(factory_in_downtime, glideinDescript,
                     frontendDescript, group_name, my_entries)

    # Request from a Frontend group to an entry
    work_count = get_work_count(work)
    if (work_count == 0):
        logSupport.log.info("No work found")
        if do_advertize:
            logSupport.log.info("Continuing to update monitoring info")
        else:
            return groupwork_done

    logSupport.log.info("Found %s total tasks to work on" % work_count)

    # Max number of children to fork at a time
    # Each child currently takes ~50 MB
    # Leaving 3GB for system, max number of children to fork is
    # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB)
    parallel_workers = 0
    try:
        parallel_workers = int(glideinDescript.data['EntryParallelWorkers'])
    except KeyError:
        logSupport.log.debug("EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits.")

    post_work_info = {}
    work_info_read_err = False

    if parallel_workers <= 0:
        logSupport.log.debug("Setting parallel_workers limit dynamically based on the available free memory")
        free_mem = os.sysconf('SC_AVPHYS_PAGES')*os.sysconf('SC_PAGE_SIZE')
        parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES))
        if parallel_workers < 1: parallel_workers = 1

    logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers)

    forkm_obj = ForkManager()
    # Only fork of child processes for entries that have corresponding
    # work to do, ie glideclient classads.
    # TODO: #22163, change in 3.5 coordinate w/ find_work():
    #  change so that only the entries w/ work to do are returned in 'work'
    #  currently work contains all entries
    #  cleanup is still done correctly, handled also in the entries w/o work function (forked as single function)
    entries_without_work = []
    for ent in my_entries:
        if work.get(ent):
            entry = my_entries[ent]  # ent is the entry.name
            forkm_obj.add_fork(ent,
                               forked_check_and_perform_work,
                               factory_in_downtime, entry, work[ent])
        else:
            entries_without_work.append(ent)
    # Evaluate stats for entries without work only if these will be advertised
    # TODO: #22163, check if this is causing too much load
    # Since glideins only decrease for entries not receiving requests, a more efficient way
    # could be to advertise entries that had non 0 # of glideins at the previous round
    if do_advertize and len(entries_without_work) > 0:
        forkm_obj.add_fork('GWMS_ENTRIES_WITHOUT_WORK',
                           forked_update_entries_stats,
                           factory_in_downtime, [my_entries[i] for i in entries_without_work])
    t_begin = time.time()
    try:
        post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers)
        t_end = time.time() - t_begin
    except RuntimeError:
        # Expect all errors logged already
        work_info_read_err = True
        t_end = time.time() - t_begin

    logSupport.roll_all_logs()
    # Gather results from the forked children
    logSupport.log.info("All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end)
    logSupport.log.debug("All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end)

    for entry in my_entries:
        # Update the entry object from the post_work_info
        if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)):
            groupwork_done[entry] = {'work_done': post_work_info[entry]['work_done']}
            (my_entries[entry]).setState(post_work_info[entry])
        else:
            logSupport.log.debug("No work found for entry %s from any frontends" % entry)

    if 'GWMS_ENTRIES_WITHOUT_WORK' in post_work_info and len(post_work_info['GWMS_ENTRIES_WITHOUT_WORK']['entries']) > 0:
        for entry, entry_state in post_work_info['GWMS_ENTRIES_WITHOUT_WORK']['entries']:
            (my_entries[entry]).setState(entry_state)

    if work_info_read_err:
        logSupport.log.debug("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated")
        logSupport.log.warning("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated")

    return groupwork_done

Exemple #7

0

Afficher le fichier

Fichier : glideFactoryEntryGroup.py Projet : mmascher/glideinwms

def find_and_perform_work(factory_in_downtime, glideinDescript,
                          frontendDescript, group_name, my_entries):
    """
    For all entries in this group, find work requests from the WMS collector,
    validate credentials, and requests glideins. If an entry is in downtime,
    requested glideins is zero.

    @type factory_in_downtime: boolean
    @param factory_in_downtime: True if factory is in downtime

    @type glideinDescript: dict
    @param glideinDescript: Factory glidein config values

    @type frontendDescript: dict
    @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects keyed on entry name

    @return: Dictionary of work to do keyed on entry name
    @rtype: dict
    """

    # Work done by group keyed by entry name. This will be returned back
    groupwork_done = {}

    # Step 1:
    # Find work to perform. Work is a dict work[entry_name][frontend]
    # We may or may not be able to perform all the work but that will be
    # checked later per entry

    work = {}
    work = find_work(factory_in_downtime, glideinDescript, frontendDescript,
                     group_name, my_entries)
    # TODO: If we return here check if we need to do cleanup of held glideins?
    #       So far only de-advertising is confirmed to trigger not cleanup
    work_count = get_work_count(work)
    if (work_count == 0):
        logSupport.log.info("No work found")
        return groupwork_done

    logSupport.log.info("Found %s total tasks to work on" % work_count)

    # Max number of children to fork at a time
    # Each child currently takes ~50 MB
    # Leaving 3GB for system, max number of children to fork is
    # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB)
    parallel_workers = 0
    try:
        parallel_workers = int(glideinDescript.data['EntryParallelWorkers'])
    except KeyError:
        logSupport.log.debug(
            "EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits."
        )

    post_work_info = {}
    work_info_read_err = False

    if parallel_workers <= 0:
        logSupport.log.debug(
            "Setting parallel_workers limit dynamically based on the available free memory"
        )
        free_mem = os.sysconf('SC_AVPHYS_PAGES') * os.sysconf('SC_PAGE_SIZE')
        parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES))
        if parallel_workers < 1: parallel_workers = 1

    logSupport.log.debug("Setting parallel_workers limit of %s" %
                         parallel_workers)

    forkm_obj = ForkManager()
    # Only fork of child processes for entries that have corresponding
    # work todo, ie glideclient classads.
    for ent in work:
        entry = my_entries[ent]

        forkm_obj.add_fork(entry.name, forked_check_and_perform_work,
                           factory_in_downtime, entry, work)
    try:
        t_begin = time.time()
        post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers)
        t_end = time.time() - t_begin
    except RuntimeError:
        # Expect all errors logged already
        work_info_read_err = True
        t_end = time.time() - t_begin

    logSupport.roll_all_logs()
    # Gather results from the forked children
    logSupport.log.info(
        "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry."
        % t_end)
    logSupport.log.debug(
        "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry."
        % t_end)

    for entry in my_entries:
        # Update the entry object from the post_work_info
        if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)):
            groupwork_done[entry] = {
                'work_done': post_work_info[entry]['work_done']
            }
            (my_entries[entry]).setState(post_work_info[entry])

        else:
            logSupport.log.debug(
                "No work found for entry %s from any frontends" % entry)

    if work_info_read_err:
        logSupport.log.debug(
            "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated"
        )
        logSupport.log.warning(
            "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated"
        )

    return groupwork_done