def find_and_perform_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries): """ For all entries in this group, find work requests from the WMS collector, validate credentials, and requests glideins. If an entry is in downtime, requested glideins is zero. @type factory_in_downtime: boolean @param factory_in_downtime: True if factory is in downtime @type glideinDescript: dict @param glideinDescript: Factory glidein config values @type frontendDescript: dict @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name @return: Dictionary of work to do keyed on entry name @rtype: dict """ # Work done by group keyed by entry name. This will be returned back groupwork_done = {} # Step 1: # Find work to perform. Work is a dict work[entry_name][frontend] # We may or may not be able to perform all the work but that will be # checked later per entry work = {} work = find_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) # TODO: If we return here check if we need to do cleanup of held glideins? # So far only de-advertising is confirmed to trigger not cleanup work_count = get_work_count(work) if (work_count == 0): logSupport.log.info("No work found") return groupwork_done logSupport.log.info("Found %s total tasks to work on" % work_count) # Max number of children to fork at a time # Each child currently takes ~50 MB # Leaving 3GB for system, max number of children to fork is # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB) parallel_workers = 0 try: parallel_workers = int(glideinDescript.data['EntryParallelWorkers']) except KeyError: logSupport.log.debug("EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits.") post_work_info = {} work_info_read_err = False if parallel_workers <= 0: logSupport.log.debug("Setting parallel_workers limit dynamically based on the available free memory") free_mem = os.sysconf('SC_AVPHYS_PAGES')*os.sysconf('SC_PAGE_SIZE') parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES)) if parallel_workers < 1: parallel_workers = 1 logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers) forkm_obj = ForkManager() # Only fork of child processes for entries that have corresponding # work todo, ie glideclient classads. for ent in work: entry = my_entries[ent] forkm_obj.add_fork(entry.name, forked_check_and_perform_work, factory_in_downtime, entry, work) try: post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers) except RuntimeError: # Expect all errors logged already work_info_read_err = True logSupport.roll_all_logs() # Gather results from the forked children logSupport.log.info("All children forked for glideFactoryEntry.check_and_perform_work terminated. Loading post work state for the entry.") logSupport.log.debug("All children forked for glideFactoryEntry.check_and_perform_work terminated. Loading post work state for the entry.") for entry in my_entries: # Update the entry object from the post_work_info if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)): groupwork_done[entry] = {'work_done': post_work_info[entry]['work_done']} (my_entries[entry]).setState(post_work_info[entry]) else: logSupport.log.debug("No work found for entry %s from anyt frontends" % entry) if work_info_read_err: logSupport.log.debug("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated") logSupport.log.warning("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated") return groupwork_done
def iterate(parent_pid, sleep_time, advertize_rate, glideinDescript, frontendDescript, group_name, my_entries): """ Iterate over set of tasks until its time to quit or die. The main "worker" function for the Factory Entry Group. @todo: More description to come @type parent_pid: int @param parent_pid: the pid for the Factory daemon @type sleep_time: int @param sleep_time: The number of seconds to sleep between iterations @type advertize_rate: int @param advertize_rate: The rate at which advertising should occur @type glideinDescript: glideFactoryConfig.GlideinDescript @param glideinDescript: glidein.descript object in the Factory root dir @type frontendDescript: glideFactoryConfig.FrontendDescript @param frontendDescript: frontend.descript object in the Factory root dir @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name """ is_first=1 count=0; # Record the starttime so we know when to disable the use of old pub key starttime = time.time() # The grace period should be in the factory config. Use it to determine # the end of lifetime for the old key object. Hardcoded for now to 30 mins. oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime']) oldkey_eoltime = starttime + oldkey_gracetime factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile']) while 1: # Check if parent is still active. If not cleanup and die. check_parent(parent_pid, glideinDescript, my_entries) cleanupSupport.cleaners.start_background_cleanup() # Check if its time to invalidate factory's old key if ( (time.time() > oldkey_eoltime) and (glideinDescript.data['OldPubKeyObj'] is not None) ): # Invalidate the use of factory's old key logSupport.log.info("Retiring use of old key.") logSupport.log.info("Old key was valid from %s to %s ie grace of ~%s sec" % (starttime,oldkey_eoltime,oldkey_gracetime)) glideinDescript.data['OldPubKeyType'] = None glideinDescript.data['OldPubKeyObj'] = None # Check if the factory is in downtime. Group is in downtime only if the # factory is in downtime. Entry specific downtime is handled in entry factory_in_downtime = factory_downtimes.checkDowntime(entry="factory") # Record the iteration start time iteration_stime = time.time() iteration_stime_str = time.ctime() if factory_in_downtime: logSupport.log.info("Iteration at (in downtime) %s" % iteration_stime_str) else: logSupport.log.info("Iteration at %s" % iteration_stime_str) # PM: Shouldn't this be inside the else statement above? # Why do we want to execute this if we are in downtime? # Or do we want to execute only few steps here but code prevents us? try: done_something = iterate_one(count==0, factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) logSupport.log.info("Writing stats for all entries") try: pids = [] # generate a list of entries for each CPU cpuCount = int(glideinDescript.data['MonitorUpdateThreadCount']) logSupport.log.info("Number of parallel writes for stats: %i" % cpuCount) entrylists = [my_entries.values()[cpu::cpuCount] for cpu in xrange(cpuCount)] # Fork's keyed by cpu number. Actual key is irrelevant pipe_ids = {} post_writestats_info = {} for cpu in xrange(cpuCount): r,w = os.pipe() unregister_sighandler() pid = os.fork() if pid: # I am the parent register_sighandler() pids.append(pid) os.close(w) pipe_ids[cpu] = {'r': r, 'pid': pid} else: # I am the child os.close(r) logSupport.disable_rotate = True # Return the pickled entry object in form of dict # return_dict[entry.name][entry.getState()] return_dict = {} for entry in entrylists[cpu]: try: entry.writeStats() return_dict[entry.name] = entry.getState() except: entry.log.warning("Error writing stats for entry '%s'" % (entry.name)) entry.log.exception("Error writing stats for entry '%s': " % (entry.name)) try: os.write(w, cPickle.dumps(return_dict)) except: # Catch and log exceptions if any to avoid # runaway processes. entry.log.exception("Error writing pickled state for entry '%s': " % (entry.name)) os.close(w) # Exit without triggering SystemExit exception os._exit(0) try: logSupport.log.info("Processing response from children after write stats") post_writestats_info = fetch_fork_result_list(pipe_ids) except: logSupport.log.exception("Error processing response from one or more children after write stats") logSupport.roll_all_logs() for i in post_writestats_info: for ent in post_writestats_info[i]: (my_entries[ent]).setState(post_writestats_info[i][ent]) except KeyboardInterrupt: raise # this is an exit signal, pass through except: # never fail for stats reasons! logSupport.log.exception("Error writing stats: ") except KeyboardInterrupt: raise # this is an exit signal, pass through except: if is_first: raise else: # if not the first pass, just warn logSupport.log.exception("Exception occurred: ") cleanupSupport.cleaners.wait_for_cleanup() iteration_etime = time.time() iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime) if (iteration_sleep_time < 0): iteration_sleep_time = 0 logSupport.log.info("Sleep %is" % iteration_sleep_time) time.sleep(iteration_sleep_time) count = (count+1) % advertize_rate is_first = 0
def iterate(parent_pid, sleep_time, advertize_rate, glideinDescript, frontendDescript, group_name, my_entries): """ Iterate over set of tasks until its time to quit or die. The main "worker" function for the Factory Entry Group. @todo: More description to come @type parent_pid: int @param parent_pid: the pid for the Factory daemon @type sleep_time: int @param sleep_time: The number of seconds to sleep between iterations @type advertize_rate: int @param advertize_rate: The rate at which advertising should occur @type glideinDescript: glideFactoryConfig.GlideinDescript @param glideinDescript: glidein.descript object in the Factory root dir @type frontendDescript: glideFactoryConfig.FrontendDescript @param frontendDescript: frontend.descript object in the Factory root dir @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name """ is_first = True # In first iteration count = 0 # Record the starttime so we know when to disable the use of old pub key starttime = time.time() # The grace period should be in the factory config. Use it to determine # the end of lifetime for the old key object. Hardcoded for now to 30 mins. oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime']) oldkey_eoltime = starttime + oldkey_gracetime factory_downtimes = glideFactoryDowntimeLib.DowntimeFile( glideinDescript.data['DowntimesFile']) while True: # Check if parent is still active. If not cleanup and die. check_parent(parent_pid, glideinDescript, my_entries) cleanupSupport.cleaners.start_background_cleanup() # Check if its time to invalidate factory's old key if ((time.time() > oldkey_eoltime) and (glideinDescript.data['OldPubKeyObj'] is not None)): # Invalidate the use of factory's old key logSupport.log.info("Retiring use of old key.") logSupport.log.info( "Old key was valid from %s to %s ie grace of ~%s sec" % (starttime, oldkey_eoltime, oldkey_gracetime)) glideinDescript.data['OldPubKeyType'] = None glideinDescript.data['OldPubKeyObj'] = None # Check if the factory is in downtime. Group is in downtime only if the # factory is in downtime. Entry specific downtime is handled in entry factory_in_downtime = factory_downtimes.checkDowntime(entry="factory") # Record the iteration start time iteration_stime = time.time() iteration_stime_str = time.ctime() if factory_in_downtime: logSupport.log.info("Iteration at (in downtime) %s" % iteration_stime_str) else: logSupport.log.info("Iteration at %s" % iteration_stime_str) # PM: Shouldn't this be inside the else statement above? # Why do we want to execute this if we are in downtime? # Or do we want to execute only few steps here but code prevents us? try: done_something = iterate_one(count == 0, factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) logSupport.log.info("Writing stats for all entries") try: pids = [] # generate a list of entries for each CPU cpuCount = int( glideinDescript.data['MonitorUpdateThreadCount']) logSupport.log.info("Number of parallel writes for stats: %i" % cpuCount) entrylists = [ my_entries.values()[cpu::cpuCount] for cpu in xrange(cpuCount) ] # Fork's keyed by cpu number. Actual key is irrelevant pipe_ids = {} post_writestats_info = {} for cpu in xrange(cpuCount): r, w = os.pipe() unregister_sighandler() pid = os.fork() if pid: # I am the parent register_sighandler() pids.append(pid) os.close(w) pipe_ids[cpu] = {'r': r, 'pid': pid} else: # I am the child os.close(r) logSupport.disable_rotate = True # Return the pickled entry object in form of dict # return_dict[entry.name][entry.getState()] return_dict = {} for entry in entrylists[cpu]: try: entry.writeStats() return_dict[entry.name] = entry.getState() except: entry.log.warning( "Error writing stats for entry '%s'" % (entry.name)) entry.log.exception( "Error writing stats for entry '%s': " % (entry.name)) try: os.write(w, cPickle.dumps(return_dict)) except: # Catch and log exceptions if any to avoid # runaway processes. entry.log.exception( "Error writing pickled state for entry '%s': " % (entry.name)) os.close(w) # Exit without triggering SystemExit exception os._exit(0) try: logSupport.log.info( "Processing response from children after write stats") post_writestats_info = fetch_fork_result_list(pipe_ids) except: logSupport.log.exception( "Error processing response from one or more children after write stats" ) logSupport.roll_all_logs() for i in post_writestats_info: for ent in post_writestats_info[i]: (my_entries[ent]).setState( post_writestats_info[i][ent]) except KeyboardInterrupt: raise # this is an exit signal, pass through except: # never fail for stats reasons! logSupport.log.exception("Error writing stats: ") except KeyboardInterrupt: raise # this is an exit signal, pass through except: if is_first: raise else: # If not the first pass, just warn logSupport.log.exception( "Exception occurred in the main loop of Factory Group %s: " % group_name) cleanupSupport.cleaners.wait_for_cleanup() iteration_etime = time.time() iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime) if (iteration_sleep_time < 0): iteration_sleep_time = 0 logSupport.log.info("Sleep %is" % iteration_sleep_time) time.sleep(iteration_sleep_time) count = (count + 1) % advertize_rate is_first = False # Entering following iterations
def find_and_perform_work(do_advertize, factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries): """ For all entries in this group, find work requests from the WMS collector, validate credentials, and requests glideins. If an entry is in downtime, requested glideins is zero. @type factory_in_downtime: boolean @param factory_in_downtime: True if factory is in downtime @type glideinDescript: dict @param glideinDescript: Factory glidein config values @type frontendDescript: dict @param frontendDescript: Security mappings for frontend identities, security classes, and usernames @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects (glideFactoryEntry.Entry) keyed on entry name @return: Dictionary of work to do keyed on entry name @rtype: dict """ # Work done by group keyed by entry name. This will be returned back groupwork_done = {} # Step 1: # Find work to perform. Work is a dict work[entry_name][frontend] # We may or may not be able to perform all the work but that will be # checked later per entry # work includes all entries, empty value for entries w/ no work to do # to allow cleanup, ... (remove held glideins, ...) work = find_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) # Request from a Frontend group to an entry work_count = get_work_count(work) if (work_count == 0): logSupport.log.info("No work found") if do_advertize: logSupport.log.info("Continuing to update monitoring info") else: return groupwork_done logSupport.log.info("Found %s total tasks to work on" % work_count) # Max number of children to fork at a time # Each child currently takes ~50 MB # Leaving 3GB for system, max number of children to fork is # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB) parallel_workers = 0 try: parallel_workers = int(glideinDescript.data['EntryParallelWorkers']) except KeyError: logSupport.log.debug( "EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits." ) post_work_info = {} work_info_read_err = False if parallel_workers <= 0: logSupport.log.debug( "Setting parallel_workers limit dynamically based on the available free memory" ) free_mem = os.sysconf('SC_AVPHYS_PAGES') * os.sysconf('SC_PAGE_SIZE') parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES)) if parallel_workers < 1: parallel_workers = 1 logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers) forkm_obj = ForkManager() # Only fork of child processes for entries that have corresponding # work to do, ie glideclient classads. # TODO: #22163, change in 3.5 coordinate w/ find_work(): # change so that only the entries w/ work to do are returned in 'work' # currently work contains all entries # cleanup is still done correctly, handled also in the entries w/o work function (forked as single function) entries_without_work = [] for ent in my_entries: if work.get(ent): entry = my_entries[ent] # ent is the entry.name forkm_obj.add_fork(ent, forked_check_and_perform_work, factory_in_downtime, entry, work[ent]) else: entries_without_work.append(ent) # Evaluate stats for entries without work only if these will be advertised # TODO: #22163, check if this is causing too much load # Since glideins only decrease for entries not receiving requests, a more efficient way # could be to advertise entries that had non 0 # of glideins at the previous round if do_advertize and len(entries_without_work) > 0: forkm_obj.add_fork('GWMS_ENTRIES_WITHOUT_WORK', forked_update_entries_stats, factory_in_downtime, [my_entries[i] for i in entries_without_work]) t_begin = time.time() try: post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers) t_end = time.time() - t_begin except RuntimeError: # Expect all errors logged already work_info_read_err = True t_end = time.time() - t_begin logSupport.roll_all_logs() # Gather results from the forked children logSupport.log.info( "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) logSupport.log.debug( "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) for entry in my_entries: # Update the entry object from the post_work_info if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)): groupwork_done[entry] = { 'work_done': post_work_info[entry]['work_done'] } (my_entries[entry]).setState(post_work_info[entry]) else: logSupport.log.debug( "No work found for entry %s from any frontends" % entry) if 'GWMS_ENTRIES_WITHOUT_WORK' in post_work_info and len( post_work_info['GWMS_ENTRIES_WITHOUT_WORK']['entries']) > 0: for entry, entry_state in post_work_info['GWMS_ENTRIES_WITHOUT_WORK'][ 'entries']: (my_entries[entry]).setState(entry_state) if work_info_read_err: logSupport.log.debug( "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated" ) logSupport.log.warning( "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated" ) return groupwork_done
# Hard kill myself. Don't want any cleanup, since I was created # just for doing check and perform work for each entry finally: # Exit, immediately. Don't want any cleanup, since I was created # just for doing check and perform work for each entry os._exit(0) # Gather info from rest of the entries try: post_work_info_subset = fetch_fork_result_list(pipe_ids) post_work_info.update(post_work_info_subset) except RuntimeError: # Expect all errors logged already work_info_read_err = True logSupport.roll_all_logs() # Gather results from the forked children logSupport.log.info("All children forked for glideFactoryEntry.check_and_perform_work terminated. Loading post work state for the entry.") logSupport.log.debug("All children forked for glideFactoryEntry.check_and_perform_work terminated. Loading post work state for the entry.") for entry in my_entries: # Update the entry object from the post_work_info if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)): groupwork_done[entry] = {'work_done': post_work_info[entry]['work_done']} (my_entries[entry]).setState(post_work_info[entry]) else: logSupport.log.debug("No work found for entry %s from anyt frontends" % entry) if work_info_read_err:
def find_and_perform_work(do_advertize, factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries): """ For all entries in this group, find work requests from the WMS collector, validate credentials, and requests glideins. If an entry is in downtime, requested glideins is zero. @type factory_in_downtime: boolean @param factory_in_downtime: True if factory is in downtime @type glideinDescript: dict @param glideinDescript: Factory glidein config values @type frontendDescript: dict @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects (glideFactoryEntry.Entry) keyed on entry name @return: Dictionary of work to do keyed on entry name @rtype: dict """ # Work done by group keyed by entry name. This will be returned back groupwork_done = {} # Step 1: # Find work to perform. Work is a dict work[entry_name][frontend] # We may or may not be able to perform all the work but that will be # checked later per entry # work includes all entries, empty value for entries w/ no work to do # to allow cleanup, ... (remove held glideins, ...) work = find_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) # Request from a Frontend group to an entry work_count = get_work_count(work) if (work_count == 0): logSupport.log.info("No work found") if do_advertize: logSupport.log.info("Continuing to update monitoring info") else: return groupwork_done logSupport.log.info("Found %s total tasks to work on" % work_count) # Max number of children to fork at a time # Each child currently takes ~50 MB # Leaving 3GB for system, max number of children to fork is # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB) parallel_workers = 0 try: parallel_workers = int(glideinDescript.data['EntryParallelWorkers']) except KeyError: logSupport.log.debug("EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits.") post_work_info = {} work_info_read_err = False if parallel_workers <= 0: logSupport.log.debug("Setting parallel_workers limit dynamically based on the available free memory") free_mem = os.sysconf('SC_AVPHYS_PAGES')*os.sysconf('SC_PAGE_SIZE') parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES)) if parallel_workers < 1: parallel_workers = 1 logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers) forkm_obj = ForkManager() # Only fork of child processes for entries that have corresponding # work to do, ie glideclient classads. # TODO: #22163, change in 3.5 coordinate w/ find_work(): # change so that only the entries w/ work to do are returned in 'work' # currently work contains all entries # cleanup is still done correctly, handled also in the entries w/o work function (forked as single function) entries_without_work = [] for ent in my_entries: if work.get(ent): entry = my_entries[ent] # ent is the entry.name forkm_obj.add_fork(ent, forked_check_and_perform_work, factory_in_downtime, entry, work[ent]) else: entries_without_work.append(ent) # Evaluate stats for entries without work only if these will be advertised # TODO: #22163, check if this is causing too much load # Since glideins only decrease for entries not receiving requests, a more efficient way # could be to advertise entries that had non 0 # of glideins at the previous round if do_advertize and len(entries_without_work) > 0: forkm_obj.add_fork('GWMS_ENTRIES_WITHOUT_WORK', forked_update_entries_stats, factory_in_downtime, [my_entries[i] for i in entries_without_work]) t_begin = time.time() try: post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers) t_end = time.time() - t_begin except RuntimeError: # Expect all errors logged already work_info_read_err = True t_end = time.time() - t_begin logSupport.roll_all_logs() # Gather results from the forked children logSupport.log.info("All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) logSupport.log.debug("All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) for entry in my_entries: # Update the entry object from the post_work_info if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)): groupwork_done[entry] = {'work_done': post_work_info[entry]['work_done']} (my_entries[entry]).setState(post_work_info[entry]) else: logSupport.log.debug("No work found for entry %s from any frontends" % entry) if 'GWMS_ENTRIES_WITHOUT_WORK' in post_work_info and len(post_work_info['GWMS_ENTRIES_WITHOUT_WORK']['entries']) > 0: for entry, entry_state in post_work_info['GWMS_ENTRIES_WITHOUT_WORK']['entries']: (my_entries[entry]).setState(entry_state) if work_info_read_err: logSupport.log.debug("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated") logSupport.log.warning("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated") return groupwork_done
def find_and_perform_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries): """ For all entries in this group, find work requests from the WMS collector, validate credentials, and requests glideins. If an entry is in downtime, requested glideins is zero. @type factory_in_downtime: boolean @param factory_in_downtime: True if factory is in downtime @type glideinDescript: dict @param glideinDescript: Factory glidein config values @type frontendDescript: dict @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name @return: Dictionary of work to do keyed on entry name @rtype: dict """ # Work done by group keyed by entry name. This will be returned back groupwork_done = {} # Step 1: # Find work to perform. Work is a dict work[entry_name][frontend] # We may or may not be able to perform all the work but that will be # checked later per entry work = {} work = find_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) # TODO: If we return here check if we need to do cleanup of held glideins? # So far only de-advertising is confirmed to trigger not cleanup work_count = get_work_count(work) if (work_count == 0): logSupport.log.info("No work found") return groupwork_done logSupport.log.info("Found %s total tasks to work on" % work_count) # Max number of children to fork at a time # Each child currently takes ~50 MB # Leaving 3GB for system, max number of children to fork is # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB) parallel_workers = 0 try: parallel_workers = int(glideinDescript.data['EntryParallelWorkers']) except KeyError: logSupport.log.debug( "EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits." ) post_work_info = {} work_info_read_err = False if parallel_workers <= 0: logSupport.log.debug( "Setting parallel_workers limit dynamically based on the available free memory" ) free_mem = os.sysconf('SC_AVPHYS_PAGES') * os.sysconf('SC_PAGE_SIZE') parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES)) if parallel_workers < 1: parallel_workers = 1 logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers) forkm_obj = ForkManager() # Only fork of child processes for entries that have corresponding # work todo, ie glideclient classads. for ent in work: entry = my_entries[ent] forkm_obj.add_fork(entry.name, forked_check_and_perform_work, factory_in_downtime, entry, work) try: t_begin = time.time() post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers) t_end = time.time() - t_begin except RuntimeError: # Expect all errors logged already work_info_read_err = True t_end = time.time() - t_begin logSupport.roll_all_logs() # Gather results from the forked children logSupport.log.info( "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) logSupport.log.debug( "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) for entry in my_entries: # Update the entry object from the post_work_info if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)): groupwork_done[entry] = { 'work_done': post_work_info[entry]['work_done'] } (my_entries[entry]).setState(post_work_info[entry]) else: logSupport.log.debug( "No work found for entry %s from any frontends" % entry) if work_info_read_err: logSupport.log.debug( "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated" ) logSupport.log.warning( "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated" ) return groupwork_done