def find_and_perform_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries): """ For all entries in this group, find work requests from the WMS collector, validate credentials, and requests glideins. If an entry is in downtime, requested glideins is zero. @type factory_in_downtime: boolean @param factory_in_downtime: True if factory is in downtime @type glideinDescript: dict @param glideinDescript: Factory glidein config values @type frontendDescript: dict @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name @return: Dictionary of work to do keyed on entry name @rtype: dict """ # Work done by group keyed by entry name. This will be returned back groupwork_done = {} # Step 1: # Find work to perform. Work is a dict work[entry_name][frontend] # We may or may not be able to perform all the work but that will be # checked later per entry work = {} work = find_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) # TODO: If we return here check if we need to do cleanup of held glideins? # So far only de-advertising is confirmed to trigger not cleanup work_count = get_work_count(work) if (work_count == 0): logSupport.log.info("No work found") return groupwork_done logSupport.log.info("Found %s total tasks to work on" % work_count) # Max number of children to fork at a time # Each child currently takes ~50 MB # Leaving 3GB for system, max number of children to fork is # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB) parallel_workers = 0 try: parallel_workers = int(glideinDescript.data['EntryParallelWorkers']) except KeyError: logSupport.log.debug("EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits.") post_work_info = {} work_info_read_err = False if parallel_workers <= 0: logSupport.log.debug("Setting parallel_workers limit dynamically based on the available free memory") free_mem = os.sysconf('SC_AVPHYS_PAGES')*os.sysconf('SC_PAGE_SIZE') parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES)) if parallel_workers < 1: parallel_workers = 1 logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers) forkm_obj = ForkManager() # Only fork of child processes for entries that have corresponding # work todo, ie glideclient classads. for ent in work: entry = my_entries[ent] forkm_obj.add_fork(entry.name, forked_check_and_perform_work, factory_in_downtime, entry, work) try: post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers) except RuntimeError: # Expect all errors logged already work_info_read_err = True logSupport.roll_all_logs() # Gather results from the forked children logSupport.log.info("All children forked for glideFactoryEntry.check_and_perform_work terminated. Loading post work state for the entry.") logSupport.log.debug("All children forked for glideFactoryEntry.check_and_perform_work terminated. Loading post work state for the entry.") for entry in my_entries: # Update the entry object from the post_work_info if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)): groupwork_done[entry] = {'work_done': post_work_info[entry]['work_done']} (my_entries[entry]).setState(post_work_info[entry]) else: logSupport.log.debug("No work found for entry %s from anyt frontends" % entry) if work_info_read_err: logSupport.log.debug("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated") logSupport.log.warning("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated") return groupwork_done
def setUp(self): #import select global_log_setup() self.fork_manager = ForkManager() self.default_forks = 100 self.default_sleep = 5
def find_and_perform_work(do_advertize, factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries): """ For all entries in this group, find work requests from the WMS collector, validate credentials, and requests glideins. If an entry is in downtime, requested glideins is zero. @type factory_in_downtime: boolean @param factory_in_downtime: True if factory is in downtime @type glideinDescript: dict @param glideinDescript: Factory glidein config values @type frontendDescript: dict @param frontendDescript: Security mappings for frontend identities, security classes, and usernames @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects (glideFactoryEntry.Entry) keyed on entry name @return: Dictionary of work to do keyed on entry name @rtype: dict """ # Work done by group keyed by entry name. This will be returned back groupwork_done = {} # Step 1: # Find work to perform. Work is a dict work[entry_name][frontend] # We may or may not be able to perform all the work but that will be # checked later per entry # work includes all entries, empty value for entries w/ no work to do # to allow cleanup, ... (remove held glideins, ...) work = find_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) # Request from a Frontend group to an entry work_count = get_work_count(work) if (work_count == 0): logSupport.log.info("No work found") if do_advertize: logSupport.log.info("Continuing to update monitoring info") else: return groupwork_done logSupport.log.info("Found %s total tasks to work on" % work_count) # Max number of children to fork at a time # Each child currently takes ~50 MB # Leaving 3GB for system, max number of children to fork is # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB) parallel_workers = 0 try: parallel_workers = int(glideinDescript.data['EntryParallelWorkers']) except KeyError: logSupport.log.debug( "EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits." ) post_work_info = {} work_info_read_err = False if parallel_workers <= 0: logSupport.log.debug( "Setting parallel_workers limit dynamically based on the available free memory" ) free_mem = os.sysconf('SC_AVPHYS_PAGES') * os.sysconf('SC_PAGE_SIZE') parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES)) if parallel_workers < 1: parallel_workers = 1 logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers) forkm_obj = ForkManager() # Only fork of child processes for entries that have corresponding # work to do, ie glideclient classads. # TODO: #22163, change in 3.5 coordinate w/ find_work(): # change so that only the entries w/ work to do are returned in 'work' # currently work contains all entries # cleanup is still done correctly, handled also in the entries w/o work function (forked as single function) entries_without_work = [] for ent in my_entries: if work.get(ent): entry = my_entries[ent] # ent is the entry.name forkm_obj.add_fork(ent, forked_check_and_perform_work, factory_in_downtime, entry, work[ent]) else: entries_without_work.append(ent) # Evaluate stats for entries without work only if these will be advertised # TODO: #22163, check if this is causing too much load # Since glideins only decrease for entries not receiving requests, a more efficient way # could be to advertise entries that had non 0 # of glideins at the previous round if do_advertize and len(entries_without_work) > 0: forkm_obj.add_fork('GWMS_ENTRIES_WITHOUT_WORK', forked_update_entries_stats, factory_in_downtime, [my_entries[i] for i in entries_without_work]) t_begin = time.time() try: post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers) t_end = time.time() - t_begin except RuntimeError: # Expect all errors logged already work_info_read_err = True t_end = time.time() - t_begin logSupport.roll_all_logs() # Gather results from the forked children logSupport.log.info( "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) logSupport.log.debug( "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) for entry in my_entries: # Update the entry object from the post_work_info if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)): groupwork_done[entry] = { 'work_done': post_work_info[entry]['work_done'] } (my_entries[entry]).setState(post_work_info[entry]) else: logSupport.log.debug( "No work found for entry %s from any frontends" % entry) if 'GWMS_ENTRIES_WITHOUT_WORK' in post_work_info and len( post_work_info['GWMS_ENTRIES_WITHOUT_WORK']['entries']) > 0: for entry, entry_state in post_work_info['GWMS_ENTRIES_WITHOUT_WORK'][ 'entries']: (my_entries[entry]).setState(entry_state) if work_info_read_err: logSupport.log.debug( "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated" ) logSupport.log.warning( "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated" ) return groupwork_done
class TestForkManager(unittest.TestCase): def setUp(self): #import select global_log_setup() self.fork_manager = ForkManager() self.default_forks = 100 self.default_sleep = 5 def tear_down(self): global_log_cleanup() def load_forks(self, num_forks=None, sleep_val=None): if not num_forks: num_forks = self.default_forks if not sleep_val: sleep_val = self.default_sleep expected = {} for i in range(0, num_forks): expected[i] = str(sleep_val) self.fork_manager.add_fork(i, sleep_fn, sleep_val) return expected def test___init__(self): self.assertTrue(isinstance(self.fork_manager, ForkManager)) def test_add_fork_and_len(self): num_forks = 10 self.load_forks(num_forks) self.assertEqual(num_forks, len(self.fork_manager)) def test_fork_and_collect(self): expected = self.load_forks() results = self.fork_manager.fork_and_collect() self.assertEqual(expected, results) def test_fork_and_wait(self): expected = self.load_forks() results = self.fork_manager.fork_and_wait() self.assertEqual(None, results) return def test_bounded_fork_and_collect_use_epoll(self): # #the following 3 tests must be run in order #which may be an artifact of different test runners # expected = self.load_forks() results = self.fork_manager.bounded_fork_and_collect(max_forks=50, log_progress=True, sleep_time=0.1) self.assertEqual(expected, results) fd = open(LOG_FILE, 'r') log_contents = fd.read() self.assertTrue("Active forks =" in log_contents) self.assertTrue("Forks to finish =" in log_contents) self.assertFalse( "'module' object has no attribute 'epoll'" in log_contents) self.assertFalse( "'module' object has no attribute 'poll'" in log_contents) def test_bounded_fork_and_collect_use_poll(self): # #force select.epoll to throw in import error so select.poll is used # del select.epoll expected = self.load_forks() results = self.fork_manager.bounded_fork_and_collect(max_forks=50, log_progress=True, sleep_time=0.1) self.assertEqual(expected, results) fd = open(LOG_FILE, 'r') log_contents = fd.read() self.assertTrue("Active forks =" in log_contents) self.assertTrue("Forks to finish =" in log_contents) self.assertTrue( "'module' object has no attribute 'epoll'" in log_contents) self.assertFalse( "'module' object has no attribute 'poll'" in log_contents) def test_bounded_fork_and_collect_use_select(self): # #force select.poll to throw an import error #depends on select.epoll being removed by previous test # del select.poll expected = self.load_forks() results = self.fork_manager.bounded_fork_and_collect(max_forks=50, log_progress=True, sleep_time=0.1) self.assertEqual(expected, results) fd = open(LOG_FILE, 'r') log_contents = fd.read() self.assertTrue("Active forks = " in log_contents) self.assertTrue("Forks to finish =" in log_contents) self.assertTrue( "'module' object has no attribute 'epoll'" in log_contents) self.assertTrue( "'module' object has no attribute 'poll'" in log_contents)
def find_and_perform_work(do_advertize, factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries): """ For all entries in this group, find work requests from the WMS collector, validate credentials, and requests glideins. If an entry is in downtime, requested glideins is zero. @type factory_in_downtime: boolean @param factory_in_downtime: True if factory is in downtime @type glideinDescript: dict @param glideinDescript: Factory glidein config values @type frontendDescript: dict @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects (glideFactoryEntry.Entry) keyed on entry name @return: Dictionary of work to do keyed on entry name @rtype: dict """ # Work done by group keyed by entry name. This will be returned back groupwork_done = {} # Step 1: # Find work to perform. Work is a dict work[entry_name][frontend] # We may or may not be able to perform all the work but that will be # checked later per entry # work includes all entries, empty value for entries w/ no work to do # to allow cleanup, ... (remove held glideins, ...) work = find_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) # Request from a Frontend group to an entry work_count = get_work_count(work) if (work_count == 0): logSupport.log.info("No work found") if do_advertize: logSupport.log.info("Continuing to update monitoring info") else: return groupwork_done logSupport.log.info("Found %s total tasks to work on" % work_count) # Max number of children to fork at a time # Each child currently takes ~50 MB # Leaving 3GB for system, max number of children to fork is # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB) parallel_workers = 0 try: parallel_workers = int(glideinDescript.data['EntryParallelWorkers']) except KeyError: logSupport.log.debug("EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits.") post_work_info = {} work_info_read_err = False if parallel_workers <= 0: logSupport.log.debug("Setting parallel_workers limit dynamically based on the available free memory") free_mem = os.sysconf('SC_AVPHYS_PAGES')*os.sysconf('SC_PAGE_SIZE') parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES)) if parallel_workers < 1: parallel_workers = 1 logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers) forkm_obj = ForkManager() # Only fork of child processes for entries that have corresponding # work to do, ie glideclient classads. # TODO: #22163, change in 3.5 coordinate w/ find_work(): # change so that only the entries w/ work to do are returned in 'work' # currently work contains all entries # cleanup is still done correctly, handled also in the entries w/o work function (forked as single function) entries_without_work = [] for ent in my_entries: if work.get(ent): entry = my_entries[ent] # ent is the entry.name forkm_obj.add_fork(ent, forked_check_and_perform_work, factory_in_downtime, entry, work[ent]) else: entries_without_work.append(ent) # Evaluate stats for entries without work only if these will be advertised # TODO: #22163, check if this is causing too much load # Since glideins only decrease for entries not receiving requests, a more efficient way # could be to advertise entries that had non 0 # of glideins at the previous round if do_advertize and len(entries_without_work) > 0: forkm_obj.add_fork('GWMS_ENTRIES_WITHOUT_WORK', forked_update_entries_stats, factory_in_downtime, [my_entries[i] for i in entries_without_work]) t_begin = time.time() try: post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers) t_end = time.time() - t_begin except RuntimeError: # Expect all errors logged already work_info_read_err = True t_end = time.time() - t_begin logSupport.roll_all_logs() # Gather results from the forked children logSupport.log.info("All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) logSupport.log.debug("All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) for entry in my_entries: # Update the entry object from the post_work_info if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)): groupwork_done[entry] = {'work_done': post_work_info[entry]['work_done']} (my_entries[entry]).setState(post_work_info[entry]) else: logSupport.log.debug("No work found for entry %s from any frontends" % entry) if 'GWMS_ENTRIES_WITHOUT_WORK' in post_work_info and len(post_work_info['GWMS_ENTRIES_WITHOUT_WORK']['entries']) > 0: for entry, entry_state in post_work_info['GWMS_ENTRIES_WITHOUT_WORK']['entries']: (my_entries[entry]).setState(entry_state) if work_info_read_err: logSupport.log.debug("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated") logSupport.log.warning("Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated") return groupwork_done
class TestForkManager(unittest.TestCase): def setUp(self): #import select global_log_setup() self.fork_manager = ForkManager() self.default_forks = 100 self.default_sleep = 5 def tear_down(self): global_log_cleanup() def load_forks(self, num_forks=None, sleep_val=None): if not num_forks: num_forks = self.default_forks if not sleep_val: sleep_val = self.default_sleep expected = {} for i in range(0, num_forks): expected[i] = str(sleep_val) self.fork_manager.add_fork(i, sleep_fn, sleep_val) return expected def test___init__(self): self.assertTrue(isinstance(self.fork_manager, ForkManager)) def test_add_fork_and_len(self): num_forks = 10 self.load_forks(num_forks) self.assertEqual(num_forks, len(self.fork_manager)) def test_fork_and_collect(self): expected = self.load_forks() results = self.fork_manager.fork_and_collect() self.assertEqual(expected, results) def test_fork_and_wait(self): expected = self.load_forks() results = self.fork_manager.fork_and_wait() self.assertEqual(None, results) return def test_bounded_fork_and_collect_use_epoll(self): # # the following 3 tests must be run in order # which may be an artifact of different test runners # if platform.system() != 'Linux': return expected = self.load_forks() results = self.fork_manager.bounded_fork_and_collect( max_forks=50, log_progress=True, sleep_time=0.1) self.assertEqual(expected, results) fd = open(LOG_FILE, 'r') log_contents = fd.read() self.assertTrue("Active forks =" in log_contents) self.assertTrue("Forks to finish =" in log_contents) self.assertFalse( "'module' object has no attribute 'epoll'" in log_contents) self.assertFalse( "'module' object has no attribute 'poll'" in log_contents) def test_bounded_fork_and_collect_use_poll(self): # # force select.epoll to throw in import error so select.poll is used # if platform.system() != 'Linux': return del select.epoll expected = self.load_forks() results = self.fork_manager.bounded_fork_and_collect( max_forks=50, log_progress=True, sleep_time=0.1) self.assertEqual(expected, results) fd = open(LOG_FILE, 'r') log_contents = fd.read() self.assertTrue("Active forks =" in log_contents) self.assertTrue("Forks to finish =" in log_contents) self.assertTrue( "'module' object has no attribute 'epoll'" in log_contents) self.assertFalse( "'module' object has no attribute 'poll'" in log_contents) def test_bounded_fork_and_collect_use_select(self): # # force select.poll to throw an import error # depends on select.epoll being removed by previous test # if platform.system() == 'Linux': del select.poll expected = self.load_forks() results = self.fork_manager.bounded_fork_and_collect( max_forks=50, log_progress=True, sleep_time=0.1) self.assertEqual(expected, results) fd = open(LOG_FILE, 'r') log_contents = fd.read() self.assertTrue("Active forks = " in log_contents) self.assertTrue("Forks to finish =" in log_contents) self.assertTrue( "'module' object has no attribute 'epoll'" in log_contents) self.assertTrue( "'module' object has no attribute 'poll'" in log_contents)
def find_and_perform_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries): """ For all entries in this group, find work requests from the WMS collector, validate credentials, and requests glideins. If an entry is in downtime, requested glideins is zero. @type factory_in_downtime: boolean @param factory_in_downtime: True if factory is in downtime @type glideinDescript: dict @param glideinDescript: Factory glidein config values @type frontendDescript: dict @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name @return: Dictionary of work to do keyed on entry name @rtype: dict """ # Work done by group keyed by entry name. This will be returned back groupwork_done = {} # Step 1: # Find work to perform. Work is a dict work[entry_name][frontend] # We may or may not be able to perform all the work but that will be # checked later per entry work = {} work = find_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) # TODO: If we return here check if we need to do cleanup of held glideins? # So far only de-advertising is confirmed to trigger not cleanup work_count = get_work_count(work) if (work_count == 0): logSupport.log.info("No work found") return groupwork_done logSupport.log.info("Found %s total tasks to work on" % work_count) # Max number of children to fork at a time # Each child currently takes ~50 MB # Leaving 3GB for system, max number of children to fork is # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB) parallel_workers = 0 try: parallel_workers = int(glideinDescript.data['EntryParallelWorkers']) except KeyError: logSupport.log.debug( "EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits." ) post_work_info = {} work_info_read_err = False if parallel_workers <= 0: logSupport.log.debug( "Setting parallel_workers limit dynamically based on the available free memory" ) free_mem = os.sysconf('SC_AVPHYS_PAGES') * os.sysconf('SC_PAGE_SIZE') parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES)) if parallel_workers < 1: parallel_workers = 1 logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers) forkm_obj = ForkManager() # Only fork of child processes for entries that have corresponding # work todo, ie glideclient classads. for ent in work: entry = my_entries[ent] forkm_obj.add_fork(entry.name, forked_check_and_perform_work, factory_in_downtime, entry, work) try: t_begin = time.time() post_work_info = forkm_obj.bounded_fork_and_collect(parallel_workers) t_end = time.time() - t_begin except RuntimeError: # Expect all errors logged already work_info_read_err = True t_end = time.time() - t_begin logSupport.roll_all_logs() # Gather results from the forked children logSupport.log.info( "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) logSupport.log.debug( "All children forked for glideFactoryEntry.check_and_perform_work terminated - took %s seconds. Loading post work state for the entry." % t_end) for entry in my_entries: # Update the entry object from the post_work_info if ((entry in post_work_info) and (len(post_work_info[entry]) > 0)): groupwork_done[entry] = { 'work_done': post_work_info[entry]['work_done'] } (my_entries[entry]).setState(post_work_info[entry]) else: logSupport.log.debug( "No work found for entry %s from any frontends" % entry) if work_info_read_err: logSupport.log.debug( "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated" ) logSupport.log.warning( "Unable to process response from one or more children for check_and_perform_work. One or more forked processes may have failed and may not have client_stats updated" ) return groupwork_done