def spawn_cleanup(work_dir, frontendDescript, groups, frontend_name, ha_mode): global STARTUP_DIR # Invalidate glidefrontendmonitor classad try: set_frontend_htcondor_env(work_dir, frontendDescript) fm_advertiser = glideinFrontendInterface.FrontendMonitorClassadAdvertiser( ) constraint = '(GlideFrontendName=="%s")&&(GlideFrontendHAMode=?="%s")' % ( frontend_name, ha_mode) fm_advertiser.invalidateConstrainedClassads(constraint) except: # Do not fail in case of errors. logSupport.log.warning( "Failed to deadvertise glidefrontendmonitor classad") for group_name in groups: try: command_list = [ sys.executable, os.path.join(STARTUP_DIR, "glideinFrontendElement.py"), str(os.getpid()), work_dir, group_name, "deadvertise" ] #logSupport.log.debug("Command list: %s" % command_list) child = subprocess.Popen(command_list, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # set it in non blocking mode for fd in (child.stdout.fileno(), child.stderr.fileno()): fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) while poll_group_process(group_name, child) is None: # None means "still alive" time.sleep(0.01) except: # never fail on cleanup pass
def spawn_iteration(work_dir, frontendDescript, groups, max_active, failure_dict, max_failures, action): childs = {} for group_name in groups: childs[group_name] = {'state': 'queued'} active_groups = 0 groups_tofinish = len(groups) max_num_failures = 0 logSupport.log.info("Starting iteration") try: while groups_tofinish > 0: done_something = False # check if any group finished by now for group_name in groups: if childs[group_name]['state'] == 'spawned': group_rc = poll_group_process(group_name, childs[group_name]['data']) if not (group_rc is None): # None means "still alive" if group_rc == 0: childs[group_name]['state'] = 'finished' else: childs[group_name]['state'] = 'failed' failure_dict[group_name].add_failure() num_failures = failure_dict[group_name].count_failures() max_num_failures = max(max_num_failures, num_failures) logSupport.log.warning("Group %s terminated with exit code %i (%i recent failure)" % (group_name, group_rc, num_failures)) childs[group_name]['end_time'] = time.time() servicePerformance.endPerfMetricEvent( 'frontend', 'group_%s_iteration'%group_name) active_groups -= 1 groups_tofinish -= 1 done_something = True # see if I can spawn more for group_name in groups: if active_groups < max_active: # can spawn more if childs[group_name]['state'] == 'queued': childs[group_name]['data'] = spawn_group(work_dir, group_name, action) childs[group_name]['state'] = 'spawned' childs[group_name]['start_time'] = time.time() servicePerformance.startPerfMetricEvent( 'frontend', 'group_%s_iteration'%group_name) active_groups += 1 done_something = True else: break if done_something: logSupport.log.info("Active groups = %i, Groups to finish = %i" % (active_groups, groups_tofinish)) if groups_tofinish > 0: time.sleep(0.01) logSupport.log.info("All groups finished") logSupport.log.info("Aggregate monitoring data") # KEL - can we just call the monitor aggregator method directly? see above servicePerformance.startPerfMetricEvent('frontend', 'aggregate_stats') stats = aggregate_stats() servicePerformance.endPerfMetricEvent('frontend', 'aggregate_stats') #logSupport.log.debug(stats) # Create the glidefrontendmonitor classad fm_advertiser = glideinFrontendInterface.FrontendMonitorClassadAdvertiser(multi_support=glideinFrontendInterface.frontendConfig.advertise_use_multi) fm_classad = glideinFrontendInterface.FrontendMonitorClassad( frontendDescript.data['FrontendName']) fm_classad.setFrontendDetails( frontendDescript.data['FrontendName'], ','.join(groups), glideinFrontendLib.getHAMode(frontendDescript.data)) try: idle_jobs = { 'Total': stats['total']['Jobs']['Idle'], '600': stats['total']['Jobs']['OldIdle'], '3600': stats['total']['Jobs']['Idle_3600'], } except KeyError as err: idle_jobs = {'Total': 0, '600': 0, '3600': 0} logSupport.log.error("Error in RRD Database. Setting idle_jobs[%s] Failed. Reconfig the frontend with -fix_rrd to fix this error" % (err.message,)) fm_classad.setIdleJobCount(idle_jobs) fm_classad.setPerfMetrics(servicePerformance.getPerfMetric('frontend')) # Gather performance stats from history file of each group for group_name in groups: gname = 'group_%s' % group_name try: history_obj = glideinFrontendConfig.HistoryFile( work_dir, group_name, True, dict) pfm = servicePerformance.getPerfMetric(gname) pfm.metric = history_obj['perf_metrics'].metric fm_classad.setPerfMetrics( servicePerformance.getPerfMetric(gname)) except: pass # Do not fail for non-critical actions fm_advertiser.addClassad(fm_classad.adParams['Name'], fm_classad) # Advertise glidefrontendmonitor classad to user pool logSupport.log.info("Advertising %i %s classad(s) to the user pool" % (len(fm_advertiser.classads), fm_advertiser.adType)) try: set_frontend_htcondor_env(work_dir, frontendDescript) fm_advertiser.advertiseAllClassads() logSupport.log.info("Done advertising %s classad(s) to the user pool" % fm_advertiser.adType) except condorExe.ExeError: logSupport.log.error("Exception occurred trying to advertise %s classad(s) to the user pool" % fm_advertiser.adType) except: # Rethrow any other exception including stop signal raise finally: # Cleanup the env clean_htcondor_env() logSupport.log.info("Cleaning logs") cleanupSupport.cleaners.cleanup() if max_num_failures > max_failures: logSupport.log.info("Too many group failures, aborting") logSupport.log.debug("Failed %i times (limit %i), aborting"%(max_num_failures, max_failures)) raise RuntimeError("Too many group failures, aborting") finally: # cleanup at exit # if anything goes wrong, hardkill the rest for group_name in childs: if childs[group_name]['state']=='spawned': logSupport.log.info("Hard killing group %s" % group_name) servicePerformance.endPerfMetricEvent( 'frontend', 'group_%s_iteration'%group_name) try: os.kill(childs[group_name]['data'].pid, signal.SIGKILL) except OSError: pass # ignore failed kills of non-existent processes # at this point, all groups should have been run timings = [] for group_name in groups: timings.append((group_name, childs[group_name]['end_time']-childs[group_name]['start_time'])) return timings