Example #1
0
def spawn_cleanup(work_dir, frontendDescript, groups, frontend_name, ha_mode):
    global STARTUP_DIR

    # Invalidate glidefrontendmonitor classad
    try:
        set_frontend_htcondor_env(work_dir, frontendDescript)
        fm_advertiser = glideinFrontendInterface.FrontendMonitorClassadAdvertiser(
        )
        constraint = '(GlideFrontendName=="%s")&&(GlideFrontendHAMode=?="%s")' % (
            frontend_name, ha_mode)
        fm_advertiser.invalidateConstrainedClassads(constraint)
    except:
        # Do not fail in case of errors.
        logSupport.log.warning(
            "Failed to deadvertise glidefrontendmonitor classad")

    for group_name in groups:
        try:
            command_list = [
                sys.executable,
                os.path.join(STARTUP_DIR, "glideinFrontendElement.py"),
                str(os.getpid()), work_dir, group_name, "deadvertise"
            ]
            #logSupport.log.debug("Command list: %s" % command_list)
            child = subprocess.Popen(command_list,
                                     shell=False,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

            # set it in non blocking mode
            for fd in (child.stdout.fileno(), child.stderr.fileno()):
                fl = fcntl.fcntl(fd, fcntl.F_GETFL)
                fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)

            while poll_group_process(group_name, child) is None:
                # None means "still alive"
                time.sleep(0.01)
        except:
            # never fail on cleanup
            pass
Example #2
0
def spawn_iteration(work_dir, frontendDescript, groups, max_active,
                    failure_dict, max_failures, action):
    childs = {}
  
    for group_name in groups:
        childs[group_name] = {'state': 'queued'}

    active_groups = 0
    groups_tofinish = len(groups)


    max_num_failures = 0
    logSupport.log.info("Starting iteration")
    try:
        while groups_tofinish > 0:
            done_something = False
            # check if any group finished by now
            for group_name in groups:
                if childs[group_name]['state'] == 'spawned':
                    group_rc = poll_group_process(group_name,
                                                  childs[group_name]['data'])
                    if not (group_rc is None): # None means "still alive"
                        if group_rc == 0:
                            childs[group_name]['state'] = 'finished'
                        else:
                            childs[group_name]['state'] = 'failed'
                            failure_dict[group_name].add_failure()
                            num_failures = failure_dict[group_name].count_failures()
                            max_num_failures = max(max_num_failures,
                                                   num_failures)
                            logSupport.log.warning("Group %s terminated with exit code %i (%i recent failure)" % (group_name, group_rc, num_failures))
                        childs[group_name]['end_time'] = time.time()
                        servicePerformance.endPerfMetricEvent(
                            'frontend', 'group_%s_iteration'%group_name)
                        active_groups -= 1
                        groups_tofinish -= 1
                        done_something = True

            # see if I can spawn more
            for group_name in groups:
                if active_groups < max_active: # can spawn more
                    if childs[group_name]['state'] == 'queued':
                        childs[group_name]['data'] = spawn_group(work_dir, group_name, action)
                        childs[group_name]['state'] = 'spawned'
                        childs[group_name]['start_time'] = time.time()
                        servicePerformance.startPerfMetricEvent(
                            'frontend', 'group_%s_iteration'%group_name)
                        active_groups += 1
                        done_something = True
                else:
                    break

            if done_something:
                logSupport.log.info("Active groups = %i, Groups to finish = %i" % (active_groups, groups_tofinish))
            if groups_tofinish > 0:
                time.sleep(0.01)
    
        logSupport.log.info("All groups finished")

        logSupport.log.info("Aggregate monitoring data")
        # KEL - can we just call the monitor aggregator method directly?  see above
        servicePerformance.startPerfMetricEvent('frontend', 'aggregate_stats')
        stats = aggregate_stats()
        servicePerformance.endPerfMetricEvent('frontend', 'aggregate_stats')
        #logSupport.log.debug(stats)

        # Create the glidefrontendmonitor classad
        fm_advertiser = glideinFrontendInterface.FrontendMonitorClassadAdvertiser(multi_support=glideinFrontendInterface.frontendConfig.advertise_use_multi)
        fm_classad = glideinFrontendInterface.FrontendMonitorClassad(
                         frontendDescript.data['FrontendName'])
        fm_classad.setFrontendDetails(
            frontendDescript.data['FrontendName'], ','.join(groups),
            glideinFrontendLib.getHAMode(frontendDescript.data))
        try:
            idle_jobs = {
                'Total': stats['total']['Jobs']['Idle'],
                '600': stats['total']['Jobs']['OldIdle'],
                '3600': stats['total']['Jobs']['Idle_3600'],
            }
        except KeyError as err:
            idle_jobs = {'Total': 0, '600': 0, '3600': 0}
            logSupport.log.error("Error in RRD Database. Setting idle_jobs[%s] Failed. Reconfig the frontend with -fix_rrd to fix this error" % (err.message,))

        fm_classad.setIdleJobCount(idle_jobs)
        fm_classad.setPerfMetrics(servicePerformance.getPerfMetric('frontend'))
        # Gather performance stats from history file of each group
        for group_name in groups:
            gname = 'group_%s' % group_name
            try:
                history_obj = glideinFrontendConfig.HistoryFile(
                    work_dir, group_name, True, dict)
                pfm = servicePerformance.getPerfMetric(gname)
                pfm.metric = history_obj['perf_metrics'].metric

                fm_classad.setPerfMetrics(
                    servicePerformance.getPerfMetric(gname))
            except:
                pass # Do not fail for non-critical actions

        fm_advertiser.addClassad(fm_classad.adParams['Name'], fm_classad)

        # Advertise glidefrontendmonitor classad to user pool
        logSupport.log.info("Advertising %i %s classad(s) to the user pool" % (len(fm_advertiser.classads), fm_advertiser.adType))
        try:
            set_frontend_htcondor_env(work_dir, frontendDescript)
            fm_advertiser.advertiseAllClassads()
            logSupport.log.info("Done advertising %s classad(s) to the user pool" % fm_advertiser.adType)
        except condorExe.ExeError:
            logSupport.log.error("Exception occurred trying to advertise %s classad(s) to the user pool" % fm_advertiser.adType)
        except:
            # Rethrow any other exception including stop signal
            raise
        finally:
            # Cleanup the env
            clean_htcondor_env()

        logSupport.log.info("Cleaning logs")
        cleanupSupport.cleaners.cleanup()

        if max_num_failures > max_failures:
            logSupport.log.info("Too many group failures, aborting")
            logSupport.log.debug("Failed %i times (limit %i), aborting"%(max_num_failures, max_failures))
            raise RuntimeError("Too many group failures, aborting") 
    finally:
        # cleanup at exit
        # if anything goes wrong, hardkill the rest
        for group_name in childs:
            if childs[group_name]['state']=='spawned':
                logSupport.log.info("Hard killing group %s" % group_name)
                servicePerformance.endPerfMetricEvent(
                    'frontend', 'group_%s_iteration'%group_name)
                try:
                    os.kill(childs[group_name]['data'].pid, signal.SIGKILL)
                except OSError:
                    pass # ignore failed kills of non-existent processes

    # at this point, all groups should have been run
    timings = []
    for group_name in groups:
        timings.append((group_name, childs[group_name]['end_time']-childs[group_name]['start_time']))
    return timings