Python getHAMode Examples

Programming Language: Python

Namespace/Package Name: glideinwms.frontend.glideinFrontendLib

Method/Function: getHAMode

Examples at hotexamples.com: 5

Python getHAMode - 5 examples found. These are the top rated real world Python examples of glideinwms.frontend.glideinFrontendLib.getHAMode extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def spawn(sleep_time, advertize_rate, work_dir, frontendDescript,
          groups, max_parallel_workers, restart_interval, restart_attempts):

    num_groups = len(groups)

    # TODO: Get the ha_check_interval from the config
    ha = glideinFrontendLib.getHASettings(frontendDescript.data)
    ha_check_interval = glideinFrontendLib.getHACheckInterval(frontendDescript.data)
    mode = glideinFrontendLib.getHAMode(frontendDescript.data)
    master_frontend_name = ''
    if mode == 'slave':
        master_frontend_name = ha.get('ha_frontends')[0].get('frontend_name')

    active = (mode == 'master')
    hibernate = shouldHibernate(frontendDescript, work_dir, ha, mode, groups)

    logSupport.log.info('Frontend started with mode = %s' % mode)
    try:

        # Service will exit on signal only.
        # This infinite loop is for the slave to go back into hibernation
        # once the master becomes alive.
        # Master never loops infinitely here, but instead it does in
        # the inner loop while(mode=='master') ...
        while True:

            while hibernate:
                # If I am slave enter hibernation cycle while Master is alive
                logSupport.log.info('Master Frontend %s is online. Hibernating.' % master_frontend_name)
                time.sleep(ha_check_interval)
                hibernate = shouldHibernate(frontendDescript, work_dir,
                                            ha, mode, groups)

            # We broke out of hibernation cycle
            # Either Master has disappeared or I am the Master
            if mode == 'slave':
                logSupport.log.info("Master frontend %s is offline. Activating slave frontend." % master_frontend_name)
                active = True

            failure_dict = {} 
            for group in groups:
                failure_dict[group] = FailureCounter(group, restart_interval)

            while ((mode == 'master') or ((mode == 'slave') and active)):
                servicePerformance.startPerfMetricEvent('frontend', 'iteration')
                start_time = time.time()
                timings = spawn_iteration(work_dir, frontendDescript, groups,
                                          max_parallel_workers, failure_dict,
                                          restart_attempts, "run")
                servicePerformance.endPerfMetricEvent('frontend', 'iteration')
                end_time = time.time()
                elapsed_time = servicePerformance.getPerfMetricEventLifetime('frontend', 'iteration')
                if elapsed_time < sleep_time:
                    real_sleep_time = sleep_time - elapsed_time
                    logSupport.log.info("Sleep %.1f sec" % real_sleep_time)
                    time.sleep(real_sleep_time)
                else:
                    logSupport.log.info("No sleeping this loop, took %.1f sec > %.1f sec" % (elapsed_time, sleep_time))

                # order the groups by walltime
                # longest walltime first
                timings.sort(lambda x, y:-cmp(x[1], y[1]))
                # recreate the groups list, with new ordering
                groups = [el[0] for el in timings]
                assert num_groups == len(groups), "Something went wrong, number of groups changed"

                if mode == 'slave':
                    # If we are slave, check if master is back and if so
                    # deadvertise my classads and hibernate
                    hibernate = shouldHibernate(frontendDescript, work_dir,
                                                ha, mode, groups)

                    if hibernate:
                        active = False
                        logSupport.log.info("Master frontend %s is back online" % master_frontend_name)
                        logSupport.log.info("Deadvertize my ads and enter hibernation cycle")
                        spawn_cleanup(work_dir, frontendDescript, groups,
                                      frontendDescript.data['FrontendName'],
                                      mode)
                    else:
                        logSupport.log.info("Master frontend %s is still offline" % master_frontend_name)


    finally:
        # We have been asked to terminate
        logSupport.log.info("Deadvertize my ads")
        spawn_cleanup(work_dir, frontendDescript, groups,
                      frontendDescript.data['FrontendName'], mode)

Example #2

Show file

def spawn_iteration(work_dir, frontendDescript, groups, max_active,
                    failure_dict, max_failures, action):
    childs = {}
  
    for group_name in groups:
        childs[group_name] = {'state': 'queued'}

    active_groups = 0
    groups_tofinish = len(groups)


    max_num_failures = 0
    logSupport.log.info("Starting iteration")
    try:
        while groups_tofinish > 0:
            done_something = False
            # check if any group finished by now
            for group_name in groups:
                if childs[group_name]['state'] == 'spawned':
                    group_rc = poll_group_process(group_name,
                                                  childs[group_name]['data'])
                    if not (group_rc is None): # None means "still alive"
                        if group_rc == 0:
                            childs[group_name]['state'] = 'finished'
                        else:
                            childs[group_name]['state'] = 'failed'
                            failure_dict[group_name].add_failure()
                            num_failures = failure_dict[group_name].count_failures()
                            max_num_failures = max(max_num_failures,
                                                   num_failures)
                            logSupport.log.warning("Group %s terminated with exit code %i (%i recent failure)" % (group_name, group_rc, num_failures))
                        childs[group_name]['end_time'] = time.time()
                        servicePerformance.endPerfMetricEvent(
                            'frontend', 'group_%s_iteration'%group_name)
                        active_groups -= 1
                        groups_tofinish -= 1
                        done_something = True

            # see if I can spawn more
            for group_name in groups:
                if active_groups < max_active: # can spawn more
                    if childs[group_name]['state'] == 'queued':
                        childs[group_name]['data'] = spawn_group(work_dir, group_name, action)
                        childs[group_name]['state'] = 'spawned'
                        childs[group_name]['start_time'] = time.time()
                        servicePerformance.startPerfMetricEvent(
                            'frontend', 'group_%s_iteration'%group_name)
                        active_groups += 1
                        done_something = True
                else:
                    break

            if done_something:
                logSupport.log.info("Active groups = %i, Groups to finish = %i" % (active_groups, groups_tofinish))
            if groups_tofinish > 0:
                time.sleep(0.01)
    
        logSupport.log.info("All groups finished")

        logSupport.log.info("Aggregate monitoring data")
        # KEL - can we just call the monitor aggregator method directly?  see above
        servicePerformance.startPerfMetricEvent('frontend', 'aggregate_stats')
        stats = aggregate_stats()
        servicePerformance.endPerfMetricEvent('frontend', 'aggregate_stats')
        #logSupport.log.debug(stats)

        # Create the glidefrontendmonitor classad
        fm_advertiser = glideinFrontendInterface.FrontendMonitorClassadAdvertiser(multi_support=glideinFrontendInterface.frontendConfig.advertise_use_multi)
        fm_classad = glideinFrontendInterface.FrontendMonitorClassad(
                         frontendDescript.data['FrontendName'])
        fm_classad.setFrontendDetails(
            frontendDescript.data['FrontendName'], ','.join(groups),
            glideinFrontendLib.getHAMode(frontendDescript.data))
        try:
            idle_jobs = {
                'Total': stats['total']['Jobs']['Idle'],
                '600': stats['total']['Jobs']['OldIdle'],
                '3600': stats['total']['Jobs']['Idle_3600'],
            }
        except KeyError as err:
            idle_jobs = {'Total': 0, '600': 0, '3600': 0}
            logSupport.log.error("Error in RRD Database. Setting idle_jobs[%s] Failed. Reconfig the frontend with -fix_rrd to fix this error" % (err.message,))

        fm_classad.setIdleJobCount(idle_jobs)
        fm_classad.setPerfMetrics(servicePerformance.getPerfMetric('frontend'))
        # Gather performance stats from history file of each group
        for group_name in groups:
            gname = 'group_%s' % group_name
            try:
                history_obj = glideinFrontendConfig.HistoryFile(
                    work_dir, group_name, True, dict)
                pfm = servicePerformance.getPerfMetric(gname)
                pfm.metric = history_obj['perf_metrics'].metric

                fm_classad.setPerfMetrics(
                    servicePerformance.getPerfMetric(gname))
            except:
                pass # Do not fail for non-critical actions

        fm_advertiser.addClassad(fm_classad.adParams['Name'], fm_classad)

        # Advertise glidefrontendmonitor classad to user pool
        logSupport.log.info("Advertising %i %s classad(s) to the user pool" % (len(fm_advertiser.classads), fm_advertiser.adType))
        try:
            set_frontend_htcondor_env(work_dir, frontendDescript)
            fm_advertiser.advertiseAllClassads()
            logSupport.log.info("Done advertising %s classad(s) to the user pool" % fm_advertiser.adType)
        except condorExe.ExeError:
            logSupport.log.error("Exception occurred trying to advertise %s classad(s) to the user pool" % fm_advertiser.adType)
        except:
            # Rethrow any other exception including stop signal
            raise
        finally:
            # Cleanup the env
            clean_htcondor_env()

        logSupport.log.info("Cleaning logs")
        cleanupSupport.cleaners.cleanup()

        if max_num_failures > max_failures:
            logSupport.log.info("Too many group failures, aborting")
            logSupport.log.debug("Failed %i times (limit %i), aborting"%(max_num_failures, max_failures))
            raise RuntimeError("Too many group failures, aborting") 
    finally:
        # cleanup at exit
        # if anything goes wrong, hardkill the rest
        for group_name in childs:
            if childs[group_name]['state']=='spawned':
                logSupport.log.info("Hard killing group %s" % group_name)
                servicePerformance.endPerfMetricEvent(
                    'frontend', 'group_%s_iteration'%group_name)
                try:
                    os.kill(childs[group_name]['data'].pid, signal.SIGKILL)
                except OSError:
                    pass # ignore failed kills of non-existent processes

    # at this point, all groups should have been run
    timings = []
    for group_name in groups:
        timings.append((group_name, childs[group_name]['end_time']-childs[group_name]['start_time']))
    return timings

Example #3

Show file

File: glideinFrontend.py Project: glideinWMS/glideinWMS

def spawn_iteration(work_dir, frontendDescript, groups, max_active,
                    failure_dict, max_failures, action):
    childs = {}
  
    for group_name in groups:
        childs[group_name] = {'state': 'queued'}

    active_groups = 0
    groups_tofinish = len(groups)


    max_num_failures = 0
    logSupport.log.info("Starting iteration")
    try:
        while groups_tofinish > 0:
            done_something = False
            # check if any group finished by now
            for group_name in groups:
                if childs[group_name]['state'] == 'spawned':
                    group_rc = poll_group_process(group_name,
                                                  childs[group_name]['data'])
                    if not (group_rc is None): # None means "still alive"
                        if group_rc == 0:
                            childs[group_name]['state'] = 'finished'
                        else:
                            childs[group_name]['state'] = 'failed'
                            failure_dict[group_name].add_failure()
                            num_failures = failure_dict[group_name].count_failures()
                            max_num_failures = max(max_num_failures,
                                                   num_failures)
                            logSupport.log.warning("Group %s terminated with exit code %i (%i recent failure)" % (group_name, group_rc, num_failures))
                        childs[group_name]['end_time'] = time.time()
                        servicePerformance.endPerfMetricEvent(
                            'frontend', 'group_%s_iteration'%group_name)
                        active_groups -= 1
                        groups_tofinish -= 1
                        done_something = True

            # see if I can spawn more
            for group_name in groups:
                if active_groups < max_active: # can spawn more
                    if childs[group_name]['state'] == 'queued':
                        childs[group_name]['data'] = spawn_group(work_dir, group_name, action)
                        childs[group_name]['state'] = 'spawned'
                        childs[group_name]['start_time'] = time.time()
                        servicePerformance.startPerfMetricEvent(
                            'frontend', 'group_%s_iteration'%group_name)
                        active_groups += 1
                        done_something = True
                else:
                    break

            if done_something:
                logSupport.log.info("Active groups = %i, Groups to finish = %i" % (active_groups, groups_tofinish))
            if groups_tofinish > 0:
                time.sleep(0.01)
    
        logSupport.log.info("All groups finished")

        logSupport.log.info("Aggregate monitoring data")
        # KEL - can we just call the monitor aggregator method directly?  see above
        servicePerformance.startPerfMetricEvent('frontend', 'aggregate_stats')
        stats = aggregate_stats()
        servicePerformance.endPerfMetricEvent('frontend', 'aggregate_stats')
        #logSupport.log.debug(stats)

        # Create the glidefrontendmonitor classad
        fm_advertiser = glideinFrontendInterface.FrontendMonitorClassadAdvertiser(multi_support=glideinFrontendInterface.frontendConfig.advertise_use_multi)
        fm_classad = glideinFrontendInterface.FrontendMonitorClassad(
                         frontendDescript.data['FrontendName'])
        fm_classad.setFrontendDetails(
            frontendDescript.data['FrontendName'], ','.join(groups),
            glideinFrontendLib.getHAMode(frontendDescript.data))
        try:
            idle_jobs = {
                'Total': stats['total']['Jobs']['Idle'],
                '600': stats['total']['Jobs']['OldIdle'],
                '3600': stats['total']['Jobs']['Idle_3600'],
            }
        except KeyError as err:
            idle_jobs = {'Total': 0, '600': 0, '3600': 0}
            logSupport.log.error("Error in RRD Database. Setting idle_jobs[%s] Failed. Reconfig the frontend with -fix_rrd to fix this error" % (err.message,))

        fm_classad.setIdleJobCount(idle_jobs)
        fm_classad.setPerfMetrics(servicePerformance.getPerfMetric('frontend'))
        # Gather performance stats from history file of each group
        for group_name in groups:
            gname = 'group_%s' % group_name
            try:
                history_obj = glideinFrontendConfig.HistoryFile(
                    work_dir, group_name, True, dict)
                pfm = servicePerformance.getPerfMetric(gname)
                pfm.metric = history_obj['perf_metrics'].metric

                fm_classad.setPerfMetrics(
                    servicePerformance.getPerfMetric(gname))
            except:
                pass # Do not fail for non-critical actions

        fm_advertiser.addClassad(fm_classad.adParams['Name'], fm_classad)

        # Advertise glidefrontendmonitor classad to user pool
        logSupport.log.info("Advertising %i %s classad(s) to the user pool" % (len(fm_advertiser.classads), fm_advertiser.adType))
        try:
            set_frontend_htcondor_env(work_dir, frontendDescript)
            fm_advertiser.advertiseAllClassads()
            logSupport.log.info("Done advertising %s classad(s) to the user pool" % fm_advertiser.adType)
        except condorExe.ExeError:
            logSupport.log.error("Exception occurred trying to advertise %s classad(s) to the user pool" % fm_advertiser.adType)
        except:
            # Rethrow any other exception including stop signal
            raise
        finally:
            # Cleanup the env
            clean_htcondor_env()

        logSupport.log.info("Cleaning logs")
        cleanupSupport.cleaners.cleanup()

        if max_num_failures > max_failures:
            logSupport.log.info("Too many group failures, aborting")
            logSupport.log.debug("Failed %i times (limit %i), aborting"%(max_num_failures, max_failures))
            raise RuntimeError("Too many group failures, aborting") 
    finally:
        # cleanup at exit
        # if anything goes wrong, hardkill the rest
        for group_name in childs:
            if childs[group_name]['state']=='spawned':
                logSupport.log.info("Hard killing group %s" % group_name)
                servicePerformance.endPerfMetricEvent(
                    'frontend', 'group_%s_iteration'%group_name)
                try:
                    os.kill(childs[group_name]['data'].pid, signal.SIGKILL)
                except OSError:
                    pass # ignore failed kills of non-existent processes

    # at this point, all groups should have been run
    timings = []
    for group_name in groups:
        timings.append((group_name, childs[group_name]['end_time']-childs[group_name]['start_time']))
    return timings

Example #4

Show file

File: glideinFrontend.py Project: glideinWMS/glideinWMS

def spawn(sleep_time, advertize_rate, work_dir, frontendDescript,
          groups, max_parallel_workers, restart_interval, restart_attempts):

    num_groups = len(groups)

    # TODO: Get the ha_check_interval from the config
    ha = glideinFrontendLib.getHASettings(frontendDescript.data)
    ha_check_interval = glideinFrontendLib.getHACheckInterval(frontendDescript.data)
    mode = glideinFrontendLib.getHAMode(frontendDescript.data)
    master_frontend_name = ''
    if mode == 'slave':
        master_frontend_name = ha.get('ha_frontends')[0].get('frontend_name')

    active = (mode == 'master')
    hibernate = shouldHibernate(frontendDescript, work_dir, ha, mode, groups)

    logSupport.log.info('Frontend started with mode = %s' % mode)
    try:

        # Service will exit on signal only.
        # This infinite loop is for the slave to go back into hibernation
        # once the master becomes alive.
        # Master never loops infinitely here, but instead it does in
        # the inner loop while(mode=='master') ...
        while True:

            while hibernate:
                # If I am slave enter hibernation cycle while Master is alive
                logSupport.log.info('Master Frontend %s is online. Hibernating.' % master_frontend_name)
                time.sleep(ha_check_interval)
                hibernate = shouldHibernate(frontendDescript, work_dir,
                                            ha, mode, groups)

            # We broke out of hibernation cycle
            # Either Master has disappeared or I am the Master
            if mode == 'slave':
                logSupport.log.info("Master frontend %s is offline. Activating slave frontend." % master_frontend_name)
                active = True

            failure_dict = {} 
            for group in groups:
                failure_dict[group] = FailureCounter(group, restart_interval)

            while ((mode == 'master') or ((mode == 'slave') and active)):
                servicePerformance.startPerfMetricEvent('frontend', 'iteration')
                start_time = time.time()
                timings = spawn_iteration(work_dir, frontendDescript, groups,
                                          max_parallel_workers, failure_dict,
                                          restart_attempts, "run")
                servicePerformance.endPerfMetricEvent('frontend', 'iteration')
                end_time = time.time()
                elapsed_time = servicePerformance.getPerfMetricEventLifetime('frontend', 'iteration')
                if elapsed_time < sleep_time:
                    real_sleep_time = sleep_time - elapsed_time
                    logSupport.log.info("Sleep %.1f sec" % real_sleep_time)
                    time.sleep(real_sleep_time)
                else:
                    logSupport.log.info("No sleeping this loop, took %.1f sec > %.1f sec" % (elapsed_time, sleep_time))

                # order the groups by walltime
                # longest walltime first
                timings.sort(lambda x, y:-cmp(x[1], y[1]))
                # recreate the groups list, with new ordering
                groups = [el[0] for el in timings]
                assert num_groups == len(groups), "Something went wrong, number of groups changed"

                if mode == 'slave':
                    # If we are slave, check if master is back and if so
                    # deadvertise my classads and hibernate
                    hibernate = shouldHibernate(frontendDescript, work_dir,
                                                ha, mode, groups)

                    if hibernate:
                        active = False
                        logSupport.log.info("Master frontend %s is back online" % master_frontend_name)
                        logSupport.log.info("Deadvertize my ads and enter hibernation cycle")
                        spawn_cleanup(work_dir, frontendDescript, groups,
                                      frontendDescript.data['FrontendName'],
                                      mode)
                    else:
                        logSupport.log.info("Master frontend %s is still offline" % master_frontend_name)


    finally:
        # We have been asked to terminate
        logSupport.log.info("Deadvertize my ads")
        spawn_cleanup(work_dir, frontendDescript, groups,
                      frontendDescript.data['FrontendName'], mode)

Example #5

Show file

File: glideinFrontend.py Project: efajardo/glideinWMS

def spawn_iteration(work_dir, frontendDescript, groups, max_active, failure_dict, max_failures, action):
    childs = {}

    for group_name in groups:
        childs[group_name] = {"state": "queued"}

    active_groups = 0
    groups_tofinish = len(groups)

    max_num_failures = 0
    logSupport.log.info("Starting iteration")
    try:
        while groups_tofinish > 0:
            done_something = False
            # check if any group finished by now
            for group_name in groups:
                if childs[group_name]["state"] == "spawned":
                    group_rc = poll_group_process(group_name, childs[group_name]["data"])
                    if not (group_rc is None):  # None means "still alive"
                        if group_rc == 0:
                            childs[group_name]["state"] = "finished"
                        else:
                            childs[group_name]["state"] = "failed"
                            failure_dict[group_name].add_failure()
                            num_failures = failure_dict[group_name].count_failures()
                            max_num_failures = max(max_num_failures, num_failures)
                            logSupport.log.warning(
                                "Group %s terminated with exit code %i (%i recent failure)"
                                % (group_name, group_rc, num_failures)
                            )
                        childs[group_name]["end_time"] = time.time()
                        active_groups -= 1
                        groups_tofinish -= 1
                        done_something = True

            # see if I can spawn more
            for group_name in groups:
                if active_groups < max_active:  # can spawn more
                    if childs[group_name]["state"] == "queued":
                        childs[group_name]["data"] = spawn_group(work_dir, group_name, action)
                        childs[group_name]["state"] = "spawned"
                        childs[group_name]["start_time"] = time.time()
                        active_groups += 1
                        done_something = True
                else:
                    break

            if done_something:
                logSupport.log.info("Active groups = %i, Groups to finish = %i" % (active_groups, groups_tofinish))
            if groups_tofinish > 0:
                time.sleep(0.01)

        logSupport.log.info("All groups finished")

        logSupport.log.info("Aggregate monitoring data")
        # KEL - can we just call the monitor aggregator method directly?  see above
        stats = aggregate_stats()
        # logSupport.log.debug(stats)

        # Create the glidefrontendmonitor classad
        fm_advertiser = glideinFrontendInterface.FrontendMonitorClassadAdvertiser(
            multi_support=glideinFrontendInterface.frontendConfig.advertise_use_multi
        )
        fm_classad = glideinFrontendInterface.FrontendMonitorClassad(frontendDescript.data["FrontendName"])
        fm_classad.setFrontendDetails(
            frontendDescript.data["FrontendName"], ",".join(groups), glideinFrontendLib.getHAMode(frontendDescript.data)
        )
        idle_jobs = {"Total": stats["total"]["Jobs"]["Idle"]}

        fm_classad.setIdleJobCount(idle_jobs)
        fm_advertiser.addClassad(fm_classad.adParams["Name"], fm_classad)

        # Advertise glidefrontendmonitor classad to user pool
        logSupport.log.info(
            "Advertising %i glidefrontendmonitor classad to the user pool" % len(fm_advertiser.classads)
        )
        try:
            set_frontend_htcondor_env(work_dir, frontendDescript)
            fm_advertiser.advertiseAllClassads()
        finally:
            # Cleanup the env
            clean_htcondor_env()

        logSupport.log.info("Done advertising")
        logSupport.log.info("Cleaning logs")
        cleanupSupport.cleaners.cleanup()

        if max_num_failures > max_failures:
            logSupport.log.info("Too many group failures, aborting")
            logSupport.log.debug("Failed %i times (limit %i), aborting" % (max_num_failures, max_failures))
            raise RuntimeError, "Too many group failures, aborting"
    finally:
        # cleanup at exit
        # if anything goes wrong, hardkill the rest
        for group_name in childs.keys():
            if childs[group_name]["state"] == "spawned":
                logSupport.log.info("Hard killing group %s" % group_name)
                try:
                    os.kill(childs[group_name]["data"].pid, signal.SIGKILL)
                except OSError:
                    pass  # ignore failed kills of non-existent processes

    # at this point, all groups should have been run
    timings = []
    for group_name in groups:
        timings.append((group_name, childs[group_name]["end_time"] - childs[group_name]["start_time"]))
    return timings