def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j['slots']) for j in running_jobs]) qw_slots = sum([int(j['slots']) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum (%d)" % self.min_nodes) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: # no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add > 0: log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now()))) try: self._cluster.add_nodes(need_to_add) self.__last_cluster_mod_time = utils.get_utc_now() log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time)) except Exception: log.error("Failed to add new host", exc_info=True)
def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j['slots']) for j in running_jobs]) qw_slots = sum([int(j['slots']) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum (%d)" % self.min_nodes) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: #no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add > 0: log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now()))) try: self._cluster.add_nodes(need_to_add) self.__last_cluster_mod_time = utils.get_utc_now() log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time)) except Exception: log.error("Failed to add new host", exc_info=True)
def _eval_remove_node(self): """ This function uses the sge stats to decide whether or not to remove a node from the cluster. """ qlen = len(self.stat.get_queued_jobs()) if qlen != 0: return if not self.has_cluster_stabilized(): return num_nodes = len(self._cluster.nodes) if num_nodes <= self.min_nodes: log.info("Not removing nodes: already at or below minimum (%d)" % self.min_nodes) return max_remove = num_nodes - self.min_nodes log.info("Looking for nodes to remove...") remove_nodes = self._find_nodes_for_removal(max_remove=max_remove) if not remove_nodes: log.info("No nodes can be removed at this time") for node in remove_nodes: if node.update() != "running": log.error("Node %s is already dead - not removing" % node.alias) continue log.warn("Removing %s: %s (%s)" % (node.alias, node.id, node.dns_name)) try: self._cluster.remove_node(node) self.__last_cluster_mod_time = utils.get_utc_now() except Exception: log.error("Failed to remove node %s" % node.alias, exc_info=True)
def execute(self, args): instance_types = ', '.join(sorted(static.INSTANCE_TYPES.keys())) if len(args) != 1: self.parser.error('please provide an instance type (options: %s)' % instance_types) if self.opts.classic and self.opts.vpc: self.parser.error("options -c and -v cannot be specified at " "the same time") instance_type = args[0] if instance_type not in static.INSTANCE_TYPES: self.parser.error('invalid instance type. possible options: %s' % instance_types) start = self.opts.start_time end = self.opts.end_time if self.opts.days_ago: if self.opts.start_time: self.parser.error("options -d and -s cannot be specified at " "the same time") if self.opts.end_time: end_tup = utils.iso_to_datetime_tuple(self.opts.end_time) else: end_tup = utils.get_utc_now() start = utils.datetime_tuple_to_iso(end_tup - timedelta( days=self.opts.days_ago)) browser_cmd = self.cfg.globals.get("web_browser") self.ec2.get_spot_history(instance_type, start, end, zone=self.opts.zone, plot=self.opts.plot, plot_web_browser=browser_cmd, vpc=self.opts.vpc, classic=self.opts.classic)
def run(self): """ As soon as a new node is ready, run the add plugins commands over it. """ interval = self.cluster.refresh_interval log.info("Waiting for one of the new nodes to be up " "(updating every {}s)".format(interval)) while True: self.ready_instances = [] self.stream_unpropagated_spots() self.stream_spots() self.stream_unpropagated_instances() self.stream_update_nrm() self.stream_instances() self.stream_manage_reboots() self.stream_ready_instances() if any([ self.unpropagated_spots, self.spots, self.unpropagated_instances, self.instances ]): if self.ready_instances: # ready_instances means nodes were added, that took # time so we should loop again now continue log.info("{} Sleeping for {} seconds".format( utils.get_utc_now(), interval)) time.sleep(interval) else: break
def __init__(self, interval=60, max_nodes=None, wait_time=900, add_pi=1, kill_after=45, stab=180, lookback_win=3, min_nodes=None, kill_cluster=False, plot_stats=False, plot_output_dir=None, dump_stats=False, stats_file=None): self._cluster = None self._keep_polling = True self._visualizer = None self.__last_cluster_mod_time = utils.get_utc_now() self.stat = SGEStats() self.polling_interval = interval self.kill_after = kill_after self.longest_allowed_queue_time = wait_time self.add_nodes_per_iteration = add_pi self.stabilization_time = stab self.lookback_window = lookback_win self.kill_cluster = kill_cluster self.max_nodes = max_nodes self.min_nodes = min_nodes self.dump_stats = dump_stats self.stats_file = stats_file self.plot_stats = plot_stats self.plot_output_dir = plot_output_dir if plot_stats: assert self.visualizer is not None
def execute(self, args): instance_types = ', '.join(sorted(static.INSTANCE_TYPES.keys())) if len(args) != 1: self.parser.error( 'please provide an instance type (options: %s)' % instance_types) if self.opts.classic and self.opts.vpc: self.parser.error("options -c and -v cannot be specified at " "the same time") instance_type = args[0] if instance_type not in static.INSTANCE_TYPES: self.parser.error('invalid instance type. possible options: %s' % instance_types) start = self.opts.start_time end = self.opts.end_time if self.opts.days_ago: if self.opts.start_time: self.parser.error("options -d and -s cannot be specified at " "the same time") if self.opts.end_time: end_tup = utils.iso_to_datetime_tuple(self.opts.end_time) else: end_tup = utils.get_utc_now() start = utils.datetime_tuple_to_iso( end_tup - timedelta(days=self.opts.days_ago)) browser_cmd = self.cfg.globals.get("web_browser") self.ec2.get_spot_history(instance_type, start, end, zone=self.opts.zone, plot=self.opts.plot, plot_web_browser=browser_cmd, vpc=self.opts.vpc, classic=self.opts.classic)
def get_all_stats(self): now = utils.get_utc_now() bits = [] # first field is the time bits.append(now) # second field is the number of hosts bits.append(self.count_hosts()) # third field is # of running jobs bits.append(len(self.get_running_jobs())) # fourth field is # of queued jobs bits.append(len(self.get_queued_jobs())) # fifth field is total # slots bits.append(self.count_total_slots()) # sixth field is average job duration bits.append(self.avg_job_duration()) # seventh field is average job wait time bits.append(self.avg_wait_time()) # last field is array of loads for hosts arr = self.get_loads() # arr may be empty if there are no exec hosts if arr: load_sum = float(reduce(self._add, arr)) avg_load = load_sum / len(arr) else: avg_load = 0.0 bits.append(avg_load) return bits
def __init__(self, remote_tzinfo=None): self.jobstat_cachesize = 200 self.hosts = [] self.jobs = [] self.queues = {} self.jobstats = self.jobstat_cachesize * [None] self.max_job_id = 0 self.remote_tzinfo = remote_tzinfo or utils.get_utc_now().tzinfo
def has_cluster_stabilized(self): now = utils.get_utc_now() elapsed = (now - self.__last_cluster_mod_time).seconds is_stabilized = not (elapsed < self.stabilization_time) if not is_stabilized: log.info("Cluster was modified less than %d seconds ago" % self.stabilization_time) log.info("Waiting for cluster to stabilize...") return is_stabilized
def __init__( self, interval=60, max_nodes=None, wait_time=900, add_pi=1, kill_after=45, stab=180, lookback_win=3, min_nodes=None, kill_cluster=False, plot_stats=False, plot_output_dir=None, dump_stats=False, stats_file=None, reboot_interval=10, n_reboot_restart=False, ignore_grp=False, instance_type=None, spot_bid=None, ): self._cluster = None self._keep_polling = True self._visualizer = None self._stat = None self.__last_cluster_mod_time = utils.get_utc_now() self.polling_interval = interval self.kill_after = kill_after self.longest_allowed_queue_time = wait_time self.add_nodes_per_iteration = add_pi self.stabilization_time = stab self.lookback_window = lookback_win self.kill_cluster = kill_cluster self.max_nodes = max_nodes self.min_nodes = min_nodes self.dump_stats = dump_stats self.stats_file = stats_file self.plot_stats = plot_stats self.plot_output_dir = plot_output_dir if plot_stats: assert self.visualizer is not None if ignore_grp: self._placement_group = False else: self._placement_group = None self.reboot_interval = reboot_interval self.n_reboot_restart = n_reboot_restart self._instance_type = instance_type self._spot_bid = spot_bid
def __init__(self, interval=60, max_nodes=None, wait_time=900, add_pi=1, kill_after=45, stab=180, lookback_win=3, min_nodes=None, kill_cluster=False, plot_stats=False, plot_output_dir=None, dump_stats=False, stats_file=None, reboot_interval=10, n_reboot_restart=False, ignore_grp=False, instance_type=None, spot_bid=None): self._cluster = None self._keep_polling = True self._visualizer = None self._stat = None self.__last_cluster_mod_time = utils.get_utc_now() self.polling_interval = interval self.kill_after = kill_after self.longest_allowed_queue_time = wait_time self.add_nodes_per_iteration = add_pi self.stabilization_time = stab self.lookback_window = lookback_win self.kill_cluster = kill_cluster self.max_nodes = max_nodes self.min_nodes = min_nodes self.dump_stats = dump_stats self.stats_file = stats_file self.plot_stats = plot_stats self.plot_output_dir = plot_output_dir if plot_stats: assert self.visualizer is not None if ignore_grp: self._placement_group = False else: self._placement_group = None self.reboot_interval = reboot_interval self.n_reboot_restart = n_reboot_restart self._instance_type = instance_type self._spot_bid = spot_bid
def check(self): """ Manages the reboot/restart/terminate (when spot) of a node. Returns True if the node is still alive, False otherwise. """ log.debug("{} next reboot {}" .format(self.node.alias, self._next_reboot)) log.debug("{} next restart {}" .format(self.node.alias, self._next_restart)) if self.node.is_impaired(): log.info("{} is impaired".format(self.node.alias)) rez = self.handle_reboot() log.debug("{} next restart {}" .format(self.node.alias, self._next_restart)) return rez if utils.get_utc_now() > self._next_reboot: return self.handle_reboot() return True
def __init__(self, interval=60, max_nodes=None, wait_time=900, add_pi=1, kill_after=45, stab=180, lookback_win=3, min_nodes=None, kill_cluster=False, plot_stats=False, plot_output_dir=None, dump_stats=False, stats_file=None): self._cluster = None self._keep_polling = True self._visualizer = None self._stat = None self.__last_cluster_mod_time = utils.get_utc_now() self.polling_interval = interval self.kill_after = kill_after self.longest_allowed_queue_time = wait_time self.add_nodes_per_iteration = add_pi self.stabilization_time = stab self.lookback_window = lookback_win self.kill_cluster = kill_cluster self.max_nodes = max_nodes self.min_nodes = min_nodes self.dump_stats = dump_stats self.stats_file = stats_file self.plot_stats = plot_stats self.plot_output_dir = plot_output_dir if plot_stats: assert self.visualizer is not None
def test_qacct_parser(self): stat = sge.SGEStats() now = utils.get_utc_now() self.jobstats = stat.parse_qacct(sge_balancer.qacct_txt, now) assert stat.avg_job_duration() == 90 assert stat.avg_wait_time() == 263
def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return False queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return False total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return False running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j["slots"]) for j in running_jobs]) qw_slots = sum([int(j["slots"]) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum (%d)" % self.min_nodes) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: # no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info( "A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time) ) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add < 1: return False log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now()))) try: self._cluster.add_nodes( need_to_add, reboot_interval=self.reboot_interval, n_reboot_restart=self.n_reboot_restart, placement_group=self._placement_group, spot_bid=self._spot_bid, instance_type=self._instance_type, ) if num_nodes < len(self._cluster.nodes): self.__last_cluster_mod_time = utils.get_utc_now() log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time)) else: log.info("No nodes were successfully added.") except ThreadPoolException as tpe: traceback.print_exc() log.error("Failed to add new host", exc_info=True) log.debug(traceback.format_exc()) log.error("Individual errors follow") for exc in tpe.exceptions: print exc[1] except Exception: traceback.print_exc() log.error("Failed to add new host", exc_info=True) log.debug(traceback.format_exc()) return True
def _set_next_reboot(self): self._next_reboot = utils.get_utc_now() + \ datetime.timedelta(minutes=self.reboot_interval)
def execute(self, args): instance_types = ', '.join(static.INSTANCE_TYPES.keys()) zone = None instance_type = None if self.opts.cluster_name: cl = self.cm.get_cluster(self.opts.cluster_name, require_keys=False) instance_type = cl.node_instance_type zone = cl.nodes[0].placement self.log.info("Cluster zone: " + zone) self.log.info("Cluster node instance type: " + instance_type) if self.opts.zone: if zone: self.log.info("You specified a zone and a cluster to get the " "zone from. Using the cluster zone.") else: zone = self.opts.zone self.log.info("Specified zone: " + zone) if instance_type: if len(args) == 1: self.log.info("You provided an instance type and a cluster to " "get the instance type from. Using the cluster " "instance type.") elif len(args) != 1: self.parser.error( 'please provide an instance type (options: %s)' % instance_types) else: instance_type = args[0] self.log.info("Specified instance type: " + instance_type) if instance_type not in static.INSTANCE_TYPES: self.parser.error( 'invalid instance type. possible options: %s' % instance_types) if self.opts.classic and self.opts.vpc: self.parser.error("options -c and -v cannot be specified at " "the same time") instance_type = args[0] if instance_type not in static.INSTANCE_TYPES: self.parser.error('invalid instance type. possible options: %s' % instance_types) start = self.opts.start_time end = self.opts.end_time if self.opts.days_ago: if self.opts.start_time: self.parser.error("options -d and -s cannot be specified at " "the same time") if self.opts.end_time: end_tup = utils.iso_to_datetime_tuple(self.opts.end_time) else: end_tup = utils.get_utc_now() start = utils.datetime_tuple_to_iso( end_tup - timedelta(days=self.opts.days_ago)) browser_cmd = self.cfg.globals.get("web_browser") self.ec2.get_spot_history(instance_type, start, end, zone=self.opts.zone, plot=self.opts.plot, plot_web_browser=browser_cmd, vpc=self.opts.vpc, classic=self.opts.classic)
def execute(self, args): instance_types = ', '.join(static.INSTANCE_TYPES.keys()) zone = None instance_type = None if self.opts.cluster_name: cl = self.cm.get_cluster(self.opts.cluster_name, require_keys=False) instance_type = cl.node_instance_type zone = cl.nodes[0].placement self.log.info("Cluster zone: " + zone) self.log.info("Cluster node instance type: " + instance_type) if self.opts.zone: if zone: self.log.info("You specified a zone and a cluster to get the " "zone from. Using the cluster zone.") else: zone = self.opts.zone self.log.info("Specified zone: " + zone) if instance_type: if len(args) == 1: self.log.info("You provided an instance type and a cluster to " "get the instance type from. Using the cluster " "instance type.") elif len(args) != 1: self.parser.error('please provide an instance type (options: %s)' % instance_types) else: instance_type = args[0] self.log.info("Specified instance type: " + instance_type) if instance_type not in static.INSTANCE_TYPES: self.parser.error( 'invalid instance type. possible options: %s' % instance_types) if self.opts.classic and self.opts.vpc: self.parser.error("options -c and -v cannot be specified at " "the same time") instance_type = args[0] if instance_type not in static.INSTANCE_TYPES: self.parser.error('invalid instance type. possible options: %s' % instance_types) start = self.opts.start_time end = self.opts.end_time if self.opts.days_ago: if self.opts.start_time: self.parser.error("options -d and -s cannot be specified at " "the same time") if self.opts.end_time: end_tup = utils.iso_to_datetime_tuple(self.opts.end_time) else: end_tup = utils.get_utc_now() start = utils.datetime_tuple_to_iso(end_tup - timedelta( days=self.opts.days_ago)) browser_cmd = self.cfg.globals.get("web_browser") self.ec2.get_spot_history(instance_type, start, end, zone=self.opts.zone, plot=self.opts.plot, plot_web_browser=browser_cmd, vpc=self.opts.vpc, classic=self.opts.classic)
def _eval_add_node(self): """ This function inspects the current state of the SGE queue and decides whether or not to add nodes to the cluster. Returns the number of nodes to add. """ num_nodes = len(self._cluster.nodes) if num_nodes >= self.max_nodes: log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes) return False queued_jobs = self.stat.get_queued_jobs() if not queued_jobs and num_nodes >= self.min_nodes: log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...") return False total_slots = self.stat.count_total_slots() if not self.has_cluster_stabilized() and total_slots > 0: return False running_jobs = self.stat.get_running_jobs() used_slots = sum([int(j['slots']) for j in running_jobs]) qw_slots = sum([int(j['slots']) for j in queued_jobs]) slots_per_host = self.stat.slots_per_host() avail_slots = total_slots - used_slots need_to_add = 0 if num_nodes < self.min_nodes: log.info("Adding node: below minimum (%d)" % self.min_nodes) need_to_add = self.min_nodes - num_nodes elif total_slots == 0: # no slots, add one now need_to_add = 1 elif qw_slots > avail_slots: log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots)) oldest_job_dt = self.stat.oldest_queued_job_age() now = self.get_remote_time() age_delta = now - oldest_job_dt if age_delta.seconds > self.longest_allowed_queue_time: log.info("A job has been waiting for %d seconds " "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)) if slots_per_host != 0: need_to_add = qw_slots / slots_per_host else: need_to_add = 1 else: log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time) max_add = self.max_nodes - len(self._cluster.running_nodes) need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add) if need_to_add < 1: return False log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now()))) try: self._cluster.add_nodes(need_to_add, reboot_interval=self.reboot_interval, n_reboot_restart=self.n_reboot_restart, placement_group=self._placement_group, spot_bid=self._spot_bid, instance_type=self._instance_type) if num_nodes < len(self._cluster.nodes): self.__last_cluster_mod_time = utils.get_utc_now() log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time)) else: log.info("No nodes were successfully added.") except ThreadPoolException as tpe: traceback.print_exc() log.error("Failed to add new host", exc_info=True) log.debug(traceback.format_exc()) log.error("Individual errors follow") for exc in tpe.exceptions: print exc[1] except Exception: traceback.print_exc() log.error("Failed to add new host", exc_info=True) log.debug(traceback.format_exc()) return True
def run(self, cluster): """ This function will loop indefinitely, using SGELoadBalancer.get_stats() to get the clusters status. It looks at the job queue and tries to decide whether to add or remove a node. It should later look at job durations (currently doesn't) """ self._cluster = cluster if self.max_nodes is None: self.max_nodes = cluster.cluster_size if self.min_nodes is None: self.min_nodes = 1 if self.kill_cluster: self.min_nodes = 0 if self.min_nodes > self.max_nodes: raise exception.BaseException( "min_nodes cannot be greater than max_nodes") use_default_stats_file = self.dump_stats and not self.stats_file use_default_plots_dir = self.plot_stats and not self.plot_output_dir if use_default_stats_file or use_default_plots_dir: self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True) if not self.stats_file: self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag if not self.plot_output_dir: self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag if not cluster.is_cluster_up(): raise exception.ClusterNotRunning(cluster.cluster_tag) if self.dump_stats: if os.path.isdir(self.stats_file): raise exception.BaseException("stats file destination '%s' is" " a directory" % self.stats_file) sfdir = os.path.dirname(os.path.abspath(self.stats_file)) self._validate_dir(sfdir, msg_prefix="stats file destination") if self.plot_stats: if os.path.isfile(self.plot_output_dir): raise exception.BaseException("plot output destination '%s' " "is a file" % self.plot_output_dir) self._validate_dir(self.plot_output_dir, msg_prefix="plot output destination") raw = dict(__raw__=True) log.info("Starting load balancer (Use ctrl-c to exit)") log.info("Maximum cluster size: %d" % self.max_nodes, extra=raw) log.info("Minimum cluster size: %d" % self.min_nodes, extra=raw) log.info("Cluster growth rate: %d nodes/iteration\n" % self.add_nodes_per_iteration, extra=raw) if self.dump_stats: log.info("Writing stats to file: %s" % self.stats_file) if self.plot_stats: log.info("Plotting stats to directory: %s" % self.plot_output_dir) while (self._keep_polling): cluster.recover(reboot_interval=self.reboot_interval, n_reboot_restart=self.n_reboot_restart) cluster.clean() if not cluster.is_cluster_up(): log.info("Waiting for all nodes to come up...") time.sleep(self.polling_interval) continue self.get_stats() log.info("Execution hosts: %d" % len(self.stat.hosts), extra=raw) log.info("Execution slots: %d" % self.stat.count_total_slots(), extra=raw) log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()), extra=raw) oldest_queued_job_age = self.stat.oldest_queued_job_age() if oldest_queued_job_age: log.info("Oldest queued job: %s" % oldest_queued_job_age, extra=raw) log.info("Avg job duration: %d secs" % self.stat.avg_job_duration(), extra=raw) log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(), extra=raw) log.info("Last cluster modification time: %s" % self.__last_cluster_mod_time.isoformat(), extra=dict(__raw__=True)) # evaluate if nodes need to be added skip_sleep = self._eval_add_node() # evaluate if nodes need to be removed self._eval_remove_node() if self.dump_stats or self.plot_stats: self.stat.write_stats_to_csv(self.stats_file) # call the visualizer if self.plot_stats: try: self.visualizer.graph_all() except IOError, e: raise exception.BaseException(str(e)) # evaluate if cluster should be terminated if self.kill_cluster: if self._eval_terminate_cluster(): log.info("Terminating cluster and exiting...") return self._cluster.terminate_cluster() if not skip_sleep: log.info("Sleeping...(looping again in %d secs)\n" % self.polling_interval) log.info("Sleeping, it's " + utils.get_utc_now().isoformat()) time.sleep(self.polling_interval) log.info("Waking up, it's " + utils.get_utc_now().isoformat())