Beispiel #1
0
 def _eval_add_node(self):
     """
     This function inspects the current state of the SGE queue and decides
     whether or not to add nodes to the cluster. Returns the number of nodes
     to add.
     """
     num_nodes = len(self._cluster.nodes)
     if num_nodes >= self.max_nodes:
         log.info("Not adding nodes: already at or above maximum (%d)" %
                  self.max_nodes)
         return
     queued_jobs = self.stat.get_queued_jobs()
     if not queued_jobs and num_nodes >= self.min_nodes:
         log.info("Not adding nodes: at or above minimum nodes "
                  "and no queued jobs...")
         return
     total_slots = self.stat.count_total_slots()
     if not self.has_cluster_stabilized() and total_slots > 0:
         return
     running_jobs = self.stat.get_running_jobs()
     used_slots = sum([int(j['slots']) for j in running_jobs])
     qw_slots = sum([int(j['slots']) for j in queued_jobs])
     slots_per_host = self.stat.slots_per_host()
     avail_slots = total_slots - used_slots
     need_to_add = 0
     if num_nodes < self.min_nodes:
         log.info("Adding node: below minimum (%d)" % self.min_nodes)
         need_to_add = self.min_nodes - num_nodes
     elif total_slots == 0:
         # no slots, add one now
         need_to_add = 1
     elif qw_slots > avail_slots:
         log.info("Queued jobs need more slots (%d) than available (%d)" %
                  (qw_slots, avail_slots))
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d seconds "
                      "longer than max: %d" %
                      (age_delta.seconds, self.longest_allowed_queue_time))
             if slots_per_host != 0:
                 need_to_add = qw_slots / slots_per_host
             else:
                 need_to_add = 1
         else:
             log.info("No queued jobs older than %d seconds" %
                      self.longest_allowed_queue_time)
     max_add = self.max_nodes - len(self._cluster.running_nodes)
     need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
     if need_to_add > 0:
         log.warn("Adding %d nodes at %s" %
                  (need_to_add, str(utils.get_utc_now())))
         try:
             self._cluster.add_nodes(need_to_add)
             self.__last_cluster_mod_time = utils.get_utc_now()
             log.info("Done adding nodes at %s" %
                      str(self.__last_cluster_mod_time))
         except Exception:
             log.error("Failed to add new host", exc_info=True)
Beispiel #2
0
 def _eval_add_node(self):
     """
     This function inspects the current state of the SGE queue and decides
     whether or not to add nodes to the cluster. Returns the number of nodes
     to add.
     """
     num_nodes = len(self._cluster.nodes)
     if num_nodes >= self.max_nodes:
         log.info("Not adding nodes: already at or above maximum (%d)" %
                  self.max_nodes)
         return
     queued_jobs = self.stat.get_queued_jobs()
     if not queued_jobs and num_nodes >= self.min_nodes:
         log.info("Not adding nodes: at or above minimum nodes "
                  "and no queued jobs...")
         return
     total_slots = self.stat.count_total_slots()
     if not self.has_cluster_stabilized() and total_slots > 0:
         return
     running_jobs = self.stat.get_running_jobs()
     used_slots = sum([int(j['slots']) for j in running_jobs])
     qw_slots = sum([int(j['slots']) for j in queued_jobs])
     slots_per_host = self.stat.slots_per_host()
     avail_slots = total_slots - used_slots
     need_to_add = 0
     if num_nodes < self.min_nodes:
         log.info("Adding node: below minimum (%d)" % self.min_nodes)
         need_to_add = self.min_nodes - num_nodes
     elif total_slots == 0:
         #no slots, add one now
         need_to_add = 1
     elif qw_slots > avail_slots:
         log.info("Queued jobs need more slots (%d) than available (%d)" %
                  (qw_slots, avail_slots))
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d seconds "
                      "longer than max: %d" %
                      (age_delta.seconds, self.longest_allowed_queue_time))
             if slots_per_host != 0:
                 need_to_add = qw_slots / slots_per_host
             else:
                 need_to_add = 1
         else:
             log.info("No queued jobs older than %d seconds" %
                      self.longest_allowed_queue_time)
     max_add = self.max_nodes - len(self._cluster.running_nodes)
     need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
     if need_to_add > 0:
         log.warn("Adding %d nodes at %s" %
                  (need_to_add, str(utils.get_utc_now())))
         try:
             self._cluster.add_nodes(need_to_add)
             self.__last_cluster_mod_time = utils.get_utc_now()
             log.info("Done adding nodes at %s" %
                      str(self.__last_cluster_mod_time))
         except Exception:
             log.error("Failed to add new host", exc_info=True)
Beispiel #3
0
 def _eval_remove_node(self):
     """
     This function uses the sge stats to decide whether or not to
     remove a node from the cluster.
     """
     qlen = len(self.stat.get_queued_jobs())
     if qlen != 0:
         return
     if not self.has_cluster_stabilized():
         return
     num_nodes = len(self._cluster.nodes)
     if num_nodes <= self.min_nodes:
         log.info("Not removing nodes: already at or below minimum (%d)"
                  % self.min_nodes)
         return
     max_remove = num_nodes - self.min_nodes
     log.info("Looking for nodes to remove...")
     remove_nodes = self._find_nodes_for_removal(max_remove=max_remove)
     if not remove_nodes:
         log.info("No nodes can be removed at this time")
     for node in remove_nodes:
         if node.update() != "running":
             log.error("Node %s is already dead - not removing" %
                       node.alias)
             continue
         log.warn("Removing %s: %s (%s)" %
                  (node.alias, node.id, node.dns_name))
         try:
             self._cluster.remove_node(node)
             self.__last_cluster_mod_time = utils.get_utc_now()
         except Exception:
             log.error("Failed to remove node %s" % node.alias,
                       exc_info=True)
Beispiel #4
0
 def execute(self, args):
     instance_types = ', '.join(sorted(static.INSTANCE_TYPES.keys()))
     if len(args) != 1:
         self.parser.error('please provide an instance type (options: %s)' %
                           instance_types)
     if self.opts.classic and self.opts.vpc:
         self.parser.error("options -c and -v cannot be specified at "
                           "the same time")
     instance_type = args[0]
     if instance_type not in static.INSTANCE_TYPES:
         self.parser.error('invalid instance type. possible options: %s' %
                           instance_types)
     start = self.opts.start_time
     end = self.opts.end_time
     if self.opts.days_ago:
         if self.opts.start_time:
             self.parser.error("options -d and -s cannot be specified at "
                               "the same time")
         if self.opts.end_time:
             end_tup = utils.iso_to_datetime_tuple(self.opts.end_time)
         else:
             end_tup = utils.get_utc_now()
         start = utils.datetime_tuple_to_iso(end_tup - timedelta(
             days=self.opts.days_ago))
     browser_cmd = self.cfg.globals.get("web_browser")
     self.ec2.get_spot_history(instance_type,
                               start,
                               end,
                               zone=self.opts.zone,
                               plot=self.opts.plot,
                               plot_web_browser=browser_cmd,
                               vpc=self.opts.vpc,
                               classic=self.opts.classic)
Beispiel #5
0
 def _eval_remove_node(self):
     """
     This function uses the sge stats to decide whether or not to
     remove a node from the cluster.
     """
     qlen = len(self.stat.get_queued_jobs())
     if qlen != 0:
         return
     if not self.has_cluster_stabilized():
         return
     num_nodes = len(self._cluster.nodes)
     if num_nodes <= self.min_nodes:
         log.info("Not removing nodes: already at or below minimum (%d)" %
                  self.min_nodes)
         return
     max_remove = num_nodes - self.min_nodes
     log.info("Looking for nodes to remove...")
     remove_nodes = self._find_nodes_for_removal(max_remove=max_remove)
     if not remove_nodes:
         log.info("No nodes can be removed at this time")
     for node in remove_nodes:
         if node.update() != "running":
             log.error("Node %s is already dead - not removing" %
                       node.alias)
             continue
         log.warn("Removing %s: %s (%s)" %
                  (node.alias, node.id, node.dns_name))
         try:
             self._cluster.remove_node(node)
             self.__last_cluster_mod_time = utils.get_utc_now()
         except Exception:
             log.error("Failed to remove node %s" % node.alias,
                       exc_info=True)
Beispiel #6
0
    def run(self):
        """
        As soon as a new node is ready, run the add plugins commands over it.
        """
        interval = self.cluster.refresh_interval
        log.info("Waiting for one of the new nodes to be up "
                 "(updating every {}s)".format(interval))

        while True:
            self.ready_instances = []
            self.stream_unpropagated_spots()
            self.stream_spots()
            self.stream_unpropagated_instances()
            self.stream_update_nrm()
            self.stream_instances()
            self.stream_manage_reboots()
            self.stream_ready_instances()

            if any([
                    self.unpropagated_spots, self.spots,
                    self.unpropagated_instances, self.instances
            ]):
                if self.ready_instances:
                    # ready_instances means nodes were added, that took
                    # time so we should loop again now
                    continue
                log.info("{} Sleeping for {} seconds".format(
                    utils.get_utc_now(), interval))
                time.sleep(interval)
            else:
                break
Beispiel #7
0
 def __init__(self,
              interval=60,
              max_nodes=None,
              wait_time=900,
              add_pi=1,
              kill_after=45,
              stab=180,
              lookback_win=3,
              min_nodes=None,
              kill_cluster=False,
              plot_stats=False,
              plot_output_dir=None,
              dump_stats=False,
              stats_file=None):
     self._cluster = None
     self._keep_polling = True
     self._visualizer = None
     self.__last_cluster_mod_time = utils.get_utc_now()
     self.stat = SGEStats()
     self.polling_interval = interval
     self.kill_after = kill_after
     self.longest_allowed_queue_time = wait_time
     self.add_nodes_per_iteration = add_pi
     self.stabilization_time = stab
     self.lookback_window = lookback_win
     self.kill_cluster = kill_cluster
     self.max_nodes = max_nodes
     self.min_nodes = min_nodes
     self.dump_stats = dump_stats
     self.stats_file = stats_file
     self.plot_stats = plot_stats
     self.plot_output_dir = plot_output_dir
     if plot_stats:
         assert self.visualizer is not None
Beispiel #8
0
 def execute(self, args):
     instance_types = ', '.join(sorted(static.INSTANCE_TYPES.keys()))
     if len(args) != 1:
         self.parser.error(
             'please provide an instance type (options: %s)' %
             instance_types)
     if self.opts.classic and self.opts.vpc:
         self.parser.error("options -c and -v cannot be specified at "
                           "the same time")
     instance_type = args[0]
     if instance_type not in static.INSTANCE_TYPES:
         self.parser.error('invalid instance type. possible options: %s' %
                           instance_types)
     start = self.opts.start_time
     end = self.opts.end_time
     if self.opts.days_ago:
         if self.opts.start_time:
             self.parser.error("options -d and -s cannot be specified at "
                               "the same time")
         if self.opts.end_time:
             end_tup = utils.iso_to_datetime_tuple(self.opts.end_time)
         else:
             end_tup = utils.get_utc_now()
         start = utils.datetime_tuple_to_iso(
             end_tup - timedelta(days=self.opts.days_ago))
     browser_cmd = self.cfg.globals.get("web_browser")
     self.ec2.get_spot_history(instance_type, start, end,
                               zone=self.opts.zone, plot=self.opts.plot,
                               plot_web_browser=browser_cmd,
                               vpc=self.opts.vpc,
                               classic=self.opts.classic)
Beispiel #9
0
 def get_all_stats(self):
     now = utils.get_utc_now()
     bits = []
     # first field is the time
     bits.append(now)
     # second field is the number of hosts
     bits.append(self.count_hosts())
     # third field is # of running jobs
     bits.append(len(self.get_running_jobs()))
     # fourth field is # of queued jobs
     bits.append(len(self.get_queued_jobs()))
     # fifth field is total # slots
     bits.append(self.count_total_slots())
     # sixth field is average job duration
     bits.append(self.avg_job_duration())
     # seventh field is average job wait time
     bits.append(self.avg_wait_time())
     # last field is array of loads for hosts
     arr = self.get_loads()
     # arr may be empty if there are no exec hosts
     if arr:
         load_sum = float(reduce(self._add, arr))
         avg_load = load_sum / len(arr)
     else:
         avg_load = 0.0
     bits.append(avg_load)
     return bits
Beispiel #10
0
 def get_all_stats(self):
     now = utils.get_utc_now()
     bits = []
     # first field is the time
     bits.append(now)
     # second field is the number of hosts
     bits.append(self.count_hosts())
     # third field is # of running jobs
     bits.append(len(self.get_running_jobs()))
     # fourth field is # of queued jobs
     bits.append(len(self.get_queued_jobs()))
     # fifth field is total # slots
     bits.append(self.count_total_slots())
     # sixth field is average job duration
     bits.append(self.avg_job_duration())
     # seventh field is average job wait time
     bits.append(self.avg_wait_time())
     # last field is array of loads for hosts
     arr = self.get_loads()
     # arr may be empty if there are no exec hosts
     if arr:
         load_sum = float(reduce(self._add, arr))
         avg_load = load_sum / len(arr)
     else:
         avg_load = 0.0
     bits.append(avg_load)
     return bits
Beispiel #11
0
 def __init__(self, remote_tzinfo=None):
     self.jobstat_cachesize = 200
     self.hosts = []
     self.jobs = []
     self.queues = {}
     self.jobstats = self.jobstat_cachesize * [None]
     self.max_job_id = 0
     self.remote_tzinfo = remote_tzinfo or utils.get_utc_now().tzinfo
Beispiel #12
0
 def __init__(self, remote_tzinfo=None):
     self.jobstat_cachesize = 200
     self.hosts = []
     self.jobs = []
     self.queues = {}
     self.jobstats = self.jobstat_cachesize * [None]
     self.max_job_id = 0
     self.remote_tzinfo = remote_tzinfo or utils.get_utc_now().tzinfo
Beispiel #13
0
 def has_cluster_stabilized(self):
     now = utils.get_utc_now()
     elapsed = (now - self.__last_cluster_mod_time).seconds
     is_stabilized = not (elapsed < self.stabilization_time)
     if not is_stabilized:
         log.info("Cluster was modified less than %d seconds ago" % self.stabilization_time)
         log.info("Waiting for cluster to stabilize...")
     return is_stabilized
Beispiel #14
0
 def has_cluster_stabilized(self):
     now = utils.get_utc_now()
     elapsed = (now - self.__last_cluster_mod_time).seconds
     is_stabilized = not (elapsed < self.stabilization_time)
     if not is_stabilized:
         log.info("Cluster was modified less than %d seconds ago" %
                  self.stabilization_time)
         log.info("Waiting for cluster to stabilize...")
     return is_stabilized
Beispiel #15
0
 def __init__(
     self,
     interval=60,
     max_nodes=None,
     wait_time=900,
     add_pi=1,
     kill_after=45,
     stab=180,
     lookback_win=3,
     min_nodes=None,
     kill_cluster=False,
     plot_stats=False,
     plot_output_dir=None,
     dump_stats=False,
     stats_file=None,
     reboot_interval=10,
     n_reboot_restart=False,
     ignore_grp=False,
     instance_type=None,
     spot_bid=None,
 ):
     self._cluster = None
     self._keep_polling = True
     self._visualizer = None
     self._stat = None
     self.__last_cluster_mod_time = utils.get_utc_now()
     self.polling_interval = interval
     self.kill_after = kill_after
     self.longest_allowed_queue_time = wait_time
     self.add_nodes_per_iteration = add_pi
     self.stabilization_time = stab
     self.lookback_window = lookback_win
     self.kill_cluster = kill_cluster
     self.max_nodes = max_nodes
     self.min_nodes = min_nodes
     self.dump_stats = dump_stats
     self.stats_file = stats_file
     self.plot_stats = plot_stats
     self.plot_output_dir = plot_output_dir
     if plot_stats:
         assert self.visualizer is not None
     if ignore_grp:
         self._placement_group = False
     else:
         self._placement_group = None
     self.reboot_interval = reboot_interval
     self.n_reboot_restart = n_reboot_restart
     self._instance_type = instance_type
     self._spot_bid = spot_bid
Beispiel #16
0
 def __init__(self,
              interval=60,
              max_nodes=None,
              wait_time=900,
              add_pi=1,
              kill_after=45,
              stab=180,
              lookback_win=3,
              min_nodes=None,
              kill_cluster=False,
              plot_stats=False,
              plot_output_dir=None,
              dump_stats=False,
              stats_file=None,
              reboot_interval=10,
              n_reboot_restart=False,
              ignore_grp=False,
              instance_type=None,
              spot_bid=None):
     self._cluster = None
     self._keep_polling = True
     self._visualizer = None
     self._stat = None
     self.__last_cluster_mod_time = utils.get_utc_now()
     self.polling_interval = interval
     self.kill_after = kill_after
     self.longest_allowed_queue_time = wait_time
     self.add_nodes_per_iteration = add_pi
     self.stabilization_time = stab
     self.lookback_window = lookback_win
     self.kill_cluster = kill_cluster
     self.max_nodes = max_nodes
     self.min_nodes = min_nodes
     self.dump_stats = dump_stats
     self.stats_file = stats_file
     self.plot_stats = plot_stats
     self.plot_output_dir = plot_output_dir
     if plot_stats:
         assert self.visualizer is not None
     if ignore_grp:
         self._placement_group = False
     else:
         self._placement_group = None
     self.reboot_interval = reboot_interval
     self.n_reboot_restart = n_reboot_restart
     self._instance_type = instance_type
     self._spot_bid = spot_bid
Beispiel #17
0
 def check(self):
     """
     Manages the reboot/restart/terminate (when spot) of a node.
     Returns True if the node is still alive, False otherwise.
     """
     log.debug("{} next reboot {}"
               .format(self.node.alias, self._next_reboot))
     log.debug("{} next restart {}"
               .format(self.node.alias, self._next_restart))
     if self.node.is_impaired():
         log.info("{} is impaired".format(self.node.alias))
         rez = self.handle_reboot()
         log.debug("{} next restart {}"
                   .format(self.node.alias, self._next_restart))
         return rez
     if utils.get_utc_now() > self._next_reboot:
         return self.handle_reboot()
     return True
Beispiel #18
0
 def check(self):
     """
     Manages the reboot/restart/terminate (when spot) of a node.
     Returns True if the node is still alive, False otherwise.
     """
     log.debug("{} next reboot {}"
               .format(self.node.alias, self._next_reboot))
     log.debug("{} next restart {}"
               .format(self.node.alias, self._next_restart))
     if self.node.is_impaired():
         log.info("{} is impaired".format(self.node.alias))
         rez = self.handle_reboot()
         log.debug("{} next restart {}"
                   .format(self.node.alias, self._next_restart))
         return rez
     if utils.get_utc_now() > self._next_reboot:
         return self.handle_reboot()
     return True
Beispiel #19
0
 def __init__(self, interval=60, max_nodes=None, wait_time=900,
              add_pi=1, kill_after=45, stab=180, lookback_win=3,
              min_nodes=None, kill_cluster=False, plot_stats=False,
              plot_output_dir=None, dump_stats=False, stats_file=None):
     self._cluster = None
     self._keep_polling = True
     self._visualizer = None
     self._stat = None
     self.__last_cluster_mod_time = utils.get_utc_now()
     self.polling_interval = interval
     self.kill_after = kill_after
     self.longest_allowed_queue_time = wait_time
     self.add_nodes_per_iteration = add_pi
     self.stabilization_time = stab
     self.lookback_window = lookback_win
     self.kill_cluster = kill_cluster
     self.max_nodes = max_nodes
     self.min_nodes = min_nodes
     self.dump_stats = dump_stats
     self.stats_file = stats_file
     self.plot_stats = plot_stats
     self.plot_output_dir = plot_output_dir
     if plot_stats:
         assert self.visualizer is not None
 def test_qacct_parser(self):
     stat = sge.SGEStats()
     now = utils.get_utc_now()
     self.jobstats = stat.parse_qacct(sge_balancer.qacct_txt, now)
     assert stat.avg_job_duration() == 90
     assert stat.avg_wait_time() == 263
Beispiel #21
0
    def _eval_add_node(self):
        """
        This function inspects the current state of the SGE queue and decides
        whether or not to add nodes to the cluster. Returns the number of nodes
        to add.
        """
        num_nodes = len(self._cluster.nodes)
        if num_nodes >= self.max_nodes:
            log.info("Not adding nodes: already at or above maximum (%d)" % self.max_nodes)
            return False
        queued_jobs = self.stat.get_queued_jobs()
        if not queued_jobs and num_nodes >= self.min_nodes:
            log.info("Not adding nodes: at or above minimum nodes " "and no queued jobs...")
            return False
        total_slots = self.stat.count_total_slots()
        if not self.has_cluster_stabilized() and total_slots > 0:
            return False
        running_jobs = self.stat.get_running_jobs()
        used_slots = sum([int(j["slots"]) for j in running_jobs])
        qw_slots = sum([int(j["slots"]) for j in queued_jobs])
        slots_per_host = self.stat.slots_per_host()
        avail_slots = total_slots - used_slots
        need_to_add = 0
        if num_nodes < self.min_nodes:
            log.info("Adding node: below minimum (%d)" % self.min_nodes)
            need_to_add = self.min_nodes - num_nodes
        elif total_slots == 0:
            # no slots, add one now
            need_to_add = 1
        elif qw_slots > avail_slots:
            log.info("Queued jobs need more slots (%d) than available (%d)" % (qw_slots, avail_slots))
            oldest_job_dt = self.stat.oldest_queued_job_age()
            now = self.get_remote_time()
            age_delta = now - oldest_job_dt
            if age_delta.seconds > self.longest_allowed_queue_time:
                log.info(
                    "A job has been waiting for %d seconds "
                    "longer than max: %d" % (age_delta.seconds, self.longest_allowed_queue_time)
                )
                if slots_per_host != 0:
                    need_to_add = qw_slots / slots_per_host
                else:
                    need_to_add = 1
            else:
                log.info("No queued jobs older than %d seconds" % self.longest_allowed_queue_time)
        max_add = self.max_nodes - len(self._cluster.running_nodes)
        need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
        if need_to_add < 1:
            return False

        log.warn("Adding %d nodes at %s" % (need_to_add, str(utils.get_utc_now())))
        try:
            self._cluster.add_nodes(
                need_to_add,
                reboot_interval=self.reboot_interval,
                n_reboot_restart=self.n_reboot_restart,
                placement_group=self._placement_group,
                spot_bid=self._spot_bid,
                instance_type=self._instance_type,
            )
            if num_nodes < len(self._cluster.nodes):
                self.__last_cluster_mod_time = utils.get_utc_now()
                log.info("Done adding nodes at %s" % str(self.__last_cluster_mod_time))
            else:
                log.info("No nodes were successfully added.")
        except ThreadPoolException as tpe:
            traceback.print_exc()
            log.error("Failed to add new host", exc_info=True)
            log.debug(traceback.format_exc())
            log.error("Individual errors follow")
            for exc in tpe.exceptions:
                print exc[1]
        except Exception:
            traceback.print_exc()
            log.error("Failed to add new host", exc_info=True)
            log.debug(traceback.format_exc())
        return True
Beispiel #22
0
 def test_qacct_parser(self):
     stat = sge.SGEStats()
     now = utils.get_utc_now()
     self.jobstats = stat.parse_qacct(sge_balancer.qacct_txt, now)
     assert stat.avg_job_duration() == 90
     assert stat.avg_wait_time() == 263
Beispiel #23
0
 def _set_next_reboot(self):
     self._next_reboot = utils.get_utc_now() + \
         datetime.timedelta(minutes=self.reboot_interval)
Beispiel #24
0
 def _set_next_reboot(self):
     self._next_reboot = utils.get_utc_now() + \
         datetime.timedelta(minutes=self.reboot_interval)
Beispiel #25
0
    def execute(self, args):
        instance_types = ', '.join(static.INSTANCE_TYPES.keys())

        zone = None
        instance_type = None
        if self.opts.cluster_name:
            cl = self.cm.get_cluster(self.opts.cluster_name,
                                     require_keys=False)
            instance_type = cl.node_instance_type
            zone = cl.nodes[0].placement
            self.log.info("Cluster zone: " + zone)
            self.log.info("Cluster node instance type: " + instance_type)
        if self.opts.zone:
            if zone:
                self.log.info("You specified a zone and a cluster to get the "
                              "zone from. Using the cluster zone.")
            else:
                zone = self.opts.zone
                self.log.info("Specified zone: " + zone)
        if instance_type:
            if len(args) == 1:
                self.log.info("You provided an instance type and a cluster to "
                              "get the instance type from. Using the cluster "
                              "instance type.")

        elif len(args) != 1:
            self.parser.error(
                'please provide an instance type (options: %s)' %
                instance_types)
        else:
            instance_type = args[0]
            self.log.info("Specified instance type: " + instance_type)
            if instance_type not in static.INSTANCE_TYPES:
                self.parser.error(
                    'invalid instance type. possible options: %s' %
                    instance_types)
        if self.opts.classic and self.opts.vpc:
            self.parser.error("options -c and -v cannot be specified at "
                              "the same time")
        instance_type = args[0]
        if instance_type not in static.INSTANCE_TYPES:
            self.parser.error('invalid instance type. possible options: %s' %
                              instance_types)
        start = self.opts.start_time
        end = self.opts.end_time
        if self.opts.days_ago:
            if self.opts.start_time:
                self.parser.error("options -d and -s cannot be specified at "
                                  "the same time")
            if self.opts.end_time:
                end_tup = utils.iso_to_datetime_tuple(self.opts.end_time)
            else:
                end_tup = utils.get_utc_now()
            start = utils.datetime_tuple_to_iso(
                end_tup - timedelta(days=self.opts.days_ago))
        browser_cmd = self.cfg.globals.get("web_browser")
        self.ec2.get_spot_history(instance_type, start, end,
                                  zone=self.opts.zone, plot=self.opts.plot,
                                  plot_web_browser=browser_cmd,
                                  vpc=self.opts.vpc,
                                  classic=self.opts.classic)
Beispiel #26
0
    def execute(self, args):
        instance_types = ', '.join(static.INSTANCE_TYPES.keys())

        zone = None
        instance_type = None
        if self.opts.cluster_name:
            cl = self.cm.get_cluster(self.opts.cluster_name,
                                     require_keys=False)
            instance_type = cl.node_instance_type
            zone = cl.nodes[0].placement
            self.log.info("Cluster zone: " + zone)
            self.log.info("Cluster node instance type: " + instance_type)
        if self.opts.zone:
            if zone:
                self.log.info("You specified a zone and a cluster to get the "
                              "zone from. Using the cluster zone.")
            else:
                zone = self.opts.zone
                self.log.info("Specified zone: " + zone)
        if instance_type:
            if len(args) == 1:
                self.log.info("You provided an instance type and a cluster to "
                              "get the instance type from. Using the cluster "
                              "instance type.")

        elif len(args) != 1:
            self.parser.error('please provide an instance type (options: %s)' %
                              instance_types)
        else:
            instance_type = args[0]
            self.log.info("Specified instance type: " + instance_type)
            if instance_type not in static.INSTANCE_TYPES:
                self.parser.error(
                    'invalid instance type. possible options: %s' %
                    instance_types)
        if self.opts.classic and self.opts.vpc:
            self.parser.error("options -c and -v cannot be specified at "
                              "the same time")
        instance_type = args[0]
        if instance_type not in static.INSTANCE_TYPES:
            self.parser.error('invalid instance type. possible options: %s' %
                              instance_types)
        start = self.opts.start_time
        end = self.opts.end_time
        if self.opts.days_ago:
            if self.opts.start_time:
                self.parser.error("options -d and -s cannot be specified at "
                                  "the same time")
            if self.opts.end_time:
                end_tup = utils.iso_to_datetime_tuple(self.opts.end_time)
            else:
                end_tup = utils.get_utc_now()
            start = utils.datetime_tuple_to_iso(end_tup - timedelta(
                days=self.opts.days_ago))
        browser_cmd = self.cfg.globals.get("web_browser")
        self.ec2.get_spot_history(instance_type,
                                  start,
                                  end,
                                  zone=self.opts.zone,
                                  plot=self.opts.plot,
                                  plot_web_browser=browser_cmd,
                                  vpc=self.opts.vpc,
                                  classic=self.opts.classic)
Beispiel #27
0
    def _eval_add_node(self):
        """
        This function inspects the current state of the SGE queue and decides
        whether or not to add nodes to the cluster. Returns the number of nodes
        to add.
        """
        num_nodes = len(self._cluster.nodes)
        if num_nodes >= self.max_nodes:
            log.info("Not adding nodes: already at or above maximum (%d)" %
                     self.max_nodes)
            return False
        queued_jobs = self.stat.get_queued_jobs()
        if not queued_jobs and num_nodes >= self.min_nodes:
            log.info("Not adding nodes: at or above minimum nodes "
                     "and no queued jobs...")
            return False
        total_slots = self.stat.count_total_slots()
        if not self.has_cluster_stabilized() and total_slots > 0:
            return False
        running_jobs = self.stat.get_running_jobs()
        used_slots = sum([int(j['slots']) for j in running_jobs])
        qw_slots = sum([int(j['slots']) for j in queued_jobs])
        slots_per_host = self.stat.slots_per_host()
        avail_slots = total_slots - used_slots
        need_to_add = 0
        if num_nodes < self.min_nodes:
            log.info("Adding node: below minimum (%d)" % self.min_nodes)
            need_to_add = self.min_nodes - num_nodes
        elif total_slots == 0:
            # no slots, add one now
            need_to_add = 1
        elif qw_slots > avail_slots:
            log.info("Queued jobs need more slots (%d) than available (%d)" %
                     (qw_slots, avail_slots))
            oldest_job_dt = self.stat.oldest_queued_job_age()
            now = self.get_remote_time()
            age_delta = now - oldest_job_dt
            if age_delta.seconds > self.longest_allowed_queue_time:
                log.info("A job has been waiting for %d seconds "
                         "longer than max: %d" %
                         (age_delta.seconds, self.longest_allowed_queue_time))
                if slots_per_host != 0:
                    need_to_add = qw_slots / slots_per_host
                else:
                    need_to_add = 1
            else:
                log.info("No queued jobs older than %d seconds" %
                         self.longest_allowed_queue_time)
        max_add = self.max_nodes - len(self._cluster.running_nodes)
        need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
        if need_to_add < 1:
            return False

        log.warn("Adding %d nodes at %s" %
                 (need_to_add, str(utils.get_utc_now())))
        try:
            self._cluster.add_nodes(need_to_add,
                                    reboot_interval=self.reboot_interval,
                                    n_reboot_restart=self.n_reboot_restart,
                                    placement_group=self._placement_group,
                                    spot_bid=self._spot_bid,
                                    instance_type=self._instance_type)
            if num_nodes < len(self._cluster.nodes):
                self.__last_cluster_mod_time = utils.get_utc_now()
                log.info("Done adding nodes at %s" %
                         str(self.__last_cluster_mod_time))
            else:
                log.info("No nodes were successfully added.")
        except ThreadPoolException as tpe:
            traceback.print_exc()
            log.error("Failed to add new host", exc_info=True)
            log.debug(traceback.format_exc())
            log.error("Individual errors follow")
            for exc in tpe.exceptions:
                print exc[1]
        except Exception:
            traceback.print_exc()
            log.error("Failed to add new host", exc_info=True)
            log.debug(traceback.format_exc())
        return True
Beispiel #28
0
 def run(self, cluster):
     """
     This function will loop indefinitely, using SGELoadBalancer.get_stats()
     to get the clusters status. It looks at the job queue and tries to
     decide whether to add or remove a node.  It should later look at job
     durations (currently doesn't)
     """
     self._cluster = cluster
     if self.max_nodes is None:
         self.max_nodes = cluster.cluster_size
     if self.min_nodes is None:
         self.min_nodes = 1
     if self.kill_cluster:
         self.min_nodes = 0
     if self.min_nodes > self.max_nodes:
         raise exception.BaseException(
             "min_nodes cannot be greater than max_nodes")
     use_default_stats_file = self.dump_stats and not self.stats_file
     use_default_plots_dir = self.plot_stats and not self.plot_output_dir
     if use_default_stats_file or use_default_plots_dir:
         self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True)
     if not self.stats_file:
         self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag
     if not self.plot_output_dir:
         self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag
     if not cluster.is_cluster_up():
         raise exception.ClusterNotRunning(cluster.cluster_tag)
     if self.dump_stats:
         if os.path.isdir(self.stats_file):
             raise exception.BaseException("stats file destination '%s' is"
                                           " a directory" % self.stats_file)
         sfdir = os.path.dirname(os.path.abspath(self.stats_file))
         self._validate_dir(sfdir, msg_prefix="stats file destination")
     if self.plot_stats:
         if os.path.isfile(self.plot_output_dir):
             raise exception.BaseException("plot output destination '%s' "
                                           "is a file" %
                                           self.plot_output_dir)
         self._validate_dir(self.plot_output_dir,
                            msg_prefix="plot output destination")
     raw = dict(__raw__=True)
     log.info("Starting load balancer (Use ctrl-c to exit)")
     log.info("Maximum cluster size: %d" % self.max_nodes, extra=raw)
     log.info("Minimum cluster size: %d" % self.min_nodes, extra=raw)
     log.info("Cluster growth rate: %d nodes/iteration\n" %
              self.add_nodes_per_iteration,
              extra=raw)
     if self.dump_stats:
         log.info("Writing stats to file: %s" % self.stats_file)
     if self.plot_stats:
         log.info("Plotting stats to directory: %s" % self.plot_output_dir)
     while (self._keep_polling):
         cluster.recover(reboot_interval=self.reboot_interval,
                         n_reboot_restart=self.n_reboot_restart)
         cluster.clean()
         if not cluster.is_cluster_up():
             log.info("Waiting for all nodes to come up...")
             time.sleep(self.polling_interval)
             continue
         self.get_stats()
         log.info("Execution hosts: %d" % len(self.stat.hosts), extra=raw)
         log.info("Execution slots: %d" % self.stat.count_total_slots(),
                  extra=raw)
         log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()),
                  extra=raw)
         oldest_queued_job_age = self.stat.oldest_queued_job_age()
         if oldest_queued_job_age:
             log.info("Oldest queued job: %s" % oldest_queued_job_age,
                      extra=raw)
         log.info("Avg job duration: %d secs" %
                  self.stat.avg_job_duration(),
                  extra=raw)
         log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(),
                  extra=raw)
         log.info("Last cluster modification time: %s" %
                  self.__last_cluster_mod_time.isoformat(),
                  extra=dict(__raw__=True))
         # evaluate if nodes need to be added
         skip_sleep = self._eval_add_node()
         # evaluate if nodes need to be removed
         self._eval_remove_node()
         if self.dump_stats or self.plot_stats:
             self.stat.write_stats_to_csv(self.stats_file)
         # call the visualizer
         if self.plot_stats:
             try:
                 self.visualizer.graph_all()
             except IOError, e:
                 raise exception.BaseException(str(e))
         # evaluate if cluster should be terminated
         if self.kill_cluster:
             if self._eval_terminate_cluster():
                 log.info("Terminating cluster and exiting...")
                 return self._cluster.terminate_cluster()
         if not skip_sleep:
             log.info("Sleeping...(looping again in %d secs)\n" %
                      self.polling_interval)
             log.info("Sleeping, it's " + utils.get_utc_now().isoformat())
             time.sleep(self.polling_interval)
             log.info("Waking up, it's " + utils.get_utc_now().isoformat())