Esempio n. 1
0
 def run(self, cluster):
     """
     This is a rough looping function. it will loop indefinitely, using
     SGELoadBalancer.get_stats() to get the clusters status. It will look
     at the job queue and try to decide whether to add or remove a node.
     It should later look at job durations. Doesn't yet.
     """
     self._cluster = cluster
     if not cluster.is_cluster_up():
         raise exception.ClusterNotRunning(cluster.cluster_tag)
     while (self._keep_polling):
         if not cluster.is_cluster_up():
             log.info("Entire cluster is not up, nodes added/removed. " + \
                      "No Action.")
             time.sleep(self.polling_interval)
             continue
         if self.get_stats() == -1:
             log.error("Failed to get stats. LoadBalancer is terminating.")
             return
         log.info(
             "Oldest job is from %s. # queued jobs = %d. # hosts = %d." %
             (self.stat.oldest_queued_job_age(),
              len(self.stat.get_queued_jobs()), len(self.stat.hosts)))
         log.info("Avg job duration = %d sec, Avg wait time = %d sec." %
                  (self.stat.avg_job_duration(), self.stat.avg_wait_time()))
         #evaluate if nodes need to be added
         self._eval_add_node()
         #evaluate if nodes need to be removed
         self._eval_remove_node()
         #call the visualizer
         self._call_visualizer()
         #sleep for the specified number of seconds
         log.info("Sleeping, looping again in %d seconds.\n" %
                  self.polling_interval)
         time.sleep(self.polling_interval)
Esempio n. 2
0
 def run(self, cluster):
     """
     This function will loop indefinitely, using SGELoadBalancer.get_stats()
     to get the clusters status. It looks at the job queue and tries to
     decide whether to add or remove a node.  It should later look at job
     durations (currently doesn't)
     """
     self._cluster = cluster
     if self.max_nodes is None:
         self.max_nodes = cluster.cluster_size
     if self.min_nodes is None:
         self.min_nodes = 1
     if self.kill_cluster:
         self.min_nodes = 0
     if self.min_nodes > self.max_nodes:
         raise exception.BaseException(
             "min_nodes cannot be greater than max_nodes")
     use_default_stats_file = self.dump_stats and not self.stats_file
     use_default_plots_dir = self.plot_stats and not self.plot_output_dir
     if use_default_stats_file or use_default_plots_dir:
         self._mkdir(DEFAULT_STATS_DIR % cluster.cluster_tag, makedirs=True)
     if not self.stats_file:
         self.stats_file = DEFAULT_STATS_FILE % cluster.cluster_tag
     if not self.plot_output_dir:
         self.plot_output_dir = DEFAULT_STATS_DIR % cluster.cluster_tag
     if not cluster.is_cluster_up():
         raise exception.ClusterNotRunning(cluster.cluster_tag)
     if self.dump_stats:
         if os.path.isdir(self.stats_file):
             raise exception.BaseException("stats file destination '%s' is"
                                           " a directory" % self.stats_file)
         sfdir = os.path.dirname(os.path.abspath(self.stats_file))
         self._validate_dir(sfdir, msg_prefix="stats file destination")
     if self.plot_stats:
         if os.path.isfile(self.plot_output_dir):
             raise exception.BaseException("plot output destination '%s' "
                                           "is a file" %
                                           self.plot_output_dir)
         self._validate_dir(self.plot_output_dir,
                            msg_prefix="plot output destination")
     raw = dict(__raw__=True)
     log.info("Starting load balancer (Use ctrl-c to exit)")
     log.info("Maximum cluster size: %d" % self.max_nodes, extra=raw)
     log.info("Minimum cluster size: %d" % self.min_nodes, extra=raw)
     log.info("Cluster growth rate: %d nodes/iteration\n" %
              self.add_nodes_per_iteration,
              extra=raw)
     if self.dump_stats:
         log.info("Writing stats to file: %s" % self.stats_file)
     if self.plot_stats:
         log.info("Plotting stats to directory: %s" % self.plot_output_dir)
     while (self._keep_polling):
         cluster.recover(reboot_interval=self.reboot_interval,
                         n_reboot_restart=self.n_reboot_restart)
         cluster.clean()
         if not cluster.is_cluster_up():
             log.info("Waiting for all nodes to come up...")
             time.sleep(self.polling_interval)
             continue
         self.get_stats()
         log.info("Execution hosts: %d" % len(self.stat.hosts), extra=raw)
         log.info("Execution slots: %d" % self.stat.count_total_slots(),
                  extra=raw)
         log.info("Queued jobs: %d" % len(self.stat.get_queued_jobs()),
                  extra=raw)
         oldest_queued_job_age = self.stat.oldest_queued_job_age()
         if oldest_queued_job_age:
             log.info("Oldest queued job: %s" % oldest_queued_job_age,
                      extra=raw)
         log.info("Avg job duration: %d secs" %
                  self.stat.avg_job_duration(),
                  extra=raw)
         log.info("Avg job wait time: %d secs" % self.stat.avg_wait_time(),
                  extra=raw)
         log.info("Last cluster modification time: %s" %
                  self.__last_cluster_mod_time.isoformat(),
                  extra=dict(__raw__=True))
         # evaluate if nodes need to be added
         skip_sleep = self._eval_add_node()
         # evaluate if nodes need to be removed
         self._eval_remove_node()
         if self.dump_stats or self.plot_stats:
             self.stat.write_stats_to_csv(self.stats_file)
         # call the visualizer
         if self.plot_stats:
             try:
                 self.visualizer.graph_all()
             except IOError, e:
                 raise exception.BaseException(str(e))
         # evaluate if cluster should be terminated
         if self.kill_cluster:
             if self._eval_terminate_cluster():
                 log.info("Terminating cluster and exiting...")
                 return self._cluster.terminate_cluster()
         if not skip_sleep:
             log.info("Sleeping...(looping again in %d secs)\n" %
                      self.polling_interval)
             time.sleep(self.polling_interval)