def __init__(self, mode, config_file=None, hosts=None, hadoop_cluster=None): """Create a new Spark cluster. It can be created as a standalone cluster or on top of YARN. Args: mode (int): The cluster manager that is used (STANDALONE_MODE or YARN_MODE). configFile (str, optional): The path of the config file to be used. hosts (list of Host, optional): The hosts of the cluster (standalone operation). hadoop_cluster (HadoopCluster, optional): The Hadoop cluster to link. """ # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) # Deployment properties self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.init_conf_dir = tempfile.mkdtemp("", "spark-init-", "/tmp") self.conf_mandatory_files = [SPARK_CONF_FILE] self.base_dir = config.get("cluster", "spark_base_dir") self.conf_dir = config.get("cluster", "spark_conf_dir") self.logs_dir = config.get("cluster", "spark_logs_dir") self.evs_log_dir = config.get("cluster", "spark_events_dir") self.work_dir = config.get("cluster", "spark_work_dir") self.port = config.getint("cluster", "spark_port") self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.bin_dir = self.base_dir + "/bin" self.sbin_dir = self.base_dir + "/sbin" self.mode = mode self.java_home = None # Initialize hosts if hosts: self.hosts = hosts self.master = hosts[0] elif hadoop_cluster: self.hosts = hadoop_cluster.hosts self.master = hadoop_cluster.master else: logger.error("Hosts in the cluster must be specified either" "directly or indirectly through a Hadoop cluster.") raise SparkException("Hosts in the cluster must be specified " "either directly or indirectly through a " "Hadoop cluster.") # Store cluster information self.hw = hw_manager.make_deployment_hardware() self.hw.add_hosts(self.hosts) self.master_cluster = self.hw.get_host_cluster(self.master) # Store reference to Hadoop cluster and check if mandatory self.hc = hadoop_cluster if not self.hc and self.mode == YARN_MODE: logger.error("When using a YARN_MODE mode, a reference to the " "Hadoop cluster should be provided.") raise SparkException("When using a YARN_MODE mode, a reference " "to the Hadoop cluster should be provided") if self.mode == STANDALONE_MODE: mode_text = "in standalone mode" else: mode_text = "on top of YARN" logger.info("Spark cluster created %s in hosts %s." + (" It is linked to a Hadoop cluster." if self.hc else ""), mode_text, ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]))
def __init__(self, hosts, topo_list=None, config_file=None): """Create a new Hadoop cluster with the given hosts and topology. Args: hosts (list of Host): The hosts to be assigned a topology. topo_list (list of str, optional): The racks to be assigned to each host. len(hosts) should be equal to len(topo_list). configFile (str, optional): The path of the config file to be used. """ # Load properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) # Deployment properties self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.init_conf_dir = tempfile.mkdtemp("", "hadoop-init-", "/tmp") self.conf_mandatory_files = [CORE_CONF_FILE, HDFS_CONF_FILE, MR_CONF_FILE] # Node properties self.base_dir = config.get("cluster", "hadoop_base_dir") self.conf_dir = config.get("cluster", "hadoop_conf_dir") self.logs_dir = config.get("cluster", "hadoop_logs_dir") self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir") self.hdfs_port = config.getint("cluster", "hdfs_port") self.mapred_port = config.getint("cluster", "mapred_port") self.bin_dir = self.base_dir + "/bin" self.sbin_dir = self.base_dir + "/bin" self.java_home = None # Configure master and slaves self.hosts = list(hosts) self.master = self.hosts[0] # Create topology self.topology = HadoopTopology(hosts, topo_list) # Store cluster information self.hw = hw_manager.make_deployment_hardware() self.hw.add_hosts(self.hosts) self.master_cluster = self.hw.get_host_cluster(self.master) # Create a string to display the topology t = {v: [] for v in self.topology.topology.values()} for key, value in self.topology.topology.iteritems(): t[value].append(key.address) log_topo = ', '.join([style.user2(k) + ': ' + ' '.join(map(lambda x: style.host(x.split('.')[0]), v)) for k, v in t.iteritems()]) logger.info("Hadoop cluster created with master %s, hosts %s and " "topology %s", style.host(self.master.address), ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]), log_topo)
def __init__(self, hosts): self.hw = hw_manager.make_deployment_hardware() self.hw.add_hosts(list(hosts))