def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") sm_instance = utils.get_instance(cluster, "master") dn_instances = utils.get_instances(cluster, "datanode") # Start the name node with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") # start the data nodes self._start_slave_datanode_processes(dn_instances) LOG.info(_LI("Hadoop services in cluster %s have been started"), cluster.name) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/") # start spark nodes if sm_instance: with remote.get_remote(sm_instance) as r: run.start_spark_master(r, self._spark_home(cluster)) LOG.info(_LI("Spark service at '%s' has been started"), sm_instance.hostname()) LOG.info(_LI('Cluster %s has been started successfully'), cluster.name) self._set_cluster_info(cluster)
def _extract_configs_to_extra(self, cluster): nn = utils.get_instance(cluster, "namenode") sp_master = utils.get_instance(cluster, "master") sp_slaves = utils.get_instances(cluster, "slave") extra = dict() config_master = config_slaves = "" if sp_master is not None: config_master = c_helper.generate_spark_env_configs(cluster) if sp_slaves is not None: slavenames = [] for slave in sp_slaves: slavenames.append(slave.hostname()) config_slaves = c_helper.generate_spark_slaves_configs(slavenames) else: config_slaves = "\n" for ng in cluster.node_groups: extra[ng.id] = { "xml": c_helper.generate_xml_configs(ng.configuration(), ng.storage_paths(), nn.hostname(), None), "setup_script": c_helper.generate_hadoop_setup_script( ng.storage_paths(), c_helper.extract_hadoop_environment_confs(ng.configuration()) ), "sp_master": config_master, "sp_slaves": config_slaves, } if c_helper.is_data_locality_enabled(cluster): topology_data = th.generate_topology_map(cluster, CONF.enable_hypervisor_awareness) extra["topology_data"] = "\n".join([k + " " + v for k, v in topology_data.items()]) + "\n" return extra
def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") sm_instance = utils.get_instance(cluster, "master") dn_instances = utils.get_instances(cluster, "datanode") # Start the name node with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") # start the data nodes self._start_slave_datanode_processes(dn_instances) LOG.info("Hadoop services in cluster %s have been started" % cluster.name) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command(("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/")) # start spark nodes if sm_instance: with remote.get_remote(sm_instance) as r: run.start_spark_master(r, self._spark_home(cluster)) LOG.info("Spark service at '%s' has been started", sm_instance.hostname()) LOG.info('Cluster %s has been started successfully' % cluster.name) self._set_cluster_info(cluster)
def scale_cluster(self, cluster, instances): master = utils.get_instance(cluster, "master") r_master = remote.get_remote(master) run.stop_spark(r_master, self._spark_home(cluster)) self._setup_instances(cluster, instances) nn = utils.get_instance(cluster, "namenode") run.refresh_nodes(remote.get_remote(nn), "dfsadmin") self._start_slave_datanode_processes(instances) run.start_spark_master(r_master, self._spark_home(cluster)) LOG.info(_LI("Spark master service at '%s' has been restarted"), master.hostname())
def scale_cluster(cluster, instances): scale_ins_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)] to_scale_dn = [] to_scale_tt = [] for i in scale_ins_hosts: if i in dn_hosts: to_scale_dn.append(i) if i in tt_hosts: to_scale_tt.append(i) client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name) rack = '/Default' client.nodes.add(scale_ins_hosts, rack, 'hadoop', '/home/hadoop/.ssh/id_rsa') client.cluster.install_software(scale_ins_hosts) if to_scale_tt: client.services.mapred.add_nodes('TaskTracker', to_scale_tt) if to_scale_dn: client.services.hdfs.add_nodes('DataNode', to_scale_dn) client.nodes.config() if to_scale_dn: client.services.hdfs.start() if to_scale_tt: client.services.mapred.start()
def install_cluster(cluster): mng_instance = u.get_instance(cluster, 'manager') all_hosts = list(set([i.fqdn() for i in u.get_instances(cluster)])) client = c.IntelClient(mng_instance, cluster.name) LOG.info("Create cluster") client.cluster.create() LOG.info("Add nodes to cluster") rack = '/Default' client.nodes.add(all_hosts, rack, 'hadoop', '/home/hadoop/.ssh/id_rsa') LOG.info("Install software") client.cluster.install_software(all_hosts) LOG.info("Configure services") _configure_services(client, cluster) LOG.info("Deploy cluster") client.nodes.config(force=True) LOG.info("Provisioning configs") # cinder and ephemeral drive support _configure_storage(client, cluster) # swift support _configure_swift(client, cluster) # user configs _add_user_params(client, cluster) LOG.info("Format HDFS") client.services.hdfs.format()
def decommission_nodes(cluster, instances): dec_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] nm_hosts = [nm.fqdn() for nm in u.get_nodemanagers(cluster)] client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name) dec_dn_hosts = [] for dec_host in dec_hosts: if dec_host in dn_hosts: dec_dn_hosts.append(dec_host) if dec_dn_hosts: client.services.hdfs.decommission_nodes(dec_dn_hosts) #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 14400 # 4 hours cur_time = 0 for host in dec_dn_hosts: while cur_time < timeout: if client.services.hdfs.get_datanode_status( host) == 'Decomissioned': break context.sleep(5) cur_time += 5 else: LOG.warn("Failed to decomission node '%s' of cluster '%s' " "in %s minutes" % (host, cluster.name, timeout / 60)) client.nodes.stop(dec_hosts) # wait stop services #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 # 10 minutes cur_time = 0 for instance in instances: while cur_time < timeout: stopped = True if instance.fqdn() in dn_hosts: stopped = stopped and _is_hadoop_service_stopped( instance, 'hadoop-hdfs-datanode') if instance.fqdn() in nm_hosts: stopped = stopped and _is_hadoop_service_stopped( instance, 'hadoop-yarn-nodemanager') if stopped: break else: context.sleep(5) cur_time += 5 else: LOG.warn("Failed to stop services on node '%s' of cluster '%s' " "in %s minutes" % (instance, cluster.name, timeout / 60)) for node in dec_hosts: LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name)) client.nodes.delete(node)
def _set_cluster_info(self, cluster): nn = utils.get_instance(cluster, "namenode") sp_master = utils.get_instance(cluster, "master") info = {} if nn: address = c_helper.get_config_value("HDFS", "dfs.http.address", cluster) port = address[address.rfind(":") + 1 :] info["HDFS"] = {"Web UI": "http://%s:%s" % (nn.management_ip, port)} info["HDFS"]["NameNode"] = "hdfs://%s:8020" % nn.hostname() if sp_master: port = c_helper.get_config_value("Spark", "Master webui port", cluster) if port is not None: info["Spark"] = {"Web UI": "http://%s:%s" % (sp_master.management_ip, port)} ctx = context.ctx() conductor.cluster_update(ctx, cluster, {"info": info})
def _extract_configs_to_extra(self, cluster): nn = utils.get_instance(cluster, "namenode") sp_master = utils.get_instance(cluster, "master") sp_slaves = utils.get_instances(cluster, "slave") extra = dict() config_master = config_slaves = '' if sp_master is not None: config_master = c_helper.generate_spark_env_configs(cluster) if sp_slaves is not None: slavenames = [] for slave in sp_slaves: slavenames.append(slave.hostname()) config_slaves = c_helper.generate_spark_slaves_configs(slavenames) else: config_slaves = "\n" for ng in cluster.node_groups: extra[ng.id] = { 'xml': c_helper.generate_xml_configs( ng.configuration(), ng.storage_paths(), nn.hostname(), None, ), 'setup_script': c_helper.generate_hadoop_setup_script( ng.storage_paths(), c_helper.extract_hadoop_environment_confs( ng.configuration())), 'sp_master': config_master, 'sp_slaves': config_slaves } if c_helper.is_data_locality_enabled(cluster): topology_data = th.generate_topology_map( cluster, CONF.enable_hypervisor_awareness) extra['topology_data'] = "\n".join( [k + " " + v for k, v in topology_data.items()]) + "\n" return extra
def decommission_nodes(self, cluster, instances): sls = utils.get_instances(cluster, "slave") dns = utils.get_instances(cluster, "datanode") decommission_dns = False decommission_sls = False for i in instances: if 'datanode' in i.node_group.node_processes: dns.remove(i) decommission_dns = True if 'slave' in i.node_group.node_processes: sls.remove(i) decommission_sls = True nn = utils.get_instance(cluster, "namenode") spark_master = utils.get_instance(cluster, "master") if decommission_sls: sc.decommission_sl(spark_master, instances, sls) if decommission_dns: sc.decommission_dn(nn, instances, dns)
def _set_cluster_info(self, cluster): nn = utils.get_instance(cluster, "namenode") sp_master = utils.get_instance(cluster, "master") info = {} if nn: address = c_helper.get_config_value('HDFS', 'dfs.http.address', cluster) port = address[address.rfind(':') + 1:] info['HDFS'] = { 'Web UI': 'http://%s:%s' % (nn.management_ip, port) } info['HDFS']['NameNode'] = 'hdfs://%s:8020' % nn.hostname() if sp_master: port = c_helper.get_config_value('Spark', 'Master webui port', cluster) if port is not None: info['Spark'] = { 'Web UI': 'http://%s:%s' % (sp_master.management_ip, port) } ctx = context.ctx() conductor.cluster_update(ctx, cluster, {'info': info})
def _set_cluster_info(self, cluster): nn = utils.get_instance(cluster, "namenode") sp_master = utils.get_instance(cluster, "master") info = {} if nn: address = c_helper.get_config_value( 'HDFS', 'dfs.http.address', cluster) port = address[address.rfind(':') + 1:] info['HDFS'] = { 'Web UI': 'http://%s:%s' % (nn.management_ip, port) } info['HDFS']['NameNode'] = 'hdfs://%s:8020' % nn.hostname() if sp_master: port = c_helper.get_config_value( 'Spark', 'Master webui port', cluster) if port is not None: info['Spark'] = { 'Web UI': 'http://%s:%s' % (sp_master.management_ip, port) } ctx = context.ctx() conductor.cluster_update(ctx, cluster, {'info': info})
def start_cluster(self, cluster): sm_instance = utils.get_instance(cluster, "master") sl_instances = utils.get_instances(cluster, "slave") # start storm master if sm_instance: with remote.get_remote(sm_instance) as r: run.start_storm_master(r) LOG.info("Storm master at '%s' has been started", sm_instance.hostname()) # start storm slaves self._start_slave_processes(sl_instances) LOG.info('Cluster %s has been started successfully' % cluster.name) self._set_cluster_info(cluster)
def start_cluster(cluster): client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name) LOG.debug("Starting hadoop services") client.services.hdfs.start() if u.get_jobtracker(cluster): client.services.mapred.start() if u.get_hiveserver(cluster): client.services.hive.start() if u.get_oozie(cluster): LOG.info("Setup oozie") _setup_oozie(cluster) client.services.oozie.start()
def scale_cluster(cluster, instances): scale_ins_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] nm_hosts = [nm.fqdn() for nm in u.get_nodemanagers(cluster)] to_scale_dn = [] to_scale_nm = [] for i in scale_ins_hosts: if i in dn_hosts: to_scale_dn.append(i) if i in nm_hosts: to_scale_nm.append(i) client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name) rack = '/Default' client.nodes.add(scale_ins_hosts, rack, 'hadoop', '/home/hadoop/.ssh/id_rsa') client.cluster.install_software(scale_ins_hosts) if to_scale_nm: client.services.yarn.add_nodes('NodeManager', to_scale_nm) if to_scale_dn: client.services.hdfs.add_nodes('DataNode', to_scale_dn) # IDH 3.0.2 reset cluster parameters (bug #1300603) # Restoring them back LOG.info("Provisioning configs") # cinder and ephemeral drive support _configure_storage(client, cluster) # swift support _configure_swift(client, cluster) # user configs _add_user_params(client, cluster) client.nodes.config() if to_scale_dn: client.services.hdfs.start() if to_scale_nm: client.services.yarn.start()
def generate_spark_env_configs(cluster): configs = [] # master configuration sp_master = utils.get_instance(cluster, "master") configs.append('SPARK_MASTER_IP=' + sp_master.hostname()) masterport = get_config_value("Spark", "Master port", cluster) if masterport and masterport != _get_spark_opt_default("Master port"): configs.append('SPARK_MASTER_PORT=' + str(masterport)) masterwebport = get_config_value("Spark", "Master webui port", cluster) if masterwebport and \ masterwebport != _get_spark_opt_default("Master webui port"): configs.append('SPARK_MASTER_WEBUI_PORT=' + str(masterwebport)) # configuration for workers workercores = get_config_value("Spark", "Worker cores", cluster) if workercores and workercores != _get_spark_opt_default("Worker cores"): configs.append('SPARK_WORKER_CORES=' + str(workercores)) workermemory = get_config_value("Spark", "Worker memory", cluster) if workermemory and \ workermemory != _get_spark_opt_default("Worker memory"): configs.append('SPARK_WORKER_MEMORY=' + str(workermemory)) workerport = get_config_value("Spark", "Worker port", cluster) if workerport and workerport != _get_spark_opt_default("Worker port"): configs.append('SPARK_WORKER_PORT=' + str(workerport)) workerwebport = get_config_value("Spark", "Worker webui port", cluster) if workerwebport and \ workerwebport != _get_spark_opt_default("Worker webui port"): configs.append('SPARK_WORKER_WEBUI_PORT=' + str(workerwebport)) workerinstances = get_config_value("Spark", "Worker instances", cluster) if workerinstances and \ workerinstances != _get_spark_opt_default("Worker instances"): configs.append('SPARK_WORKER_INSTANCES=' + str(workerinstances)) return '\n'.join(configs)
def generate_spark_env_configs(cluster): configs = [] # master configuration sp_master = utils.get_instance(cluster, "master") configs.append('SPARK_MASTER_IP=' + sp_master.hostname()) masterport = get_config_value("Spark", "Master port", cluster) if masterport and masterport != _get_spark_opt_default("Master port"): configs.append('SPARK_MASTER_PORT=' + str(masterport)) masterwebport = get_config_value("Spark", "Master webui port", cluster) if (masterwebport and masterwebport != _get_spark_opt_default("Master webui port")): configs.append('SPARK_MASTER_WEBUI_PORT=' + str(masterwebport)) # configuration for workers workercores = get_config_value("Spark", "Worker cores", cluster) if workercores and workercores != _get_spark_opt_default("Worker cores"): configs.append('SPARK_WORKER_CORES=' + str(workercores)) workermemory = get_config_value("Spark", "Worker memory", cluster) if (workermemory and workermemory != _get_spark_opt_default("Worker memory")): configs.append('SPARK_WORKER_MEMORY=' + str(workermemory)) workerport = get_config_value("Spark", "Worker port", cluster) if workerport and workerport != _get_spark_opt_default("Worker port"): configs.append('SPARK_WORKER_PORT=' + str(workerport)) workerwebport = get_config_value("Spark", "Worker webui port", cluster) if (workerwebport and workerwebport != _get_spark_opt_default("Worker webui port")): configs.append('SPARK_WORKER_WEBUI_PORT=' + str(workerwebport)) workerinstances = get_config_value("Spark", "Worker instances", cluster) if (workerinstances and workerinstances != _get_spark_opt_default("Worker instances")): configs.append('SPARK_WORKER_INSTANCES=' + str(workerinstances)) return '\n'.join(configs)
def _extract_configs_to_extra(self, cluster): st_master = utils.get_instance(cluster, "master") st_slaves = utils.get_instances(cluster, "slave") zk_servers = utils.get_instances(cluster, "zookeeper") extra = dict() config_instances = '' if st_master is not None: if zk_servers is not None: zknames = [] for zk in zk_servers: zknames.append(zk.hostname()) config_master = c_helper.generate_storm_config(cluster, st_master.hostname(), zknames) # FIGURE OUT HOW TO GET IPS for ng in cluster.node_groups: extra[ng.id] = { 'setup_script': c_helper.generate_hosts_setup_script( ng.storage_paths(), c_helper.extract_hadoop_environment_confs( ng.configuration()) ), 'sp_master': config_master, 'sp_slaves': config_slaves } if c_helper.is_data_locality_enabled(cluster): topology_data = th.generate_topology_map( cluster, CONF.enable_hypervisor_awareness) extra['topology_data'] = "\n".join( [k + " " + v for k, v in topology_data.items()]) + "\n" return extra
def get_resourcemanager(cluster): return u.get_instance(cluster, 'RESOURCEMANAGER')
def install_manager(cluster): LOG.info("Starting Install Manager Process") mng_instance = u.get_instance(cluster, 'manager') idh_tarball_path = c_helper.get_config_value( cluster.cluster_configs.get('general'), c_helper.IDH_TARBALL_URL) idh_tarball_filename = idh_tarball_path.rsplit('/', 1)[-1] idh_dir = idh_tarball_filename[:idh_tarball_filename.find('.tar.gz')] LOG.info("IDH tgz will be retrieved from: \'%s\'", idh_tarball_path) idh_repo = c_helper.get_config_value( cluster.cluster_configs.get('general'), c_helper.IDH_REPO_URL) os_repo = c_helper.get_config_value(cluster.cluster_configs.get('general'), c_helper.OS_REPO_URL) idh_install_cmd = 'sudo ./%s/install.sh --mode=silent 2>&1' % idh_dir with mng_instance.remote() as r: LOG.info("Download IDH manager ") try: r.execute_command('curl -O %s 2>&1' % idh_tarball_path) except Exception as e: raise RuntimeError( "Unable to download IDH manager from %s" % idh_tarball_path, e) # unpack archive LOG.info("Unpack manager %s ", idh_tarball_filename) try: r.execute_command('tar xzf %s 2>&1' % idh_tarball_filename) except Exception as e: raise RuntimeError("Unable to unpack tgz %s", idh_tarball_filename, e) # install idh LOG.debug("Install manager with %s : ", idh_install_cmd) inst_conf = _INST_CONF_TEMPLATE % (os_repo, idh_repo) r.write_file_to('%s/ui-installer/conf' % idh_dir, inst_conf) #TODO(alazarev) make timeout configurable (bug #1262897) r.execute_command(idh_install_cmd, timeout=3600) # fix nginx persimmions bug r.execute_command('sudo chmod o+x /var/lib/nginx/ /var/lib/nginx/tmp ' '/var/lib/nginx/tmp/client_body') # waiting start idh manager #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 LOG.debug("Waiting %s seconds for Manager to start : ", timeout) while timeout: try: telnetlib.Telnet(mng_instance.management_ip, 9443) break except IOError: timeout -= 2 context.sleep(2) else: message = ("IDH Manager failed to start in %s minutes on node '%s' " "of cluster '%s'" % (timeout / 60, mng_instance.management_ip, cluster.name)) LOG.error(message) raise iex.IntelPluginException(message)
def run_job(self, job_execution): ctx = context.ctx() job = conductor.job_get(ctx, job_execution.job_id) proxy_configs = job_execution.job_configs.get('proxy_configs') # We'll always run the driver program on the master master = plugin_utils.get_instance(self.cluster, "master") # TODO(tmckay): wf_dir should probably be configurable. # The only requirement is that the dir is writable by the image user wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job, job_execution.id) paths = job_utils.upload_job_files(master, wf_dir, job, libs_subdir=False, proxy_configs=proxy_configs) # We can shorten the paths in this case since we'll run out of wf_dir paths = [os.path.basename(p) for p in paths] # TODO(tmckay): for now, paths[0] is always assumed to be the app # jar and we generate paths in order (mains, then libs). # When we have a Spark job type, we can require a "main" and set # the app jar explicitly to be "main" app_jar = paths.pop(0) # The rest of the paths will be passed with --jars additional_jars = ",".join(paths) if additional_jars: additional_jars = "--jars " + additional_jars # Launch the spark job using spark-submit and deploy_mode = client host = master.hostname() port = c_helper.get_config_value("Spark", "Master port", self.cluster) spark_submit = os.path.join( c_helper.get_config_value("Spark", "Spark home", self.cluster), "bin/spark-submit") job_class = job_execution.job_configs.configs["edp.java.main_class"] # TODO(tmckay): we need to clean up wf_dirs on long running clusters # TODO(tmckay): probably allow for general options to spark-submit args = " ".join(job_execution.job_configs.get('args', [])) # The redirects of stdout and stderr will preserve output in the wf_dir cmd = "%s %s --class %s %s --master spark://%s:%s %s" % ( spark_submit, app_jar, job_class, additional_jars, host, port, args) # If an exception is raised here, the job_manager will mark # the job failed and log the exception with remote.get_remote(master) as r: # Upload the command launch script launch = os.path.join(wf_dir, "launch_command") r.write_file_to(launch, self._job_script()) r.execute_command("chmod +x %s" % launch) ret, stdout = r.execute_command( "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" % (wf_dir, cmd)) if ret == 0: # Success, we'll add the wf_dir in job_execution.extra and store # pid@instance_id as the job id # We know the job is running so return "RUNNING" return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, { 'spark-path': wf_dir }) # Hmm, no execption but something failed. # Since we're using backgrounding with redirect, this is unlikely. raise e.EDPError( _("Spark job execution failed. Exit status = " "%(status)s, stdout = %(stdout)s") % { 'status': ret, 'stdout': stdout })
def get_oozie(cluster): return u.get_instance(cluster, 'OOZIE_SERVER')
def get_historyserver(cluster): return u.get_instance(cluster, 'historyserver')
def get_manager(cluster): return u.get_instance(cluster, 'MANAGER')
def get_jobtracker(cluster): instance = u.get_instance(cluster, "jobtracker") return instance
def get_oozie(cluster): return u.get_instance(cluster, "oozie")
def get_namenode(cluster): return u.get_instance(cluster, "namenode")
def install_manager(cluster): LOG.info("Starting Install Manager Process") mng_instance = u.get_instance(cluster, 'manager') idh_tarball_path = c_helper.get_config_value( cluster.cluster_configs.get('general'), c_helper.IDH_TARBALL_URL) idh_tarball_filename = idh_tarball_path.rsplit('/', 1)[-1] idh_dir = idh_tarball_filename[:idh_tarball_filename.find('.tar.gz')] LOG.info("IDH tgz will be retrieved from: \'%s\'", idh_tarball_path) idh_repo = c_helper.get_config_value( cluster.cluster_configs.get('general'), c_helper.IDH_REPO_URL) os_repo = c_helper.get_config_value( cluster.cluster_configs.get('general'), c_helper.OS_REPO_URL) idh_install_cmd = 'sudo ./%s/install.sh --mode=silent 2>&1' % idh_dir with mng_instance.remote() as r: LOG.info("Download IDH manager ") try: r.execute_command('curl -O %s 2>&1' % idh_tarball_path) except Exception as e: raise RuntimeError("Unable to download IDH manager from %s" % idh_tarball_path, e) # unpack archive LOG.info("Unpack manager %s ", idh_tarball_filename) try: r.execute_command('tar xzf %s 2>&1' % idh_tarball_filename) except Exception as e: raise RuntimeError("Unable to unpack tgz %s", idh_tarball_filename, e) # install idh LOG.debug("Install manager with %s : ", idh_install_cmd) inst_conf = _INST_CONF_TEMPLATE % (os_repo, idh_repo) r.write_file_to('%s/ui-installer/conf' % idh_dir, inst_conf) #TODO(alazarev) make timeout configurable (bug #1262897) r.execute_command(idh_install_cmd, timeout=3600) # fix nginx persimmions bug r.execute_command('sudo chmod o+x /var/lib/nginx/ /var/lib/nginx/tmp ' '/var/lib/nginx/tmp/client_body') # waiting start idh manager #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 LOG.debug("Waiting %s seconds for Manager to start : ", timeout) while timeout: try: telnetlib.Telnet(mng_instance.management_ip, 9443) break except IOError: timeout -= 2 context.sleep(2) else: message = ("IDH Manager failed to start in %s minutes on node '%s' " "of cluster '%s'" % (timeout / 60, mng_instance.management_ip, cluster.name)) LOG.error(message) raise iex.IntelPluginException(message)
def get_oozie_server(self, cluster): return u.get_instance(cluster, "oozie")
def run_job(self, job_execution): ctx = context.ctx() job = conductor.job_get(ctx, job_execution.job_id) # We'll always run the driver program on the master master = plugin_utils.get_instance(self.cluster, "master") # TODO(tmckay): wf_dir should probably be configurable. # The only requirement is that the dir is writable by the image user wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job, job_execution.id) paths = job_utils.upload_job_files(master, wf_dir, job, libs_subdir=False) # We can shorten the paths in this case since we'll run out of wf_dir paths = [os.path.basename(p) for p in paths] # TODO(tmckay): for now, paths[0] is always assumed to be the app # jar and we generate paths in order (mains, then libs). # When we have a Spark job type, we can require a "main" and set # the app jar explicitly to be "main" app_jar = paths.pop(0) # The rest of the paths will be passed with --jars additional_jars = ",".join(paths) if additional_jars: additional_jars = "--jars " + additional_jars # Launch the spark job using spark-submit and deploy_mode = client host = master.hostname() port = c_helper.get_config_value("Spark", "Master port", self.cluster) spark_submit = os.path.join( c_helper.get_config_value("Spark", "Spark home", self.cluster), "bin/spark-submit") job_class = job_execution.job_configs.configs["edp.java.main_class"] # TODO(tmckay): we need to clean up wf_dirs on long running clusters # TODO(tmckay): probably allow for general options to spark-submit args = " ".join(job_execution.job_configs.get('args', [])) # The redirects of stdout and stderr will preserve output in the wf_dir cmd = "%s %s --class %s %s --master spark://%s:%s %s" % ( spark_submit, app_jar, job_class, additional_jars, host, port, args) # If an exception is raised here, the job_manager will mark # the job failed and log the exception with remote.get_remote(master) as r: # Upload the command launch script launch = os.path.join(wf_dir, "launch_command") r.write_file_to(launch, self._job_script()) r.execute_command("chmod +x %s" % launch) ret, stdout = r.execute_command( "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" % (wf_dir, cmd)) if ret == 0: # Success, we'll add the wf_dir in job_execution.extra and store # pid@instance_id as the job id # We know the job is running so return "RUNNING" return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, {'spark-path': wf_dir}) # Hmm, no execption but something failed. # Since we're using backgrounding with redirect, this is unlikely. raise e.EDPError("Spark job execution failed. Exit status = %s, " "stdout = %s" % (ret, stdout))
def decommission_nodes(cluster, instances): dec_hosts = [i.fqdn() for i in instances] dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)] tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)] client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name) dec_dn_hosts = [] for dec_host in dec_hosts: if dec_host in dn_hosts: dec_dn_hosts.append(dec_host) if dec_dn_hosts: client.services.hdfs.decommission_nodes(dec_dn_hosts) #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 14400 # 4 hours cur_time = 0 for host in dec_dn_hosts: while cur_time < timeout: if client.services.hdfs.get_datanode_status( host) == 'Decomissioned': break context.sleep(5) cur_time += 5 else: LOG.warn("Failed to decomission node '%s' of cluster '%s' " "in %s minutes" % (host, cluster.name, timeout / 60)) client.nodes.stop(dec_hosts) # wait stop services #TODO(alazarev) make timeout configurable (bug #1262897) timeout = 600 # 10 minutes cur_time = 0 for instance in instances: while cur_time < timeout: stopped = True if instance.fqdn() in dn_hosts: code, out = instance.remote().execute_command( 'sudo /sbin/service hadoop-datanode status', raise_when_error=False) if out.strip() != 'datanode is stopped': stopped = False if out.strip() == 'datanode dead but pid file exists': instance.remote().execute_command( 'sudo rm -f ' '/var/run/hadoop/hadoop-hadoop-datanode.pid') if instance.fqdn() in tt_hosts: code, out = instance.remote().execute_command( 'sudo /sbin/service hadoop-tasktracker status', raise_when_error=False) if out.strip() != 'tasktracker is stopped': stopped = False if stopped: break else: context.sleep(5) cur_time += 5 else: LOG.warn("Failed to stop services on node '%s' of cluster '%s' " "in %s minutes" % (instance, cluster.name, timeout / 60)) for node in dec_hosts: LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name)) client.nodes.delete(node)
def get_resourcemanager(cluster): return u.get_instance(cluster, 'resourcemanager')
def get_oozie_server(self, cluster): return u.get_instance(cluster, "oozie_server")
def get_hiveserver(cluster): return u.get_instance(cluster, "hiveserver")
def get_historyserver(cluster): return u.get_instance(cluster, 'JOBHISTORY')
def get_namenode(cluster): return u.get_instance(cluster, "NAMENODE")
def get_secondarynamenode(cluster): return u.get_instance(cluster, 'SECONDARYNAMENODE')
def test_get_instance(self): self.assertIsNone(u.get_instance(self.c1, 'wrong-process')) self.assertEqual(u.get_instance(self.c1, 'nn'), self.ng1.instances[0]) with testtools.ExpectedException(ex.InvalidComponentCountException): u.get_instance(self.c1, 'dn')
def get_oozie_server(self, cluster): return u.get_instance(cluster, "OOZIE_SERVER")