def bootstrap(self, tar_file): # 0. Check that required packages are present required_packages = "openjdk-7-jre openjdk-7-jdk" check_packages = TaktukRemote("dpkg -s " + required_packages, self.hosts) for p in check_packages.processes: p.nolog_exit_code = p.nolog_error = True check_packages.run() if not check_packages.ok: logger.info("Packages not installed, trying to install") install_packages = TaktukRemote( "export DEBIAN_MASTER=noninteractive ; " + "apt-get update && apt-get install -y --force-yes " + required_packages, self.hosts).run() if not install_packages.ok: logger.error("Unable to install the packages") get_java_home = SshProcess( 'echo $(readlink -f /usr/bin/javac | ' 'sed "s:/bin/javac::")', self.master) get_java_home.run() self.java_home = get_java_home.stdout.strip() logger.info("All required packages are present") # 1. Copy Hive tar file and uncompress logger.info("Copy " + tar_file + " to hosts and uncompress") rm_dirs = TaktukRemote( "rm -rf " + self.base_dir + " " + self.conf_dir + " " + self.warehouse_dir + " " + self.logs_dir, self.hosts) put_tar = TaktukPut(self.hosts, [tar_file], "/tmp") tar_xf = TaktukRemote( "tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp", self.hosts) SequentialActions([rm_dirs, put_tar, tar_xf]).run() # 2. Move installation to base dir logger.info("Create installation directories") mv_base_dir = TaktukRemote( "mv /tmp/" + os.path.basename(tar_file).replace(".tar.gz", "") + " " + self.base_dir, self.hosts) mkdirs = TaktukRemote( "mkdir -p " + self.conf_dir + " && mkdir -p " + self.warehouse_dir, self.hosts) chmods = TaktukRemote( "chmod g+w " + self.base_dir + " && chmod g+w " + self.conf_dir + " && chmod g+w " + self.warehouse_dir, self.hosts) SequentialActions([mv_base_dir, mkdirs, chmods]).run() # 3. Specify environment variables command = "cat >> " + self.conf_dir + "/hive-env.sh << EOF\n" command += "JAVA_HOME=" + self.java_home + "\n" command += "HIVE_HOME=" + self.base_dir + "\n" command += "HIVE_CONF_DIR=" + self.conf_dir + "\n" command += "HADOOP_HOME=" + self.hc.base_dir + "\n" command += "EOF\n" command += "chmod +x " + self.conf_dir + "/hive-env.sh" action = Remote(command, self.hosts) action.run()
def check_java_version(java_major_version, hosts): tr = TaktukRemote("java -version 2>&1 | grep version", hosts) tr.run() for p in tr.processes: match = re.match('.*[^.0-9]1\.([0-9]+).[0-9].*', p.stdout) version = int(match.group(1)) if java_major_version > version: msg = "Java 1.%d+ required" % java_major_version return False return True
def bootstrap(self, tar_file): # 0. Check requirements java_major_version = 7 if not check_java_version(java_major_version, self.hosts): msg = "Java 1.%d+ required" % java_major_version logger.error(msg) raise SparkException(msg) self.java_home = get_java_home(self.master) # 1. Copy hadoop tar file and uncompress logger.info("Copy " + tar_file + " to hosts and uncompress") rm_dirs = TaktukRemote("rm -rf " + self.base_dir + " " + self.conf_dir, self.hosts) put_tar = TaktukPut(self.hosts, [tar_file], "/tmp") tar_xf = TaktukRemote( "tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp", self.hosts) rm_tar = TaktukRemote("rm /tmp/" + os.path.basename(tar_file), self.hosts) SequentialActions([rm_dirs, put_tar, tar_xf, rm_tar]).run() # 2. Move installation to base dir logger.info("Create installation directories") mv_base_dir = TaktukRemote( "mv /tmp/" + os.path.basename(tar_file).replace(".tgz", "") + " " + self.base_dir, self.hosts) mkdirs = TaktukRemote( "mkdir -p " + self.conf_dir + " && mkdir -p " + self.logs_dir, self.hosts) chmods = TaktukRemote( "chmod g+w " + self.base_dir + " && chmod g+w " + self.conf_dir + " && chmod g+w " + self.logs_dir, self.hosts) SequentialActions([mv_base_dir, mkdirs, chmods]).run() # 2.1. Create spark-events dir if self.evs_log_dir: if self.evs_log_dir.startswith("file://") or \ "://" not in self.evs_log_dir: mk_evs_dir = TaktukRemote( "mkdir -p " + self.evs_log_dir + " && chmod g+w " + self.evs_log_dir, self.hosts) mk_evs_dir.run() elif self.evs_log_dir.startswith("hdfs://"): self.hc.execute("fs -mkdir -p " + self.evs_log_dir) # 3. Specify environment variables env_file = self.conf_dir + "/spark-env.sh" command = "cat >> " + env_file + " << EOF\n" command += "JAVA_HOME=" + self.java_home + "\n" command += "SPARK_LOG_DIR=" + self.logs_dir + "\n" if self.hc: command += "HADOOP_CONF_DIR=" + self.hc.conf_dir + "\n" if self.mode == YARN_MODE: command += "YARN_CONF_DIR=" + self.hc.conf_dir + "\n" command += "EOF\n" command += "echo SPARK_PUBLIC_DNS=$(hostname) >> " + env_file command += " && chmod +x " + env_file action = Remote(command, self.hosts) action.run() # 4. Generate initial configuration self._initialize_conf()
def load(self, hc, dest, desired_size=None): """Load the dataset in the given dfs folder by copying it from the local folder. Args: hc (HadoopCluster): The Hadoop cluster where to deploy the dataset. dest (str): The dfs destination folder. desired_size (int, optional): The size of the data to be copied. If indicated only the first files of the dataset up to the given size are copied, if not, the whole dataset is transferred. """ dataset_files = [os.path.join(self.local_path, f) for f in os.listdir(self.local_path)] hosts = hc.hosts # Define and create temp dir tmp_dir = "/tmp" + dest action_remove = TaktukRemote("rm -rf " + tmp_dir, hosts) action_remove.run() action_create = TaktukRemote("mkdir -p " + tmp_dir, hosts) action_create.run() # Generate list of files to copy if desired_size: all_files_to_copy = [] dataset_files.sort() real_size = 0 while real_size < desired_size: if dataset_files: all_files_to_copy.append(dataset_files[0]) real_size += os.path.getsize(dataset_files[0]) del dataset_files[0] else: logger.warn( "Dataset files do not fill up to desired size " "(real size = " + str(real_size) + ")") break else: real_size = 0 all_files_to_copy = dataset_files for f in all_files_to_copy: real_size += os.path.getsize(f) # Assign files to hosts files_per_host = [[]] * len(hosts) for idx in range(0, len(hosts)): files_per_host[idx] = all_files_to_copy[idx::len(hosts)] # Create threads and launch them logger.info( "Loading dataset in parallel into " + str(len(hosts)) + " hosts") if not hc.running: hc.start() class SizeCollector: size = 0 lock = threading.Lock() def __init__(self): pass def increment(self, qty): self.lock.acquire() try: self.size += qty finally: self.lock.release() def copy_function(host, files_to_copy, collector=None): action = Put([host], files_to_copy, tmp_dir) action.run() local_final_size = 0 for f in files_to_copy: src_file = os.path.join(tmp_dir, os.path.basename(f)) if self.pre_load_function: src_file = self.pre_load_function(src_file, host) action = SshProcess("du -b " + src_file + "| cut -f1", host) action.run() local_final_size += int(action.stdout.strip()) hc.execute("fs -put " + src_file + " " + os.path.join(dest, os.path.basename(src_file)), host, True, False) if collector: collector.increment(local_final_size) if self.pre_load_function: final_size = SizeCollector() else: final_size = None threads = [] for idx, h in enumerate(hosts): if files_per_host[idx]: t = threading.Thread(target=copy_function, args=(h, files_per_host[idx], final_size)) t.start() threads.append(t) # Wait for the threads to finish for t in threads: t.join() logger.info("Loading completed: real local size = " + str(real_size) + ", final remote size = " + str(final_size.size)) self.deployments[hc, desired_size] = dest
def bootstrap(self, tar_file): """Install Hadoop in all cluster nodes from the specified tar.gz file. Args: tar_file (str): The file containing Hadoop binaries. """ # 0. Check that required packages are present required_packages = "openjdk-7-jre openjdk-7-jdk" check_packages = TaktukRemote("dpkg -s " + required_packages, self.hosts) for p in check_packages.processes: p.nolog_exit_code = p.nolog_error = True check_packages.run() if not check_packages.ok: logger.info("Packages not installed, trying to install") install_packages = TaktukRemote( "export DEBIAN_MASTER=noninteractive ; " + "apt-get update && apt-get install -y --force-yes " + required_packages, self.hosts).run() if not install_packages.ok: logger.error("Unable to install the packages") get_java_home = SshProcess('echo $(readlink -f /usr/bin/javac | ' 'sed "s:/bin/javac::")', self.master) get_java_home.run() self.java_home = get_java_home.stdout.strip() logger.info("All required packages are present") # 1. Copy hadoop tar file and uncompress logger.info("Copy " + tar_file + " to hosts and uncompress") rm_dirs = Remote("rm -rf " + self.base_dir + " " + self.conf_dir + " " + self.logs_dir + " " + self.hadoop_temp_dir, self.hosts) put_tar = TaktukPut(self.hosts, [tar_file], "/tmp") tar_xf = TaktukRemote( "tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp", self.hosts) SequentialActions([rm_dirs, put_tar, tar_xf]).run() # 2. Move installation to base dir and create other dirs logger.info("Create installation directories") mv_base_dir = TaktukRemote( "mv /tmp/" + os.path.basename(tar_file).replace(".tar.gz", "") + " " + self.base_dir, self.hosts) mkdirs = TaktukRemote("mkdir -p " + self.conf_dir + " && mkdir -p " + self.logs_dir + " && mkdir -p " + self.hadoop_temp_dir, self.hosts) chmods = TaktukRemote("chmod g+w " + self.base_dir + " && chmod g+w " + self.conf_dir + " && chmod g+w " + self.logs_dir + " && chmod g+w " + self.hadoop_temp_dir, self.hosts) SequentialActions([mv_base_dir, mkdirs, chmods]).run() # 4. Specify environment variables command = "cat >> " + self.conf_dir + "/hadoop-env.sh << EOF\n" command += "export JAVA_HOME=" + self.java_home + "\n" command += "export HADOOP_LOG_DIR=" + self.logs_dir + "\n" command += "HADOOP_HOME_WARN_SUPPRESS=\"TRUE\"\n" command += "EOF" action = Remote(command, self.hosts) action.run() # 5. Check version return self._check_version_compliance()
def check_packages(packages, hosts): tr = TaktukRemote("dpkg -s " + packages, hosts) for p in tr.processes: p.nolog_exit_code = p.nolog_error = True tr.run() return tr.ok
def bootstrap(self, tar_file): """Install Hadoop in all cluster nodes from the specified tar.gz file. Args: tar_file (str): The file containing Hadoop binaries. """ # 0. Check requirements java_major_version = 7 if not check_java_version(java_major_version, self.hosts): msg = "Java 1.%d+ required" % java_major_version logger.error(msg) raise HadoopException(msg) self.java_home = get_java_home(self.master) # 1. Copy hadoop tar file and uncompress logger.info("Copy " + tar_file + " to hosts and uncompress") rm_dirs = TaktukRemote( "rm -rf " + self.base_dir + " " + self.conf_dir + " " + self.logs_dir + " " + self.hadoop_temp_dir, self.hosts) put_tar = TaktukPut(self.hosts, [tar_file], "/tmp") tar_xf = TaktukRemote( "tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp", self.hosts) rm_tar = TaktukRemote("rm /tmp/" + os.path.basename(tar_file), self.hosts) SequentialActions([rm_dirs, put_tar, tar_xf, rm_tar]).run() # 2. Move installation to base dir and create other dirs logger.info("Create installation directories") mv_base_dir = TaktukRemote( "mv /tmp/" + os.path.basename(tar_file).replace(".tar.gz", "") + " " + self.base_dir, self.hosts) mkdirs = TaktukRemote( "mkdir -p " + self.conf_dir + " && mkdir -p " + self.logs_dir + " && mkdir -p " + self.hadoop_temp_dir, self.hosts) chmods = TaktukRemote( "chmod g+w " + self.base_dir + " && chmod g+w " + self.conf_dir + " && chmod g+w " + self.logs_dir + " && chmod g+w " + self.hadoop_temp_dir, self.hosts) SequentialActions([mv_base_dir, mkdirs, chmods]).run() # 4. Specify environment variables command = "cat >> " + self.conf_dir + "/hadoop-env.sh << EOF\n" command += "export JAVA_HOME=" + self.java_home + "\n" command += "export HADOOP_LOG_DIR=" + self.logs_dir + "\n" command += "HADOOP_HOME_WARN_SUPPRESS=\"TRUE\"\n" command += "EOF" action = Remote(command, self.hosts) action.run() # 5. Check version (cannot do it before) if not self._check_version_compliance(): return False # 6. Generate initial configuration self._initialize_conf() return True