Python SshProcess Exemples, execo.process.SshProcess Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : engine.py Projet : sarlam/hadoop_g5k

    def _copy_xp_output(self):
        """Copy experiment's output."""

        if self.output_path:
            remote_path = self.macro_manager.test_macros[
                "xp.output"]  # TODO: what happens if not specified?
            local_path = os.path.join(self.output_path, str(self.comb_id))
            logger.info("Copying output to " + local_path)

            tmp_dir = "/tmp"

            # Remove file in tmp dir if exists
            proc = SshProcess(
                "rm -rf " +
                os.path.join(tmp_dir, os.path.basename(remote_path)),
                self.hc.master)
            proc.run()

            # Get files in master
            self.hc.execute("fs -get " + remote_path + " " + tmp_dir,
                            verbose=False)

            # Copy files from master
            action = Get(
                [self.hc.master],
                [os.path.join(tmp_dir, os.path.basename(remote_path))],
                local_path)
            action.run()

Exemple #2

0

Afficher le fichier

Fichier : spark.py Projet : sarlam/hadoop_g5k

    def bootstrap(self, tar_file):

        # 0. Check that required packages are present
        required_packages = "openjdk-7-jre openjdk-7-jdk"
        check_packages = TaktukRemote("dpkg -s " + required_packages,
                                      self.hosts)
        for p in check_packages.processes:
            p.nolog_exit_code = p.nolog_error = True
        check_packages.run()
        if not check_packages.ok:
            logger.info("Packages not installed, trying to install")
            install_packages = TaktukRemote(
                "export DEBIAN_MASTER=noninteractive ; " +
                "apt-get update && apt-get install -y --force-yes " +
                required_packages, self.hosts).run()
            if not install_packages.ok:
                logger.error("Unable to install the packages")

        get_java_home = SshProcess('echo $(readlink -f /usr/bin/javac | '
                                   'sed "s:/bin/javac::")', self.master)
        get_java_home.run()
        self.java_home = get_java_home.stdout.strip()

        logger.info("All required packages are present")

        # 1. Copy hadoop tar file and uncompress
        logger.info("Copy " + tar_file + " to hosts and uncompress")
        rm_dirs = TaktukRemote("rm -rf " + self.base_dir +
                               " " + self.conf_dir,
                               self.hosts)
        put_tar = TaktukPut(self.hosts, [tar_file], "/tmp")
        tar_xf = TaktukRemote(
            "tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp",
            self.hosts)
        SequentialActions([rm_dirs, put_tar, tar_xf]).run()

        # 2. Move installation to base dir
        logger.info("Create installation directories")
        mv_base_dir = TaktukRemote(
            "mv /tmp/" + os.path.basename(tar_file).replace(".tgz", "") + " " +
            self.base_dir,
            self.hosts)
        mkdirs = TaktukRemote("mkdir -p " + self.conf_dir, self.hosts)
        chmods = TaktukRemote("chmod g+w " + self.base_dir +
                              " && chmod g+w " + self.conf_dir,
                              self.hosts)
        SequentialActions([mv_base_dir, mkdirs, chmods]).run()

        # 3. Specify environment variables
        command = "cat >> " + self.conf_dir + "/spark-env.sh << EOF\n"
        command += "JAVA_HOME=" + self.java_home + "\n"
        command += "SPARK_LOG_DIR=" + self.logs_dir + "\n"
        if self.hc:
            command += "HADOOP_CONF_DIR=" + self.hc.conf_dir + "\n"
        if self.mode == YARN_MODE:
            command += "YARN_CONF_DIR=" + self.hc.conf_dir + "\n"
        command += "EOF\n"
        command += "chmod +x " + self.conf_dir + "/spark-env.sh"
        action = Remote(command, self.hosts)
        action.run()

Exemple #3

0

Afficher le fichier

Fichier : engine.py Projet : djamelinfo/hadoop_g5k

    def _copy_xp_output(self):
        """Copy experiment's output."""

        if self.output_path:
            remote_path = self.macro_manager.test_macros["xp.output"]  # TODO: what happens if not specified?
            local_path = os.path.join(self.output_path, str(self.comb_id))
            logger.info("Copying output to " + local_path)

            tmp_dir = "/tmp"

            # Remove file in tmp dir if exists
            proc = SshProcess("rm -rf " +
                              os.path.join(tmp_dir, os.path.basename(remote_path)),
                              self.hc.master)
            proc.run()

            # Get files in master
            self.hc.execute("fs -get " + remote_path + " " + tmp_dir,
                            verbose=False)

            # Copy files from master
            action = Get([self.hc.master],
                         [os.path.join(tmp_dir, os.path.basename(remote_path))],
                         local_path)
            action.run()

Exemple #4

0

Afficher le fichier

Fichier : spark.py Projet : djamelinfo/hadoop_g5k

    def start_spark(self):
        """Start spark processes.
        In STANDALONE mode it starts the master and slaves. In YARN mode it just
        checks that Hadoop is running, and starts it if not.
        """

        logger.info("Starting Spark")

        if self.running:
            logger.warn("Spark was already started")
            return

        if self.mode == STANDALONE_MODE:
            proc = SshProcess(self.sbin_dir + "/start-master.sh;" +
                              self.sbin_dir + "/start-slaves.sh;",
                              self.master)
            proc.run()
            if not proc.finished_ok:
                logger.warn("Error while starting Spark")
                return
        elif self.mode == YARN_MODE:
            if not self.hc.running:
                logger.warn("YARN services must be started first")
                self.hc.start_and_wait()

        self.running = True

Exemple #5

0

Afficher le fichier

    def start_spark(self):
        """Start spark processes.
        In STANDALONE mode it starts the master and slaves. In YARN mode it just
        checks that Hadoop is running, and starts it if not.
        """

        logger.info("Starting Spark")

        if self.running:
            logger.warn("Spark was already started")
            return

        if self.mode == STANDALONE_MODE:
            proc = SshProcess(
                self.sbin_dir + "/start-master.sh;" + self.sbin_dir +
                "/start-slaves.sh;", self.master)
            proc.run()
            if not proc.finished_ok:
                logger.warn("Error while starting Spark")
                return
        elif self.mode == YARN_MODE:
            if not self.hc.running:
                logger.warn("YARN services must be started first")
                self.hc.start_and_wait()

        self.running = True

Exemple #6

0

Afficher le fichier

Fichier : hive.py Projet : rwfazul/hadoop_g5k

    def bootstrap(self, tar_file):

        # 0. Check that required packages are present
        required_packages = "openjdk-7-jre openjdk-7-jdk"
        check_packages = TaktukRemote("dpkg -s " + required_packages,
                                      self.hosts)
        for p in check_packages.processes:
            p.nolog_exit_code = p.nolog_error = True
        check_packages.run()
        if not check_packages.ok:
            logger.info("Packages not installed, trying to install")
            install_packages = TaktukRemote(
                "export DEBIAN_MASTER=noninteractive ; " +
                "apt-get update && apt-get install -y --force-yes " +
                required_packages, self.hosts).run()
            if not install_packages.ok:
                logger.error("Unable to install the packages")

        get_java_home = SshProcess(
            'echo $(readlink -f /usr/bin/javac | '
            'sed "s:/bin/javac::")', self.master)
        get_java_home.run()
        self.java_home = get_java_home.stdout.strip()

        logger.info("All required packages are present")

        # 1. Copy Hive tar file and uncompress
        logger.info("Copy " + tar_file + " to hosts and uncompress")
        rm_dirs = TaktukRemote(
            "rm -rf " + self.base_dir + " " + self.conf_dir + " " +
            self.warehouse_dir + " " + self.logs_dir, self.hosts)
        put_tar = TaktukPut(self.hosts, [tar_file], "/tmp")
        tar_xf = TaktukRemote(
            "tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp",
            self.hosts)
        SequentialActions([rm_dirs, put_tar, tar_xf]).run()

        # 2. Move installation to base dir
        logger.info("Create installation directories")
        mv_base_dir = TaktukRemote(
            "mv /tmp/" + os.path.basename(tar_file).replace(".tar.gz", "") +
            " " + self.base_dir, self.hosts)
        mkdirs = TaktukRemote(
            "mkdir -p " + self.conf_dir + " && mkdir -p " + self.warehouse_dir,
            self.hosts)
        chmods = TaktukRemote(
            "chmod g+w " + self.base_dir + " && chmod g+w " + self.conf_dir +
            " && chmod g+w " + self.warehouse_dir, self.hosts)
        SequentialActions([mv_base_dir, mkdirs, chmods]).run()

        # 3. Specify environment variables
        command = "cat >> " + self.conf_dir + "/hive-env.sh << EOF\n"
        command += "JAVA_HOME=" + self.java_home + "\n"
        command += "HIVE_HOME=" + self.base_dir + "\n"
        command += "HIVE_CONF_DIR=" + self.conf_dir + "\n"
        command += "HADOOP_HOME=" + self.hc.base_dir + "\n"
        command += "EOF\n"
        command += "chmod +x " + self.conf_dir + "/hive-env.sh"
        action = Remote(command, self.hosts)
        action.run()

Exemple #7

0

Afficher le fichier

Fichier : spark.py Projet : djamelinfo/hadoop_g5k

    def execute_job(self, job, node=None, verbose=True):
        """Execute the given Spark job in the specified node.

        Args:
          job (SparkJob):
            The job object.
          node (Host, optional):
            The host were the command should be executed. If not provided,
            self.master is chosen.
          verbose (bool, optional):
            If True stdout and stderr of remote process is displayed.

        Returns (tuple of str):
          A tuple with the standard and error outputs of the process executing
          the job.
        """

        if not self.running:
            logger.warn("The cluster was stopped. Starting it automatically")
            self.start()

        if node is None:
            node = self.master

        exec_dir = "/tmp"

        # Copy necessary files to cluster
        files_to_copy = job.get_files_to_copy()
        action = Put([node], files_to_copy, exec_dir)
        action.run()

        # Get command
        command = job.get_command(exec_dir)

        # Execute
        logger.info("Executing spark job. Command = {" + self.bin_dir +
                    "/spark-submit " + command + "} in " + str(node))

        proc = SshProcess(self.bin_dir + "/spark-submit " + command, node)

        if verbose:
            red_color = '\033[01;31m'

            proc.stdout_handlers.append(sys.stdout)
            proc.stderr_handlers.append(
                ColorDecorator(sys.stderr, red_color))

        proc.start()
        proc.wait()

        # Get job info
        job.stdout = proc.stdout
        job.stderr = proc.stderr
        job.success = (proc.exit_code == 0)

        return proc.stdout, proc.stderr

Exemple #8

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def get_version(self):
        """Return the Hadoop version.
        
        Returns (str):
          The version used by the Hadoop cluster.
        """

        proc = SshProcess("export JAVA_HOME=" + self.java_home + ";" +
                          self.bin_dir + "/hadoop version",
                          self.master)
        proc.run()
        version = proc.stdout.splitlines()[0]
        return version

Exemple #9

0

Afficher le fichier

    def format_dfs(self):
        """Format the distributed filesystem."""

        logger.info("Formatting HDFS")

        proc = SshProcess(self.bin_dir + "/hadoop namenode -format",
                          self.master)
        proc.run()

        if proc.finished_ok:
            logger.info("HDFS formatted successfully")
        else:
            logger.warn("Error while formatting HDFS")

Exemple #10

0

Afficher le fichier

    def get_version(self):
        """Return the Hadoop version.
        
        Returns (str):
          The version used by the Hadoop cluster.
        """

        proc = SshProcess("export JAVA_HOME=" + self.java_home + ";" +
                          self.bin_dir + "/hadoop version",
                          self.master)
        proc.run()
        version = proc.stdout.splitlines()[0]
        return version

Exemple #11

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def format_dfs(self):
        """Format the distributed filesystem."""

        logger.info("Formatting HDFS")

        proc = SshProcess(self.bin_dir + "/hadoop namenode -format",
                          self.master)
        proc.run()

        if proc.finished_ok:
            logger.info("HDFS formatted successfully")
        else:
            logger.warn("Error while formatting HDFS")

Exemple #12

0

Afficher le fichier

    def copy_history(self, dest, job_ids=None):
        """Copy history logs from master.
        
        Args:
          dest (str):
            The path of the local dir where the logs will be copied.
          job_ids (list of str, optional):
            A list with the ids of the jobs for which the history should be
            copied. If nothing is passed, the history of all jobs is copied.
        """

        if not os.path.exists(dest):
            logger.warning("Destination directory " + dest +
                           " does not exist. It will be created")
            os.makedirs(dest)

        history_dir = os.path.join(self.logs_dir, "history")
        if job_ids:
            pattern = " -o ".join("-name " + jid + "*" for jid in job_ids)
            list_dirs = SshProcess("find " + history_dir + " " + pattern,
                                   self.master)
            list_dirs.run()
        else:
            list_dirs = SshProcess("find " + history_dir + " -name job_*",
                                   self.master)
            list_dirs.run()

        remote_files = []
        for line in list_dirs.stdout.splitlines():
            remote_files.append(line)

        action = Get([self.master], remote_files, dest)
        action.run()

Exemple #13

0

Afficher le fichier

    def stop_map_reduce(self):
        """Stop the JobTracker and TaskTrackers."""

        self._check_initialization()

        logger.info("Stopping MapReduce")

        proc = SshProcess(self.sbin_dir + "/stop-mapred.sh", self.master)
        proc.run()

        if not proc.finished_ok:
            logger.warn("Error while stopping MapReduce")
        else:
            self.running_map_reduce = False

Exemple #14

0

Afficher le fichier

    def stop_dfs(self):
        """Stop the NameNode and DataNodes."""

        self._check_initialization()

        logger.info("Stopping HDFS")

        proc = SshProcess(self.sbin_dir + "/stop-dfs.sh", self.master)
        proc.run()

        if not proc.finished_ok:
            logger.warn("Error while stopping HDFS")
        else:
            self.running_dfs = False

Exemple #15

0

Afficher le fichier

    def __force_clean(self):
        """Stop previous Spark processes (if any) and remove all remote files
        created by it."""

        spark_processes = ["Master", "Worker"]

        force_kill = False
        for h in self.hosts:
            proc = SshProcess("jps", h)
            proc.run()

            ids_to_kill = []
            for line in proc.stdout.splitlines():
                field = line.split()
                if field[1] in spark_processes:
                    ids_to_kill.append(field[0])

            if ids_to_kill:
                force_kill = True
                ids_to_kill_str = ""
                for pid in ids_to_kill:
                    ids_to_kill_str += " " + pid

                logger.warn("Killing running Spark processes in host %s" %
                            style.host(h.address.split('.')[0]))

                proc = SshProcess("kill -9" + ids_to_kill_str, h)
                proc.run()

        if force_kill:
            logger.info(
                "Processes from previous hadoop deployments had to be killed")

        self.clean_logs()

Exemple #16

0

Afficher le fichier

Fichier : cluster_v2.py Projet : sarlam/hadoop_g5k

    def stop_yarn(self):
        """Stop the YARN ResourceManager and NodeManagers."""

        self._check_initialization()

        logger.info("Stopping YARN")

        proc = SshProcess(self.sbin_dir + "/stop-yarn.sh", self.master)
        proc.run()

        if not proc.finished_ok:
            logger.warn("Error while stopping YARN")
        else:
            self.running_yarn = False

Exemple #17

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def stop_dfs(self):
        """Stop the NameNode and DataNodes."""

        self._check_initialization()

        logger.info("Stopping HDFS")

        proc = SshProcess(self.sbin_dir + "/stop-dfs.sh", self.master)
        proc.run()

        if not proc.finished_ok:
            logger.warn("Error while stopping HDFS")
        else:
            self.running_dfs = False

Exemple #18

0

Afficher le fichier

Fichier : hive.py Projet : rwfazul/hadoop_g5k

    def __force_clean(self):
        """Stop previous Hive processes (if any) and remove all remote files
        created by it."""

        hive_processes = []

        force_kill = False
        for h in self.hosts:
            proc = SshProcess("jps", self.master)
            proc.run()

            ids_to_kill = []
            for line in proc.stdout.splitlines():
                field = line.split()
                if field[1] in hive_processes:
                    ids_to_kill.append(field[0])

            if ids_to_kill:
                force_kill = True
                ids_to_kill_str = ""
                for pid in ids_to_kill:
                    ids_to_kill_str += " " + pid

                proc = SshProcess("kill -9" + ids_to_kill_str, h)
                proc.run()

        if force_kill:
            logger.info(
                "Processes from previous hadoop deployments had to be killed")

        self.clean_logs()

Exemple #19

0

Afficher le fichier

Fichier : cluster_v2.py Projet : mliroz/bigdata_dpy

    def stop_yarn(self):
        """Stop the YARN ResourceManager and NodeManagers."""
        
        self._check_initialization()

        logger.info("Stopping YARN")

        proc = SshProcess(self.sbin_dir + "/stop-yarn.sh", self.master)
        proc.run()
        
        if not proc.finished_ok:
            logger.warn("Error while stopping YARN")
        else:
            self.running_yarn = False

Exemple #20

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def stop_map_reduce(self):
        """Stop the JobTracker and TaskTrackers."""

        self._check_initialization()

        logger.info("Stopping MapReduce")

        proc = SshProcess(self.sbin_dir + "/stop-mapred.sh", self.master)
        proc.run()

        if not proc.finished_ok:
            logger.warn("Error while stopping MapReduce")
        else:
            self.running_map_reduce = False

Exemple #21

0

Afficher le fichier

    def stop_spark(self):
        """Stop Spark processes."""

        logger.info("Stopping Spark")

        if self.mode == STANDALONE_MODE:
            proc = SshProcess(
                self.sbin_dir + "/stop-slaves.sh;" + self.sbin_dir +
                "/stop-master.sh;", self.master)
            proc.run()
            if not proc.finished_ok:
                logger.warn("Error while stopping Spark")
                return

        self.running = False

Exemple #22

0

Afficher le fichier

Fichier : spark.py Projet : djamelinfo/hadoop_g5k

    def stop_spark(self):
        """Stop Spark processes."""

        logger.info("Stopping Spark")

        if self.mode == STANDALONE_MODE:
            proc = SshProcess(self.sbin_dir + "/stop-slaves.sh;" +
                              self.sbin_dir + "/stop-master.sh;",
                              self.master)
            proc.run()
            if not proc.finished_ok:
                logger.warn("Error while stopping Spark")
                return

        self.running = False

Exemple #23

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def start_dfs_and_wait(self):
        """Start the NameNode and DataNodes and wait for exiting safemode."""

        self._check_initialization()

        self.start_dfs()

        logger.info("Waiting for safe mode to be off")
        proc = SshProcess(self.bin_dir + "/hadoop dfsadmin -safemode wait",
                          self.master)
        proc.run()

        if not proc.finished_ok:
            logger.warn("Error while starting HDFS")
        else:
            self.running_dfs = True

Exemple #24

0

Afficher le fichier

    def start_dfs_and_wait(self):
        """Start the NameNode and DataNodes and wait for exiting safemode."""

        self._check_initialization()

        self.start_dfs()

        logger.info("Waiting for safe mode to be off")
        proc = SshProcess(self.bin_dir + "/hadoop dfsadmin -safemode wait",
                          self.master)
        proc.run()

        if not proc.finished_ok:
            logger.warn("Error while starting HDFS")
        else:
            self.running_dfs = True

Exemple #25

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def execute(self, command, node=None, should_be_running=True,
                verbose=True):
        """Execute the given Hadoop command in the given node.

        Args:
          command (str):
            The command to be executed.
          node (Host, optional):
            The host were the command should be executed. If not provided,
            self.master is chosen.
          should_be_running (bool, optional):
            True if the cluster needs to be running in order to execute the
            command. If so, and it is not running, it is automatically started.
          verbose: (bool, optional):
            If True stdout and stderr of remote process is displayed.

        Returns (tuple of str):
          A tuple with the standard and error outputs of the process executing
          the command.
        """

        self._check_initialization()

        if should_be_running and not self.running:
            logger.warn("The cluster was stopped. Starting it automatically")
            self.start()

        if not node:
            node = self.master

        if verbose:
            logger.info("Executing {" + self.bin_dir + "/hadoop " +
                        command + "} in " + str(node))

        proc = SshProcess(self.bin_dir + "/hadoop " + command, node)

        if verbose:
            red_color = '\033[01;31m'

            proc.stdout_handlers.append(sys.stdout)
            proc.stderr_handlers.append(
                ColorDecorator(sys.stderr, red_color))

        proc.start()
        proc.wait()

        return (proc.stdout, proc.stderr)

Exemple #26

0

Afficher le fichier

Fichier : cluster_v2.py Projet : mliroz/bigdata_dpy

    def start_yarn(self):
        """Start the YARN ResourceManager and NodeManagers."""

        logger.info("Starting YARN")
        
        self._check_initialization()
        
        proc = SshProcess(self.sbin_dir + "/start-yarn.sh", self.master)
        proc.run()        
        
        if not proc.finished_ok:
            logger.warn("Error while starting YARN")
        else:
            #TODO: get success or not from super.
            self.running_yarn = True
            if self.running_dfs:
                self.running = True

Exemple #27

0

Afficher le fichier

    def start_yarn(self):
        """Start the YARN ResourceManager and NodeManagers."""

        logger.info("Starting YARN")

        self._check_initialization()

        proc = SshProcess(self.sbin_dir + "/start-yarn.sh", self.master)
        proc.run()

        if not proc.finished_ok:
            logger.warn("Error while starting YARN")
        else:
            #TODO: get success or not from super.
            self.running_yarn = True
            if self.running_dfs:
                self.running = True

Exemple #28

0

Afficher le fichier

Fichier : wrapper.py Projet : mliroz/diversity_p2p

    def execute(self):
        """Execute a single test.

        Return:
          str: Local path of the file containing the process output.
        """

        test = SshProcess("java -jar " + self.jar_path +
                          " -p " + self.props_path,
                          self.host)

        # Output is stored in a local temporary file
        (_, temp_file) = tempfile.mkstemp("", "div_p2p-out-", "/tmp")
        test.stdout_handlers.append(temp_file)

        test.run()

        return temp_file

Exemple #29

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def start_dfs(self):
        """Start the NameNode and DataNodes."""

        self._check_initialization()

        logger.info("Starting HDFS")

        if self.running_dfs:
            logger.warn("Dfs was already started")
            return

        proc = SshProcess(self.sbin_dir + "/start-dfs.sh", self.master)
        proc.run()

        if not proc.finished_ok:
            logger.warn("Error while starting HDFS")
        else:
            self.running_dfs = True

Exemple #30

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def start_map_reduce(self):
        """Start the JobTracker and TaskTrackers."""

        self._check_initialization()

        logger.info("Starting MapReduce")

        if self.running_map_reduce:
            logger.warn("Error while starting MapReduce")
            return

        proc = SshProcess(self.sbin_dir + "/start-mapred.sh", self.master)
        proc.run()

        if not proc.finished_ok:
            logger.info("MapReduce started successfully")
        else:
            self.running_map_reduce = True

Exemple #31

0

Afficher le fichier

    def start_map_reduce(self):
        """Start the JobTracker and TaskTrackers."""

        self._check_initialization()

        logger.info("Starting MapReduce")

        if self.running_map_reduce:
            logger.warn("Error while starting MapReduce")
            return

        proc = SshProcess(self.sbin_dir + "/start-mapred.sh", self.master)
        proc.run()

        if not proc.finished_ok:
            logger.info("MapReduce started successfully")
        else:
            self.running_map_reduce = True

Exemple #32

0

Afficher le fichier

    def start_dfs(self):
        """Start the NameNode and DataNodes."""

        self._check_initialization()

        logger.info("Starting HDFS")

        if self.running_dfs:
            logger.warn("Dfs was already started")
            return

        proc = SshProcess(self.sbin_dir + "/start-dfs.sh", self.master)
        proc.run()

        if not proc.finished_ok:
            logger.warn("Error while starting HDFS")
        else:
            self.running_dfs = True

Exemple #33

0

Afficher le fichier

Fichier : mahout.py Projet : djamelinfo/hadoop_g5k

    def bootstrap(self, tar_file):

        # 1. Remove used dirs if existing
        action = Remote("rm -rf " + self.base_dir, self.hc.hosts)
        action.run()
        action = Remote("rm -rf " + self.conf_dir, self.hc.hosts)
        action.run()

        # 1. Copy Mahout tar file and uncompress
        logger.info("Copy " + tar_file + " to hosts and uncompress")
        action = Put(self.hc.hosts, [tar_file], "/tmp")
        action.run()
        action = Remote(
            "tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp",
            self.hc.hosts)
        action.run()

        # 2. Move installation to base dir
        logger.info("Create installation directories")
        action = Remote(
            "mv /tmp/" +
            os.path.basename(tar_file).replace(".tar.gz", "") + " " +
            self.base_dir,
            self.hc.hosts)
        action.run()

        # 3 Create other dirs
        action = Remote("mkdir -p " + self.conf_dir, self.hc.hosts)
        action.run()

        # 4. Include libraries in Hadoop's classpath
        list_dirs = SshProcess("ls -1 " + self.base_dir + "/*.jar",
                               self.hc.master)
        list_dirs.run()
        libs = " ".join(list_dirs.stdout.splitlines())
        action = Remote("cp " + libs + " " + self.hc.base_dir + "/lib",
                        self.hc.hosts)
        action.run()

        initialized = True  # No need to call initialize()

Exemple #34

0

Afficher le fichier

Fichier : hive.py Projet : djamelinfo/hadoop_g5k

    def __force_clean(self):
        """Stop previous Hive processes (if any) and remove all remote files
        created by it."""

        hive_processes = []

        force_kill = False
        for h in self.hosts:
            proc = SshProcess("jps", self.master)
            proc.run()

            ids_to_kill = []
            for line in proc.stdout.splitlines():
                field = line.split()
                if field[1] in hive_processes:
                    ids_to_kill.append(field[0])

            if ids_to_kill:
                force_kill = True
                ids_to_kill_str = ""
                for pid in ids_to_kill:
                    ids_to_kill_str += " " + pid

                proc = SshProcess("kill -9" + ids_to_kill_str, h)
                proc.run()

        if force_kill:
            logger.info(
                "Processes from previous hadoop deployments had to be killed")

        self.clean_logs()

Exemple #35

0

Afficher le fichier

    def bootstrap(self, tar_file):

        # 1. Remove used dirs if existing
        action = Remote("rm -rf " + self.base_dir, self.hc.hosts)
        action.run()
        action = Remote("rm -rf " + self.conf_dir, self.hc.hosts)
        action.run()

        # 1. Copy Mahout tar file and uncompress
        logger.info("Copy " + tar_file + " to hosts and uncompress")
        action = Put(self.hc.hosts, [tar_file], "/tmp")
        action.run()
        action = Remote(
            "tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp",
            self.hc.hosts)
        action.run()

        # 2. Move installation to base dir
        logger.info("Create installation directories")
        action = Remote(
            "mv /tmp/" +
            os.path.basename(tar_file).replace(".tar.gz", "") + " " +
            self.base_dir,
            self.hc.hosts)
        action.run()

        # 3 Create other dirs
        action = Remote("mkdir -p " + self.conf_dir, self.hc.hosts)
        action.run()

        # 4. Include libraries in Hadoop's classpath
        list_dirs = SshProcess("ls -1 " + self.base_dir + "/*.jar",
                               self.hc.master)
        list_dirs.run()
        libs = " ".join(list_dirs.stdout.splitlines())
        action = Remote("cp " + libs + " " + self.hc.base_dir + "/lib",
                        self.hc.hosts)
        action.run()

        initialized = True  # No need to call initialize()

Exemple #36

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def copy_history(self, dest, job_ids=None):
        """Copy history logs from master.
        
        Args:
          dest (str):
            The path of the local dir where the logs will be copied.
          job_ids (list of str, optional):
            A list with the ids of the jobs for which the history should be
            copied. If nothing is passed, the history of all jobs is copied.
        """

        if not os.path.exists(dest):
            logger.warning("Destination directory " + dest +
                           " does not exist. It will be created")
            os.makedirs(dest)

        history_dir = os.path.join(self.logs_dir, "history")
        if job_ids:
            pattern = " -o ".join("-name " + jid + "*" for jid in job_ids)
            list_dirs = SshProcess("find " + history_dir + " " + pattern,
                                   self.master)
            list_dirs.run()
        else:
            list_dirs = SshProcess("find " + history_dir + " -name job_*",
                                   self.master)
            list_dirs.run()

        remote_files = []
        for line in list_dirs.stdout.splitlines():
            remote_files.append(line)

        action = Get([self.master], remote_files, dest)
        action.run()

Exemple #37

0

Afficher le fichier

Fichier : mahout.py Projet : djamelinfo/hadoop_g5k

    def execute(self, command, node=None, verbose=True):

        if not node:
            node = self.hc.master

        if verbose:
            logger.info("Executing {" + self.bin_dir + "/mahout " +
                        command + "} in " + str(node))

        proc = SshProcess("export JAVA_HOME='" + self.hc.java_home + "';" +
                          "export HADOOP_HOME='" + self.hc.base_dir + "';" +
                          self.bin_dir + "/mahout " + command, node)

        if verbose:
            red_color = '\033[01;31m'

            proc.stdout_handlers.append(sys.stdout)
            proc.stderr_handlers.append(ColorDecorator(sys.stderr, red_color))

        proc.start()
        proc.wait()

        return proc.stdout, proc.stderr

Exemple #38

0

Afficher le fichier

    def execute_job(self, job, node=None, verbose=True):
        """Execute the given Spark job in the specified node.

        Args:
          job (SparkJob):
            The job object.
          node (Host, optional):
            The host were the command should be executed. If not provided,
            self.master is chosen.
          verbose (bool, optional):
            If True stdout and stderr of remote process is displayed.

        Returns (tuple of str):
          A tuple with the standard and error outputs of the process executing
          the job.
        """

        if not self.running:
            logger.warn("The cluster was stopped. Starting it automatically")
            self.start()

        if node is None:
            node = self.master

        exec_dir = "/tmp"

        # Copy necessary files to cluster
        files_to_copy = job.get_files_to_copy()
        action = Put([node], files_to_copy, exec_dir)
        action.run()

        # Get command
        command = job.get_command(exec_dir)

        # Execute
        logger.info("Executing spark job. Command = {" + self.bin_dir +
                    "/spark-submit " + command + "} in " + str(node))

        proc = SshProcess(self.bin_dir + "/spark-submit " + command, node)

        if verbose:
            red_color = '\033[01;31m'

            proc.stdout_handlers.append(sys.stdout)
            proc.stderr_handlers.append(ColorDecorator(sys.stderr, red_color))

        proc.start()
        proc.wait()

        # Get job info
        job.stdout = proc.stdout
        job.stderr = proc.stderr
        job.success = (proc.exit_code == 0)

        return proc.stdout, proc.stderr

Exemple #39

0

Afficher le fichier

Fichier : cluster_v2.py Projet : sarlam/hadoop_g5k

    def copy_history(self, dest, job_ids=None):
        """Copy history logs from dfs.

        Args:
          dest (str):
            The path of the local dir where the logs will be copied.
          job_ids (list of str, optional):
            A list with the ids of the jobs for which the history should be
            copied. If nothing is passed, the history of all jobs is copied.
        """

        if not os.path.exists(dest):
            logger.warning("Destination directory " + dest +
                           " does not exist. It will be created")
            os.makedirs(dest)

        # Dirs used
        user_login = getpass.getuser()
        hist_dfs_dir = "/tmp/hadoop-yarn/staging/history/done_intermediate/" + \
                       user_login
        hist_tmp_dir = "/tmp/hadoop_hist"

        # Remove file in tmp dir if exists
        proc = SshProcess("rm -rf " + hist_tmp_dir, self.master)
        proc.run()

        # Get files in master
        if job_ids:
            proc = SshProcess("mkdir " + hist_tmp_dir, self.master)
            proc.run()
            for jid in job_ids:
                self.execute("fs -get " + hist_dfs_dir + "/" + jid + "* " +
                             hist_tmp_dir,
                             verbose=False)
        else:
            self.execute("fs -get " + hist_dfs_dir + " " + hist_tmp_dir,
                         verbose=False)

        # Copy files from master
        action = Get([self.master], [hist_tmp_dir], dest)
        action.run()

Exemple #40

0

Afficher le fichier

    def execute(self,
                command,
                node=None,
                should_be_running=True,
                verbose=True):
        """Execute the given Hadoop command in the given node.

        Args:
          command (str):
            The command to be executed.
          node (Host, optional):
            The host were the command should be executed. If not provided,
            self.master is chosen.
          should_be_running (bool, optional):
            True if the cluster needs to be running in order to execute the
            command. If so, and it is not running, it is automatically started.
          verbose: (bool, optional):
            If True stdout and stderr of remote process is displayed.

        Returns (tuple of str):
          A tuple with the standard and error outputs of the process executing
          the command.
        """

        self._check_initialization()

        if should_be_running and not self.running:
            logger.warn("The cluster was stopped. Starting it automatically")
            self.start()

        if not node:
            node = self.master

        if verbose:
            logger.info("Executing {" + self.bin_dir + "/hadoop " + command +
                        "} in " + str(node))

        proc = SshProcess(self.bin_dir + "/hadoop " + command, node)

        if verbose:
            red_color = '\033[01;31m'

            proc.stdout_handlers.append(sys.stdout)
            proc.stderr_handlers.append(ColorDecorator(sys.stderr, red_color))

        proc.start()
        proc.wait()

        return proc.stdout, proc.stderr

Exemple #41

0

Afficher le fichier

Fichier : hive.py Projet : rwfazul/hadoop_g5k

    def _copy_base_conf(self):
        """Copy base configuration files to tmp dir."""

        self.temp_conf_dir = tempfile.mkdtemp("", "hive-", "/tmp")
        if os.path.exists(self.local_base_conf_dir):
            base_conf_files = [
                os.path.join(self.local_base_conf_dir, f)
                for f in os.listdir(self.local_base_conf_dir)
            ]
            for f in base_conf_files:
                shutil.copy(f, self.temp_conf_dir)
        else:
            logger.warn(
                "Local conf dir does not exist. Using default configuration")
            base_conf_files = []

        mandatory_files = ["hive-site.xml"]

        missing_conf_files = mandatory_files
        for f in base_conf_files:
            f_base_name = os.path.basename(f)
            if f_base_name in missing_conf_files:
                missing_conf_files.remove(f_base_name)

        # Copy or create mandatory files
        action = SshProcess("ls -1 " + self.conf_dir, self.master)
        action.run()
        files_in_conf_dir = action.stdout

        remote_missing_files = []
        for f in missing_conf_files:
            if f in files_in_conf_dir:
                remote_missing_files.append(os.path.join(self.conf_dir, f))
            else:
                create_xml_file(os.path.join(self.temp_conf_dir, f))

        if remote_missing_files:
            logger.info("Copying missing conf files from master: " +
                        str(remote_missing_files))

            action = Get([self.master], remote_missing_files,
                         self.temp_conf_dir)
            action.run()

Exemple #42

0

Afficher le fichier

Fichier : dataset.py Projet : sarlam/hadoop_g5k

        def copy_function(host, files_to_copy, collector=None):
            action = Put([host], files_to_copy, tmp_dir)
            action.run()

            local_final_size = 0

            for f in files_to_copy:
                src_file = os.path.join(tmp_dir, os.path.basename(f))
                if self.pre_load_function:
                    src_file = self.pre_load_function(src_file, host)

                    action = SshProcess("du -b " + src_file + "| cut -f1", host)
                    action.run()

                    local_final_size += int(action.stdout.strip())

                hc.execute("fs -put " + src_file + " " +
                           os.path.join(dest, os.path.basename(src_file)),
                           host, True, False)

            if collector:
                collector.increment(local_final_size)

Exemple #43

0

Afficher le fichier

Fichier : hive.py Projet : djamelinfo/hadoop_g5k

    def _copy_base_conf(self):
        """Copy base configuration files to tmp dir."""

        self.temp_conf_dir = tempfile.mkdtemp("", "hive-", "/tmp")
        if os.path.exists(self.local_base_conf_dir):
            base_conf_files = [os.path.join(self.local_base_conf_dir, f)
                               for f in os.listdir(self.local_base_conf_dir)]
            for f in base_conf_files:
                shutil.copy(f, self.temp_conf_dir)
        else:
            logger.warn(
                "Local conf dir does not exist. Using default configuration")
            base_conf_files = []

        mandatory_files = ["hive-site.xml"]

        missing_conf_files = mandatory_files
        for f in base_conf_files:
            f_base_name = os.path.basename(f)
            if f_base_name in missing_conf_files:
                missing_conf_files.remove(f_base_name)

        # Copy or create mandatory files
        action = SshProcess("ls -1 " + self.conf_dir, self.master)
        action.run()
        files_in_conf_dir = action.stdout

        remote_missing_files = []
        for f in missing_conf_files:
            if f in files_in_conf_dir:
                remote_missing_files.append(os.path.join(self.conf_dir, f))
            else:
                create_xml_file(os.path.join(self.temp_conf_dir, f))

        if remote_missing_files:
            logger.info("Copying missing conf files from master: " + str(
                remote_missing_files))

            action = Get([self.master], remote_missing_files,
                         self.temp_conf_dir)
            action.run()

Exemple #44

0

Afficher le fichier

Fichier : cluster_v2.py Projet : mliroz/bigdata_dpy

    def copy_history(self, dest, job_ids=None):
        """Copy history logs from dfs.

        Args:
          dest (str):
            The path of the local dir where the logs will be copied.
          job_ids (list of str, optional):
            A list with the ids of the jobs for which the history should be
            copied. If nothing is passed, the history of all jobs is copied.
        """

        if not os.path.exists(dest):
            logger.warning("Destination directory " + dest +
                           " does not exist. It will be created")
            os.makedirs(dest)

        # Dirs used
        user_login = getpass.getuser()
        hist_dfs_dir = "/tmp/hadoop-yarn/staging/history/done_intermediate/" + \
                       user_login
        hist_tmp_dir = "/tmp/hadoop_hist"

        # Remove file in tmp dir if exists
        proc = SshProcess("rm -rf " + hist_tmp_dir, self.master)
        proc.run()

        # Get files in master
        if job_ids:
            proc = SshProcess("mkdir " + hist_tmp_dir, self.master)
            proc.run()
            for jid in job_ids:
                self.execute("fs -get " + hist_dfs_dir + "/" + jid + "* " +
                             hist_tmp_dir, verbose=False)
        else:
            self.execute("fs -get " + hist_dfs_dir + " " + hist_tmp_dir,
                         verbose=False)

        # Copy files from master
        action = Get([self.master], [hist_tmp_dir], dest)
        action.run()

Exemple #45

0

Afficher le fichier

Fichier : spark.py Projet : djamelinfo/hadoop_g5k

    def __force_clean(self):
        """Stop previous Spark processes (if any) and remove all remote files
        created by it."""

        spark_processes = [
            "Master",
            "Worker"
        ]

        force_kill = False
        for h in self.hosts:
            proc = SshProcess("jps", h)
            proc.run()

            ids_to_kill = []
            for line in proc.stdout.splitlines():
                field = line.split()
                if field[1] in spark_processes:
                    ids_to_kill.append(field[0])

            if ids_to_kill:
                force_kill = True
                ids_to_kill_str = ""
                for pid in ids_to_kill:
                    ids_to_kill_str += " " + pid

                logger.warn(
                    "Killing running Spark processes in host %s" %
                    style.host(h.address.split('.')[0]))

                proc = SshProcess("kill -9" + ids_to_kill_str, h)
                proc.run()

        if force_kill:
            logger.info(
                "Processes from previous hadoop deployments had to be killed")

        self.clean_logs()

Exemple #46

0

Afficher le fichier

    def execute(self, command, node=None, verbose=True):

        if not node:
            node = self.hc.master

        if verbose:
            logger.info("Executing {" + self.bin_dir + "/mahout " +
                        command + "} in " + str(node))

        proc = SshProcess("export JAVA_HOME='" + self.hc.java_home + "';" +
                          "export HADOOP_HOME='" + self.hc.base_dir + "';" +
                          self.bin_dir + "/mahout " + command, node)

        if verbose:
            red_color = '\033[01;31m'

            proc.stdout_handlers.append(sys.stdout)
            proc.stderr_handlers.append(ColorDecorator(sys.stderr, red_color))

        proc.start()
        proc.wait()

        return proc.stdout, proc.stderr

Exemple #47

0

Afficher le fichier

Fichier : deploy_execware.py Projet : nirvanesque/execo-g5k-tools

        if resources[c] > 1:
            wanted = {c: 1}
            break
    jobs_specs = get_jobs_specs(wanted, name=job_name)
    for sub, frontend in jobs_specs:
        sub.walltime = walltime
        sub.job_type = "deploy"
    job = oarsub(jobs_specs)[0]

nodes = get_oar_job_nodes(job[0], job[1])
logger.info('Deploying host %s', nodes[0].address)
deployed, undeployed = deploy(Deployment(nodes, env_name="jessie-x64-base"))

execware_host = list(deployed)[0]
logger.info('Installing required packages %s', style.emph(packages))
install_packages = SshProcess(
    'apt-get update && apt-get install -y ' + packages, execware_host).run()
logger.info('Copying files to host')
put_files = Put(execware_host, [source_code], remote_location="/tmp").run()

xml_file = """
<settings>
     <proxies>
      <proxy>
         <id>g5k-proxy</id>
         <active>true</active>
         <protocol>http</protocol>
         <host>proxy</host>
         <port>3128</port>
       </proxy>
      <proxy>
         <id>g5k-proxy-https</id>

Exemple #48

0

Afficher le fichier

    def bootstrap(self, tar_file):
        """Install Hadoop in all cluster nodes from the specified tar.gz file.
        
        Args:
          tar_file (str):
            The file containing Hadoop binaries.
        """

        # 0. Check that required packages are present
        required_packages = "openjdk-7-jre openjdk-7-jdk"
        check_packages = TaktukRemote("dpkg -s " + required_packages,
                                      self.hosts)
        for p in check_packages.processes:
            p.nolog_exit_code = p.nolog_error = True
        check_packages.run()
        if not check_packages.ok:
            logger.info("Packages not installed, trying to install")
            install_packages = TaktukRemote(
                "export DEBIAN_MASTER=noninteractive ; " +
                "apt-get update && apt-get install -y --force-yes " +
                required_packages, self.hosts).run()
            if not install_packages.ok:
                logger.error("Unable to install the packages")

        get_java_home = SshProcess('echo $(readlink -f /usr/bin/javac | '
                                   'sed "s:/bin/javac::")', self.master)
        get_java_home.run()
        self.java_home = get_java_home.stdout.strip()

        logger.info("All required packages are present")

        # 1. Copy hadoop tar file and uncompress
        logger.info("Copy " + tar_file + " to hosts and uncompress")
        rm_dirs = Remote("rm -rf " + self.base_dir +
                         " " + self.conf_dir +
                         " " + self.logs_dir +
                         " " + self.hadoop_temp_dir,
                         self.hosts)
        put_tar = TaktukPut(self.hosts, [tar_file], "/tmp")
        tar_xf = TaktukRemote(
            "tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp",
            self.hosts)
        SequentialActions([rm_dirs, put_tar, tar_xf]).run()

        # 2. Move installation to base dir and create other dirs
        logger.info("Create installation directories")
        mv_base_dir = TaktukRemote(
            "mv /tmp/" +
            os.path.basename(tar_file).replace(".tar.gz", "") + " " +
            self.base_dir,
            self.hosts)
        mkdirs = TaktukRemote("mkdir -p " + self.conf_dir +
                              " && mkdir -p " + self.logs_dir +
                              " && mkdir -p " + self.hadoop_temp_dir,
                              self.hosts)
        chmods = TaktukRemote("chmod g+w " + self.base_dir +
                              " && chmod g+w " + self.conf_dir +
                              " && chmod g+w " + self.logs_dir +
                              " && chmod g+w " + self.hadoop_temp_dir,
                              self.hosts)
        SequentialActions([mv_base_dir, mkdirs, chmods]).run()

        # 4. Specify environment variables
        command = "cat >> " + self.conf_dir + "/hadoop-env.sh << EOF\n"
        command += "export JAVA_HOME=" + self.java_home + "\n"
        command += "export HADOOP_LOG_DIR=" + self.logs_dir + "\n"
        command += "HADOOP_HOME_WARN_SUPPRESS=\"TRUE\"\n"
        command += "EOF"
        action = Remote(command, self.hosts)
        action.run()

        # 5. Check version
        return self._check_version_compliance()

Exemple #49

0

Afficher le fichier

    def execute_job(self, job, node=None, verbose=True):
        """Execute the given MapReduce job in the specified node.
        
        Args:
          job (HadoopJarJob):
            The job object.
          node (Host, optional):
            The host were the command should be executed. If not provided,
            self.master is chosen.
          verbose (bool, optional):
            If True stdout and stderr of remote process is displayed.

        Returns (tuple of str):
          A tuple with the standard and error outputs of the process executing
          the job.
        """

        self._check_initialization()

        if not self.running:
            logger.warn("The cluster was stopped. Starting it automatically")
            self.start()

        if not node:
            node = self.master

        exec_dir = "/tmp"

        # Copy necessary files to cluster
        files_to_copy = job.get_files_to_copy()
        action = Put([node], files_to_copy, exec_dir)
        action.run()

        # Get command
        command = job.get_command(exec_dir)

        # Execute
        logger.info("Executing jar job. Command = {" + self.bin_dir +
                    "/hadoop " + command + "} in " + str(node))

        proc = SshProcess(self.bin_dir + "/hadoop " + command, node)

        if verbose:
            red_color = '\033[01;31m'

            proc.stdout_handlers.append(sys.stdout)
            proc.stderr_handlers.append(
                ColorDecorator(sys.stderr, red_color))

        proc.start()
        proc.wait()

        # Get job info
        job.stdout = proc.stdout
        job.stderr = proc.stderr
        job.success = (proc.exit_code == 0)

        for line in job.stdout.splitlines():
            if "Running job" in line:
                if "mapred.JobClient" in line or "mapreduce.Job" in line:
                    # TODO: more possible formats?
                    try:
                        match = re.match('.*Running job: (.*)', line)
                        job.job_id = match.group(1)
                        break
                    except:
                        pass

        return (proc.stdout, proc.stderr)

Exemple #50

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def execute_job(self, job, node=None, verbose=True):
        """Execute the given MapReduce job in the specified node.
        
        Args:
          job (HadoopJarJob):
            The job object.
          node (Host, optional):
            The host were the command should be executed. If not provided,
            self.master is chosen.
          verbose (bool, optional):
            If True stdout and stderr of remote process is displayed.

        Returns (tuple of str):
          A tuple with the standard and error outputs of the process executing
          the job.
        """

        self._check_initialization()

        if not self.running:
            logger.warn("The cluster was stopped. Starting it automatically")
            self.start()

        if not node:
            node = self.master

        exec_dir = "/tmp"

        # Copy necessary files to cluster
        files_to_copy = job.get_files_to_copy()
        action = Put([node], files_to_copy, exec_dir)
        action.run()

        # Get command
        command = job.get_command(exec_dir)

        # Execute
        logger.info("Executing jar job. Command = {" + self.bin_dir +
                    "/hadoop " + command + "} in " + str(node))

        proc = SshProcess(self.bin_dir + "/hadoop " + command, node)

        if verbose:
            red_color = '\033[01;31m'

            proc.stdout_handlers.append(sys.stdout)
            proc.stderr_handlers.append(
                ColorDecorator(sys.stderr, red_color))

        proc.start()
        proc.wait()

        # Get job info
        job.stdout = proc.stdout
        job.stderr = proc.stderr
        job.success = (proc.exit_code == 0)

        for line in job.stdout.splitlines():
            if "Running job" in line:
                if "mapred.JobClient" in line or "mapreduce.Job" in line:
                    # TODO: more possible formats?
                    try:
                        match = re.match('.*Running job: (.*)', line)
                        job.job_id = match.group(1)
                        break
                    except:
                        pass

        return (proc.stdout, proc.stderr)

Exemple #51

0

Afficher le fichier

Fichier : cluster.py Projet : lmolina/hadoop_g5k

    def bootstrap(self, tar_file):
        """Install Hadoop in all cluster nodes from the specified tar.gz file.
        
        Args:
          tar_file (str):
            The file containing Hadoop binaries.
        """

        # 0. Check that required packages are present
        required_packages = "openjdk-7-jre openjdk-7-jdk"
        check_packages = TaktukRemote("dpkg -s " + required_packages,
                                      self.hosts)
        for p in check_packages.processes:
            p.nolog_exit_code = p.nolog_error = True
        check_packages.run()
        if not check_packages.ok:
            logger.info("Packages not installed, trying to install")
            install_packages = TaktukRemote(
                "export DEBIAN_MASTER=noninteractive ; " +
                "apt-get update && apt-get install -y --force-yes " +
                required_packages, self.hosts).run()
            if not install_packages.ok:
                logger.error("Unable to install the packages")

        get_java_home = SshProcess('echo $(readlink -f /usr/bin/javac | '
                                   'sed "s:/bin/javac::")', self.master)
        get_java_home.run()
        self.java_home = get_java_home.stdout.strip()

        logger.info("All required packages are present")

        # 1. Copy hadoop tar file and uncompress
        logger.info("Copy " + tar_file + " to hosts and uncompress")
        rm_dirs = Remote("rm -rf " + self.base_dir +
                         " " + self.conf_dir +
                         " " + self.logs_dir +
                         " " + self.hadoop_temp_dir,
                         self.hosts)
        put_tar = TaktukPut(self.hosts, [tar_file], "/tmp")
        tar_xf = TaktukRemote(
            "tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp",
            self.hosts)
        SequentialActions([rm_dirs, put_tar, tar_xf]).run()

        # 2. Move installation to base dir and create other dirs
        logger.info("Create installation directories")
        mv_base_dir = TaktukRemote(
            "mv /tmp/" +
            os.path.basename(tar_file).replace(".tar.gz", "") + " " +
            self.base_dir,
            self.hosts)
        mkdirs = TaktukRemote("mkdir -p " + self.conf_dir +
                              " && mkdir -p " + self.logs_dir +
                              " && mkdir -p " + self.hadoop_temp_dir,
                              self.hosts)
        chmods = TaktukRemote("chmod g+w " + self.base_dir +
                              " && chmod g+w " + self.conf_dir +
                              " && chmod g+w " + self.logs_dir +
                              " && chmod g+w " + self.hadoop_temp_dir,
                              self.hosts)
        SequentialActions([mv_base_dir, mkdirs, chmods]).run()

        # 4. Specify environment variables
        command = "cat >> " + self.conf_dir + "/hadoop-env.sh << EOF\n"
        command += "export JAVA_HOME=" + self.java_home + "\n"
        command += "export HADOOP_LOG_DIR=" + self.logs_dir + "\n"
        command += "HADOOP_HOME_WARN_SUPPRESS=\"TRUE\"\n"
        command += "EOF"
        action = Remote(command, self.hosts)
        action.run()

        # 5. Check version
        return self._check_version_compliance()