Esempio n. 1
0
    def install(
            self,
            ssh_client: paramiko.client.SSHClient,
            cluster: FlintrockCluster):
        print("[{h}] Installing HDFS...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        with ssh_client.open_sftp() as sftp:
            sftp.put(
                localpath=os.path.join(SCRIPTS_DIR, 'download-hadoop.py'),
                remotepath='/tmp/download-hadoop.py')

        ssh_check_output(
            client=ssh_client,
            command="""
                set -e

                python /tmp/download-hadoop.py "{version}"

                mkdir "hadoop"
                mkdir "hadoop/conf"

                tar xzf "hadoop-{version}.tar.gz" -C "hadoop" --strip-components=1
                rm "hadoop-{version}.tar.gz"
            """.format(version=self.version))
Esempio n. 2
0
    def install(
            self,
            ssh_client: paramiko.client.SSHClient,
            cluster: FlintrockCluster):

        print("[{h}] Installing Spark...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        try:
            if self.version:
                with ssh_client.open_sftp() as sftp:
                    sftp.put(
                        localpath=os.path.join(SCRIPTS_DIR, 'install-spark.sh'),
                        remotepath='/tmp/install-spark.sh')
                    sftp.chmod(path='/tmp/install-spark.sh', mode=0o755)
                url = self.download_source.format(v=self.version)
                ssh_check_output(
                    client=ssh_client,
                    command="""
                        set -e
                        /tmp/install-spark.sh {url}
                        rm -f /tmp/install-spark.sh
                    """.format(url=shlex.quote(url)))
            else:
                ssh_check_output(
                    client=ssh_client,
                    command="""
                        set -e
                        sudo yum install -y git
                        sudo yum install -y java-devel
                        """)
                ssh_check_output(
                    client=ssh_client,
                    command="""
                        set -e
                        git clone {repo} spark
                        cd spark
                        git reset --hard {commit}
                        if [ -e "make-distribution.sh" ]; then
                            ./make-distribution.sh -Phadoop-2.6
                        else
                            ./dev/make-distribution.sh -Phadoop-2.6
                        fi
                    """.format(
                        repo=shlex.quote(self.git_repository),
                        commit=shlex.quote(self.git_commit)))
            ssh_check_output(
                client=ssh_client,
                command="""
                    set -e
                    for f in $(find spark/bin -type f -executable -not -name '*.cmd'); do
                        sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)"
                    done
                    echo "export SPARK_HOME='$(pwd)/spark'" >> .bashrc
                """)
        except Exception as e:
            # TODO: This should be a more specific exception.
            print("Error: Failed to install Spark.", file=sys.stderr)
            print(e, file=sys.stderr)
            raise
Esempio n. 3
0
    def install(self, ssh_client: paramiko.client.SSHClient,
                cluster: FlintrockCluster):
        logger.info("[{h}] Installing HDFS...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        with ssh_client.open_sftp() as sftp:
            sftp.put(localpath=os.path.join(SCRIPTS_DIR, 'download-hadoop.py'),
                     remotepath='/tmp/download-hadoop.py')

        ssh_check_output(client=ssh_client,
                         command="""
                set -e

                python /tmp/download-hadoop.py "{version}" "{download_source}"

                mkdir "hadoop"
                mkdir "hadoop/conf"

                tar xzf "hadoop-{version}.tar.gz" -C "hadoop" --strip-components=1
                rm "hadoop-{version}.tar.gz"

                for f in $(find hadoop/bin -type f -executable -not -name '*.cmd'); do
                    sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)"
                done
                echo "export HADOOP_LIBEXEC_DIR='$(pwd)/hadoop/libexec'" >> .bashrc
            """.format(version=self.version,
                       download_source=self.download_source))
Esempio n. 4
0
    def install(
        self,
        ssh_client: paramiko.client.SSHClient,
        cluster: FlintrockCluster,
    ):
        logger.info("[{h}] Installing HDFS...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        with ssh_client.open_sftp() as sftp:
            sftp.put(localpath=os.path.join(SCRIPTS_DIR,
                                            'download-package.py'),
                     remotepath='/tmp/download-package.py')

        logger.debug("[{h}] Downloading Hadoop from: {s}".format(
            h=ssh_client.get_transport().getpeername()[0],
            s=self.download_source,
        ))

        ssh_check_output(
            client=ssh_client,
            command="""
                set -e

                python /tmp/download-package.py "{download_source}" "hadoop"

                for f in $(find hadoop/bin -type f -executable -not -name '*.cmd'); do
                    sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)"
                done

                echo "export HADOOP_LIBEXEC_DIR='$(pwd)/hadoop/libexec'" >> .bashrc
            """.format(
                # version=self.version,
                download_source=self.download_source.format(v=self.version), ))
Esempio n. 5
0
    def install(
            self,
            ssh_client: paramiko.client.SSHClient,
            cluster: FlintrockCluster):
        logger.info("[{h}] Installing HDFS...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        with ssh_client.open_sftp() as sftp:
            sftp.put(
                localpath=os.path.join(SCRIPTS_DIR, 'download-package.py'),
                remotepath='/tmp/download-package.py')

        ssh_check_output(
            client=ssh_client,
            command="""
                set -e

                python /tmp/download-package.py "{download_source}" "hadoop"

                for f in $(find hadoop/bin -type f -executable -not -name '*.cmd'); do
                    sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)"
                done

                echo "export HADOOP_LIBEXEC_DIR='$(pwd)/hadoop/libexec'" >> .bashrc
            """.format(
                version=self.version,
                download_source=self.download_source.format(v=self.version),
            ))
Esempio n. 6
0
    def install(
            self,
            ssh_client: paramiko.client.SSHClient,
            cluster: FlintrockCluster):

        print("[{h}] Installing Spark...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        try:
            if self.version:
                with ssh_client.open_sftp() as sftp:
                    sftp.put(
                        localpath=os.path.join(SCRIPTS_DIR, 'install-spark.sh'),
                        remotepath='/tmp/install-spark.sh')
                    sftp.chmod(path='/tmp/install-spark.sh', mode=0o755)
                url = self.download_source.format(v=self.version)
                ssh_check_output(
                    client=ssh_client,
                    command="""
                        set -e
                        /tmp/install-spark.sh {url}
                        rm -f /tmp/install-spark.sh
                    """.format(url=shlex.quote(url)))
            else:
                ssh_check_output(
                    client=ssh_client,
                    command="""
                        set -e
                        sudo yum install -y git
                        sudo yum install -y java-devel
                        """)
                ssh_check_output(
                    client=ssh_client,
                    command="""
                        set -e
                        git clone {repo} spark
                        cd spark
                        git reset --hard {commit}
                        if [ -e "make-distribution.sh" ]; then
                            ./make-distribution.sh -Phadoop-2.6
                        else
                            ./dev/make-distribution.sh -Phadoop-2.6
                        fi
                    """.format(
                        repo=shlex.quote(self.git_repository),
                        commit=shlex.quote(self.git_commit)))
            ssh_check_output(
                client=ssh_client,
                command="""
                    set -e
                    for f in $(find spark/bin -type f -executable -not -name '*.cmd'); do
                        sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)"
                    done
                    echo "export SPARK_HOME='$(pwd)/spark'" >> .bashrc
                """)
        except Exception as e:
            # TODO: This should be a more specific exception.
            print("Error: Failed to install Spark.", file=sys.stderr)
            print(e, file=sys.stderr)
            raise
Esempio n. 7
0
    def install(
            self,
            ssh_client: paramiko.client.SSHClient,
            cluster: FlintrockCluster):
        print("[{h}] Installing HDFS...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        with ssh_client.open_sftp() as sftp:
            sftp.put(
                localpath=os.path.join(SCRIPTS_DIR, 'download-hadoop.py'),
                remotepath='/tmp/download-hadoop.py')

        ssh_check_output(
            client=ssh_client,
            command="""
                set -e

                python /tmp/download-hadoop.py "{version}" "{download_source}"

                mkdir "hadoop"
                mkdir "hadoop/conf"

                tar xzf "hadoop-{version}.tar.gz" -C "hadoop" --strip-components=1
                rm "hadoop-{version}.tar.gz"

                for f in $(find hadoop/bin -type f -executable -not -name '*.cmd'); do
                    sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)"
                done
            """.format(version=self.version, download_source=self.download_source))
Esempio n. 8
0
    def install(
            self,
            ssh_client: paramiko.client.SSHClient,
            cluster: FlintrockCluster):
        print("[{h}] Installing HDFS...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        with ssh_client.open_sftp() as sftp:
            sftp.put(
                localpath=os.path.join(SCRIPTS_DIR, 'download-hadoop.py'),
                remotepath='/tmp/download-hadoop.py')

        ssh_check_output(
            client=ssh_client,
            command="""
                set -e

                python /tmp/download-hadoop.py "{version}" "{download_source}"

                mkdir "hadoop"
                mkdir "hadoop/conf"

                tar xzf "hadoop-{version}.tar.gz" -C "hadoop" --strip-components=1
                rm "hadoop-{version}.tar.gz"
            """.format(version=self.version, download_source=self.download_source))
Esempio n. 9
0
    def install(
            self,
            ssh_client: paramiko.client.SSHClient,
            cluster: FlintrockCluster):
        logger.info("[{h}] Installing Spark...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        if self.version:
            with ssh_client.open_sftp() as sftp:
                sftp.put(
                    localpath=os.path.join(SCRIPTS_DIR, 'download-package.py'),
                    remotepath='/tmp/download-package.py')

            ssh_check_output(
                client=ssh_client,
                command="""
                    python /tmp/download-package.py "{download_source}" "spark"
                """.format(
                    version=self.version,
                    download_source=self.download_source.format(v=self.version),
                ))

        else:
            ssh_check_output(
                client=ssh_client,
                command="""
                    set -e
                    sudo yum install -y git
                    sudo yum install -y java-devel
                    """)
            ssh_check_output(
                client=ssh_client,
                command="""
                    set -e
                    git clone {repo} spark
                    cd spark
                    git reset --hard {commit}
                    if [ -e "make-distribution.sh" ]; then
                        ./make-distribution.sh -Phadoop-{hadoop_short_version}
                    else
                        ./dev/make-distribution.sh -Phadoop-{hadoop_short_version}
                    fi
                """.format(
                    repo=shlex.quote(self.git_repository),
                    commit=shlex.quote(self.git_commit),
                    # Hardcoding this here until we figure out a better way to handle
                    # the supported build profiles.
                    hadoop_short_version='2.7',
                ))
        ssh_check_output(
            client=ssh_client,
            command="""
                set -e
                for f in $(find spark/bin -type f -executable -not -name '*.cmd'); do
                    sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)"
                done
                echo "export SPARK_HOME='$(pwd)/spark'" >> .bashrc
            """)
Esempio n. 10
0
def setup_node(
        *,
        # Change this to take host, user, and identity_file?
        # Add some kind of caching for SSH connections so that they
        # can be looked up by host and reused?
        ssh_client: paramiko.client.SSHClient,
        services: list,
        java_version: int,
        cluster: FlintrockCluster):
    """
    Setup a new node.

    Cluster methods like provision_node() and add_slaves_node() should
    delegate the main work of setting up new nodes to this function.
    """
    host = ssh_client.get_transport().getpeername()[0]
    ssh_check_output(client=ssh_client,
                     command="""
            set -e

            echo {private_key} > "$HOME/.ssh/id_rsa"
            echo {public_key} >> "$HOME/.ssh/authorized_keys"

            chmod 400 "$HOME/.ssh/id_rsa"
        """.format(private_key=shlex.quote(cluster.ssh_key_pair.private),
                   public_key=shlex.quote(cluster.ssh_key_pair.public)))

    with ssh_client.open_sftp() as sftp:
        sftp.put(localpath=os.path.join(SCRIPTS_DIR,
                                        'setup-ephemeral-storage.py'),
                 remotepath='/tmp/setup-ephemeral-storage.py')

    logger.info("[{h}] Configuring ephemeral storage...".format(h=host))
    # TODO: Print some kind of warning if storage is large, since formatting
    #       will take several minutes (~4 minutes for 2TB).
    storage_dirs_raw = ssh_check_output(client=ssh_client,
                                        command="""
            set -e
            python /tmp/setup-ephemeral-storage.py
            rm -f /tmp/setup-ephemeral-storage.py
        """)
    storage_dirs = json.loads(storage_dirs_raw)

    cluster.storage_dirs.root = storage_dirs['root']
    cluster.storage_dirs.ephemeral = storage_dirs['ephemeral']

    ensure_java(ssh_client, java_version)

    for service in services:
        try:
            service.install(
                ssh_client=ssh_client,
                cluster=cluster,
            )
        except Exception as e:
            raise Exception("Failed to install {}.".format(
                type(service).__name__)) from e
Esempio n. 11
0
    def install(self, ssh_client: paramiko.client.SSHClient,
                cluster: FlintrockCluster):
        logger.info("[{h}] Installing Spark...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        if self.version:
            with ssh_client.open_sftp() as sftp:
                sftp.put(localpath=os.path.join(SCRIPTS_DIR,
                                                'download-package.py'),
                         remotepath='/tmp/download-package.py')

            ssh_check_output(client=ssh_client,
                             command="""
                    python /tmp/download-package.py "{download_source}" "spark"
                """.format(
                                 version=self.version,
                                 download_source=self.download_source.format(
                                     v=self.version),
                             ))

        else:
            ssh_check_output(client=ssh_client,
                             command="""
                    set -e
                    sudo yum install -y git
                    sudo yum install -y java-devel
                    """)
            ssh_check_output(
                client=ssh_client,
                command="""
                    set -e
                    git clone {repo} spark
                    cd spark
                    git reset --hard {commit}
                    if [ -e "make-distribution.sh" ]; then
                        ./make-distribution.sh -Phadoop-{hadoop_short_version}
                    else
                        ./dev/make-distribution.sh -Phadoop-{hadoop_short_version}
                    fi
                """.format(
                    repo=shlex.quote(self.git_repository),
                    commit=shlex.quote(self.git_commit),
                    # Hardcoding this here until we figure out a better way to handle
                    # the supported build profiles.
                    hadoop_short_version='2.7',
                ))
        ssh_check_output(client=ssh_client,
                         command="""
                set -e
                for f in $(find spark/bin -type f -executable -not -name '*.cmd'); do
                    sudo ln -s "$(pwd)/$f" "/usr/local/bin/$(basename $f)"
                done
                echo "export SPARK_HOME='$(pwd)/spark'" >> .bashrc
            """)
Esempio n. 12
0
def setup_node(
        *,
        # Change this to take host, user, and identity_file?
        # Add some kind of caching for SSH connections so that they
        # can be looked up by host and reused?
        ssh_client: paramiko.client.SSHClient,
        services: list,
        cluster: FlintrockCluster):
    """
    Setup a new node.

    Cluster methods like provision_node() and add_slaves_node() should
    delegate the main work of setting up new nodes to this function.
    """
    host = ssh_client.get_transport().getpeername()[0]
    ssh_check_output(
        client=ssh_client,
        command="""
            set -e

            echo {private_key} > "$HOME/.ssh/id_rsa"
            echo {public_key} >> "$HOME/.ssh/authorized_keys"

            chmod 400 "$HOME/.ssh/id_rsa"
        """.format(
            private_key=shlex.quote(cluster.ssh_key_pair.private),
            public_key=shlex.quote(cluster.ssh_key_pair.public)))

    with ssh_client.open_sftp() as sftp:
        sftp.put(
            localpath=os.path.join(SCRIPTS_DIR, 'setup-ephemeral-storage.py'),
            remotepath='/tmp/setup-ephemeral-storage.py')

    logger.info("[{h}] Configuring ephemeral storage...".format(h=host))
    # TODO: Print some kind of warning if storage is large, since formatting
    #       will take several minutes (~4 minutes for 2TB).
    storage_dirs_raw = ssh_check_output(
        client=ssh_client,
        command="""
            set -e
            python /tmp/setup-ephemeral-storage.py
            rm -f /tmp/setup-ephemeral-storage.py
        """)
    storage_dirs = json.loads(storage_dirs_raw)

    cluster.storage_dirs.root = storage_dirs['root']
    cluster.storage_dirs.ephemeral = storage_dirs['ephemeral']

    ensure_java8(ssh_client)

    for service in services:
        service.install(
            ssh_client=ssh_client,
            cluster=cluster)
Esempio n. 13
0
    def install(
            self,
            ssh_client: paramiko.client.SSHClient,
            cluster: FlintrockCluster):
        # TODO: Allow users to specify the Spark "distribution". (?)
        distribution = 'hadoop2.6'

        print("[{h}] Installing Spark...".format(
            h=ssh_client.get_transport().getpeername()[0]))

        try:
            if self.version:
                with ssh_client.open_sftp() as sftp:
                    sftp.put(
                        localpath=os.path.join(SCRIPTS_DIR, 'install-spark.sh'),
                        remotepath='/tmp/install-spark.sh')
                    sftp.chmod(path='/tmp/install-spark.sh', mode=0o755)
                ssh_check_output(
                    client=ssh_client,
                    command="""
                        set -e
                        /tmp/install-spark.sh {spark_version} {distribution}
                        rm -f /tmp/install-spark.sh
                    """.format(
                            spark_version=shlex.quote(self.version),
                            distribution=shlex.quote(distribution)))
            else:
                ssh_check_output(
                    client=ssh_client,
                    command="""
                        set -e
                        sudo yum install -y git
                        sudo yum install -y java-devel
                        """)
                ssh_check_output(
                    client=ssh_client,
                    command="""
                        set -e
                        git clone {repo} spark
                        cd spark
                        git reset --hard {commit}
                        if [ -e "make-distribution.sh" ]; then
                            ./make-distribution.sh -Phadoop-2.6
                        else
                            ./dev/make-distribution.sh -Phadoop-2.6
                        fi
                    """.format(
                        repo=shlex.quote(self.git_repository),
                        commit=shlex.quote(self.git_commit)))
        except Exception as e:
            # TODO: This should be a more specific exception.
            print("Error: Failed to install Spark.", file=sys.stderr)
            print(e, file=sys.stderr)
            raise
Esempio n. 14
0
def uploadFile(sourcePath: str,
               targetPath: str,
               sshc: paramiko.client.SSHClient,
               compress_method: str = None,
               verbose: bool = True) -> pathlib.Path:
    def show_progress(filename, size, sent):
        print(f"Uploading {filename} progress: " +
              f"{float(sent)/float(size)*100:.2f}%",
              end="\r")

    progress = show_progress if verbose else None

    try:
        if compress_method:
            fileName = pathlib.Path(sourcePath).name
            # change targetPath for uploading to
            # targetPath's directory / sourcePath's name + ext.
            targetPath = pathlib.Path(
                str(pathlib.Path(targetPath).parent / fileName) + "." +
                compress_method)
            sourcePath = archiveFile(sourcePath,
                                     verbose=verbose,
                                     method=compress_method)
            isArchived = True

        with scp.SCPClient(sshc.get_transport(), progress=progress) as scpc:
            # in case Path is PosixPath, casting them to str
            scpc.put(str(sourcePath), str(targetPath))
            print("\n")  # nextline

        if compress_method:
            unarchiveSSH(targetPath,
                         sshc,
                         method=compress_method,
                         verbose=verbose)
            isUnarchived = True
            # change targetPath to uploaded raw file
            uploadedPath = str(pathlib.Path(targetPath).parent / fileName)
    finally:  # delete archive files
        if 'isArchived' in locals():
            with verbosity_context(f"Deleting archive {sourcePath}", verbose):
                os.remove(sourcePath)
        if 'isUnarchived' in locals():
            sftp = sshc.open_sftp()
            with verbosity_context(f"Deleting archive {targetPath} via SCP",
                                   verbose):
                sftp.remove(str(targetPath))

    return uploadedPath if compress_method else targetPath