Exemple #1
0
 def __setup_path( self ):
     globally = True
     if globally:
         with remote_open( '/etc/environment', use_sudo=True ) as f:
             new_path = [ fmt( '{install_dir}/{package}/bin' )
                 for package in ('spark', 'hadoop') ]
             self.__patch_etc_environment( f, new_path )
     else:
         for _user in (user, self.admin_account( )):
             with settings( user=_user ):
                 with remote_open( '~/.profile' ) as f:
                     f.write( '\n' )
                     for package in ('spark', 'hadoop'):
                         # We don't include sbin here because too many file names collide in
                         # Spark's and Hadoop's sbin
                         f.write( fmt( 'PATH="$PATH:{install_dir}/{package}/bin"\n' ) )
Exemple #2
0
 def __setup_ssh_config(self):
     with remote_open('/etc/ssh/ssh_config', use_sudo=True) as f:
         f.write(
             heredoc("""
             Host spark-master
                 CheckHostIP no
                 HashKnownHosts no"""))
Exemple #3
0
 def __setup_path( self ):
     globally = True
     if globally:
         with remote_open( '/etc/environment', use_sudo=True ) as f:
             new_path = [ fmt( '{install_dir}/{package}/bin' )
                 for package in ('spark', 'hadoop') ]
             self._patch_etc_environment( f, dirs=new_path )
     else:
         for _user in (user, self.admin_account( )):
             with settings( user=_user ):
                 with remote_open( '~/.profile' ) as f:
                     f.write( '\n' )
                     for package in ('spark', 'hadoop'):
                         # We don't include sbin here because too many file names collide in
                         # Spark's and Hadoop's sbin
                         f.write( fmt( 'PATH="$PATH:{install_dir}/{package}/bin"\n' ) )
Exemple #4
0
    def __install_spark(self):
        # Download and extract Spark
        path = fmt(
            'spark/spark-{spark_version}/spark-{spark_version}-bin-hadoop{spark_hadoop_version}.tgz'
        )
        self._install_apache_package(path, install_dir)

        spark_dir = var_dir + "/spark"

        # Add environment variables to spark_env.sh
        spark_env_sh_path = fmt("{install_dir}/spark/conf/spark-env.sh")
        sudo(fmt("cp {spark_env_sh_path}.template {spark_env_sh_path}"))
        spark_env = dict(
            SPARK_LOG_DIR=self._lazy_mkdir(log_dir, "spark"),
            SPARK_WORKER_DIR=self._lazy_mkdir(spark_dir, "work"),
            SPARK_LOCAL_DIRS=self._lazy_mkdir(spark_dir, "local"),
            JAVA_HOME='/usr/lib/jvm/java-8-oracle',
            SPARK_MASTER_IP='spark-master',
            HADOOP_CONF_DIR=fmt("{install_dir}/hadoop/etc/hadoop"))
        with remote_open(spark_env_sh_path, use_sudo=True) as spark_env_sh:
            spark_env_sh.write('\n')
            for name, value in spark_env.iteritems():
                spark_env_sh.write(fmt('export {name}="{value}"\n'))

        # Configure Spark properties
        spark_defaults = {
            'spark.eventLog.enabled': 'true',
            'spark.eventLog.dir': self._lazy_mkdir(spark_dir, "history"),
            'spark.master': 'spark://spark-master:7077'
        }
        spark_defaults_conf_path = fmt(
            "{install_dir}/spark/conf/spark-defaults.conf")
        sudo(
            fmt("cp {spark_defaults_conf_path}.template {spark_defaults_conf_path}"
                ))
        with remote_open(spark_defaults_conf_path,
                         use_sudo=True) as spark_defaults_conf:
            for name, value in spark_defaults.iteritems():
                spark_defaults_conf.write(fmt("{name}\t{value}\n"))

        # Make shell auto completion easier
        sudo(fmt('find {install_dir}/spark -name "*.cmd" | xargs rm'))

        # Install upstart jobs
        self.__register_upstart_jobs(spark_services)
Exemple #5
0
    def __install_yarn(self):
        # Download and extract Hadoop
        path = fmt(
            'hadoop/common/hadoop-{hadoop_version}/hadoop-{hadoop_version}.tar.gz'
        )
        self._install_apache_package(path, install_dir)

        # patch path
        with remote_open('/etc/environment', use_sudo=True) as f:
            yarn_path = fmt('{install_dir}/hadoop')
            self._patch_etc_environment(f,
                                        env_pairs=dict(HADOOP_HOME=yarn_path))
Exemple #6
0
    def __install_spark( self ):
        # Download and extract Spark
        path = fmt( 'spark/spark-{spark_version}/spark-{spark_version}-bin-hadoop{spark_hadoop_version}.tgz' )
        self._install_apache_package( path, install_dir )

        spark_dir = var_dir + "/spark"

        # Add environment variables to spark_env.sh
        spark_env_sh_path = fmt( "{install_dir}/spark/conf/spark-env.sh" )
        sudo( fmt( "cp {spark_env_sh_path}.template {spark_env_sh_path}" ) )
        spark_env = dict(
            SPARK_LOG_DIR=self._lazy_mkdir( log_dir, "spark" ),
            SPARK_WORKER_DIR=self._lazy_mkdir( spark_dir, "work" ),
            SPARK_LOCAL_DIRS=self._lazy_mkdir( spark_dir, "local" ),
            JAVA_HOME='/usr/lib/jvm/java-8-oracle',
            SPARK_MASTER_IP='spark-master',
            HADOOP_CONF_DIR=fmt( "{install_dir}/hadoop/etc/hadoop" ),
            SPARK_PUBLIC_DNS="$(curl -s http://169.254.169.254/latest/meta-data/public-hostname)" )
        with remote_open( spark_env_sh_path, use_sudo=True ) as spark_env_sh:
            spark_env_sh.write( '\n' )
            for name, value in spark_env.iteritems( ):
                spark_env_sh.write( fmt( 'export {name}="{value}"\n' ) )

        # Configure Spark properties
        spark_defaults = {
            'spark.eventLog.enabled': 'true',
            'spark.eventLog.dir': self._lazy_mkdir( spark_dir, "history" ),
            'spark.master': 'spark://spark-master:7077'
        }
        spark_defaults_conf_path = fmt( "{install_dir}/spark/conf/spark-defaults.conf" )
        sudo( fmt( "cp {spark_defaults_conf_path}.template {spark_defaults_conf_path}" ) )
        with remote_open( spark_defaults_conf_path, use_sudo=True ) as spark_defaults_conf:
            for name, value in spark_defaults.iteritems( ):
                spark_defaults_conf.write( fmt( "{name}\t{value}\n" ) )

        # Make shell auto completion easier
        sudo( fmt( 'find {install_dir}/spark -name "*.cmd" | xargs rm' ) )

        # Install upstart jobs
        self.__register_upstart_jobs( spark_services )
Exemple #7
0
    def __install_hadoop(self):
        # Download and extract Hadoop
        path = fmt(
            'hadoop/common/hadoop-{hadoop_version}/hadoop-{hadoop_version}.tar.gz'
        )
        self._install_apache_package(path, install_dir)

        # Add environment variables to hadoop_env.sh
        hadoop_env = dict(HADOOP_LOG_DIR=self._lazy_mkdir(log_dir, "hadoop"),
                          JAVA_HOME='/usr/lib/jvm/java-8-oracle')
        hadoop_env_sh_path = fmt(
            "{install_dir}/hadoop/etc/hadoop/hadoop-env.sh")
        with remote_open(hadoop_env_sh_path, use_sudo=True) as hadoop_env_sh:
            hadoop_env_sh.write('\n')
            for name, value in hadoop_env.iteritems():
                hadoop_env_sh.write(fmt('export {name}="{value}"\n'))

        # Configure HDFS
        hdfs_dir = var_dir + "/hdfs"
        put(use_sudo=True,
            remote_path=fmt('{install_dir}/hadoop/etc/hadoop/hdfs-site.xml'),
            local_path=StringIO(
                self.__to_hadoop_xml_config({
                    'dfs.replication':
                    str(hdfs_replication),
                    'dfs.permissions':
                    'false',
                    'dfs.name.dir':
                    self._lazy_mkdir(hdfs_dir, 'name', persistent=True),
                    'dfs.data.dir':
                    self._lazy_mkdir(hdfs_dir, 'data', persistent=True),
                    'fs.checkpoint.dir':
                    self._lazy_mkdir(hdfs_dir, 'checkpoint', persistent=True),
                    'dfs.namenode.http-address':
                    'spark-master:50070',
                    'dfs.namenode.secondary.http-address':
                    'spark-master:50090'
                })))

        # Configure Hadoop
        put(use_sudo=True,
            remote_path=fmt('{install_dir}/hadoop/etc/hadoop/core-site.xml'),
            local_path=StringIO(
                self.__to_hadoop_xml_config(
                    {'fs.default.name': 'hdfs://spark-master:8020'})))

        # Make shell auto completion easier
        sudo(fmt('find {install_dir}/hadoop -name "*.cmd" | xargs rm'))

        # Install upstart jobs
        self.__register_upstart_jobs(hadoop_services)
Exemple #8
0
    def __install_spark(self):
        # Download and extract Spark
        path = fmt(
            'spark/spark-{spark_version}/spark-{spark_version}-bin-hadoop{spark_hadoop_version}.tgz'
        )
        self._install_apache_package(path, install_dir)

        # Patch paths
        with remote_open('/etc/environment', use_sudo=True) as f:
            spark_home = fmt('{install_dir}/spark')
            # These two PYTHONPATH entries are also added by the 'pyspark' wrapper script.
            # We need to replicate them globally because we want to be able to just do
            # 'import pyspark' in Toil's Spark service code and associated tests.
            python_path = [
                fmt('{spark_home}/python'),
                run(fmt('ls {spark_home}/python/lib/py4j-*-src.zip').strip())
            ]
            self._patch_etc_environment(f,
                                        env_pairs=dict(SPARK_HOME=spark_home),
                                        dirs=python_path,
                                        dirs_var='PYTHONPATH')
Exemple #9
0
    def __install_hadoop( self ):
        # Download and extract Hadoop
        path = fmt( 'hadoop/common/hadoop-{hadoop_version}/hadoop-{hadoop_version}.tar.gz' )
        self.__install_apache_package( path )

        # Add environment variables to hadoop_env.sh
        hadoop_env = dict(
            HADOOP_LOG_DIR=self._lazy_mkdir( log_dir, "hadoop" ),
            JAVA_HOME='/usr/lib/jvm/java-7-oracle' )
        hadoop_env_sh_path = fmt( "{install_dir}/hadoop/etc/hadoop/hadoop-env.sh" )
        with remote_open( hadoop_env_sh_path, use_sudo=True ) as hadoop_env_sh:
            hadoop_env_sh.write( '\n' )
            for name, value in hadoop_env.iteritems( ):
                hadoop_env_sh.write( fmt( 'export {name}="{value}"\n' ) )

        # Configure HDFS
        hdfs_dir = var_dir + "/hdfs"
        put( use_sudo=True,
             remote_path=fmt( '{install_dir}/hadoop/etc/hadoop/hdfs-site.xml' ),
             local_path=StringIO( self.__to_hadoop_xml_config( {
                 'dfs.replication': str( hdfs_replication ),
                 'dfs.permissions': 'false',
                 'dfs.name.dir': self._lazy_mkdir( hdfs_dir, 'name', persistent=True ),
                 'dfs.data.dir': self._lazy_mkdir( hdfs_dir, 'data', persistent=True ),
                 'fs.checkpoint.dir': self._lazy_mkdir( hdfs_dir, 'checkpoint', persistent=True ),
                 'dfs.namenode.http-address': 'spark-master:50070',
                 'dfs.namenode.secondary.http-address': 'spark-master:50090' } ) ) )

        # Configure Hadoop
        put( use_sudo=True,
             remote_path=fmt( '{install_dir}/hadoop/etc/hadoop/core-site.xml' ),
             local_path=StringIO( self.__to_hadoop_xml_config( {
                 'fs.default.name': 'hdfs://spark-master:8020' } ) ) )

        # Make shell auto completion easier
        sudo( fmt( 'find {install_dir}/hadoop -name "*.cmd" | xargs rm' ) )

        # Install upstart jobs
        self.__register_upstart_jobs( hadoop_services )
Exemple #10
0
 def __setup_ssh_config( self ):
     with remote_open( '/etc/ssh/ssh_config', use_sudo=True ) as f:
         f.write( heredoc( """
             Host spark-master
                 CheckHostIP no
                 HashKnownHosts no""" ) )