def install_java(): """Install java just like we do for Hadoop Base. This is the same method used to install java in HadoopBase: https://github.com/juju-solutions/jujubigdata/blob/master/jujubigdata/handlers.py#L134 This allows us to run Pig in local mode (which requires Java) without any Hadoop. If Hadoop comes along later, we'll already have java installed in a way that is compatible with the plugin. NOTE: this will go away if/when we support the java interface. """ env = utils.read_etc_env() java_installer = Path(jujuresources.resource_path('java-installer')) java_installer.chmod(0o755) output = check_output([java_installer], env=env).decode('utf8') lines = output.strip().splitlines() if len(lines) != 2: raise ValueError('Unexpected output from java-installer: %s' % output) java_home, java_version = lines if '_' in java_version: java_major, java_release = java_version.split("_") else: java_major, java_release = java_version, '' unitdata.kv().set('java.home', java_home) unitdata.kv().set('java.version', java_major) unitdata.kv().set('java.version.release', java_release)
def reconfigure_zeppelin(self): ''' Configure zeppelin based on current environment ''' raise NotImplementedError() # NB (kwm): this method is not currently called because Bigtop spark # doesn't expose these settings. Leaving this here just in case # we update the bigtop charms to provide these bits in the future. etc_env = utils.read_etc_env() hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '') spark_driver_mem = etc_env.get('SPARK_DRIVER_MEMORY', '1g') spark_exe_mode = os.environ.get('MASTER', 'yarn-client') spark_executor_mem = etc_env.get('SPARK_EXECUTOR_MEMORY', '1g') zeppelin_env = self.dist_config.path( 'zeppelin_conf') / 'zeppelin-env.sh' with open(zeppelin_env, "a") as f: f.write('export ZEPPELIN_CLASSPATH_OVERRIDES={}\n'.format( hadoop_extra_classpath)) f.write( 'export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n' .format(spark_driver_mem, spark_executor_mem)) f.write( 'export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n' .format(spark_driver_mem, spark_executor_mem)) f.write('export MASTER={}\n'.format(spark_exe_mode))
def install_java(self): """ Run the java-installer resource to install Java and determine the JAVA_HOME and Java version. The java-installer must be idempotent and its only output (on stdout) should be two lines: the JAVA_HOME path, and the Java version, respectively. If there is an error installing Java, the installer should exit with a non-zero exit code. """ env = utils.read_etc_env() java_installer = Path(jujuresources.resource_path('java-installer')) java_installer.chmod(0o755) output = check_output([java_installer], env=env).decode('utf8') lines = output.strip().splitlines() if len(lines) != 2: raise ValueError('Unexpected output from java-installer: %s' % output) java_home, java_version = lines if '_' in java_version: java_major, java_release = java_version.split("_") else: java_major, java_release = java_version, '' unitdata.kv().set('java.home', java_home) unitdata.kv().set('java.version', java_major) unitdata.kv().set('java.version.release', java_release)
def configure_zeppelin(self): ''' Configure zeppelin environment for all users ''' zeppelin_bin = self.dist_config.path('zeppelin') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if zeppelin_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], zeppelin_bin]) env['ZEPPELIN_CONF_DIR'] = self.dist_config.path('zeppelin_conf') zeppelin_site = self.dist_config.path( 'zeppelin_conf') / 'zeppelin-site.xml' with utils.xmlpropmap_edit_in_place(zeppelin_site) as xml: xml['zeppelin.server.port'] = self.dist_config.port('zeppelin') xml['zeppelin.notebook.dir'] = self.dist_config.path( 'zeppelin_notebooks') etc_env = utils.read_etc_env() hadoop_conf_dir = etc_env.get('HADOOP_CONF_DIR', '/etc/hadoop/conf') spark_home = etc_env.get('SPARK_HOME', '/usr/lib/spark') spark_driver_mem = etc_env.get('SPARK_DRIVER_MEMORY', '1g') spark_exe_mode = os.environ.get('MASTER', 'yarn-client') spark_executor_mem = etc_env.get('SPARK_EXECUTOR_MEMORY', '1g') zeppelin_env = self.dist_config.path( 'zeppelin_conf') / 'zeppelin-env.sh' with open(zeppelin_env, "a") as f: f.write('export ZEPPELIN_HOME={}\n'.format( self.dist_config.path('zeppelin'))) f.write( 'export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n' .format(spark_driver_mem, spark_executor_mem)) f.write('export ZEPPELIN_LOG_DIR={}\n'.format( self.dist_config.path('zeppelin_logs'))) f.write( 'export ZEPPELIN_MEM="-Xms128m -Xmx1024m -XX:MaxPermSize=512m"\n' ) f.write('export ZEPPELIN_NOTEBOOK_DIR={}\n'.format( self.dist_config.path('zeppelin_notebooks'))) f.write('export SPARK_HOME={}\n'.format(spark_home)) f.write( 'export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n' .format(spark_driver_mem, spark_executor_mem)) f.write('export HADOOP_CONF_DIR={}\n'.format(hadoop_conf_dir)) f.write( 'export PYTHONPATH={s}/python:{s}/python/lib/py4j-0.8.2.1-src.zip\n' .format(s=spark_home)) f.write('export MASTER={}\n'.format(spark_exe_mode)) # User needs write access to zepp's conf to write interpreter.json # on server start. chown the whole conf dir, though we could probably # touch that file and chown it, leaving the rest owned as root:root. # TODO: weigh implications of have zepp's conf dir owned by non-root. cmd = "chown -R ubuntu:hadoop {}".format( self.dist_config.path('zeppelin_conf')) call(cmd.split())
def run_bg(self, user, command, *args): """ Run a Kafka command as the `kafka` user in the background. :param str command: Command to run :param list args: Additional args to pass to the command """ parts = [command] + list(args) quoted = ' '.join("'%s'" % p for p in parts) e = utils.read_etc_env() Popen(['su', user, '-c', quoted], env=e)
def configure_yarn_mode(self): # put the spark jar in hdfs spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format( self.dist_config.path('spark')))[0] utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/share/lib') try: utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar, '/user/ubuntu/share/lib/spark-assembly.jar') except CalledProcessError: pass # jar already in HDFS from another Spark with utils.environment_edit_in_place('/etc/environment') as env: env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar" # create hdfs storage space for history server dc = self.dist_config prefix = dc.path('log_prefix') events_dir = dc.path('spark_events') events_dir = 'hdfs:///{}'.format(events_dir.replace(prefix, '')) utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', events_dir) utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', events_dir) # create hdfs storage space for spark-bench utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/spark-bench') utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', '/user/ubuntu/spark-bench') # ensure user-provided Hadoop works hadoop_classpath = utils.run_as('hdfs', 'hadoop', 'classpath', capture_output=True) spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh' utils.re_edit_in_place(spark_env, { r'.*SPARK_DIST_CLASSPATH.*': 'SPARK_DIST_CLASSPATH={}'.format(hadoop_classpath), }, append_non_matches=True) # update spark-defaults spark_conf = self.dist_config.path( 'spark_conf') / 'spark-defaults.conf' etc_env = utils.read_etc_env() utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), }, append_non_matches=True) unitdata.kv().set('hdfs.available', True) unitdata.kv().flush(True)
def run_bg(self, user, command, *args): """ Run a Hive command as the `hive` user in the background. :param str command: Command to run :param list args: Additional args to pass to the command """ parts = [command] + list(args) quoted = ' '.join("'%s'" % p for p in parts) e = utils.read_etc_env() Popen(['su', user, '-c', quoted], env=e)
def run_bg(self, user, output_log, command, *args): """ Run a command as the given user in the background. :param str user: User to run flume agent :param str command: Command to run :param list args: Additional args to pass to the command """ parts = [command] + list(args) quoted = ' '.join("'%s'" % p for p in parts) e = utils.read_etc_env() Popen(['su', user, '-c', '{} &> {} &'.format(quoted, output_log)], env=e)
def configure_hadoop_libs(self): if unitdata.kv().get('hadoop.extra.installed', False): return spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf' etc_env = utils.read_etc_env() hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '') utils.re_edit_in_place(spark_conf, { r'.*spark.driver.extraClassPath .*': 'spark.driver.extraClassPath {}'.format(hadoop_extra_classpath), r'.*spark.jars .*': 'spark.jars {}'.format(hadoop_extra_classpath), }, append_non_matches=True) unitdata.kv().set('hadoop.extra.installed', True) unitdata.kv().flush(True)
def run_bg(self, user, command, *args): """ Start a Flume agent as the given user in the background. :param str user: User to run flume agent :param str command: Command to run :param list args: Additional args to pass to the command """ parts = [command] + list(args) quoted = ' '.join("'%s'" % p for p in parts) # This is here to force explicit execution on the background. Too much output causes Popen to fail. silent = ' '.join([quoted, "2>", "/dev/null", "&"]) e = utils.read_etc_env() Popen(['su', user, '-c', silent], env=e)
def configure_yarn_mode(self): # put the spark jar in hdfs spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format( self.dist_config.path('spark')))[0] utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/share/lib') try: utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar, '/user/ubuntu/share/lib/spark-assembly.jar') except CalledProcessError: pass # jar already in HDFS from another Spark with utils.environment_edit_in_place('/etc/environment') as env: env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar" # create hdfs storage space for history server dc = self.dist_config prefix = dc.path('log_prefix') events_dir = dc.path('spark_events') events_dir = 'hdfs:///{}'.format(events_dir.replace(prefix, '')) utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', events_dir) utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', events_dir) # create hdfs storage space for spark-bench utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/spark-bench') utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', '/user/ubuntu/spark-bench') # ensure user-provided Hadoop works hadoop_classpath = utils.run_as('hdfs', 'hadoop', 'classpath', capture_output=True) spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh' utils.re_edit_in_place(spark_env, { r'.*SPARK_DIST_CLASSPATH.*': 'SPARK_DIST_CLASSPATH={}'.format(hadoop_classpath), }, append_non_matches=True) # update spark-defaults spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf' etc_env = utils.read_etc_env() hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '') utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), r'.*spark.driver.extraClassPath .*': 'spark.driver.extraClassPath {}'.format(hadoop_extra_classpath), }, append_non_matches=True) unitdata.kv().set('hdfs.available', True) unitdata.kv().flush(True)
def configure_zeppelin(self): """ Configure zeppelin environment for all users """ zeppelin_bin = self.dist_config.path("zeppelin") / "bin" with utils.environment_edit_in_place("/etc/environment") as env: if zeppelin_bin not in env["PATH"]: env["PATH"] = ":".join([env["PATH"], zeppelin_bin]) env["ZEPPELIN_CONF_DIR"] = self.dist_config.path("zeppelin_conf") zeppelin_site = self.dist_config.path("zeppelin_conf") / "zeppelin-site.xml" with utils.xmlpropmap_edit_in_place(zeppelin_site) as xml: xml["zeppelin.server.port"] = self.dist_config.port("zeppelin") xml["zeppelin.notebook.dir"] = self.dist_config.path("zeppelin_notebooks") etc_env = utils.read_etc_env() hadoop_conf_dir = etc_env.get("HADOOP_CONF_DIR", "/etc/hadoop/conf") spark_home = etc_env.get("SPARK_HOME", "/usr/lib/spark") spark_driver_mem = etc_env.get("SPARK_DRIVER_MEMORY", "1g") spark_exe_mode = os.environ.get("MASTER", "yarn-client") spark_executor_mem = etc_env.get("SPARK_EXECUTOR_MEMORY", "1g") zeppelin_env = self.dist_config.path("zeppelin_conf") / "zeppelin-env.sh" with open(zeppelin_env, "a") as f: f.write("export ZEPPELIN_HOME={}\n".format(self.dist_config.path("zeppelin"))) f.write( 'export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n'.format( spark_driver_mem, spark_executor_mem ) ) f.write("export ZEPPELIN_LOG_DIR={}\n".format(self.dist_config.path("zeppelin_logs"))) f.write('export ZEPPELIN_MEM="-Xms128m -Xmx1024m -XX:MaxPermSize=512m"\n') f.write("export ZEPPELIN_NOTEBOOK_DIR={}\n".format(self.dist_config.path("zeppelin_notebooks"))) f.write("export SPARK_HOME={}\n".format(spark_home)) f.write( 'export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n'.format( spark_driver_mem, spark_executor_mem ) ) f.write("export HADOOP_CONF_DIR={}\n".format(hadoop_conf_dir)) f.write("export PYTHONPATH={s}/python:{s}/python/lib/py4j-0.8.2.1-src.zip\n".format(s=spark_home)) f.write("export MASTER={}\n".format(spark_exe_mode)) # User needs write access to zepp's conf to write interpreter.json # on server start. chown the whole conf dir, though we could probably # touch that file and chown it, leaving the rest owned as root:root. # TODO: weigh implications of have zepp's conf dir owned by non-root. cmd = "chown -R ubuntu:hadoop {}".format(self.dist_config.path("zeppelin_conf")) call(cmd.split())
def configure_yarn_mode(self): # put the spark jar in hdfs spark_assembly_jar = glob('{}/lib/spark-assembly-*.jar'.format( self.dist_config.path('spark')))[0] utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/share/lib') try: utils.run_as('hdfs', 'hdfs', 'dfs', '-put', spark_assembly_jar, '/user/ubuntu/share/lib/spark-assembly.jar') except CalledProcessError: pass # jar already in HDFS from another Spark with utils.environment_edit_in_place('/etc/environment') as env: env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar" # create hdfs storage space for history server utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', self.dist_config.path('spark_events')) utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', self.dist_config.path('spark_events')) # create hdfs storage space for spark-bench utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/spark-bench') utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', '/user/ubuntu/spark-bench') # update spark-defaults spark_conf = self.dist_config.path( 'spark_conf') / 'spark-defaults.conf' etc_env = utils.read_etc_env() hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '') utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), r'.*spark.eventLog.enabled .*': 'spark.eventLog.enabled true', r'.*spark.eventLog.dir .*': 'spark.eventLog.dir hdfs://{}'.format( self.dist_config.path('spark_events')), r'.*spark.driver.extraClassPath .*': 'spark.driver.extraClassPath {}'.format(hadoop_extra_classpath), }, append_non_matches=True) unitdata.kv().set('hdfs.available', True) unitdata.kv().flush(True)
def configure_zeppelin(self): ''' Configure zeppelin environment for all users ''' zeppelin_bin = self.dist_config.path('zeppelin') / 'bin' with utils.environment_edit_in_place('/etc/environment') as env: if zeppelin_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], zeppelin_bin]) env['ZEPPELIN_CONF_DIR'] = self.dist_config.path('zeppelin_conf') zeppelin_site = self.dist_config.path('zeppelin_conf') / 'zeppelin-site.xml' with utils.xmlpropmap_edit_in_place(zeppelin_site) as xml: xml['zeppelin.server.port'] = self.dist_config.port('zeppelin') xml['zeppelin.notebook.dir'] = self.dist_config.path('zeppelin_notebooks') etc_env = utils.read_etc_env() hadoop_conf_dir = etc_env.get('HADOOP_CONF_DIR', '/etc/hadoop/conf') hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '') spark_home = etc_env.get('SPARK_HOME', '/usr/lib/spark') spark_driver_mem = etc_env.get('SPARK_DRIVER_MEMORY', '1g') spark_exe_mode = os.environ.get('MASTER', 'yarn-client') spark_executor_mem = etc_env.get('SPARK_EXECUTOR_MEMORY', '1g') zeppelin_env = self.dist_config.path('zeppelin_conf') / 'zeppelin-env.sh' with open(zeppelin_env, "a") as f: f.write('export ZEPPELIN_CLASSPATH_OVERRIDES={}\n'.format(hadoop_extra_classpath)) f.write('export ZEPPELIN_HOME={}\n'.format(self.dist_config.path('zeppelin'))) f.write('export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n'.format( spark_driver_mem, spark_executor_mem)) f.write('export ZEPPELIN_LOG_DIR={}\n'.format(self.dist_config.path('zeppelin_logs'))) f.write('export ZEPPELIN_MEM="-Xms128m -Xmx1024m -XX:MaxPermSize=512m"\n') f.write('export ZEPPELIN_NOTEBOOK_DIR={}\n'.format(self.dist_config.path('zeppelin_notebooks'))) f.write('export SPARK_HOME={}\n'.format(spark_home)) f.write('export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n'.format( spark_driver_mem, spark_executor_mem)) f.write('export HADOOP_CONF_DIR={}\n'.format(hadoop_conf_dir)) f.write('export PYTHONPATH={s}/python:{s}/python/lib/py4j-0.8.2.1-src.zip\n'.format(s=spark_home)) f.write('export MASTER={}\n'.format(spark_exe_mode)) # User needs write access to zepp's conf to write interpreter.json # on server start. chown the whole conf dir, though we could probably # touch that file and chown it, leaving the rest owned as root:root. # TODO: weigh implications of have zepp's conf dir owned by non-root. cmd = "chown -R ubuntu:hadoop {}".format(self.dist_config.path('zeppelin_conf')) call(cmd.split())
def configure_hadoop_libs(self): if unitdata.kv().get('hadoop.extra.installed', False): return spark_conf = self.dist_config.path( 'spark_conf') / 'spark-defaults.conf' etc_env = utils.read_etc_env() hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '') utils.re_edit_in_place(spark_conf, { r'.*spark.driver.extraClassPath .*': 'spark.driver.extraClassPath {}'.format(hadoop_extra_classpath), r'.*spark.jars .*': 'spark.jars {}'.format(hadoop_extra_classpath), }, append_non_matches=True) unitdata.kv().set('hadoop.extra.installed', True) unitdata.kv().flush(True)
def reconfigure_zeppelin(self): ''' Configure zeppelin based on current environment ''' raise NotImplementedError() # NB (kwm): this method is not currently called because Bigtop spark # doesn't expose these settings. Leaving this here just in case # we update the bigtop charms to provide these bits in the future. etc_env = utils.read_etc_env() hadoop_extra_classpath = etc_env.get('HADOOP_EXTRA_CLASSPATH', '') spark_driver_mem = etc_env.get('SPARK_DRIVER_MEMORY', '1g') spark_exe_mode = os.environ.get('MASTER', 'yarn-client') spark_executor_mem = etc_env.get('SPARK_EXECUTOR_MEMORY', '1g') zeppelin_env = self.dist_config.path('zeppelin_conf') / 'zeppelin-env.sh' with open(zeppelin_env, "a") as f: f.write('export ZEPPELIN_CLASSPATH_OVERRIDES={}\n'.format(hadoop_extra_classpath)) f.write('export ZEPPELIN_JAVA_OPTS="-Dspark.driver.memory={} -Dspark.executor.memory={}"\n'.format( spark_driver_mem, spark_executor_mem)) f.write('export SPARK_SUBMIT_OPTIONS="--driver-memory {} --executor-memory {}"\n'.format( spark_driver_mem, spark_executor_mem)) f.write('export MASTER={}\n'.format(spark_exe_mode))
def configure_oozie_hdfs(self): #config = hookenv.config() e = utils.read_etc_env() utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/oozie', env=e) utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'oozie:hadoop', '/user/oozie', env=e)