def postgresql_conf_defaults(): """Return the postgresql.conf defaults, which we parse from config.yaml""" # We load defaults from the extra_pg_conf default in config.yaml, # which ensures that they never get out of sync. raw = helpers.config_yaml()["options"]["extra_pg_conf"]["default"] defaults = postgresql.parse_config(raw) # And recalculate some defaults, which could get out of sync. # Settings with mandatory minimums like wal_senders are handled # later, in ensure_viable_postgresql_conf(). ram = int(host.get_total_ram() / (1024 * 1024)) # Working in megabytes. # Default shared_buffers to 25% of ram, minimum 16MB, maximum 8GB, # per current best practice rules of thumb. Rest is cache. shared_buffers = max(min(math.ceil(ram * 0.25), 8192), 16) effective_cache_size = max(1, ram - shared_buffers) defaults["shared_buffers"] = "{} MB".format(shared_buffers) defaults["effective_cache_size"] = "{} MB".format(effective_cache_size) # PostgreSQL 10 introduces multiple password encryption methods. if postgresql.has_version("10"): # Change this to scram-sha-256 next LTS release, when we can # start assuming clients have libpq 10. The setting can of # course still be overridden in the config. defaults["password_encryption"] = "md5" else: defaults["password_encryption"] = True return defaults
def _get_hugepages(): pages = config.get("dpdk-hugepages") if not pages: return None if not pages.endswith("%"): return pages pp = int(pages.rstrip("%")) return int(get_total_ram() * pp / 100 / 1024 / 2048)
def postgresql_conf_defaults(): '''Return the postgresql.conf defaults, which we parse from config.yaml''' # We load defaults from the extra_pg_conf default in config.yaml, # which ensures that they never get out of sync. raw = helpers.config_yaml()['options']['extra_pg_conf']['default'] defaults = postgresql.parse_config(raw) # And recalculate some defaults, which could get out of sync. # Settings with mandatory minimums like wal_senders are handled # later, in ensure_viable_postgresql_conf(). ram = int(host.get_total_ram() / (1024 * 1024)) # Working in megabytes. # Default shared_buffers to 25% of ram, minimum 16MB, maximum 8GB, # per current best practice rules of thumb. Rest is cache. shared_buffers = max(min(math.ceil(ram * 0.25), 8192), 16) effective_cache_size = max(1, ram - shared_buffers) defaults['shared_buffers'] = '{} MB'.format(shared_buffers) defaults['effective_cache_size'] = '{} MB'.format(effective_cache_size) return defaults
def calculate_watermark_scale_factor(): """Calculates optimal vm.watermark_scale_factor value :returns: watermark_scale_factor :rtype: int """ memtotal = get_total_ram() normal_managed_pages = get_normal_managed_pages() try: wmark = min([ watermark_scale_factor(memtotal, managed_pages) for managed_pages in normal_managed_pages ]) except ValueError as e: log( "Failed to calculate watermark_scale_factor from normal managed pages: {}" .format(normal_managed_pages), ERROR) raise e log("vm.watermark_scale_factor: {}".format(wmark), DEBUG) return wmark
def configure(self, available_hosts, zk_units, peers, extra_libs): """ This is the core logic of setting up spark. :param dict available_hosts: Hosts that Spark should know about. :param list zk_units: List of Zookeeper dicts with host/port info. :param list peers: List of Spark peer tuples (unit name, IP). :param list extra_libs: List of extra lib paths for driver/executors. """ # Set KV based on connected applications unitdata.kv().set('zookeeper.units', zk_units) unitdata.kv().set('sparkpeer.units', peers) unitdata.kv().flush(True) # Get our config ready dc = self.dist_config mode = hookenv.config()['spark_execution_mode'] master_ip = utils.resolve_private_address( available_hosts['spark-master']) master_url = self.get_master_url(master_ip) req_driver_mem = hookenv.config()['driver_memory'] req_executor_mem = hookenv.config()['executor_memory'] if mode.startswith('yarn'): spark_events = 'hdfs://{}'.format(dc.path('spark_events')) else: spark_events = 'file://{}'.format(dc.path('spark_events')) # handle tuning options that may be set as percentages driver_mem = '1g' executor_mem = '1g' if req_driver_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "driver_memory percentage in non-local mode. " "Using 1g default.", level=hookenv.WARNING) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "executor_memory percentage in non-local mode. " "Using 1g default.", level=hookenv.WARNING) else: executor_mem = req_executor_mem # Some spark applications look for envars in /etc/environment with utils.environment_edit_in_place('/etc/environment') as env: env['MASTER'] = master_url env['SPARK_HOME'] = dc.path('spark_home') # Setup hosts dict hosts = { 'spark': master_ip, } if 'namenode' in available_hosts: hosts['namenode'] = available_hosts['namenode'] if 'resourcemanager' in available_hosts: hosts['resourcemanager'] = available_hosts['resourcemanager'] # Setup roles dict. We always include the history server and client. # Determine other roles based on our execution mode. roles = ['spark-history-server', 'spark-client'] if mode == 'standalone': roles.append('spark-master') roles.append('spark-worker') elif mode.startswith('yarn'): roles.append('spark-on-yarn') roles.append('spark-yarn-slave') # Setup overrides dict override = { 'spark::common::master_url': master_url, 'spark::common::event_log_dir': spark_events, 'spark::common::history_log_dir': spark_events, 'spark::common::extra_lib_dirs': ':'.join(extra_libs) if extra_libs else None, 'spark::common::driver_mem': driver_mem, 'spark::common::executor_mem': executor_mem, } if zk_units: zks = [] for unit in zk_units: ip = utils.resolve_private_address(unit['host']) zks.append("%s:%s" % (ip, unit['port'])) zk_connect = ",".join(zks) override['spark::common::zookeeper_connection_string'] = zk_connect else: override['spark::common::zookeeper_connection_string'] = None # Create our site.yaml and trigger puppet. # NB: during an upgrade, we configure the site.yaml, but do not # trigger puppet. The user must do that with the 'reinstall' action. bigtop = Bigtop() bigtop.render_site_yaml(hosts, roles, override) if unitdata.kv().get('spark.version.repo', False): hookenv.log( "An upgrade is available and the site.yaml has been " "configured. Run the 'reinstall' action to continue.", level=hookenv.INFO) else: bigtop.trigger_puppet() self.patch_worker_master_url(master_ip, master_url) # Packages don't create the event dir by default. Do it each time # spark is (re)installed to ensure location/perms are correct. self.configure_events_dir(mode) # Handle examples and Spark-Bench. Do this each time this method is # called in case we need to act on a new resource or user config. self.configure_examples() self.configure_sparkbench()
def configure(self): ''' Configure spark environment for all users ''' dc = self.dist_config spark_home = self.dist_config.path('spark') spark_bin = spark_home / 'bin' # handle tuning options that may be set as percentages driver_mem = '1g' req_driver_mem = hookenv.config()['driver_memory'] executor_mem = '1g' req_executor_mem = hookenv.config()['executor_memory'] if req_driver_mem.endswith('%'): if self.is_spark_local(): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log("driver_memory percentage in non-local mode. Using 1g default.", level=None) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if self.is_spark_local(): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log("executor_memory percentage in non-local mode. Using 1g default.", level=None) else: executor_mem = req_executor_mem # update environment variables with utils.environment_edit_in_place('/etc/environment') as env: if spark_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], spark_bin]) env['MASTER'] = self.get_master() env['PYSPARK_DRIVER_PYTHON'] = "ipython" env['SPARK_CONF_DIR'] = self.dist_config.path('spark_conf') env['SPARK_DRIVER_MEMORY'] = driver_mem env['SPARK_EXECUTOR_MEMORY'] = executor_mem env['SPARK_HOME'] = spark_home events_dir = 'file://{}'.format(dc.path('spark_events')) if unitdata.kv().get('hdfs.available', False): prefix = dc.path('log_prefix') events_dir = dc.path('spark_events') events_dir = 'hdfs:///{}'.format(events_dir.replace(prefix, '')) # update spark-defaults spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf' utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), r'.*spark.eventLog.enabled .*': 'spark.eventLog.enabled true', r'.*spark.history.fs.logDirectory .*': 'spark.history.fs.logDirectory {}'.format( events_dir), r'.*spark.eventLog.dir .*': 'spark.eventLog.dir {}'.format(events_dir), }, append_non_matches=True) # update spark-env spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh' utils.re_edit_in_place(spark_env, { r'.*SPARK_DRIVER_MEMORY.*': 'SPARK_DRIVER_MEMORY={}'.format(driver_mem), r'.*SPARK_EXECUTOR_MEMORY.*': 'SPARK_EXECUTOR_MEMORY={}'.format(executor_mem), r'.*SPARK_LOG_DIR.*': 'SPARK_LOG_DIR={}'.format(self.dist_config.path('spark_logs')), r'.*SPARK_WORKER_DIR.*': 'SPARK_WORKER_DIR={}'.format(self.dist_config.path('spark_work')), }) # If zookeeper is available we should be in HA mode so we should not set the MASTER_IP if not unitdata.kv().get('zookeepers.available', False): master_ip = self.get_master_ip() utils.re_edit_in_place(spark_env, { r'.*SPARK_MASTER_IP.*': 'SPARK_MASTER_IP={}'.format(master_ip), }) # manage SparkBench install_sb = hookenv.config()['spark_bench_enabled'] sb_dir = '/home/ubuntu/spark-bench' if install_sb: if not unitdata.kv().get('spark_bench.installed', False): if utils.cpu_arch() == 'ppc64le': sb_url = hookenv.config()['spark_bench_ppc64le'] else: # TODO: may need more arch cases (go with x86 sb for now) sb_url = hookenv.config()['spark_bench_x86_64'] Path(sb_dir).rmtree_p() au = ArchiveUrlFetchHandler() au.install(sb_url, '/home/ubuntu') # ##### # Handle glob if we use a .tgz that doesn't expand to sb_dir # sb_archive_dir = glob('/home/ubuntu/spark-bench-*')[0] # SparkBench expects to live in ~/spark-bench, so put it there # Path(sb_archive_dir).rename(sb_dir) # ##### # comment out mem tunings (let them come from /etc/environment) sb_env = Path(sb_dir) / 'conf/env.sh' utils.re_edit_in_place(sb_env, { r'^SPARK_DRIVER_MEMORY.*': '# SPARK_DRIVER_MEMORY (use value from environment)', r'^SPARK_EXECUTOR_MEMORY.*': '# SPARK_EXECUTOR_MEMORY (use value from environment)', }) unitdata.kv().set('spark_bench.installed', True) unitdata.kv().flush(True) else: Path(sb_dir).rmtree_p() unitdata.kv().set('spark_bench.installed', False) unitdata.kv().flush(True) self.setup_init_scripts()
def configure(self, available_hosts, zk_units, peers): """ This is the core logic of setting up spark. Two flags are needed: * Namenode exists aka HDFS is ready * Resource manager exists aka YARN is ready both flags are infered from the available hosts. :param dict available_hosts: Hosts that Spark should know about. """ # Bootstrap spark if not unitdata.kv().get('spark.bootstrapped', False): self.setup() unitdata.kv().set('spark.bootstrapped', True) # Set KV based on connected applications unitdata.kv().set('zookeeper.units', zk_units) unitdata.kv().set('sparkpeer.units', peers) unitdata.kv().flush(True) # Get our config ready dc = self.dist_config events_log_dir = 'file://{}'.format(dc.path('spark_events')) mode = hookenv.config()['spark_execution_mode'] master_ip = utils.resolve_private_address(available_hosts['spark-master']) master_url = self.get_master_url(master_ip) # Setup hosts dict hosts = { 'spark': master_ip, } if 'namenode' in available_hosts: hosts['namenode'] = available_hosts['namenode'] events_log_dir = self.setup_hdfs_logs() if 'resourcemanager' in available_hosts: hosts['resourcemanager'] = available_hosts['resourcemanager'] # Setup roles dict. We always include the history server and client. # Determine other roles based on our execution mode. roles = ['spark-history-server', 'spark-client'] if mode == 'standalone': roles.append('spark-master') roles.append('spark-worker') elif mode.startswith('yarn'): roles.append('spark-on-yarn') roles.append('spark-yarn-slave') # Setup overrides dict override = { 'spark::common::master_url': master_url, 'spark::common::event_log_dir': events_log_dir, 'spark::common::history_log_dir': events_log_dir, } if zk_units: zks = [] for unit in zk_units: ip = utils.resolve_private_address(unit['host']) zks.append("%s:%s" % (ip, unit['port'])) zk_connect = ",".join(zks) override['spark::common::zookeeper_connection_string'] = zk_connect else: override['spark::common::zookeeper_connection_string'] = None # Create our site.yaml and trigger puppet bigtop = Bigtop() bigtop.render_site_yaml(hosts, roles, override) bigtop.trigger_puppet() # Do this after our puppet bits in case puppet overrides needed perms if 'namenode' not in available_hosts: # Local event dir (not in HDFS) needs to be 777 so non-spark # users can write job history there. It needs to be g+s so # all entries will be readable by spark (in the spark group). # It needs to be +t so users cannot remove files they don't own. dc.path('spark_events').chmod(0o3777) self.patch_worker_master_url(master_ip, master_url) # handle tuning options that may be set as percentages driver_mem = '1g' req_driver_mem = hookenv.config()['driver_memory'] executor_mem = '1g' req_executor_mem = hookenv.config()['executor_memory'] if req_driver_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log("driver_memory percentage in non-local mode. Using 1g default.", level=None) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log("executor_memory percentage in non-local mode. Using 1g default.", level=None) else: executor_mem = req_executor_mem spark_env = '/etc/spark/conf/spark-env.sh' utils.re_edit_in_place(spark_env, { r'.*SPARK_DRIVER_MEMORY.*': 'export SPARK_DRIVER_MEMORY={}'.format(driver_mem), r'.*SPARK_EXECUTOR_MEMORY.*': 'export SPARK_EXECUTOR_MEMORY={}'.format(executor_mem), }, append_non_matches=True) # Install SB (subsequent calls will reconfigure existing install) # SparkBench looks for the spark master in /etc/environment with utils.environment_edit_in_place('/etc/environment') as env: env['MASTER'] = master_url self.install_benchmark()
def configure(self, available_hosts, zk_units, peers, extra_libs): """ This is the core logic of setting up spark. :param dict available_hosts: Hosts that Spark should know about. :param list zk_units: List of Zookeeper dicts with host/port info. :param list peers: List of Spark peer tuples (unit name, IP). :param list extra_libs: List of extra lib paths for driver/executors. """ # Set KV based on connected applications unitdata.kv().set('zookeeper.units', zk_units) unitdata.kv().set('sparkpeer.units', peers) unitdata.kv().flush(True) # Get our config ready dc = self.dist_config mode = hookenv.config()['spark_execution_mode'] master_ip = utils.resolve_private_address(available_hosts['spark-master']) master_url = self.get_master_url(master_ip) req_driver_mem = hookenv.config()['driver_memory'] req_executor_mem = hookenv.config()['executor_memory'] if mode.startswith('yarn'): spark_events = 'hdfs://{}'.format(dc.path('spark_events')) else: spark_events = 'file://{}'.format(dc.path('spark_events')) # handle tuning options that may be set as percentages driver_mem = '1g' executor_mem = '1g' if req_driver_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log("driver_memory percentage in non-local mode. " "Using 1g default.", level=hookenv.WARNING) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log("executor_memory percentage in non-local mode. " "Using 1g default.", level=hookenv.WARNING) else: executor_mem = req_executor_mem # Some spark applications look for envars in /etc/environment with utils.environment_edit_in_place('/etc/environment') as env: env['MASTER'] = master_url env['SPARK_HOME'] = dc.path('spark_home') # Setup hosts dict hosts = { 'spark': master_ip, } if 'namenode' in available_hosts: hosts['namenode'] = available_hosts['namenode'] if 'resourcemanager' in available_hosts: hosts['resourcemanager'] = available_hosts['resourcemanager'] # Setup roles dict. We always include the history server and client. # Determine other roles based on our execution mode. roles = ['spark-history-server', 'spark-client'] if mode == 'standalone': roles.append('spark-master') roles.append('spark-worker') elif mode.startswith('yarn'): roles.append('spark-on-yarn') roles.append('spark-yarn-slave') # Setup overrides dict override = { 'spark::common::master_url': master_url, 'spark::common::event_log_dir': spark_events, 'spark::common::history_log_dir': spark_events, 'spark::common::extra_lib_dirs': ':'.join(extra_libs) if extra_libs else None, 'spark::common::driver_mem': driver_mem, 'spark::common::executor_mem': executor_mem, } if zk_units: zks = [] for unit in zk_units: ip = utils.resolve_private_address(unit['host']) zks.append("%s:%s" % (ip, unit['port'])) zk_connect = ",".join(zks) override['spark::common::zookeeper_connection_string'] = zk_connect else: override['spark::common::zookeeper_connection_string'] = None # Create our site.yaml and trigger puppet. # NB: during an upgrade, we configure the site.yaml, but do not # trigger puppet. The user must do that with the 'reinstall' action. bigtop = Bigtop() bigtop.render_site_yaml(hosts, roles, override) if unitdata.kv().get('spark.version.repo', False): hookenv.log("An upgrade is available and the site.yaml has been " "configured. Run the 'reinstall' action to continue.", level=hookenv.INFO) else: bigtop.trigger_puppet() self.patch_worker_master_url(master_ip, master_url) # Packages don't create the event dir by default. Do it each time # spark is (re)installed to ensure location/perms are correct. self.configure_events_dir(mode) # Handle examples and Spark-Bench. Do this each time this method is # called in case we need to act on a new resource or user config. self.configure_examples() self.configure_sparkbench()
def configure(self): ''' Configure spark environment for all users ''' spark_home = self.dist_config.path('spark') spark_bin = spark_home / 'bin' # handle tuning options that may be set as percentages driver_mem = '1g' req_driver_mem = hookenv.config()['driver_memory'] executor_mem = '1g' req_executor_mem = hookenv.config()['executor_memory'] if req_driver_mem.endswith('%'): if self.is_spark_local(): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "driver_memory percentage in non-local mode. Using 1g default.", level=None) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if self.is_spark_local(): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "executor_memory percentage in non-local mode. Using 1g default.", level=None) else: executor_mem = req_executor_mem # update environment variables with utils.environment_edit_in_place('/etc/environment') as env: if spark_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], spark_bin]) env['MASTER'] = self.get_master() env['PYSPARK_DRIVER_PYTHON'] = "ipython" env['SPARK_CONF_DIR'] = self.dist_config.path('spark_conf') env['SPARK_DRIVER_MEMORY'] = driver_mem env['SPARK_EXECUTOR_MEMORY'] = executor_mem env['SPARK_HOME'] = spark_home env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar" # update spark config spark_conf = self.dist_config.path( 'spark_conf') / 'spark-defaults.conf' utils.re_edit_in_place( spark_conf, { r'.*spark.master *.*': 'spark.master {}'.format(self.get_master()), r'.*spark.eventLog.enabled *.*': 'spark.eventLog.enabled true', r'.*spark.eventLog.dir *.*': 'spark.eventLog.dir hdfs:///user/ubuntu/directory', }) spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh' local_ip = utils.resolve_private_address(hookenv.unit_private_ip()) utils.re_edit_in_place( spark_env, { r'.*SPARK_DRIVER_MEMORY.*': 'SPARK_DRIVER_MEMORY={}'.format(driver_mem), r'.*SPARK_EXECUTOR_MEMORY.*': 'SPARK_EXECUTOR_MEMORY={}'.format(executor_mem), r'.*SPARK_LOG_DIR.*': 'SPARK_LOG_DIR={}'.format(self.dist_config.path('spark_logs')), r'.*SPARK_MASTER_IP.*': 'SPARK_MASTER_IP={}'.format(local_ip), r'.*SPARK_WORKER_DIR.*': 'SPARK_WORKER_DIR={}'.format( self.dist_config.path('spark_work')), }) # manage SparkBench install_sb = hookenv.config()['spark_bench_enabled'] sb_dir = '/home/ubuntu/spark-bench' if install_sb: if utils.cpu_arch() == 'ppc64le': sb_url = hookenv.config()['spark_bench_ppc64le'] else: # TODO: may need more arch cases (go with x86 sb for now) sb_url = hookenv.config()['spark_bench_x86_64'] Path(sb_dir).rmtree_p() fetcher = ArchiveUrlFetchHandler() fetcher.install(sb_url, '/home/ubuntu') # ##### # Handle glob if we use a .tgz that doesn't expand to sb_dir # sb_archive_dir = glob('/home/ubuntu/spark-bench-*')[0] # SparkBench expects to live in ~/spark-bench, so put it there # Path(sb_archive_dir).rename(sb_dir) # ##### # comment out mem tunings (let them come from /etc/environment) sb_env = Path(sb_dir) / 'conf/env.sh' utils.re_edit_in_place( sb_env, { r'^SPARK_DRIVER_MEMORY.*': '# SPARK_DRIVER_MEMORY (use value from environment)', r'^SPARK_EXECUTOR_MEMORY.*': '# SPARK_EXECUTOR_MEMORY (use value from environment)', }) else: Path(sb_dir).rmtree_p()
def configure(self): ''' Configure spark environment for all users ''' dc = self.dist_config spark_home = self.dist_config.path('spark') spark_bin = spark_home / 'bin' # handle tuning options that may be set as percentages driver_mem = '1g' req_driver_mem = hookenv.config()['driver_memory'] executor_mem = '1g' req_executor_mem = hookenv.config()['executor_memory'] if req_driver_mem.endswith('%'): if self.is_spark_local(): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "driver_memory percentage in non-local mode. Using 1g default.", level=None) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if self.is_spark_local(): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "executor_memory percentage in non-local mode. Using 1g default.", level=None) else: executor_mem = req_executor_mem # update environment variables with utils.environment_edit_in_place('/etc/environment') as env: if spark_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], spark_bin]) env['MASTER'] = self.get_master() env['PYSPARK_DRIVER_PYTHON'] = "ipython" env['SPARK_CONF_DIR'] = self.dist_config.path('spark_conf') env['SPARK_DRIVER_MEMORY'] = driver_mem env['SPARK_EXECUTOR_MEMORY'] = executor_mem env['SPARK_HOME'] = spark_home events_dir = 'file://{}'.format(dc.path('spark_events')) if unitdata.kv().get('hdfs.available', False): prefix = dc.path('log_prefix') events_dir = dc.path('spark_events') events_dir = 'hdfs:///{}'.format(events_dir.replace(prefix, '')) # update spark-defaults spark_conf = self.dist_config.path( 'spark_conf') / 'spark-defaults.conf' utils.re_edit_in_place(spark_conf, { r'.*spark.master .*': 'spark.master {}'.format(self.get_master()), r'.*spark.eventLog.enabled .*': 'spark.eventLog.enabled true', r'.*spark.history.fs.logDirectory .*': 'spark.history.fs.logDirectory {}'.format(events_dir), r'.*spark.eventLog.dir .*': 'spark.eventLog.dir {}'.format(events_dir), }, append_non_matches=True) # update spark-env spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh' utils.re_edit_in_place( spark_env, { r'.*SPARK_DRIVER_MEMORY.*': 'SPARK_DRIVER_MEMORY={}'.format(driver_mem), r'.*SPARK_EXECUTOR_MEMORY.*': 'SPARK_EXECUTOR_MEMORY={}'.format(executor_mem), r'.*SPARK_LOG_DIR.*': 'SPARK_LOG_DIR={}'.format(self.dist_config.path('spark_logs')), r'.*SPARK_WORKER_DIR.*': 'SPARK_WORKER_DIR={}'.format( self.dist_config.path('spark_work')), }) # If zookeeper is available we should be in HA mode so we should not set the MASTER_IP if not unitdata.kv().get('zookeepers.available', False): master_ip = self.get_master_ip() utils.re_edit_in_place(spark_env, { r'.*SPARK_MASTER_IP.*': 'SPARK_MASTER_IP={}'.format(master_ip), }) # manage SparkBench install_sb = hookenv.config()['spark_bench_enabled'] sb_dir = '/home/ubuntu/spark-bench' if install_sb: if not unitdata.kv().get('spark_bench.installed', False): if utils.cpu_arch() == 'ppc64le': sb_url = hookenv.config()['spark_bench_ppc64le'] else: # TODO: may need more arch cases (go with x86 sb for now) sb_url = hookenv.config()['spark_bench_x86_64'] Path(sb_dir).rmtree_p() au = ArchiveUrlFetchHandler() au.install(sb_url, '/home/ubuntu') # ##### # Handle glob if we use a .tgz that doesn't expand to sb_dir # sb_archive_dir = glob('/home/ubuntu/spark-bench-*')[0] # SparkBench expects to live in ~/spark-bench, so put it there # Path(sb_archive_dir).rename(sb_dir) # ##### # comment out mem tunings (let them come from /etc/environment) sb_env = Path(sb_dir) / 'conf/env.sh' utils.re_edit_in_place( sb_env, { r'^SPARK_DRIVER_MEMORY.*': '# SPARK_DRIVER_MEMORY (use value from environment)', r'^SPARK_EXECUTOR_MEMORY.*': '# SPARK_EXECUTOR_MEMORY (use value from environment)', }) unitdata.kv().set('spark_bench.installed', True) unitdata.kv().flush(True) else: Path(sb_dir).rmtree_p() unitdata.kv().set('spark_bench.installed', False) unitdata.kv().flush(True) self.setup_init_scripts()
def configure(self, available_hosts, zk_units, peers, extra_libs): """ This is the core logic of setting up spark. :param dict available_hosts: Hosts that Spark should know about. :param list zk_units: List of Zookeeper dicts with host/port info. :param list peers: List of Spark peer tuples (unit name, IP). :param list extra_libs: List of extra lib paths for driver/executors. """ # Bootstrap spark if not unitdata.kv().get('spark.bootstrapped', False): self.setup() unitdata.kv().set('spark.bootstrapped', True) # Set KV based on connected applications unitdata.kv().set('zookeeper.units', zk_units) unitdata.kv().set('sparkpeer.units', peers) unitdata.kv().flush(True) # Get our config ready dc = self.dist_config events_log_dir = 'file://{}'.format(dc.path('spark_events')) mode = hookenv.config()['spark_execution_mode'] master_ip = utils.resolve_private_address( available_hosts['spark-master']) master_url = self.get_master_url(master_ip) req_driver_mem = hookenv.config()['driver_memory'] req_executor_mem = hookenv.config()['executor_memory'] # handle tuning options that may be set as percentages driver_mem = '1g' executor_mem = '1g' if req_driver_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "driver_memory percentage in non-local mode. Using 1g default.", level=None) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if mode == 'standalone' or mode.startswith('local'): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log( "executor_memory percentage in non-local mode. Using 1g default.", level=None) else: executor_mem = req_executor_mem # Setup hosts dict hosts = { 'spark': master_ip, } if 'namenode' in available_hosts: hosts['namenode'] = available_hosts['namenode'] events_log_dir = self.setup_hdfs_logs() else: # Bigtop includes a default hadoop_head_node if we do not specify # any namenode info. To ensure spark standalone doesn't get # invalid hadoop config, set our NN to an empty string. hosts['namenode'] = '' if 'resourcemanager' in available_hosts: hosts['resourcemanager'] = available_hosts['resourcemanager'] # Setup roles dict. We always include the history server and client. # Determine other roles based on our execution mode. roles = ['spark-history-server', 'spark-client'] if mode == 'standalone': roles.append('spark-master') roles.append('spark-worker') elif mode.startswith('yarn'): roles.append('spark-on-yarn') roles.append('spark-yarn-slave') # Setup overrides dict override = { 'spark::common::master_url': master_url, 'spark::common::event_log_dir': events_log_dir, 'spark::common::history_log_dir': events_log_dir, 'spark::common::extra_lib_dirs': ':'.join(extra_libs) if extra_libs else None, 'spark::common::driver_mem': driver_mem, 'spark::common::executor_mem': executor_mem, } if zk_units: zks = [] for unit in zk_units: ip = utils.resolve_private_address(unit['host']) zks.append("%s:%s" % (ip, unit['port'])) zk_connect = ",".join(zks) override['spark::common::zookeeper_connection_string'] = zk_connect else: override['spark::common::zookeeper_connection_string'] = None # Create our site.yaml and trigger puppet bigtop = Bigtop() bigtop.render_site_yaml(hosts, roles, override) bigtop.trigger_puppet() # Do this after our puppet bits in case puppet overrides needed perms if 'namenode' not in available_hosts: # Local event dir (not in HDFS) needs to be 777 so non-spark # users can write job history there. It needs to be g+s so # all entries will be readable by spark (in the spark group). # It needs to be +t so users cannot remove files they don't own. dc.path('spark_events').chmod(0o3777) self.patch_worker_master_url(master_ip, master_url) # Install SB (subsequent calls will reconfigure existing install) # SparkBench looks for the spark master in /etc/environment with utils.environment_edit_in_place('/etc/environment') as env: env['MASTER'] = master_url self.install_benchmark()
def configure(self): ''' Configure spark environment for all users ''' spark_home = self.dist_config.path('spark') spark_bin = spark_home / 'bin' # handle tuning options that may be set as percentages driver_mem = '1g' req_driver_mem = hookenv.config()['driver_memory'] executor_mem = '1g' req_executor_mem = hookenv.config()['executor_memory'] if req_driver_mem.endswith('%'): if self.is_spark_local(): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_driver_mem.strip('%')) / 100 driver_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log("driver_memory percentage in non-local mode. Using 1g default.", level=None) else: driver_mem = req_driver_mem if req_executor_mem.endswith('%'): if self.is_spark_local(): mem_mb = host.get_total_ram() / 1024 / 1024 req_percentage = float(req_executor_mem.strip('%')) / 100 executor_mem = str(int(mem_mb * req_percentage)) + 'm' else: hookenv.log("executor_memory percentage in non-local mode. Using 1g default.", level=None) else: executor_mem = req_executor_mem # update environment variables with utils.environment_edit_in_place('/etc/environment') as env: if spark_bin not in env['PATH']: env['PATH'] = ':'.join([env['PATH'], spark_bin]) env['MASTER'] = self.get_master() env['PYSPARK_DRIVER_PYTHON'] = "ipython" env['SPARK_CONF_DIR'] = self.dist_config.path('spark_conf') env['SPARK_DRIVER_MEMORY'] = driver_mem env['SPARK_EXECUTOR_MEMORY'] = executor_mem env['SPARK_HOME'] = spark_home env['SPARK_JAR'] = "hdfs:///user/ubuntu/share/lib/spark-assembly.jar" # update spark config spark_conf = self.dist_config.path('spark_conf') / 'spark-defaults.conf' utils.re_edit_in_place(spark_conf, { r'.*spark.master *.*': 'spark.master {}'.format(self.get_master()), r'.*spark.eventLog.enabled *.*': 'spark.eventLog.enabled true', r'.*spark.eventLog.dir *.*': 'spark.eventLog.dir hdfs:///user/ubuntu/directory', }) spark_env = self.dist_config.path('spark_conf') / 'spark-env.sh' local_ip = utils.resolve_private_address(hookenv.unit_private_ip()) utils.re_edit_in_place(spark_env, { r'.*SPARK_DRIVER_MEMORY.*': 'SPARK_DRIVER_MEMORY={}'.format(driver_mem), r'.*SPARK_EXECUTOR_MEMORY.*': 'SPARK_EXECUTOR_MEMORY={}'.format(executor_mem), r'.*SPARK_LOG_DIR.*': 'SPARK_LOG_DIR={}'.format(self.dist_config.path('spark_logs')), r'.*SPARK_MASTER_IP.*': 'SPARK_MASTER_IP={}'.format(local_ip), r'.*SPARK_WORKER_DIR.*': 'SPARK_WORKER_DIR={}'.format(self.dist_config.path('spark_work')), }) # manage SparkBench install_sb = hookenv.config()['spark_bench_enabled'] sb_dir = '/home/ubuntu/spark-bench' if install_sb: if utils.cpu_arch() == 'ppc64le': sb_url = hookenv.config()['spark_bench_ppc64le'] else: # TODO: may need more arch cases (go with x86 sb for now) sb_url = hookenv.config()['spark_bench_x86_64'] Path(sb_dir).rmtree_p() fetcher = ArchiveUrlFetchHandler() fetcher.install(sb_url, '/home/ubuntu') # ##### # Handle glob if we use a .tgz that doesn't expand to sb_dir # sb_archive_dir = glob('/home/ubuntu/spark-bench-*')[0] # SparkBench expects to live in ~/spark-bench, so put it there # Path(sb_archive_dir).rename(sb_dir) # ##### # comment out mem tunings (let them come from /etc/environment) sb_env = Path(sb_dir) / 'conf/env.sh' utils.re_edit_in_place(sb_env, { r'^SPARK_DRIVER_MEMORY.*': '# SPARK_DRIVER_MEMORY (use value from environment)', r'^SPARK_EXECUTOR_MEMORY.*': '# SPARK_EXECUTOR_MEMORY (use value from environment)', }) else: Path(sb_dir).rmtree_p()