def test_hive_site() -> None: configuration = Configuration( "hive-site", { "hive.execution.engine": "tez", "hive.security.metastore.authorization.manager": "org.apache.hadoop.hive.ql.security.authorization.StorageBasedAuthorizationProvider", }, ) serialized_conf = configuration.serialized assert ( serialized_conf == " <property>\n <name>hive.execution.engine</name>\n <value>tez</value>\n </property>\n <property>\n <name>hive.security.metastore.authorization.manager</name>\n <value>org.apache.hadoop.hive.ql.security.authorization.StorageBasedAuthorizationProvider</value>\n </property>\n" )
def get_regional_configs(self) -> List[Configuration]: aws_region = os.getenv("AWS_REGION") if aws_region is None: logging.warning( "Unable to detect AWS region from environment variable AWS_REGION" ) return [] elif aws_region in ["cn-northwest-1", "cn-north-1"]: aws_domain = "amazonaws.com.cn" s3_endpoint = f"s3.{aws_region}.{aws_domain}" elif aws_region in ["us-gov-west-1", "us-gov-east-1"]: aws_domain = "amazonaws.com" s3_endpoint = f"s3.{aws_region}.{aws_domain}" else: # no special regional configs needed return [] return [ Configuration(Classification="core-site", Properties={"fs.s3a.endpoint": s3_endpoint}) ]
def get_yarn_spark_resource_config( self, instance_count: int, instance_mem_mb: int, instance_cores: int) -> Tuple[Configuration, Configuration]: aws_region = os.getenv("AWS_REGION") executor_cores = instance_cores executor_count_per_instance = int(instance_cores / executor_cores) executor_count_total = instance_count * executor_count_per_instance default_parallelism = instance_count * instance_cores * 2 # Let's leave 3% of the instance memory free instance_mem_mb = int(instance_mem_mb * 0.97) driver_mem_mb = 2 * 1024 driver_mem_ovr_pct = 0.1 driver_mem_ovr_mb = int(driver_mem_mb * driver_mem_ovr_pct) executor_mem_ovr_pct = 0.1 executor_mem_mb = int( (instance_mem_mb - driver_mem_mb - driver_mem_ovr_mb) / (executor_count_per_instance + executor_count_per_instance * executor_mem_ovr_pct)) executor_mem_ovr_mb = int(executor_mem_mb * executor_mem_ovr_pct) driver_gc_config = ( "-XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 " "-XX:+CMSClassUnloadingEnabled") driver_java_opts = f"-XX:OnOutOfMemoryError='kill -9 %p' " f"{driver_gc_config}" executor_gc_config = ( f"-XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70 " f"-XX:ConcGCThreads={max(int(executor_cores / 4), 1)} " f"-XX:ParallelGCThreads={max(int(3 * executor_cores / 4), 1)} ") executor_java_opts = ( f"-verbose:gc -XX:OnOutOfMemoryError='kill -9 %p' " f"-XX:+PrintGCDetails -XX:+PrintGCDateStamps " f"{executor_gc_config}") yarn_site_config = Configuration( "yarn-site", { "yarn.scheduler.minimum-allocation-mb": "1", "yarn.scheduler.maximum-allocation-mb": str(instance_mem_mb), "yarn.scheduler.minimum-allocation-vcores": "1", "yarn.scheduler.maximum-allocation-vcores": str(instance_cores), "yarn.nodemanager.resource.memory-mb": str(instance_mem_mb), "yarn.nodemanager.resource.cpu-vcores": str(instance_cores), }, ) spark_defaults_config = Configuration( "spark-defaults", { "spark.driver.memory": f"{driver_mem_mb}m", "spark.driver.memoryOverhead": f"{driver_mem_ovr_mb}m", "spark.driver.defaultJavaOptions": f"{driver_java_opts}", "spark.executor.memory": f"{executor_mem_mb}m", "spark.executor.memoryOverhead": f"{executor_mem_ovr_mb}m", "spark.executor.cores": f"{executor_cores}", "spark.executor.defaultJavaOptions": f"{executor_java_opts}", "spark.executor.instances": f"{executor_count_total}", "spark.default.parallelism": f"{default_parallelism}", "spark.yarn.appMasterEnv.AWS_REGION": f"{aws_region}", "spark.executorEnv.AWS_REGION": f"{aws_region}" }, ) return yarn_site_config, spark_defaults_config
def _write_conf(conf: Configuration) -> None: logging.info("Writing user config to {}".format(conf.path)) conf_string = conf.write_config() logging.info("Configuration at {} is: \n{}".format( conf.path, conf_string))