def test_use_aws_credentials_json(self, tmpdir): fp = tmpdir.join('test.json') fp.write( json.dumps({ 'accessKeyId': self.access_key, 'secretAccessKey': self.secret_key })) assert spark_config.get_aws_credentials( aws_credentials_json=str(fp)) == self.expected_creds
def test_use_service_credentials(self, tmpdir, monkeypatch): test_service = 'test_service' creds_dir = tmpdir.mkdir('creds') creds_file = creds_dir.join(f'test_service.yaml') creds_file.write(yaml.dump(self.creds)) monkeypatch.setattr(spark_config, 'AWS_CREDENTIALS_DIR', str(creds_dir)) assert spark_config.get_aws_credentials( service=test_service) == self.expected_creds
def test_use_service_credentials_missing_file(self, tmpdir, monkeypatch, mock_session, mock_log): test_service = 'not_exist' creds_dir = tmpdir.mkdir('creds') monkeypatch.setattr(spark_config, 'AWS_CREDENTIALS_DIR', str(creds_dir)) assert spark_config.get_aws_credentials( service=test_service) == self.expected_temp_creds (warning_msg, ), _ = mock_log.warning.call_args expected_message = f"Did not find service AWS credentials at {os.path.join(creds_dir, test_service + '.yaml')}" assert expected_message in warning_msg
def get_env(self): env = super().get_env() if self.get_executor() == "spark": spark_config_dict = self.get_spark_config_dict() env["EXECUTOR_CLUSTER"] = self.get_spark_paasta_cluster() env["EXECUTOR_POOL"] = self.get_spark_paasta_pool() env["SPARK_OPTS"] = stringify_spark_env(spark_config_dict) # The actual mesos secret will be decrypted and injected on mesos master when assigning # tasks. env["SPARK_MESOS_SECRET"] = "SHARED_SECRET(SPARK_MESOS_SECRET)" if clusterman_metrics: env["CLUSTERMAN_RESOURCES"] = json.dumps( generate_clusterman_metrics_entries( clusterman_metrics, get_resources_requested(spark_config_dict), spark_config_dict["spark.app.name"], get_webui_url(spark_config_dict["spark.ui.port"]), )) else: env["CLUSTERMAN_RESOURCES"] = "{}" if "AWS_ACCESS_KEY_ID" not in env or "AWS_SECRET_ACCESS_KEY" not in env: try: access_key, secret_key, session_token = get_aws_credentials( service=self.get_service(), aws_credentials_yaml=self.config_dict.get( "aws_credentials_yaml"), ) env["AWS_ACCESS_KEY_ID"] = access_key env["AWS_SECRET_ACCESS_KEY"] = secret_key except Exception: log.warning( f"Cannot set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment " f"variables for tron action {self.get_instance()} of service " f"{self.get_service()} via credentail file. Traceback:\n" f"{traceback.format_exc()}") if "AWS_DEFAULT_REGION" not in env: env["AWS_DEFAULT_REGION"] = DEFAULT_AWS_REGION return env
def get_spark_config_dict(self): spark_config_dict = getattr(self, "_spark_config_dict", None) # cached the created dict, so that we don't need to process it multiple # times, and having inconsistent result if spark_config_dict is not None: return spark_config_dict if self.get_spark_cluster_manager() == "mesos": mesos_leader = ( f"zk://{load_system_paasta_config().get_zk_hosts()}" if not self.for_validation else "N/A") else: mesos_leader = None aws_creds = get_aws_credentials( aws_credentials_yaml=self.config_dict.get("aws_credentials_yaml")) self._spark_config_dict = get_spark_conf( cluster_manager=self.get_spark_cluster_manager(), spark_app_base_name= f"tron_spark_{self.get_service()}_{self.get_instance()}", user_spark_opts=self.config_dict.get("spark_args", {}), paasta_cluster=self.get_spark_paasta_cluster(), paasta_pool=self.get_spark_paasta_pool(), paasta_service=self.get_service(), paasta_instance=self.get_instance(), docker_img=self.get_docker_url(), aws_creds=aws_creds, extra_volumes=self.get_volumes( load_system_paasta_config().get_volumes()), # tron is using environment variable to load the required creds with_secret=False, mesos_leader=mesos_leader, # load_system_paasta already load the default volumes load_paasta_default_volumes=False, ) return self._spark_config_dict
def paasta_spark_run(args): # argparse does not work as expected with both default and # type=validate_work_dir. validate_work_dir(args.work_dir) try: system_paasta_config = load_system_paasta_config() except PaastaNotConfiguredError: print( PaastaColors.yellow( "Warning: Couldn't load config files from '/etc/paasta'. This indicates" "PaaSTA is not configured locally on this host, and local-run may not behave" "the same way it would behave on a server configured for PaaSTA." ), sep="\n", ) system_paasta_config = SystemPaastaConfig({"volumes": []}, "/etc/paasta") if args.cmd == "jupyter-lab" and not args.build and not args.image: print( PaastaColors.red( "The jupyter-lab command requires a prebuilt image with -I or --image." ), file=sys.stderr, ) return 1 # Use the default spark:client instance configs if not provided try: instance_config = get_instance_config( service=args.service, instance=args.instance, cluster=system_paasta_config.get_cluster_aliases().get( args.cluster, args.cluster ), load_deployments=args.build is False and args.image is None, soa_dir=args.yelpsoa_config_root, ) except NoConfigurationForServiceError as e: print(str(e), file=sys.stderr) return 1 except NoDeploymentsAvailable: print( PaastaColors.red( "Error: No deployments.json found in %(soa_dir)s/%(service)s." "You can generate this by running:" "generate_deployments_for_service -d %(soa_dir)s -s %(service)s" % {"soa_dir": args.yelpsoa_config_root, "service": args.service} ), sep="\n", file=sys.stderr, ) return 1 if not args.cmd and not instance_config.get_cmd(): print( "A command is required, pyspark, spark-shell, spark-submit or jupyter", file=sys.stderr, ) return 1 aws_creds = get_aws_credentials( service=args.service, no_aws_credentials=args.no_aws_credentials, aws_credentials_yaml=args.aws_credentials_yaml, profile_name=args.aws_profile, ) docker_image = get_docker_image(args, instance_config) if docker_image is None: return 1 pod_template_path = generate_pod_template_path() args.enable_compact_bin_packing = should_enable_compact_bin_packing( args.disable_compact_bin_packing, args.cluster_manager ) volumes = instance_config.get_volumes(system_paasta_config.get_volumes()) app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd()) if args.enable_compact_bin_packing: document = POD_TEMPLATE.format( spark_pod_label=limit_size_with_hash(f"exec-{app_base_name}"), ) parsed_pod_template = yaml.load(document) with open(pod_template_path, "w") as f: yaml.dump(parsed_pod_template, f) needs_docker_cfg = not args.build user_spark_opts = _parse_user_spark_args( args.spark_args, pod_template_path, args.enable_compact_bin_packing ) args.cmd = _auto_add_timeout_for_job(args.cmd, args.timeout_job_runtime) # This is required if configs are provided as part of `spark-submit` # Other way to provide is with --spark-args sub_cmds = args.cmd.split(" ") # spark.driver.memory=10g for cmd in sub_cmds: if cmd.startswith("spark.driver.memory") or cmd.startswith( "spark.driver.cores" ): key, value = cmd.split("=") user_spark_opts[key] = value paasta_instance = get_smart_paasta_instance_name(args) auto_set_temporary_credentials_provider = ( args.disable_temporary_credentials_provider is False ) spark_conf = get_spark_conf( cluster_manager=args.cluster_manager, spark_app_base_name=app_base_name, docker_img=docker_image, user_spark_opts=user_spark_opts, paasta_cluster=args.cluster, paasta_pool=args.pool, paasta_service=args.service, paasta_instance=paasta_instance, extra_volumes=volumes, aws_creds=aws_creds, needs_docker_cfg=needs_docker_cfg, auto_set_temporary_credentials_provider=auto_set_temporary_credentials_provider, ) # Experimental: TODO: Move to service_configuration_lib once confirmed that there are no issues # Enable AQE: Adaptive Query Execution if "spark.sql.adaptive.enabled" not in spark_conf: spark_conf["spark.sql.adaptive.enabled"] = "true" aqe_msg = "Spark performance improving feature Adaptive Query Execution (AQE) is enabled. Set spark.sql.adaptive.enabled as false to disable." log.info(aqe_msg) print(PaastaColors.blue(aqe_msg)) return configure_and_run_docker_container( args, docker_img=docker_image, instance_config=instance_config, system_paasta_config=system_paasta_config, spark_conf=spark_conf, aws_creds=aws_creds, cluster_manager=args.cluster_manager, pod_template_path=pod_template_path, )
def paasta_spark_run(args): # argparse does not work as expected with both default and # type=validate_work_dir. validate_work_dir(args.work_dir) try: system_paasta_config = load_system_paasta_config() except PaastaNotConfiguredError: print( PaastaColors.yellow( "Warning: Couldn't load config files from '/etc/paasta'. This indicates" "PaaSTA is not configured locally on this host, and local-run may not behave" "the same way it would behave on a server configured for PaaSTA." ), sep="\n", ) system_paasta_config = SystemPaastaConfig({"volumes": []}, "/etc/paasta") if args.cmd == "jupyter-lab" and not args.build and not args.image: print( PaastaColors.red( "The jupyter-lab command requires a prebuilt image with -I or --image." ), file=sys.stderr, ) return 1 # Use the default spark:client instance configs if not provided try: instance_config = get_instance_config( service=args.service, instance=args.instance, cluster=args.cluster, load_deployments=args.build is False and args.image is None, soa_dir=args.yelpsoa_config_root, ) except NoConfigurationForServiceError as e: print(str(e), file=sys.stderr) return 1 except NoDeploymentsAvailable: print( PaastaColors.red( "Error: No deployments.json found in %(soa_dir)s/%(service)s." "You can generate this by running:" "generate_deployments_for_service -d %(soa_dir)s -s %(service)s" % { "soa_dir": args.yelpsoa_config_root, "service": args.service }), sep="\n", file=sys.stderr, ) return 1 if not args.cmd and not instance_config.get_cmd(): print( "A command is required, pyspark, spark-shell, spark-submit or jupyter", file=sys.stderr, ) return 1 aws_creds = get_aws_credentials( service=args.service, no_aws_credentials=args.no_aws_credentials, aws_credentials_yaml=args.aws_credentials_yaml, profile_name=args.aws_profile, ) docker_image = get_docker_image(args, instance_config) if docker_image is None: return 1 volumes = instance_config.get_volumes(system_paasta_config.get_volumes()) app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd()) needs_docker_cfg = not args.build and not args.image user_spark_opts = _parse_user_spark_args(args.spark_args) paasta_instance = get_smart_paasta_instance_name(args) spark_conf = get_spark_conf( cluster_manager="mesos", spark_app_base_name=app_base_name, docker_img=docker_image, user_spark_opts=user_spark_opts, paasta_cluster=args.cluster, paasta_pool=args.pool, paasta_service=args.service, paasta_instance=paasta_instance, extra_volumes=volumes, aws_creds=aws_creds, needs_docker_cfg=needs_docker_cfg, ) return configure_and_run_docker_container( args, docker_img=docker_image, instance_config=instance_config, system_paasta_config=system_paasta_config, spark_conf=spark_conf, aws_creds=aws_creds, )
def test_aws_credentials_yaml(self, tmpdir, aws_creds): fp = tmpdir.join('test.yaml') fp.write(yaml.dump(aws_creds)) expected_output = self.expected_temp_creds if aws_creds == self.temp_creds else self.expected_creds assert spark_config.get_aws_credentials( aws_credentials_yaml=str(fp)) == expected_output
def test_no_aws_creds(self): assert spark_config.get_aws_credentials( no_aws_credentials=True) == (None, None, None)
def test_fail(self, tmpdir): fp = tmpdir.join('test.yaml') fp.write('not yaml file') with pytest.raises(ValueError): spark_config.get_aws_credentials(aws_credentials_yaml=str(fp))
def test_use_profile(self, mock_session): assert spark_config.get_aws_credentials( profile_name='test_profile') == self.expected_temp_creds
def test_use_session(self, mock_session): assert spark_config.get_aws_credentials( session=mock_session) == self.expected_temp_creds