Beispiel #1
0
def configure_and_run_docker_container(
    args: argparse.Namespace,
    docker_img: str,
    instance_config: InstanceConfig,
    system_paasta_config: SystemPaastaConfig,
) -> int:
    volumes = list()
    for volume in instance_config.get_volumes(system_paasta_config.get_volumes()):
        if os.path.exists(volume["hostPath"]):
            volumes.append(
                "{}:{}:{}".format(
                    volume["hostPath"], volume["containerPath"], volume["mode"].lower()
                )
            )
        else:
            print(
                PaastaColors.yellow(
                    "Warning: Path %s does not exist on this host. Skipping this binding."
                    % volume["hostPath"]
                ),
                file=sys.stderr,
            )

    original_docker_cmd = args.cmd or instance_config.get_cmd()
    spark_ui_port = pick_random_port(args.service + str(os.getpid()))
    spark_app_name = get_spark_app_name(original_docker_cmd, spark_ui_port)

    access_key, secret_key = get_aws_credentials(
        service=args.service,
        no_aws_credentials=args.no_aws_credentials,
        aws_credentials_yaml=args.aws_credentials_yaml,
        profile_name=args.aws_profile,
    )
    spark_config_dict = get_spark_config(
        args=args,
        spark_app_name=spark_app_name,
        spark_ui_port=spark_ui_port,
        docker_img=docker_img,
        system_paasta_config=system_paasta_config,
        volumes=volumes,
        access_key=access_key,
        secret_key=secret_key,
    )
    spark_conf_str = create_spark_config_str(spark_config_dict, is_mrjob=args.mrjob)

    # Spark client specific volumes
    volumes.append("%s:rw" % args.work_dir)
    volumes.append("/etc/passwd:/etc/passwd:ro")
    volumes.append("/etc/group:/etc/group:ro")
    volumes.append("/nail/home:/nail/home:rw")

    environment = instance_config.get_env_dictionary()
    environment.update(
        get_spark_env(args, spark_conf_str, spark_ui_port, access_key, secret_key)
    )

    webui_url = get_webui_url(spark_ui_port)

    docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str)
    if "history-server" in docker_cmd:
        print(f"\nSpark history server URL {webui_url}\n")
    elif any(c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]):
        print(f"\nSpark monitoring URL {webui_url}\n")

    if clusterman_metrics and _should_emit_resource_requirements(
        docker_cmd, args.mrjob
    ):
        try:
            emit_resource_requirements(spark_config_dict, args.cluster, webui_url)
        except Boto3Error as e:
            print(
                PaastaColors.red(
                    f"Encountered {e} while attempting to send resource requirements to Clusterman."
                )
            )
            if args.suppress_clusterman_metrics_errors:
                print(
                    "Continuing anyway since --suppress-clusterman-metrics-errors was passed"
                )
            else:
                raise

    return run_docker_container(
        container_name=spark_app_name,
        volumes=volumes,
        environment=environment,
        docker_img=docker_img,
        docker_cmd=docker_cmd,
        dry_run=args.dry_run,
        nvidia=args.nvidia,
    )
Beispiel #2
0
def configure_and_run_docker_container(
    args: argparse.Namespace,
    docker_img: str,
    instance_config: InstanceConfig,
    system_paasta_config: SystemPaastaConfig,
    spark_conf: Mapping[str, str],
    aws_creds: Tuple[Optional[str], Optional[str], Optional[str]],
    cluster_manager: str,
    pod_template_path: str,
) -> int:

    # driver specific volumes
    volumes: List[str] = []

    docker_memory_limit = _calculate_docker_memory_limit(
        spark_conf, args.docker_memory_limit
    )
    docker_cpu_limit = _calculate_docker_cpu_limit(
        spark_conf,
        args.docker_cpu_limit,
    )

    if cluster_manager == CLUSTER_MANAGER_MESOS:
        volumes = (
            spark_conf.get("spark.mesos.executor.docker.volumes", "").split(",")
            if spark_conf.get("spark.mesos.executor.docker.volumes", "") != ""
            else []
        )
    elif cluster_manager == CLUSTER_MANAGER_K8S:
        volume_names = [
            re.match(
                r"spark.kubernetes.executor.volumes.hostPath.(\d+).mount.path", key
            ).group(1)
            for key in spark_conf.keys()
            if "spark.kubernetes.executor.volumes.hostPath." in key
            and ".mount.path" in key
        ]
        for volume_name in volume_names:
            read_only = (
                "ro"
                if spark_conf.get(
                    f"spark.kubernetes.executor.volumes.hostPath.{volume_name}.mount.readOnly"
                )
                == "true"
                else "rw"
            )
            container_path = spark_conf.get(
                f"spark.kubernetes.executor.volumes.hostPath.{volume_name}.mount.path"
            )
            host_path = spark_conf.get(
                f"spark.kubernetes.executor.volumes.hostPath.{volume_name}.options.path"
            )
            volumes.append(f"{host_path}:{container_path}:{read_only}")

    volumes.append("%s:rw" % args.work_dir)
    volumes.append("/nail/home:/nail/home:rw")

    if args.enable_compact_bin_packing:
        volumes.append(f"{pod_template_path}:{pod_template_path}:rw")

    environment = instance_config.get_env_dictionary()  # type: ignore
    spark_conf_str = create_spark_config_str(spark_conf, is_mrjob=args.mrjob)
    environment.update(
        get_spark_env(args, spark_conf_str, aws_creds, spark_conf["spark.ui.port"])
    )  # type:ignore

    webui_url = get_webui_url(spark_conf["spark.ui.port"])
    webui_url_msg = f"\nSpark monitoring URL {webui_url}\n"

    docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str)
    if "history-server" in docker_cmd:
        print(f"\nSpark history server URL {webui_url}\n")
    elif any(c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]):
        signalfx_url = get_signalfx_url(spark_conf)
        signalfx_url_msg = f"\nSignalfx dashboard: {signalfx_url}\n"
        print(webui_url_msg)
        print(signalfx_url_msg)
        log.info(webui_url_msg)
        log.info(signalfx_url_msg)
        history_server_url = get_history_url(spark_conf)
        if history_server_url:
            history_server_url_msg = (
                f"\nAfter the job is finished, you can find the spark UI from {history_server_url}\n"
                "Check y/spark-recent-history for faster access to prod logs\n"
            )
            print(history_server_url_msg)
            log.info(history_server_url_msg)
    print(f"Selected cluster manager: {cluster_manager}\n")

    if clusterman_metrics and _should_get_resource_requirements(docker_cmd, args.mrjob):
        try:
            if cluster_manager == CLUSTER_MANAGER_MESOS:
                print("Sending resource request metrics to Clusterman")
                hourly_cost, resources = send_and_calculate_resources_cost(
                    clusterman_metrics, spark_conf, webui_url, args.pool
                )
            else:
                resources = get_resources_requested(spark_conf)
                hourly_cost = get_spark_hourly_cost(
                    clusterman_metrics,
                    resources,
                    spark_conf["spark.executorEnv.PAASTA_CLUSTER"],
                    args.pool,
                )
            message = (
                f"Resource request ({resources['cpus']} cpus and {resources['mem']} MB memory total)"
                f" is estimated to cost ${hourly_cost} per hour"
            )
            if clusterman_metrics.util.costs.should_warn(hourly_cost):
                print(PaastaColors.red(f"WARNING: {message}"))
            else:
                print(message)
        except Boto3Error as e:
            print(
                PaastaColors.red(
                    f"Encountered {e} while attempting to send resource requirements to Clusterman."
                )
            )
            if args.suppress_clusterman_metrics_errors:
                print(
                    "Continuing anyway since --suppress-clusterman-metrics-errors was passed"
                )
            else:
                raise

    final_spark_submit_cmd_msg = f"Final command: {docker_cmd}"
    print(PaastaColors.grey(final_spark_submit_cmd_msg))
    log.info(final_spark_submit_cmd_msg)
    return run_docker_container(
        container_name=spark_conf["spark.app.name"],
        volumes=volumes,
        environment=environment,
        docker_img=docker_img,
        docker_cmd=docker_cmd,
        dry_run=args.dry_run,
        nvidia=args.nvidia,
        docker_memory_limit=docker_memory_limit,
        docker_cpu_limit=docker_cpu_limit,
    )
Beispiel #3
0
def configure_and_run_docker_container(
    args: argparse.Namespace,
    docker_img: str,
    instance_config: InstanceConfig,
    system_paasta_config: SystemPaastaConfig,
    spark_conf: Mapping[str, str],
    aws_creds: Tuple[Optional[str], Optional[str], Optional[str]],
) -> int:

    # driver specific volumes
    volumes = (spark_conf.get("spark.mesos.executor.docker.volumes",
                              "").split(",") if
               spark_conf.get("spark.mesos.executor.docker.volumes", "") != ""
               else [])
    volumes.append("%s:rw" % args.work_dir)
    volumes.append("/nail/home:/nail/home:rw")

    environment = instance_config.get_env_dictionary()  # type: ignore
    spark_conf_str = create_spark_config_str(spark_conf, is_mrjob=args.mrjob)
    environment.update(
        get_spark_env(args, spark_conf_str, aws_creds,
                      spark_conf["spark.ui.port"]))  # type:ignore

    webui_url = get_webui_url(spark_conf["spark.ui.port"])

    docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str)
    if "history-server" in docker_cmd:
        print(f"\nSpark history server URL {webui_url}\n")
    elif any(c in docker_cmd
             for c in ["pyspark", "spark-shell", "spark-submit"]):
        signalfx_url = get_signalfx_url(spark_conf)
        print(f"\nSpark monitoring URL {webui_url}\n")
        print(f"\nSignalfx dashboard: {signalfx_url}\n")
        history_server_url = get_history_url(spark_conf)
        if history_server_url:
            print(
                f"\nAfter the job is finished, you can find the spark UI from {history_server_url}\n"
            )

    if clusterman_metrics and _should_emit_resource_requirements(
            docker_cmd, args.mrjob):
        try:
            print("Sending resource request metrics to Clusterman")
            hourly_cost, resources = send_and_calculate_resources_cost(
                clusterman_metrics, spark_conf, webui_url, args.pool)
            message = (
                f"Resource request ({resources['cpus']} cpus and {resources['mem']} MB memory total)"
                f" is estimated to cost ${hourly_cost} per hour")
            if clusterman_metrics.util.costs.should_warn(hourly_cost):
                print(PaastaColors.red(f"WARNING: {message}"))
            else:
                print(message)
        except Boto3Error as e:
            print(
                PaastaColors.red(
                    f"Encountered {e} while attempting to send resource requirements to Clusterman."
                ))
            if args.suppress_clusterman_metrics_errors:
                print(
                    "Continuing anyway since --suppress-clusterman-metrics-errors was passed"
                )
            else:
                raise

    return run_docker_container(
        container_name=spark_conf["spark.app.name"],
        volumes=volumes,
        environment=environment,
        docker_img=docker_img,
        docker_cmd=docker_cmd,
        dry_run=args.dry_run,
        nvidia=args.nvidia,
    )