Exemple #1
0
def start_server(dockerized: bool = None, with_jupyter: bool = True, daemon: bool = None):
    if dockerized is None:
        dockerized = (
            False
            if any(["SPARK_HOME" in os.environ, "HADOOP_CONF_DIR" in os.environ])
            else True
        )
    if daemon is None:
        daemon = dockerized
    if dockerized:
        container = _init_spark_container(with_jupyter=with_jupyter)
        if daemon:
            logging.info("Now serving spark requests via docker.")
        else:
            with logged_block("hosting spark container"):
                while True:
                    time.sleep(30)
    else:
        _init_spark(dockerized=False, with_jupyter=with_jupyter, daemon=daemon)
        logging.info(
            "Spark server started. "
            "Monitor via http://localhost:4040 or http://127.0.0.1:4040"
        )
        # if with_jupyter:
        #     start_jupyter()
        # else:
        #     logging.info("Skipping Jupyter notebooks server launch...")
        if not daemon:
            with logged_block(_SERVING_SPARK_REQUESTS):
                # NOTE: When run containerized, the above message triggers
                #       the host to stop echoing logs
                while True:
                    time.sleep(30)
Exemple #2
0
def smart_build(
    dockerfile_path: str,
    tag_as=None,
    push_core=True,
    push_final=False,
    with_login=False,
    addl_args=None,
    ignore_caches=False,
):
    """
    Builds the dockerfile if needed but pulls it from the remote if possible.
    """
    if bool(with_login):
        login()
    tag_as = _to_list(tag_as)
    result = smart_split(dockerfile_path, tag_as, addl_args=addl_args)
    image_core, dockerfile_path_core, image_derived, dockerfile_path_derived = result
    if not ignore_caches:
        if dockerfile_path_derived is None and exists_remotely(image_core):
            logging.info(
                "Image with matching hash already exists "
                "and no host files are referenced in Dockerfile."
                f"Attempting to retag existing image '{image_core}' as '{tag_as}'..."
            )
            remote_retag(
                image_name=image_core.split(":")[0],
                existing_tag=image_core.split(":")[1],
                tag_as=tag_as,
            )
            return
        pull(image_core, skip_if_exists=True, silent=True)
    if ignore_caches or not exists_locally(image_core):
        with logged_block(f"building interim (core) image as '{image_core}'"):
            build(dockerfile_path_core, image_core, addl_args=addl_args)
    if push_core:
        if ignore_caches or not exists_remotely(image_core):
            with logged_block(
                    f"pushing interim (core) image '{image_derived}'"):
                push(image_core)
        else:
            logging.info(
                f"Already exists. Skipping push of image '{image_derived}'")
    with logged_block(
            f"building '{dockerfile_path_derived}' as '{image_derived}'"):
        if dockerfile_path_derived:
            build(dockerfile_path_derived, image_derived, addl_args=addl_args)
        else:
            tag(image_core, image_derived)
    if tag_as:
        tag(image_derived, tag_as)
        if push_final:
            for image_name in tag_as:
                push(image_name)
Exemple #3
0
def install(*args, infra_dir="./infra", deploy=False, git_ref="master"):
    """
    Usage example:
    ```
    s-infra install catalog:aws-prereqs --infra_dir=infra/prereqs --deploy=True
    s-infra install samples:aws --infra_dir=infra --deploy=True
    ```
    Which is identical to:
    ```
    s-infra install catalog:aws-prereqs --infra_dir=infra/prereqs
    s-infra init+apply --infra_dir=infra/prereqs
    s-infra install samples:aws --infra_dir=infra
    s-infra init+apply --infra_dir=infra
    ```
    """
    uio.create_folder(infra_dir)
    for arg in args:
        with logged_block(f"installing terraform modules from '{arg}'"):
            infra_type, infra_name = arg.split(":")
            if infra_type not in ["catalog", "samples"]:
                raise ValueError(
                    f"Expected infra_type to be one of: 'catalog', 'samples'. Received type: {infra_type}"
                )
            uio.download_folder(
                remote_folder=
                f"git://github.com/slalom-ggp/dataops-infra#{git_ref}//{infra_type}/{infra_name}",
                local_folder=infra_dir,
            )
    lf = "\n"
    logging.info(f"List of installed modules:\n{lf.join(uio.ls(infra_dir))}")
    init(infra_dir=infra_dir)
    if deploy:
        apply(infra_dir=infra_dir)
Exemple #4
0
def _init_local_spark():
    """Return an initialized local spark object"""
    global spark, sc, thrift

    # context = SparkContext(conf=conf)
    for folder in [SPARK_WAREHOUSE_DIR]:
        uio.create_folder(folder)
    conf = SparkConf()
    hadoop_conf = _get_hadoop_conf()
    for fn in [conf.set]:
        # for fn in [conf.set, SparkContext.setSystemProperty, context.setSystemProperty]:
        for k, v in hadoop_conf.items():
            fn(k, v)
    os.environ["PYSPARK_PYTHON"] = sys.executable
    with logged_block("creating spark session"):
        spark = (
            SparkSession.builder.config(conf=conf)
            .master("local")
            .appName("Python Spark")
            .enableHiveSupport()
            .getOrCreate()
        )
        sc = spark.sparkContext
        # Set the property for the driver. Doesn't work using the same syntax
        # as the executor because the jvm has already been created.
        sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
    if not ENABLE_SQL_JDBC:
        logging.info(f"Skipping Thrift server launch (ENABLE_SQL_JDBC={ENABLE_SQL_JDBC})")
        thrift = None
    else:
        with logged_block("starting Thrift server"):
            java_import(sc._gateway.jvm, "")
            spark_hive = sc._gateway.jvm.org.apache.spark.sql.hive
            thrift_class = spark_hive.thriftserver.HiveThriftServer2
            thrift = thrift_class.startWithContext(spark._jwrapped)
        logging.info("Sleeping while waiting for Thrift Server...")
        time.sleep(1)
    spark.sparkContext.setLogLevel(SPARK_LOG_LEVEL)
    _print_conf_debug(sc)
    if ENV_VAR_SPARK_UDF_MODULE in os.environ:
        add_udf_module(os.environ.get(ENV_VAR_SPARK_UDF_MODULE))
    else:
        logging.info("Skipping loading UDFs (env variable not set)")
    for jar_path in SPARK_EXTRA_AWS_JARS:
        sc.addPyFile(jar_path)
Exemple #5
0
def _init_spark(dockerized=False, with_jupyter=False, daemon=False):
    """Return an initialized spark object"""
    global spark, sc, thrift

    if dockerized:
        container = _init_spark_container(with_jupyter=with_jupyter)
        # context = SparkContext(conf=conf)
        os.environ["PYSPARK_PYTHON"] = sys.executable
        with logged_block("connecting to spark container"):
            spark = SparkSession.builder.master(CONTAINER_ENDPOINT).getOrCreate()
        spark.sparkContext.setLogLevel(SPARK_LOG_LEVEL)
        sc = spark.sparkContext
    elif daemon:
        cmd = f"{sys.executable} -m slalom.dataops.sparkutils start_server"
        wait_test = lambda line: _SERVING_SPARK_REQUESTS in line
        wait_max = 120  # Max wait in seconds
        if with_jupyter:
            cmd = f"{cmd} --with_jupyter"
        runnow.run(cmd, daemon=True, wait_test=wait_test, wait_max=wait_max)
    else:
        _init_local_spark()
Exemple #6
0
def _ecs_wait_for(
    wait_for,
    task_arn,
    cluster,
    region,
    timeout=1200,
    heartbeat_interval=None,
    raise_error=True,
):
    task_id = task_arn.split("/")[-1]
    wait_cmd = f"aws ecs wait tasks-{wait_for} --cluster {cluster} --tasks {task_arn}"
    desc_cmd = f"aws ecs describe-tasks --cluster {cluster} --tasks {task_arn}"

    with logged_block(
            f"waiting for ECS job to reach '{wait_for}' status",
            heartbeat_interval=heartbeat_interval,
    ):
        timeout_time = time.time() + (timeout or MAX_ECS_WAIT)
        return_code, output_text = runnow.run(wait_cmd, raise_error=False)
        while return_code == 255 and time.time() < timeout_time:
            logging.info("aws cli timeout expired. Retrying...")
            return_code, output_text = runnow.run(wait_cmd, raise_error=True)
        if return_code != 0:
            raise RuntimeError(
                f"ECS wait command failed or timed out (return={return_code}).\n"
                f"{output_text}")
    return_code, output_text = runnow.run(desc_cmd, raise_error=False)
    if return_code != 0:
        raise RuntimeError(f"ECS task describe failed.\n{output_text}")

    jsonobj = json.loads(output_text)
    if len(jsonobj.get("tasks",
                       [])) == 0 or len(jsonobj.get("failures", [])) > 0:
        RuntimeError(f"Could not start task ({jsonobj.get('failures', '')})")
    task_arn = jsonobj["tasks"][0]["taskArn"]
    logging.info(
        f"ECS task status: {get_ecs_task_detail_url(region, task_arn, cluster)}"
    )
    logging.info(f"ECS task logs:   {get_ecs_log_url(region, task_arn)}")
    return task_arn
Exemple #7
0
def check_tf_metadata(
    tf_dir,
    recursive: bool = True,
    check_module_headers: bool = True,
    check_input_descriptions: bool = True,
    check_output_descriptions: bool = True,
    required_input_vars: list = [
        "name_prefix", "resource_tags", "environment"
    ],
    required_output_vars: list = ["summary"],
    raise_error=True,
    abspath=False,
):
    """
    Return a dictionary of reference paths to error messages and a dictionary
    of errors to reference paths.
    """
    def _log_issue(module_path, issue_desc, details_list):
        if details_list:
            if issue_desc in error_locations:
                error_locations[issue_desc].extend(details_list)
            else:
                error_locations[issue_desc] = details_list

    error_locations: Dict[str, List[str]] = {}
    with logged_block(
            "checking Terraform modules against repository code standards"):
        modules_metadata = get_tf_metadata(tf_dir, recursive=recursive)
        for module_path, metadata in modules_metadata.items():
            if abspath:
                path_sep = os.path.sep
                module_path = os.path.abspath(module_path)
                module_path = module_path.replace("\\", path_sep).replace(
                    "/", path_sep)
            else:
                path_sep = "/"
            if check_module_headers and not metadata["header"]:
                _log_issue(
                    module_path,
                    "1. Blank module headers",
                    [f"{module_path}{path_sep}main.tf"],
                )
            if required_input_vars:
                issue_details = [
                    f"{module_path}{path_sep}variables.tf:var.{required_input}"
                    for required_input in required_input_vars if required_input
                    not in [var["name"] for var in metadata.get("inputs", {})]
                ]
                _log_issue(module_path, "2. Missing required input variables",
                           issue_details)
            if required_output_vars:
                issue_details = [
                    f"{module_path}{path_sep}outputs.tf:output.{required_output}"
                    for required_output in required_output_vars
                    if required_output not in
                    [var["name"] for var in metadata.get("outputs", {})]
                ]
                _log_issue(module_path, "3. Missing required output variables",
                           issue_details)
            if check_input_descriptions:
                issue_details = [
                    f"{module_path}{path_sep}variables.tf:var.{var['name']}"
                    for var in metadata.get("inputs", {})
                    if not var.get("description")
                ]
                _log_issue(module_path,
                           "4. Missing input variable descriptions",
                           issue_details)
            if check_output_descriptions:
                issue_details = [
                    f"{module_path}{path_sep}outputs.tf:output.{var['name']}"
                    for var in metadata.get("outputs", {})
                    if not var.get("description")
                ]
                _log_issue(module_path,
                           "5. Missing output variable descriptions",
                           issue_details)
    result_str = "\n".join([
        f"\n{k}:\n    - [ ] " + ("\n    - [ ] ".join(error_locations[k]))
        for k in sorted(error_locations.keys())
    ])
    if raise_error and error_locations:
        raise ValueError(
            f"One or more validation errors occurred.\n{result_str}")
    return result_str
Exemple #8
0
def _init_spark_container(spark_image=DOCKER_SPARK_IMAGE, with_jupyter=False):
    global _spark_container

    if _spark_container:
        return _spark_container
    port_map = {
        "4040": "4040",  # App Web UI
        "7077": "7077",  # Standalone master driver
        "8080": "8080",  # Standalone-mode master Web UI
        "8081": "8081",  # Standalone-mode worker Web UI
        "8888": "8888",  # Jupyter Notebook Server
        "10000": "10000",  # Thrift JDBC port for SQL queries
        "18080": "18080",  # History Server Web UI
    }
    uio.set_aws_env_vars()
    env = [
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
        "BATCH_ID=SparkContainerTest",
        "ENABLE_SQL_JDBC=True",
        "METASTORE_TYPE=MySQL",
    ]
    if "AWS_ACCESS_KEY_ID" in os.environ:
        env.append(f"AWS_ACCESS_KEY_ID={os.environ['AWS_ACCESS_KEY_ID']}")
    if "AWS_SECRET_ACCESS_KEY" in os.environ:
        env.append(f"AWS_SECRET_ACCESS_KEY={os.environ['AWS_SECRET_ACCESS_KEY']}")
    docker_client = docker.from_env()  # WSL1
    # docker_client = docker.DockerClient(base_url="npipe:////./pipe/docker_wsl")  # WSL2
    try:
        dock_r.pull(spark_image)
    except Exception as ex:
        logging.warning(f"Could not pull latest Spark image '{spark_image}'. {ex}")
    try:
        old_container = docker_client.containers.get("spark_server")
        if old_container:
            with logged_block("terminating previous 'spark_server' docker container"):
                old_container.stop()
                logging.info("Waiting for cleanup of old Spark container...")
                time.sleep(2)
    except Exception as _:
        pass
    spark_image_cmd = "sparkutils start_server"
    if with_jupyter:
        spark_image_cmd = f"{spark_image_cmd} --with_jupyter"
    _spark_container = docker_client.containers.run(
        image=spark_image,
        name="spark_server",
        command=spark_image_cmd,
        detach=True,
        auto_remove=True,
        ports=port_map,
        environment=env,
        # stream=True,
    )
    logging.info(
        f"Attempting to initialize Spark docker container "
        f"(status={_spark_container.status})..."
    )
    MAX_WAIT_TIME = int(60 * 5)
    start = time.time()
    for line in _spark_container.logs(stream=True, until=int(start + MAX_WAIT_TIME)):
        logging.info(f"SPARK CONTAINER LOG: {line.decode('utf-8').rstrip()}")
        # time.sleep(0.2)
        if _SERVING_SPARK_REQUESTS in line.decode("utf-8"):
            logging.info(
                f"Spark container reported success after "
                f"{int(time.time() - start)} seconds"
            )
            break
        elif time.time() > start + MAX_WAIT_TIME:
            logging.info(f"Max timeout wait exceeded ({MAX_WAIT_TIME} seconds)")
            break
    if _spark_container.status in ["running", "created"]:
        return _spark_container
    else:
        raise RuntimeError(
            "Spark docker container exited unexpectedly "
            f"(status={_spark_container.status})."
        )