def start_server(dockerized: bool = None, with_jupyter: bool = True, daemon: bool = None): if dockerized is None: dockerized = ( False if any(["SPARK_HOME" in os.environ, "HADOOP_CONF_DIR" in os.environ]) else True ) if daemon is None: daemon = dockerized if dockerized: container = _init_spark_container(with_jupyter=with_jupyter) if daemon: logging.info("Now serving spark requests via docker.") else: with logged_block("hosting spark container"): while True: time.sleep(30) else: _init_spark(dockerized=False, with_jupyter=with_jupyter, daemon=daemon) logging.info( "Spark server started. " "Monitor via http://localhost:4040 or http://127.0.0.1:4040" ) # if with_jupyter: # start_jupyter() # else: # logging.info("Skipping Jupyter notebooks server launch...") if not daemon: with logged_block(_SERVING_SPARK_REQUESTS): # NOTE: When run containerized, the above message triggers # the host to stop echoing logs while True: time.sleep(30)
def smart_build( dockerfile_path: str, tag_as=None, push_core=True, push_final=False, with_login=False, addl_args=None, ignore_caches=False, ): """ Builds the dockerfile if needed but pulls it from the remote if possible. """ if bool(with_login): login() tag_as = _to_list(tag_as) result = smart_split(dockerfile_path, tag_as, addl_args=addl_args) image_core, dockerfile_path_core, image_derived, dockerfile_path_derived = result if not ignore_caches: if dockerfile_path_derived is None and exists_remotely(image_core): logging.info( "Image with matching hash already exists " "and no host files are referenced in Dockerfile." f"Attempting to retag existing image '{image_core}' as '{tag_as}'..." ) remote_retag( image_name=image_core.split(":")[0], existing_tag=image_core.split(":")[1], tag_as=tag_as, ) return pull(image_core, skip_if_exists=True, silent=True) if ignore_caches or not exists_locally(image_core): with logged_block(f"building interim (core) image as '{image_core}'"): build(dockerfile_path_core, image_core, addl_args=addl_args) if push_core: if ignore_caches or not exists_remotely(image_core): with logged_block( f"pushing interim (core) image '{image_derived}'"): push(image_core) else: logging.info( f"Already exists. Skipping push of image '{image_derived}'") with logged_block( f"building '{dockerfile_path_derived}' as '{image_derived}'"): if dockerfile_path_derived: build(dockerfile_path_derived, image_derived, addl_args=addl_args) else: tag(image_core, image_derived) if tag_as: tag(image_derived, tag_as) if push_final: for image_name in tag_as: push(image_name)
def install(*args, infra_dir="./infra", deploy=False, git_ref="master"): """ Usage example: ``` s-infra install catalog:aws-prereqs --infra_dir=infra/prereqs --deploy=True s-infra install samples:aws --infra_dir=infra --deploy=True ``` Which is identical to: ``` s-infra install catalog:aws-prereqs --infra_dir=infra/prereqs s-infra init+apply --infra_dir=infra/prereqs s-infra install samples:aws --infra_dir=infra s-infra init+apply --infra_dir=infra ``` """ uio.create_folder(infra_dir) for arg in args: with logged_block(f"installing terraform modules from '{arg}'"): infra_type, infra_name = arg.split(":") if infra_type not in ["catalog", "samples"]: raise ValueError( f"Expected infra_type to be one of: 'catalog', 'samples'. Received type: {infra_type}" ) uio.download_folder( remote_folder= f"git://github.com/slalom-ggp/dataops-infra#{git_ref}//{infra_type}/{infra_name}", local_folder=infra_dir, ) lf = "\n" logging.info(f"List of installed modules:\n{lf.join(uio.ls(infra_dir))}") init(infra_dir=infra_dir) if deploy: apply(infra_dir=infra_dir)
def _init_local_spark(): """Return an initialized local spark object""" global spark, sc, thrift # context = SparkContext(conf=conf) for folder in [SPARK_WAREHOUSE_DIR]: uio.create_folder(folder) conf = SparkConf() hadoop_conf = _get_hadoop_conf() for fn in [conf.set]: # for fn in [conf.set, SparkContext.setSystemProperty, context.setSystemProperty]: for k, v in hadoop_conf.items(): fn(k, v) os.environ["PYSPARK_PYTHON"] = sys.executable with logged_block("creating spark session"): spark = ( SparkSession.builder.config(conf=conf) .master("local") .appName("Python Spark") .enableHiveSupport() .getOrCreate() ) sc = spark.sparkContext # Set the property for the driver. Doesn't work using the same syntax # as the executor because the jvm has already been created. sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true") if not ENABLE_SQL_JDBC: logging.info(f"Skipping Thrift server launch (ENABLE_SQL_JDBC={ENABLE_SQL_JDBC})") thrift = None else: with logged_block("starting Thrift server"): java_import(sc._gateway.jvm, "") spark_hive = sc._gateway.jvm.org.apache.spark.sql.hive thrift_class = spark_hive.thriftserver.HiveThriftServer2 thrift = thrift_class.startWithContext(spark._jwrapped) logging.info("Sleeping while waiting for Thrift Server...") time.sleep(1) spark.sparkContext.setLogLevel(SPARK_LOG_LEVEL) _print_conf_debug(sc) if ENV_VAR_SPARK_UDF_MODULE in os.environ: add_udf_module(os.environ.get(ENV_VAR_SPARK_UDF_MODULE)) else: logging.info("Skipping loading UDFs (env variable not set)") for jar_path in SPARK_EXTRA_AWS_JARS: sc.addPyFile(jar_path)
def _init_spark(dockerized=False, with_jupyter=False, daemon=False): """Return an initialized spark object""" global spark, sc, thrift if dockerized: container = _init_spark_container(with_jupyter=with_jupyter) # context = SparkContext(conf=conf) os.environ["PYSPARK_PYTHON"] = sys.executable with logged_block("connecting to spark container"): spark = SparkSession.builder.master(CONTAINER_ENDPOINT).getOrCreate() spark.sparkContext.setLogLevel(SPARK_LOG_LEVEL) sc = spark.sparkContext elif daemon: cmd = f"{sys.executable} -m slalom.dataops.sparkutils start_server" wait_test = lambda line: _SERVING_SPARK_REQUESTS in line wait_max = 120 # Max wait in seconds if with_jupyter: cmd = f"{cmd} --with_jupyter" runnow.run(cmd, daemon=True, wait_test=wait_test, wait_max=wait_max) else: _init_local_spark()
def _ecs_wait_for( wait_for, task_arn, cluster, region, timeout=1200, heartbeat_interval=None, raise_error=True, ): task_id = task_arn.split("/")[-1] wait_cmd = f"aws ecs wait tasks-{wait_for} --cluster {cluster} --tasks {task_arn}" desc_cmd = f"aws ecs describe-tasks --cluster {cluster} --tasks {task_arn}" with logged_block( f"waiting for ECS job to reach '{wait_for}' status", heartbeat_interval=heartbeat_interval, ): timeout_time = time.time() + (timeout or MAX_ECS_WAIT) return_code, output_text = runnow.run(wait_cmd, raise_error=False) while return_code == 255 and time.time() < timeout_time: logging.info("aws cli timeout expired. Retrying...") return_code, output_text = runnow.run(wait_cmd, raise_error=True) if return_code != 0: raise RuntimeError( f"ECS wait command failed or timed out (return={return_code}).\n" f"{output_text}") return_code, output_text = runnow.run(desc_cmd, raise_error=False) if return_code != 0: raise RuntimeError(f"ECS task describe failed.\n{output_text}") jsonobj = json.loads(output_text) if len(jsonobj.get("tasks", [])) == 0 or len(jsonobj.get("failures", [])) > 0: RuntimeError(f"Could not start task ({jsonobj.get('failures', '')})") task_arn = jsonobj["tasks"][0]["taskArn"] logging.info( f"ECS task status: {get_ecs_task_detail_url(region, task_arn, cluster)}" ) logging.info(f"ECS task logs: {get_ecs_log_url(region, task_arn)}") return task_arn
def check_tf_metadata( tf_dir, recursive: bool = True, check_module_headers: bool = True, check_input_descriptions: bool = True, check_output_descriptions: bool = True, required_input_vars: list = [ "name_prefix", "resource_tags", "environment" ], required_output_vars: list = ["summary"], raise_error=True, abspath=False, ): """ Return a dictionary of reference paths to error messages and a dictionary of errors to reference paths. """ def _log_issue(module_path, issue_desc, details_list): if details_list: if issue_desc in error_locations: error_locations[issue_desc].extend(details_list) else: error_locations[issue_desc] = details_list error_locations: Dict[str, List[str]] = {} with logged_block( "checking Terraform modules against repository code standards"): modules_metadata = get_tf_metadata(tf_dir, recursive=recursive) for module_path, metadata in modules_metadata.items(): if abspath: path_sep = os.path.sep module_path = os.path.abspath(module_path) module_path = module_path.replace("\\", path_sep).replace( "/", path_sep) else: path_sep = "/" if check_module_headers and not metadata["header"]: _log_issue( module_path, "1. Blank module headers", [f"{module_path}{path_sep}main.tf"], ) if required_input_vars: issue_details = [ f"{module_path}{path_sep}variables.tf:var.{required_input}" for required_input in required_input_vars if required_input not in [var["name"] for var in metadata.get("inputs", {})] ] _log_issue(module_path, "2. Missing required input variables", issue_details) if required_output_vars: issue_details = [ f"{module_path}{path_sep}outputs.tf:output.{required_output}" for required_output in required_output_vars if required_output not in [var["name"] for var in metadata.get("outputs", {})] ] _log_issue(module_path, "3. Missing required output variables", issue_details) if check_input_descriptions: issue_details = [ f"{module_path}{path_sep}variables.tf:var.{var['name']}" for var in metadata.get("inputs", {}) if not var.get("description") ] _log_issue(module_path, "4. Missing input variable descriptions", issue_details) if check_output_descriptions: issue_details = [ f"{module_path}{path_sep}outputs.tf:output.{var['name']}" for var in metadata.get("outputs", {}) if not var.get("description") ] _log_issue(module_path, "5. Missing output variable descriptions", issue_details) result_str = "\n".join([ f"\n{k}:\n - [ ] " + ("\n - [ ] ".join(error_locations[k])) for k in sorted(error_locations.keys()) ]) if raise_error and error_locations: raise ValueError( f"One or more validation errors occurred.\n{result_str}") return result_str
def _init_spark_container(spark_image=DOCKER_SPARK_IMAGE, with_jupyter=False): global _spark_container if _spark_container: return _spark_container port_map = { "4040": "4040", # App Web UI "7077": "7077", # Standalone master driver "8080": "8080", # Standalone-mode master Web UI "8081": "8081", # Standalone-mode worker Web UI "8888": "8888", # Jupyter Notebook Server "10000": "10000", # Thrift JDBC port for SQL queries "18080": "18080", # History Server Web UI } uio.set_aws_env_vars() env = [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "BATCH_ID=SparkContainerTest", "ENABLE_SQL_JDBC=True", "METASTORE_TYPE=MySQL", ] if "AWS_ACCESS_KEY_ID" in os.environ: env.append(f"AWS_ACCESS_KEY_ID={os.environ['AWS_ACCESS_KEY_ID']}") if "AWS_SECRET_ACCESS_KEY" in os.environ: env.append(f"AWS_SECRET_ACCESS_KEY={os.environ['AWS_SECRET_ACCESS_KEY']}") docker_client = docker.from_env() # WSL1 # docker_client = docker.DockerClient(base_url="npipe:////./pipe/docker_wsl") # WSL2 try: dock_r.pull(spark_image) except Exception as ex: logging.warning(f"Could not pull latest Spark image '{spark_image}'. {ex}") try: old_container = docker_client.containers.get("spark_server") if old_container: with logged_block("terminating previous 'spark_server' docker container"): old_container.stop() logging.info("Waiting for cleanup of old Spark container...") time.sleep(2) except Exception as _: pass spark_image_cmd = "sparkutils start_server" if with_jupyter: spark_image_cmd = f"{spark_image_cmd} --with_jupyter" _spark_container = docker_client.containers.run( image=spark_image, name="spark_server", command=spark_image_cmd, detach=True, auto_remove=True, ports=port_map, environment=env, # stream=True, ) logging.info( f"Attempting to initialize Spark docker container " f"(status={_spark_container.status})..." ) MAX_WAIT_TIME = int(60 * 5) start = time.time() for line in _spark_container.logs(stream=True, until=int(start + MAX_WAIT_TIME)): logging.info(f"SPARK CONTAINER LOG: {line.decode('utf-8').rstrip()}") # time.sleep(0.2) if _SERVING_SPARK_REQUESTS in line.decode("utf-8"): logging.info( f"Spark container reported success after " f"{int(time.time() - start)} seconds" ) break elif time.time() > start + MAX_WAIT_TIME: logging.info(f"Max timeout wait exceeded ({MAX_WAIT_TIME} seconds)") break if _spark_container.status in ["running", "created"]: return _spark_container else: raise RuntimeError( "Spark docker container exited unexpectedly " f"(status={_spark_container.status})." )