def start_fn(): provider = os.environ["CORTEX_PROVIDER"] project_dir = os.environ["CORTEX_PROJECT_DIR"] spec_path = os.environ["CORTEX_API_SPEC"] model_dir = os.getenv("CORTEX_MODEL_DIR") cache_dir = os.getenv("CORTEX_CACHE_DIR") region = os.getenv("AWS_REGION") tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: with LockedFile("/run/used_ports.json", "r+") as f: used_ports = json.load(f) for port in used_ports.keys(): if not used_ports[port]: tf_serving_port = port used_ports[port] = True break f.seek(0) json.dump(used_ports, f) f.truncate() try: api = get_api(provider, spec_path, model_dir, cache_dir, region) client = api.predictor.initialize_client( tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port ) with FileLock("/run/init_stagger.lock"): logger.info("loading the predictor from {}".format(api.predictor.path)) predictor_impl = api.predictor.initialize_impl(project_dir, client) local_cache["api"] = api local_cache["provider"] = provider local_cache["client"] = client local_cache["predictor_impl"] = predictor_impl local_cache["predict_fn_args"] = inspect.getfullargspec(predictor_impl.predict).args if api.server_side_batching_enabled: dynamic_batching_config = api.api_spec["predictor"]["server_side_batching"] local_cache["dynamic_batcher"] = DynamicBatcher( predictor_impl, max_batch_size=dynamic_batching_config["max_batch_size"], batch_interval=dynamic_batching_config["batch_interval"] / NANOSECONDS_IN_SECOND, # convert nanoseconds to seconds ) if util.has_method(predictor_impl, "post_predict"): local_cache["post_predict_fn_args"] = inspect.getfullargspec( predictor_impl.post_predict ).args predict_route = "/predict" local_cache["predict_route"] = predict_route except: logger.exception("failed to start api") sys.exit(1) app.add_api_route(local_cache["predict_route"], predict, methods=["POST"]) app.add_api_route(local_cache["predict_route"], get_summary, methods=["GET"]) return app
def start(): while not pathlib.Path("/mnt/workspace/init_script_run.txt").is_file(): time.sleep(0.2) cache_dir = os.environ["CORTEX_CACHE_DIR"] api_spec_path = os.environ["CORTEX_API_SPEC"] job_spec_path = os.environ["CORTEX_JOB_SPEC"] project_dir = os.environ["CORTEX_PROJECT_DIR"] model_dir = os.getenv("CORTEX_MODEL_DIR") tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") region = os.getenv("AWS_REGION") has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: with LockedFile("/run/used_ports.json", "r+") as f: used_ports = json.load(f) for port in used_ports.keys(): if not used_ports[port]: tf_serving_port = port used_ports[port] = True break f.seek(0) json.dump(used_ports, f) f.truncate() api = get_api(api_spec_path, model_dir, cache_dir, region) storage, _ = get_spec(api_spec_path, cache_dir, region) job_spec = get_job_spec(storage, cache_dir, job_spec_path) sqs_client = boto3.client("sqs", region_name=region) client = api.predictor.initialize_client(tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port) try: log.info("loading the predictor from {}".format(api.predictor.path)) metrics_client = MetricsClient(api.statsd) predictor_impl = api.predictor.initialize_impl( project_dir=project_dir, client=client, metrics_client=metrics_client, job_spec=job_spec, ) except UserRuntimeException as err: err.wrap(f"failed to start job {job_spec['job_id']}") log.error(str(err), exc_info=True) sys.exit(1) except Exception as err: capture_exception(err) log.error(f"failed to start job {job_spec['job_id']}", exc_info=True) sys.exit(1) # crons only stop if an unhandled exception occurs def check_if_crons_have_failed(): while True: for cron in api.predictor.crons: if not cron.is_alive(): os.kill(os.getpid(), signal.SIGQUIT) time.sleep(1) threading.Thread(target=check_if_crons_have_failed, daemon=True).start() local_cache["api"] = api local_cache["job_spec"] = job_spec local_cache["predictor_impl"] = predictor_impl local_cache["predict_fn_args"] = inspect.getfullargspec( predictor_impl.predict).args local_cache["sqs_client"] = sqs_client open("/mnt/workspace/api_readiness.txt", "a").close() log.info("polling for batches...") try: sqs_handler = SQSHandler( sqs_client=sqs_client, queue_url=job_spec["sqs_url"], renewal_period=MESSAGE_RENEWAL_PERIOD, visibility_timeout=INITIAL_MESSAGE_VISIBILITY, not_found_sleep_time=MESSAGE_NOT_FOUND_SLEEP, message_wait_time=SQS_POLL_WAIT_TIME, dead_letter_queue_url=job_spec.get("sqs_dead_letter_queue"), stop_if_no_messages=True, ) sqs_handler.start( message_fn=handle_batch_message, message_failure_fn=handle_batch_failure, on_job_complete_fn=handle_on_job_complete, ) except UserRuntimeException as err: err.wrap(f"failed to run job {job_spec['job_id']}") log.error(str(err), exc_info=True) sys.exit(1) except Exception as err: capture_exception(err) log.error(f"failed to run job {job_spec['job_id']}", exc_info=True) sys.exit(1)
def start(): while not pathlib.Path("/mnt/workspace/init_script_run.txt").is_file(): time.sleep(0.2) cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] api_spec_path = os.environ["CORTEX_API_SPEC"] job_spec_path = os.environ["CORTEX_JOB_SPEC"] project_dir = os.environ["CORTEX_PROJECT_DIR"] model_dir = os.getenv("CORTEX_MODEL_DIR") tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") region = os.getenv("AWS_REGION") has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: with LockedFile("/run/used_ports.json", "r+") as f: used_ports = json.load(f) for port in used_ports.keys(): if not used_ports[port]: tf_serving_port = port used_ports[port] = True break f.seek(0) json.dump(used_ports, f) f.truncate() api = get_api(provider, api_spec_path, model_dir, cache_dir, region) storage, api_spec = get_spec(provider, api_spec_path, cache_dir, region) job_spec = get_job_spec(storage, cache_dir, job_spec_path) try: client = api.predictor.initialize_client( tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port) logger.info("loading the predictor from {}".format(api.predictor.path)) predictor_impl = api.predictor.initialize_impl(project_dir, client, job_spec) except CortexException as err: err.wrap(f"failed to start job {job_spec['job_id']}") logger.error(str(err), exc_info=True) sys.exit(1) except: logger.error(f"failed to start job {job_spec['job_id']}", exc_info=True) sys.exit(1) local_cache["api_spec"] = api local_cache["provider"] = provider local_cache["job_spec"] = job_spec local_cache["predictor_impl"] = predictor_impl local_cache["predict_fn_args"] = inspect.getfullargspec( predictor_impl.predict).args local_cache["sqs_client"] = boto3.client("sqs", region_name=region) open("/mnt/workspace/api_readiness.txt", "a").close() logger.info("polling for batches...") try: sqs_loop() except CortexException as err: err.wrap(f"failed to run job {job_spec['job_id']}") logger.error(str(err), exc_info=True) sys.exit(1) except: logger.error(f"failed to run job {job_spec['job_id']}", exc_info=True) sys.exit(1)
def start_fn(): provider = os.environ["CORTEX_PROVIDER"] project_dir = os.environ["CORTEX_PROJECT_DIR"] spec_path = os.environ["CORTEX_API_SPEC"] model_dir = os.getenv("CORTEX_MODEL_DIR") cache_dir = os.getenv("CORTEX_CACHE_DIR") region = os.getenv("AWS_REGION") tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") try: has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: with LockedFile("/run/used_ports.json", "r+") as f: used_ports = json.load(f) for port in used_ports.keys(): if not used_ports[port]: tf_serving_port = port used_ports[port] = True break f.seek(0) json.dump(used_ports, f) f.truncate() api = get_api(provider, spec_path, model_dir, cache_dir, region) client = api.predictor.initialize_client( tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port) with FileLock("/run/init_stagger.lock"): logger.info("loading the predictor from {}".format( api.predictor.path)) predictor_impl = api.predictor.initialize_impl(project_dir, client) # crons only stop if an unhandled exception occurs def check_if_crons_have_failed(): while True: for cron in api.predictor.crons: if not cron.is_alive(): os.kill(os.getpid(), signal.SIGQUIT) time.sleep(1) threading.Thread(target=check_if_crons_have_failed, daemon=True).start() local_cache["api"] = api local_cache["provider"] = provider local_cache["client"] = client local_cache["predictor_impl"] = predictor_impl local_cache["predict_fn_args"] = inspect.getfullargspec( predictor_impl.predict).args if api.python_server_side_batching_enabled: dynamic_batching_config = api.api_spec["predictor"][ "server_side_batching"] local_cache["dynamic_batcher"] = DynamicBatcher( predictor_impl, max_batch_size=dynamic_batching_config["max_batch_size"], batch_interval=dynamic_batching_config["batch_interval"] / NANOSECONDS_IN_SECOND, # convert nanoseconds to seconds ) if util.has_method(predictor_impl, "post_predict"): local_cache["post_predict_fn_args"] = inspect.getfullargspec( predictor_impl.post_predict).args predict_route = "/predict" local_cache["predict_route"] = predict_route except (UserRuntimeException, Exception) as err: if not isinstance(err, UserRuntimeException): capture_exception(err) logger.exception("failed to start api") sys.exit(1) app.add_api_route(local_cache["predict_route"], predict, methods=["POST"]) app.add_api_route(local_cache["predict_route"], get_summary, methods=["GET"]) return app
def init(): project_dir = os.environ["CORTEX_PROJECT_DIR"] spec_path = os.environ["CORTEX_API_SPEC"] model_dir = os.getenv("CORTEX_MODEL_DIR") cache_dir = os.getenv("CORTEX_CACHE_DIR") region = os.getenv("AWS_REGION") tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: with LockedFile("/run/used_ports.json", "r+") as f: used_ports = json.load(f) for port in used_ports.keys(): if not used_ports[port]: tf_serving_port = port used_ports[port] = True break f.seek(0) json.dump(used_ports, f) f.truncate() api = get_api(spec_path, model_dir, cache_dir, region) config: Dict[str, Any] = { "api": None, "client": None, "predictor_impl": None, "module_proto_pb2_grpc": None, } proto_without_ext = pathlib.Path(api.predictor.protobuf_path).stem module_proto_pb2 = importlib.import_module(proto_without_ext + "_pb2") module_proto_pb2_grpc = importlib.import_module(proto_without_ext + "_pb2_grpc") client = api.predictor.initialize_client(tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port) with FileLock("/run/init_stagger.lock"): logger.info("loading the predictor from {}".format(api.predictor.path)) metrics_client = MetricsClient(api.statsd) predictor_impl = api.predictor.initialize_impl( project_dir=project_dir, client=client, metrics_client=metrics_client, proto_module_pb2=module_proto_pb2, ) # crons only stop if an unhandled exception occurs def check_if_crons_have_failed(): while True: for cron in api.predictor.crons: if not cron.is_alive(): os.kill(os.getpid(), signal.SIGQUIT) time.sleep(1) threading.Thread(target=check_if_crons_have_failed, daemon=True).start() ServicerClass = get_servicer_from_module(module_proto_pb2_grpc) class PredictorServicer(ServicerClass): def __init__(self, predict_fn_args, predictor_impl, api): self.predict_fn_args = predict_fn_args self.predictor_impl = predictor_impl self.api = api def Predict(self, payload, context): try: kwargs = build_predict_kwargs(self.predict_fn_args, payload, context) response = self.predictor_impl.predict(**kwargs) self.api.post_status_code_request_metrics(200) except Exception: logger.error(traceback.format_exc()) self.api.post_status_code_request_metrics(500) context.abort(grpc.StatusCode.INTERNAL, "internal server error") return response config["api"] = api config["client"] = client config["predictor_impl"] = predictor_impl config["predict_fn_args"] = inspect.getfullargspec( predictor_impl.predict).args config["module_proto_pb2"] = module_proto_pb2 config["module_proto_pb2_grpc"] = module_proto_pb2_grpc config["predictor_servicer"] = PredictorServicer return config
def start(): while not pathlib.Path("/mnt/workspace/init_script_run.txt").is_file(): time.sleep(0.2) cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] api_spec_path = os.environ["CORTEX_API_SPEC"] job_spec_path = os.environ["CORTEX_JOB_SPEC"] project_dir = os.environ["CORTEX_PROJECT_DIR"] model_dir = os.getenv("CORTEX_MODEL_DIR") tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") region = os.getenv("AWS_REGION") has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: with LockedFile("/run/used_ports.json", "r+") as f: used_ports = json.load(f) for port in used_ports.keys(): if not used_ports[port]: tf_serving_port = port used_ports[port] = True break f.seek(0) json.dump(used_ports, f) f.truncate() api = get_api(provider, api_spec_path, model_dir, cache_dir, region) storage, _ = get_spec(provider, api_spec_path, cache_dir, region) job_spec = get_job_spec(storage, cache_dir, job_spec_path) client = api.predictor.initialize_client(tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port) try: logger.info("loading the predictor from {}".format(api.predictor.path)) metrics_client = MetricsClient(api.statsd) predictor_impl = api.predictor.initialize_impl( project_dir=project_dir, client=client, metrics_client=metrics_client, job_spec=job_spec, ) except UserRuntimeException as err: err.wrap(f"failed to start job {job_spec['job_id']}") logger.error(str(err), exc_info=True) sys.exit(1) except Exception as err: capture_exception(err) logger.error(f"failed to start job {job_spec['job_id']}", exc_info=True) sys.exit(1) # crons only stop if an unhandled exception occurs def check_if_crons_have_failed(): while True: for cron in api.predictor.crons: if not cron.is_alive(): os.kill(os.getpid(), signal.SIGQUIT) time.sleep(1) threading.Thread(target=check_if_crons_have_failed, daemon=True).start() local_cache["api"] = api local_cache["provider"] = provider local_cache["job_spec"] = job_spec local_cache["predictor_impl"] = predictor_impl local_cache["predict_fn_args"] = inspect.getfullargspec( predictor_impl.predict).args local_cache["sqs_client"] = boto3.client("sqs", region_name=region) open("/mnt/workspace/api_readiness.txt", "a").close() logger.info("polling for batches...") try: sqs_loop() except UserRuntimeException as err: err.wrap(f"failed to run job {job_spec['job_id']}") logger.error(str(err), exc_info=True) sys.exit(1) except Exception as err: capture_exception(err) logger.error(f"failed to run job {job_spec['job_id']}", exc_info=True) sys.exit(1)