def main(): with open("/src/cortex/serve/log_config.yaml", "r") as f: log_config = yaml.load(f, yaml.FullLoader) # get API spec cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) # load tensorflow models into TFS if raw_api_spec["predictor"]["type"] == "tensorflow": load_tensorflow_serving_models() # https://github.com/encode/uvicorn/blob/master/uvicorn/config.py uvicorn.run( "cortex.serve.wsgi:app", host="0.0.0.0", port=int(os.environ["CORTEX_SERVING_PORT"]), workers=int(os.environ["CORTEX_WORKERS_PER_REPLICA"]), limit_concurrency=int(os.environ["CORTEX_MAX_WORKER_CONCURRENCY"]), backlog=int(os.environ["CORTEX_SO_MAX_CONN"]), log_config=log_config, log_level="info", )
def main(): # wait until neuron-rtd sidecar is ready uses_inferentia = os.getenv("CORTEX_ACTIVE_NEURON") if uses_inferentia: wait_neuron_rtd() # strictly for Inferentia has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: base_serving_port = int(os.environ["CORTEX_TF_BASE_SERVING_PORT"]) num_processes = int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]) used_ports = {} for w in range(int(num_processes)): used_ports[str(base_serving_port + w)] = False with open("/run/used_ports.json", "w+") as f: json.dump(used_ports, f) # get API spec cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) # load tensorflow models into TFS if raw_api_spec["predictor"]["type"] == "tensorflow": load_tensorflow_serving_models()
def start(): cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] api_spec_path = os.environ["CORTEX_API_SPEC"] job_spec_path = os.environ["CORTEX_JOB_SPEC"] project_dir = os.environ["CORTEX_PROJECT_DIR"] model_dir = os.getenv("CORTEX_MODEL_DIR") tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: with FileLock("/run/used_ports.json.lock"): with open("/run/used_ports.json", "r+") as f: used_ports = json.load(f) for port in used_ports.keys(): if not used_ports[port]: tf_serving_port = port used_ports[port] = True break f.seek(0) json.dump(used_ports, f) f.truncate() raw_api_spec = get_spec(provider, storage, cache_dir, api_spec_path) job_spec = get_job_spec(storage, cache_dir, job_spec_path) api = API(provider=provider, storage=storage, model_dir=model_dir, cache_dir=cache_dir, **raw_api_spec) client = api.predictor.initialize_client(tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port) cx_logger().info("loading the predictor from {}".format( api.predictor.path)) predictor_impl = api.predictor.initialize_impl(project_dir, client, raw_api_spec, job_spec) local_cache["api_spec"] = api local_cache["provider"] = provider local_cache["job_spec"] = job_spec local_cache["predictor_impl"] = predictor_impl local_cache["predict_fn_args"] = inspect.getfullargspec( predictor_impl.predict).args local_cache["sqs_client"] = boto3.client( "sqs", region_name=os.environ["AWS_REGION"]) open("/mnt/workspace/api_readiness.txt", "a").close() cx_logger().info("polling for batches...") sqs_loop()
def main(): with open("/src/cortex/serve/log_config.yaml", "r") as f: log_config = yaml.load(f, yaml.FullLoader) # wait until neuron-rtd sidecar is ready uses_inferentia = os.getenv("CORTEX_ACTIVE_NEURON") if uses_inferentia: wait_neuron_rtd() # strictly for Inferentia has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: base_serving_port = int(os.environ["CORTEX_TF_BASE_SERVING_PORT"]) num_processes = int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]) used_ports = {} for w in range(int(num_processes)): used_ports[str(base_serving_port + w)] = False with open("/run/used_ports.json", "w+") as f: json.dump(used_ports, f) # get API spec cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) # load tensorflow models into TFS if raw_api_spec["predictor"]["type"] == "tensorflow": load_tensorflow_serving_models() if raw_api_spec["kind"] == "RealtimeAPI": # https://github.com/encode/uvicorn/blob/master/uvicorn/config.py uvicorn.run( "cortex.serve.wsgi:app", host="0.0.0.0", port=int(os.environ["CORTEX_SERVING_PORT"]), workers=int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]), limit_concurrency=int(os.environ["CORTEX_MAX_PROCESS_CONCURRENCY"] ), # this is a per process limit backlog=int(os.environ["CORTEX_SO_MAX_CONN"]), log_config=log_config, log_level="info", ) else: from cortex.serve import batch batch.start()
def start_fn(): cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] project_dir = os.environ["CORTEX_PROJECT_DIR"] model_dir = os.getenv("CORTEX_MODEL_DIR") tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: with FileLock("/run/used_ports.json.lock"): with open("/run/used_ports.json", "r+") as f: used_ports = json.load(f) for port in used_ports.keys(): if not used_ports[port]: tf_serving_port = port used_ports[port] = True break f.seek(0) json.dump(used_ports, f) f.truncate() try: raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) api = API( provider=provider, storage=storage, model_dir=model_dir, cache_dir=cache_dir, **raw_api_spec, ) client = api.predictor.initialize_client( tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port) cx_logger().info("loading the predictor from {}".format( api.predictor.path)) predictor_impl = api.predictor.initialize_impl(project_dir, client) local_cache["api"] = api local_cache["provider"] = provider local_cache["client"] = client local_cache["predictor_impl"] = predictor_impl local_cache["predict_fn_args"] = inspect.getfullargspec( predictor_impl.predict).args predict_route = "/" if provider != "local": predict_route = "/predict" local_cache["predict_route"] = predict_route except: cx_logger().exception("failed to start api") sys.exit(1) if (provider != "local" and api.monitoring is not None and api.monitoring.model_type == "classification"): try: local_cache["class_set"] = api.get_cached_classes() except: cx_logger().warn( "an error occurred while attempting to load classes", exc_info=True) app.add_api_route(local_cache["predict_route"], predict, methods=["POST"]) app.add_api_route(local_cache["predict_route"], get_summary, methods=["GET"]) return app