def start(args): ctx = Context(s3_path=args.context, cache_dir=args.cache_dir, workload_id=args.workload_id) package.install_packages(ctx.python_packages, ctx.bucket) api = ctx.apis_id_map[args.api] model = ctx.models[api["model_name"]] tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"]) local_cache["ctx"] = ctx local_cache["api"] = api local_cache["model"] = model if not os.path.isdir(args.model_dir): aws.download_and_extract_zip(model["key"], args.model_dir, ctx.bucket) for column_name in model["feature_columns"] + [model["target_column"]]: if ctx.is_transformed_column(column_name): trans_impl, _ = ctx.get_transformer_impl(column_name) local_cache["trans_impls"][column_name] = trans_impl transformed_column = ctx.transformed_columns[column_name] input_args_schema = transformed_column["inputs"]["args"] # cache aggregates and constants in memory if input_args_schema is not None: local_cache["transform_args_cache"][ column_name] = ctx.populate_args(input_args_schema) channel = implementations.insecure_channel("localhost", args.tf_serve_port) local_cache[ "stub"] = prediction_service_pb2.beta_create_PredictionService_stub( channel) local_cache["required_inputs"] = tf_lib.get_base_input_columns( model["name"], ctx) # wait a bit for tf serving to start before querying metadata limit = 600 for i in range(limit): try: local_cache["metadata"] = run_get_model_metadata() break except Exception as e: if i == limit - 1: logger.exception( "An error occurred, see `cx logs api {}` for more details." .format(api["name"])) sys.exit(1) time.sleep(1) logger.info("Serving model: {}".format(model["name"])) serve(app, listen="*:{}".format(args.port))
def start(args): ctx = Context(s3_path=args.context, cache_dir=args.cache_dir, workload_id=args.workload_id) api = ctx.apis_id_map[args.api] local_cache["api"] = api local_cache["ctx"] = ctx if api.get("request_handler_impl_key") is not None: local_cache["request_handler"] = ctx.get_request_handler_impl( api["name"]) if not util.is_resource_ref(api["model"]): if api.get("request_handler") is not None: package.install_packages(ctx.python_packages, ctx.storage) if not os.path.isdir(args.model_dir): ctx.storage.download_and_unzip_external(api["model"], args.model_dir) else: package.install_packages(ctx.python_packages, ctx.storage) model_name = util.get_resource_ref(api["model"]) model = ctx.models[model_name] estimator = ctx.estimators[model["estimator"]] local_cache["model"] = model local_cache["estimator"] = estimator local_cache["target_col"] = ctx.columns[util.get_resource_ref( model["target_column"])] local_cache["target_col_type"] = ctx.get_inferred_column_type( util.get_resource_ref(model["target_column"])) log_level = "DEBUG" if ctx.environment is not None and ctx.environment.get( "log_level") is not None: log_level = ctx.environment["log_level"].get("tensorflow", "DEBUG") tf_lib.set_logging_verbosity(log_level) if not os.path.isdir(args.model_dir): ctx.storage.download_and_unzip(model["key"], args.model_dir) for column_name in ctx.extract_column_names( [model["input"], model["target_column"]]): if ctx.is_transformed_column(column_name): trans_impl, _ = ctx.get_transformer_impl(column_name) local_cache["trans_impls"][column_name] = trans_impl transformed_column = ctx.transformed_columns[column_name] # cache aggregate values for resource_name in util.extract_resource_refs( transformed_column["input"]): if resource_name in ctx.aggregates: ctx.get_obj(ctx.aggregates[resource_name]["key"]) local_cache["required_inputs"] = tf_lib.get_base_input_columns( model["name"], ctx) if util.is_dict(model["input"]) and model["input"].get( "target_vocab") is not None: local_cache["target_vocab_populated"] = ctx.populate_values( model["input"]["target_vocab"], None, False) try: validate_model_dir(args.model_dir) except Exception as e: logger.exception(e) sys.exit(1) channel = grpc.insecure_channel("localhost:" + str(args.tf_serve_port)) local_cache["stub"] = prediction_service_pb2_grpc.PredictionServiceStub( channel) # wait a bit for tf serving to start before querying metadata limit = 300 for i in range(limit): try: local_cache["metadata"] = run_get_model_metadata() break except Exception as e: if i == limit - 1: logger.exception( "An error occurred, see `cortex logs -v api {}` for more details." .format(api["name"])) sys.exit(1) time.sleep(1) logger.info("Serving model: {}".format( util.remove_resource_ref(api["model"]))) serve(app, listen="*:{}".format(args.port))
def train(model_name, model_impl, ctx, model_dir): model = ctx.models[model_name] util.mkdir_p(model_dir) util.rm_dir(model_dir) tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"]) run_config = tf.estimator.RunConfig( tf_random_seed=model["training"]["tf_random_seed"], save_summary_steps=model["training"]["save_summary_steps"], save_checkpoints_secs=model["training"]["save_checkpoints_secs"], save_checkpoints_steps=model["training"]["save_checkpoints_steps"], log_step_count_steps=model["training"]["log_step_count_steps"], keep_checkpoint_max=model["training"]["keep_checkpoint_max"], keep_checkpoint_every_n_hours=model["training"] ["keep_checkpoint_every_n_hours"], model_dir=model_dir, ) train_input_fn = generate_input_fn(model_name, ctx, "training") eval_input_fn = generate_input_fn(model_name, ctx, "evaluation") serving_input_fn = generate_json_serving_input_fn(model_name, ctx) exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False) dataset_metadata = aws.read_json_from_s3(model["dataset"]["metadata_key"], ctx.bucket) train_num_steps = model["training"]["num_steps"] if model["training"]["num_epochs"]: train_num_steps = ( math.ceil(dataset_metadata["dataset_size"] * model["data_partition_ratio"]["training"] / float(model["training"]["batch_size"])) * model["training"]["num_epochs"]) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps) eval_num_steps = model["evaluation"]["num_steps"] if model["evaluation"]["num_epochs"]: eval_num_steps = ( math.ceil(dataset_metadata["dataset_size"] * model["data_partition_ratio"]["evaluation"] / float(model["evaluation"]["batch_size"])) * model["evaluation"]["num_epochs"]) eval_spec = tf.estimator.EvalSpec( eval_input_fn, steps=eval_num_steps, exporters=[exporter], name="estimator-eval", start_delay_secs=model["evaluation"]["start_delay_secs"], throttle_secs=model["evaluation"]["throttle_secs"], ) model_config = ctx.model_config(model["name"]) tf_lib.add_tf_types(model_config) try: estimator = model_impl.create_estimator(run_config, model_config) except Exception as e: raise UserRuntimeException("model " + model_name) from e if model["type"] == "regression": estimator = tf.contrib.estimator.add_metrics( estimator, get_regression_eval_metrics) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return model_dir
def train(model_name, estimator_impl, ctx, model_dir): model = ctx.models[model_name] util.mkdir_p(model_dir) util.rm_dir(model_dir) tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"]) run_config = tf.estimator.RunConfig( tf_random_seed=model["training"]["tf_random_seed"], save_summary_steps=model["training"]["save_summary_steps"], save_checkpoints_secs=model["training"]["save_checkpoints_secs"], save_checkpoints_steps=model["training"]["save_checkpoints_steps"], log_step_count_steps=model["training"]["log_step_count_steps"], keep_checkpoint_max=model["training"]["keep_checkpoint_max"], keep_checkpoint_every_n_hours=model["training"]["keep_checkpoint_every_n_hours"], model_dir=model_dir, ) train_input_fn = generate_input_fn(model_name, ctx, "training", estimator_impl) eval_input_fn = generate_input_fn(model_name, ctx, "evaluation", estimator_impl) serving_input_fn = generate_json_serving_input_fn(model_name, ctx, estimator_impl) exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False) train_num_steps = model["training"]["num_steps"] dataset_metadata = ctx.get_metadata(model["dataset"]["id"]) if model["training"]["num_epochs"]: train_num_steps = ( math.ceil(dataset_metadata["training_size"] / float(model["training"]["batch_size"])) * model["training"]["num_epochs"] ) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps) eval_num_steps = model["evaluation"]["num_steps"] if model["evaluation"]["num_epochs"]: eval_num_steps = ( math.ceil(dataset_metadata["eval_size"] / float(model["evaluation"]["batch_size"])) * model["evaluation"]["num_epochs"] ) eval_spec = tf.estimator.EvalSpec( eval_input_fn, steps=eval_num_steps, exporters=[exporter], name="estimator-eval", start_delay_secs=model["evaluation"]["start_delay_secs"], throttle_secs=model["evaluation"]["throttle_secs"], ) model_config = ctx.model_config(model_name) try: tf_estimator = estimator_impl.create_estimator(run_config, model_config) except Exception as e: raise UserRuntimeException("model " + model_name) from e target_col_name = util.get_resource_ref(model["target_column"]) if ctx.get_inferred_column_type(target_col_name) == consts.COLUMN_TYPE_FLOAT: tf_estimator = tf.contrib.estimator.add_metrics(tf_estimator, get_regression_eval_metrics) tf.estimator.train_and_evaluate(tf_estimator, train_spec, eval_spec) return model_dir