Example #1
0
def start(args):
    ctx = Context(s3_path=args.context,
                  cache_dir=args.cache_dir,
                  workload_id=args.workload_id)
    package.install_packages(ctx.python_packages, ctx.bucket)

    api = ctx.apis_id_map[args.api]
    model = ctx.models[api["model_name"]]
    tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"])

    local_cache["ctx"] = ctx
    local_cache["api"] = api
    local_cache["model"] = model

    if not os.path.isdir(args.model_dir):
        aws.download_and_extract_zip(model["key"], args.model_dir, ctx.bucket)

    for column_name in model["feature_columns"] + [model["target_column"]]:
        if ctx.is_transformed_column(column_name):
            trans_impl, _ = ctx.get_transformer_impl(column_name)
            local_cache["trans_impls"][column_name] = trans_impl
            transformed_column = ctx.transformed_columns[column_name]
            input_args_schema = transformed_column["inputs"]["args"]
            # cache aggregates and constants in memory
            if input_args_schema is not None:
                local_cache["transform_args_cache"][
                    column_name] = ctx.populate_args(input_args_schema)

    channel = implementations.insecure_channel("localhost", args.tf_serve_port)
    local_cache[
        "stub"] = prediction_service_pb2.beta_create_PredictionService_stub(
            channel)

    local_cache["required_inputs"] = tf_lib.get_base_input_columns(
        model["name"], ctx)

    # wait a bit for tf serving to start before querying metadata
    limit = 600
    for i in range(limit):
        try:
            local_cache["metadata"] = run_get_model_metadata()
            break
        except Exception as e:
            if i == limit - 1:
                logger.exception(
                    "An error occurred, see `cx logs api {}` for more details."
                    .format(api["name"]))
                sys.exit(1)

        time.sleep(1)

    logger.info("Serving model: {}".format(model["name"]))
    serve(app, listen="*:{}".format(args.port))
Example #2
0
def start(args):
    ctx = Context(s3_path=args.context,
                  cache_dir=args.cache_dir,
                  workload_id=args.workload_id)

    api = ctx.apis_id_map[args.api]
    local_cache["api"] = api
    local_cache["ctx"] = ctx

    if api.get("request_handler_impl_key") is not None:
        local_cache["request_handler"] = ctx.get_request_handler_impl(
            api["name"])

    if not util.is_resource_ref(api["model"]):
        if api.get("request_handler") is not None:
            package.install_packages(ctx.python_packages, ctx.storage)
        if not os.path.isdir(args.model_dir):
            ctx.storage.download_and_unzip_external(api["model"],
                                                    args.model_dir)
    else:
        package.install_packages(ctx.python_packages, ctx.storage)
        model_name = util.get_resource_ref(api["model"])
        model = ctx.models[model_name]
        estimator = ctx.estimators[model["estimator"]]

        local_cache["model"] = model
        local_cache["estimator"] = estimator
        local_cache["target_col"] = ctx.columns[util.get_resource_ref(
            model["target_column"])]
        local_cache["target_col_type"] = ctx.get_inferred_column_type(
            util.get_resource_ref(model["target_column"]))

        log_level = "DEBUG"
        if ctx.environment is not None and ctx.environment.get(
                "log_level") is not None:
            log_level = ctx.environment["log_level"].get("tensorflow", "DEBUG")
        tf_lib.set_logging_verbosity(log_level)

        if not os.path.isdir(args.model_dir):
            ctx.storage.download_and_unzip(model["key"], args.model_dir)

        for column_name in ctx.extract_column_names(
            [model["input"], model["target_column"]]):
            if ctx.is_transformed_column(column_name):
                trans_impl, _ = ctx.get_transformer_impl(column_name)
                local_cache["trans_impls"][column_name] = trans_impl
                transformed_column = ctx.transformed_columns[column_name]

                # cache aggregate values
                for resource_name in util.extract_resource_refs(
                        transformed_column["input"]):
                    if resource_name in ctx.aggregates:
                        ctx.get_obj(ctx.aggregates[resource_name]["key"])

        local_cache["required_inputs"] = tf_lib.get_base_input_columns(
            model["name"], ctx)

        if util.is_dict(model["input"]) and model["input"].get(
                "target_vocab") is not None:
            local_cache["target_vocab_populated"] = ctx.populate_values(
                model["input"]["target_vocab"], None, False)

    try:
        validate_model_dir(args.model_dir)
    except Exception as e:
        logger.exception(e)
        sys.exit(1)

    channel = grpc.insecure_channel("localhost:" + str(args.tf_serve_port))
    local_cache["stub"] = prediction_service_pb2_grpc.PredictionServiceStub(
        channel)

    # wait a bit for tf serving to start before querying metadata
    limit = 300
    for i in range(limit):
        try:
            local_cache["metadata"] = run_get_model_metadata()
            break
        except Exception as e:
            if i == limit - 1:
                logger.exception(
                    "An error occurred, see `cortex logs -v api {}` for more details."
                    .format(api["name"]))
                sys.exit(1)

        time.sleep(1)

    logger.info("Serving model: {}".format(
        util.remove_resource_ref(api["model"])))
    serve(app, listen="*:{}".format(args.port))
Example #3
0
def train(model_name, model_impl, ctx, model_dir):
    model = ctx.models[model_name]

    util.mkdir_p(model_dir)
    util.rm_dir(model_dir)

    tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"])

    run_config = tf.estimator.RunConfig(
        tf_random_seed=model["training"]["tf_random_seed"],
        save_summary_steps=model["training"]["save_summary_steps"],
        save_checkpoints_secs=model["training"]["save_checkpoints_secs"],
        save_checkpoints_steps=model["training"]["save_checkpoints_steps"],
        log_step_count_steps=model["training"]["log_step_count_steps"],
        keep_checkpoint_max=model["training"]["keep_checkpoint_max"],
        keep_checkpoint_every_n_hours=model["training"]
        ["keep_checkpoint_every_n_hours"],
        model_dir=model_dir,
    )

    train_input_fn = generate_input_fn(model_name, ctx, "training")
    eval_input_fn = generate_input_fn(model_name, ctx, "evaluation")
    serving_input_fn = generate_json_serving_input_fn(model_name, ctx)
    exporter = tf.estimator.FinalExporter("estimator",
                                          serving_input_fn,
                                          as_text=False)

    dataset_metadata = aws.read_json_from_s3(model["dataset"]["metadata_key"],
                                             ctx.bucket)
    train_num_steps = model["training"]["num_steps"]
    if model["training"]["num_epochs"]:
        train_num_steps = (
            math.ceil(dataset_metadata["dataset_size"] *
                      model["data_partition_ratio"]["training"] /
                      float(model["training"]["batch_size"])) *
            model["training"]["num_epochs"])

    train_spec = tf.estimator.TrainSpec(train_input_fn,
                                        max_steps=train_num_steps)

    eval_num_steps = model["evaluation"]["num_steps"]
    if model["evaluation"]["num_epochs"]:
        eval_num_steps = (
            math.ceil(dataset_metadata["dataset_size"] *
                      model["data_partition_ratio"]["evaluation"] /
                      float(model["evaluation"]["batch_size"])) *
            model["evaluation"]["num_epochs"])

    eval_spec = tf.estimator.EvalSpec(
        eval_input_fn,
        steps=eval_num_steps,
        exporters=[exporter],
        name="estimator-eval",
        start_delay_secs=model["evaluation"]["start_delay_secs"],
        throttle_secs=model["evaluation"]["throttle_secs"],
    )

    model_config = ctx.model_config(model["name"])
    tf_lib.add_tf_types(model_config)

    try:
        estimator = model_impl.create_estimator(run_config, model_config)
    except Exception as e:
        raise UserRuntimeException("model " + model_name) from e

    if model["type"] == "regression":
        estimator = tf.contrib.estimator.add_metrics(
            estimator, get_regression_eval_metrics)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    return model_dir
Example #4
0
def train(model_name, estimator_impl, ctx, model_dir):
    model = ctx.models[model_name]

    util.mkdir_p(model_dir)
    util.rm_dir(model_dir)

    tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"])

    run_config = tf.estimator.RunConfig(
        tf_random_seed=model["training"]["tf_random_seed"],
        save_summary_steps=model["training"]["save_summary_steps"],
        save_checkpoints_secs=model["training"]["save_checkpoints_secs"],
        save_checkpoints_steps=model["training"]["save_checkpoints_steps"],
        log_step_count_steps=model["training"]["log_step_count_steps"],
        keep_checkpoint_max=model["training"]["keep_checkpoint_max"],
        keep_checkpoint_every_n_hours=model["training"]["keep_checkpoint_every_n_hours"],
        model_dir=model_dir,
    )

    train_input_fn = generate_input_fn(model_name, ctx, "training", estimator_impl)
    eval_input_fn = generate_input_fn(model_name, ctx, "evaluation", estimator_impl)
    serving_input_fn = generate_json_serving_input_fn(model_name, ctx, estimator_impl)
    exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False)

    train_num_steps = model["training"]["num_steps"]
    dataset_metadata = ctx.get_metadata(model["dataset"]["id"])
    if model["training"]["num_epochs"]:
        train_num_steps = (
            math.ceil(dataset_metadata["training_size"] / float(model["training"]["batch_size"]))
            * model["training"]["num_epochs"]
        )

    train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps)

    eval_num_steps = model["evaluation"]["num_steps"]
    if model["evaluation"]["num_epochs"]:
        eval_num_steps = (
            math.ceil(dataset_metadata["eval_size"] / float(model["evaluation"]["batch_size"]))
            * model["evaluation"]["num_epochs"]
        )

    eval_spec = tf.estimator.EvalSpec(
        eval_input_fn,
        steps=eval_num_steps,
        exporters=[exporter],
        name="estimator-eval",
        start_delay_secs=model["evaluation"]["start_delay_secs"],
        throttle_secs=model["evaluation"]["throttle_secs"],
    )

    model_config = ctx.model_config(model_name)

    try:
        tf_estimator = estimator_impl.create_estimator(run_config, model_config)
    except Exception as e:
        raise UserRuntimeException("model " + model_name) from e

    target_col_name = util.get_resource_ref(model["target_column"])
    if ctx.get_inferred_column_type(target_col_name) == consts.COLUMN_TYPE_FLOAT:
        tf_estimator = tf.contrib.estimator.add_metrics(tf_estimator, get_regression_eval_metrics)

    tf.estimator.train_and_evaluate(tf_estimator, train_spec, eval_spec)

    return model_dir