def get_label_placeholder(model_name, ctx): model = ctx.models[model_name] target_column_name = util.get_resource_ref(model["target_column"]) column_type = tf_lib.CORTEX_TYPE_TO_TF_TYPE[ctx.columns[target_column_name] ["type"]] return tf.placeholder(shape=[None], dtype=column_type)
def model_config(self, model_name): model = self.models[model_name] if model is None: return None estimator = self.estimators[model["estimator"]] target_column = self.columns[util.get_resource_ref(model["target_column"])] if estimator.get("target_column") is not None: target_col_type = self.get_inferred_column_type(target_column["name"]) if target_col_type not in estimator["target_column"]: raise UserException( "model " + model_name, "target_column", target_column["name"], "unsupported type (expected type {}, got type {})".format( util.data_type_str(estimator["target_column"]), util.data_type_str(target_col_type), ), ) model_config = deepcopy(model) config_keys = [ "name", "estimator" "estimator_path" "target_column" "input" "training_input" "hparams" "prediction_key" "data_partition_ratio" "training" "evaluation" "tags", ] util.keep_dict_keys(model_config, config_keys) model_config["target_column"] = target_column["name"] model_config["input"] = self.populate_values( model["input"], estimator["input"], preserve_column_refs=False ) if model.get("training_input") is not None: model_config["training_input"] = self.populate_values( model["training_input"], estimator["training_input"], preserve_column_refs=False ) if model.get("hparams") is not None: model_config["hparams"] = self.populate_values( model["hparams"], estimator["hparams"], preserve_column_refs=False ) return model_config
def create_transformer_inputs_from_map(input, col_value_map): if util.is_str(input): if util.is_resource_ref(input): res_name = util.get_resource_ref(input) return col_value_map[res_name] return input if util.is_list(input): replaced = [] for item in input: replaced.append(create_transformer_inputs_from_map(item, col_value_map)) return replaced if util.is_dict(input): replaced = {} for key, val in input.items(): key_replaced = create_transformer_inputs_from_map(key, col_value_map) val_replaced = create_transformer_inputs_from_map(val, col_value_map) replaced[key_replaced] = val_replaced return replaced return input
def read_parquet(ctx, spark): parquet_config = ctx.environment["data"] df = spark.read.parquet(parquet_config["path"]) alias_map = {} for parquet_col_config in parquet_config["schema"]: col_name = util.get_resource_ref(parquet_col_config["raw_column"]) if col_name in ctx.raw_columns: alias_map[col_name] = parquet_col_config["parquet_column_name"] missing_cols = set(alias_map.keys()) - set(df.columns) if len(missing_cols) > 0: logger.error("found schema:") log_df_schema(df, logger.error) raise UserException("missing column(s) in input dataset", str(missing_cols)) selectExprs = [ "{} as {}".format(parq_name, col_name) for col_name, parq_name in alias_map.items() ] return df.selectExpr(*selectExprs)
def read_csv(ctx, spark): data_config = ctx.environment["data"] csv_config = { util.snake_to_camel(param_name): val for param_name, val in data_config.get("csv_config", {}).items() if val is not None } df = spark.read.csv(data_config["path"], inferSchema=True, mode="FAILFAST", **csv_config) if len(data_config["schema"]) != len(df.columns): raise UserException("expected " + len(data_config["schema"]) + " column(s) but got " + len(df.columns)) col_names = [ util.get_resource_ref(col_ref) for col_ref in data_config["schema"] ] renamed_cols = [ F.col(c).alias(col_names[idx]) for idx, c in enumerate(df.columns) ] return df.select(*renamed_cols)
def start(args): ctx = Context(s3_path=args.context, cache_dir=args.cache_dir, workload_id=args.workload_id) api = ctx.apis_id_map[args.api] local_cache["api"] = api local_cache["ctx"] = ctx try: if api.get("request_handler_impl_key") is not None: local_cache["request_handler"] = ctx.get_request_handler_impl( api["name"]) if not util.is_resource_ref(api["model"]): if api.get("request_handler") is not None: package.install_packages(ctx.python_packages, ctx.storage) if not os.path.isdir(args.model_dir): ctx.storage.download_and_unzip_external( api["model"], args.model_dir) else: package.install_packages(ctx.python_packages, ctx.storage) model_name = util.get_resource_ref(api["model"]) model = ctx.models[model_name] estimator = ctx.estimators[model["estimator"]] local_cache["model"] = model local_cache["estimator"] = estimator local_cache["target_col"] = ctx.columns[util.get_resource_ref( model["target_column"])] local_cache["target_col_type"] = ctx.get_inferred_column_type( util.get_resource_ref(model["target_column"])) log_level = "DEBUG" if ctx.environment is not None and ctx.environment.get( "log_level") is not None: log_level = ctx.environment["log_level"].get( "tensorflow", "DEBUG") tf_lib.set_logging_verbosity(log_level) if not os.path.isdir(args.model_dir): ctx.storage.download_and_unzip(model["key"], args.model_dir) for column_name in ctx.extract_column_names( [model["input"], model["target_column"]]): if ctx.is_transformed_column(column_name): trans_impl, _ = ctx.get_transformer_impl(column_name) local_cache["trans_impls"][column_name] = trans_impl transformed_column = ctx.transformed_columns[column_name] # cache aggregate values for resource_name in util.extract_resource_refs( transformed_column["input"]): if resource_name in ctx.aggregates: ctx.get_obj(ctx.aggregates[resource_name]["key"]) local_cache["required_inputs"] = tf_lib.get_base_input_columns( model["name"], ctx) if util.is_dict(model["input"]) and model["input"].get( "target_vocab") is not None: local_cache["target_vocab_populated"] = ctx.populate_values( model["input"]["target_vocab"], None, False) except CortexException as e: e.wrap("error") logger.error(str(e)) logger.exception( "An error occurred, see `cortex logs -v api {}` for more details.". format(api["name"])) sys.exit(1) except Exception as e: logger.exception( "An error occurred, see `cortex logs -v api {}` for more details.". format(api["name"])) sys.exit(1) try: validate_model_dir(args.model_dir) except Exception as e: logger.exception(e) sys.exit(1) channel = grpc.insecure_channel("localhost:" + str(args.tf_serve_port)) local_cache["stub"] = prediction_service_pb2_grpc.PredictionServiceStub( channel) # wait a bit for tf serving to start before querying metadata limit = 300 for i in range(limit): try: local_cache["metadata"] = run_get_model_metadata() break except Exception as e: if i == limit - 1: logger.exception( "An error occurred, see `cortex logs -v api {}` for more details." .format(api["name"])) sys.exit(1) time.sleep(1) serve(app, listen="*:{}".format(args.port))
def populate_values(self, input, input_schema, preserve_column_refs): if input is None: if input_schema is None: return None if input_schema.get("_allow_null") == True: return None raise UserException("Null value is not allowed") if util.is_resource_ref(input): res_name = util.get_resource_ref(input) if res_name in self.constants: if self.constants[res_name].get("value") is not None: const_val = self.constants[res_name]["value"] elif self.constants[res_name].get("path") is not None: const_val = self.storage.get_json_external(self.constants[res_name]["path"]) try: return self.populate_values(const_val, input_schema, preserve_column_refs) except CortexException as e: e.wrap("constant " + res_name) raise if res_name in self.aggregates: agg_val = self.get_obj(self.aggregates[res_name]["key"]) try: return self.populate_values(agg_val, input_schema, preserve_column_refs) except CortexException as e: e.wrap("aggregate " + res_name) raise if res_name in self.columns: if input_schema is not None: col_type = self.get_inferred_column_type(res_name) if col_type not in input_schema["_type"]: raise UserException( "column {}: unsupported input type (expected type {}, got type {})".format( res_name, util.data_type_str(input_schema["_type"]), util.data_type_str(col_type), ) ) if preserve_column_refs: return input else: return res_name if util.is_list(input): elem_schema = None if input_schema is not None: if not util.is_list(input_schema["_type"]): raise UserException( "unsupported input type (expected type {}, got {})".format( util.data_type_str(input_schema["_type"]), util.user_obj_str(input) ) ) elem_schema = input_schema["_type"][0] min_count = input_schema.get("_min_count") if min_count is not None and len(input) < min_count: raise UserException( "list has length {}, but the minimum allowed length is {}".format( len(input), min_count ) ) max_count = input_schema.get("_max_count") if max_count is not None and len(input) > max_count: raise UserException( "list has length {}, but the maximum allowed length is {}".format( len(input), max_count ) ) casted = [] for i, elem in enumerate(input): try: casted.append(self.populate_values(elem, elem_schema, preserve_column_refs)) except CortexException as e: e.wrap("index " + i) raise return casted if util.is_dict(input): if input_schema is None: casted = {} for key, val in input.items(): key_casted = self.populate_values(key, None, preserve_column_refs) try: val_casted = self.populate_values(val, None, preserve_column_refs) except CortexException as e: e.wrap(util.user_obj_str(key)) raise casted[key_casted] = val_casted return casted if not util.is_dict(input_schema["_type"]): raise UserException( "unsupported input type (expected type {}, got {})".format( util.data_type_str(input_schema["_type"]), util.user_obj_str(input) ) ) min_count = input_schema.get("_min_count") if min_count is not None and len(input) < min_count: raise UserException( "map has length {}, but the minimum allowed length is {}".format( len(input), min_count ) ) max_count = input_schema.get("_max_count") if max_count is not None and len(input) > max_count: raise UserException( "map has length {}, but the maximum allowed length is {}".format( len(input), max_count ) ) is_generic_map = False if len(input_schema["_type"]) == 1: input_type_key = next(iter(input_schema["_type"].keys())) if is_compound_type(input_type_key): is_generic_map = True generic_map_key_schema = input_schema_from_type_schema(input_type_key) generic_map_value = input_schema["_type"][input_type_key] if is_generic_map: casted = {} for key, val in input.items(): key_casted = self.populate_values( key, generic_map_key_schema, preserve_column_refs ) try: val_casted = self.populate_values( val, generic_map_value, preserve_column_refs ) except CortexException as e: e.wrap(util.user_obj_str(key)) raise casted[key_casted] = val_casted return casted # fixed map casted = {} for key, val_schema in input_schema["_type"].items(): if key in input: val = input[key] else: if val_schema.get("_optional") is not True: raise UserException("missing key: " + util.user_obj_str(key)) if val_schema.get("_default") is None: continue val = val_schema["_default"] try: val_casted = self.populate_values(val, val_schema, preserve_column_refs) except CortexException as e: e.wrap(util.user_obj_str(key)) raise casted[key] = val_casted return casted if input_schema is None: return input if not util.is_str(input_schema["_type"]): raise UserException( "unsupported input type (expected type {}, got {})".format( util.data_type_str(input_schema["_type"]), util.user_obj_str(input) ) ) return cast_compound_type(input, input_schema["_type"])
def _parse_example(example_proto): features = tf.parse_single_example(serialized=example_proto, features=feature_spec) target = features.pop(util.get_resource_ref(model["target_column"]), None) return features, target
def train(model_name, estimator_impl, ctx, model_dir): model = ctx.models[model_name] util.mkdir_p(model_dir) util.rm_dir(model_dir) tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"]) run_config = tf.estimator.RunConfig( tf_random_seed=model["training"]["tf_random_seed"], save_summary_steps=model["training"]["save_summary_steps"], save_checkpoints_secs=model["training"]["save_checkpoints_secs"], save_checkpoints_steps=model["training"]["save_checkpoints_steps"], log_step_count_steps=model["training"]["log_step_count_steps"], keep_checkpoint_max=model["training"]["keep_checkpoint_max"], keep_checkpoint_every_n_hours=model["training"] ["keep_checkpoint_every_n_hours"], model_dir=model_dir, ) train_input_fn = generate_input_fn(model_name, ctx, "training", estimator_impl) eval_input_fn = generate_input_fn(model_name, ctx, "evaluation", estimator_impl) serving_input_fn = generate_json_serving_input_fn(model_name, ctx, estimator_impl) exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False) train_num_steps = model["training"]["num_steps"] dataset_metadata = ctx.get_metadata(model["dataset"]["id"]) if model["training"]["num_epochs"]: train_num_steps = (math.ceil(dataset_metadata["training_size"] / float(model["training"]["batch_size"])) * model["training"]["num_epochs"]) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps) eval_num_steps = model["evaluation"]["num_steps"] if model["evaluation"]["num_epochs"]: eval_num_steps = (math.ceil(dataset_metadata["eval_size"] / float(model["evaluation"]["batch_size"])) * model["evaluation"]["num_epochs"]) eval_spec = tf.estimator.EvalSpec( eval_input_fn, steps=eval_num_steps, exporters=[exporter], name="estimator-eval", start_delay_secs=model["evaluation"]["start_delay_secs"], throttle_secs=model["evaluation"]["throttle_secs"], ) model_config = ctx.model_config(model_name) try: tf_estimator = estimator_impl.create_estimator(run_config, model_config) except Exception as e: raise UserRuntimeException("model " + model_name) from e target_col_name = util.get_resource_ref(model["target_column"]) if ctx.get_inferred_column_type( target_col_name) == consts.COLUMN_TYPE_FLOAT: tf_estimator = tf.contrib.estimator.add_metrics( tf_estimator, get_regression_eval_metrics) tf.estimator.train_and_evaluate(tf_estimator, train_spec, eval_spec) return model_dir