def run_custom_aggregator(aggregator_resource, df, ctx, spark): aggregator = ctx.aggregators[aggregator_resource["aggregator"]] aggregate_name = aggregator_resource["name"] aggregator_impl, _ = ctx.get_aggregator_impl(aggregate_name) input_schema = aggregator_resource["inputs"] aggregator_column_input = input_schema["columns"] args_schema = input_schema["args"] args = {} if input_schema.get("args", None) is not None and len(input_schema["args"]) > 0: args = ctx.populate_args(input_schema["args"]) try: result = aggregator_impl.aggregate_spark(df, aggregator_column_input, args) except Exception as e: raise UserRuntimeException( "aggregate " + aggregator_resource["name"], "aggregator " + aggregator["name"], "function aggregate_spark", ) from e if not util.validate_value_type(result, aggregator["output_type"]): raise UserException( "aggregate " + aggregator_resource["name"], "aggregator " + aggregator["name"], "type of {} is not {}".format( util.str_rep(util.pp_str(result), truncate=100), aggregator["output_type"]), ) ctx.store_aggregate_result(result, aggregator_resource) return result
def run_custom_aggregator(aggregate, df, ctx, spark): aggregator = ctx.aggregators[aggregate["aggregator"]] aggregator_impl, _ = ctx.get_aggregator_impl(aggregate["name"]) try: input = ctx.populate_values(aggregate["input"], aggregator["input"], preserve_column_refs=False) except CortexException as e: e.wrap("aggregate " + aggregate["name"], "input") raise try: result = aggregator_impl.aggregate_spark(df, input) except Exception as e: raise UserRuntimeException( "aggregate " + aggregate["name"], "aggregator " + aggregator["name"], "function aggregate_spark", ) from e if aggregator.get( "output_type") is not None and not util.validate_output_type( result, aggregator["output_type"]): raise UserException( "aggregate " + aggregate["name"], "aggregator " + aggregator["name"], "unsupported return type (expected type {}, got {})".format( util.data_type_str(aggregator["output_type"]), util.user_obj_str(result)), ) result = util.cast_output_type(result, aggregator["output_type"]) ctx.store_aggregate_result(result, aggregate) return result
def execute_transform_spark(column_name, df, ctx, spark): trans_impl, trans_impl_path = ctx.get_transformer_impl(column_name) spark.sparkContext.addPyFile( trans_impl_path) # Executor pods need this because of the UDF columns_input_config, impl_args = extract_inputs(column_name, ctx) try: return trans_impl.transform_spark(df, columns_input_config, impl_args, column_name) except Exception as e: raise UserRuntimeException("function transform_spark") from e
def transform_column(column_name, df, ctx, spark): if not ctx.is_transformed_column(column_name): return df if column_name in df.columns: return df transformed_column = ctx.transformed_columns[column_name] trans_impl, _ = ctx.get_transformer_impl(column_name) if hasattr(trans_impl, "transform_spark"): try: df = execute_transform_spark(column_name, df, ctx, spark) return df.withColumn( column_name, F.col(column_name).cast(CORTEX_TYPE_TO_SPARK_TYPE[ ctx.get_inferred_column_type(column_name)]), ) except CortexException as e: raise UserRuntimeException( "transformed column " + column_name, transformed_column["transformer"] + ".transform_spark", ) from e elif hasattr(trans_impl, "transform_python"): try: return execute_transform_python(column_name, df, ctx, spark) except Exception as e: raise UserRuntimeException( "transformed column " + column_name, transformed_column["transformer"] + ".transform_python", ) from e else: raise UserException( "transformed column " + column_name, "transformer " + transformed_column["transformer"], "transform_spark(), transform_python(), or both must be defined", )
def reverse_transform(value): ctx = local_cache["ctx"] model = local_cache["model"] target_col = local_cache["target_col"] trans_impl = local_cache["trans_impls"].get(target_col["name"]) if not (trans_impl and hasattr(trans_impl, "reverse_transform_python")): return None input = ctx.populate_values(target_col["input"], None, preserve_column_refs=False) try: result = trans_impl.reverse_transform_python(value, input) except Exception as e: raise UserRuntimeException("transformer " + target_col["transformer"], "function reverse_transform_python") from e return result
def reverse_transform(value): ctx = local_cache["ctx"] model = local_cache["model"] trans_impl = local_cache["trans_impls"].get(model["target_column"], None) if not (trans_impl and hasattr(trans_impl, "reverse_transform_python")): return None transformer_name = model["target_column"] input_schema = ctx.transformed_columns[transformer_name]["inputs"] if input_schema.get("args", None) is not None and len(input_schema["args"]) > 0: args = local_cache["transform_args_cache"].get(transformer_name, {}) try: result = trans_impl.reverse_transform_python(value, args) except Exception as e: raise UserRuntimeException( "transformer " + ctx.transformed_columns[model["target_column"]]["transformer"], "function reverse_transform_python", ) from e return result
def execute_transform_spark(column_name, df, ctx, spark): trans_impl, trans_impl_path = ctx.get_transformer_impl(column_name) transformed_column = ctx.transformed_columns[column_name] transformer = ctx.transformers[transformed_column["transformer"]] if trans_impl_path not in ctx.spark_uploaded_impls: spark.sparkContext.addPyFile( trans_impl_path) # Executor pods need this because of the UDF ctx.spark_uploaded_impls[trans_impl_path] = True try: input = ctx.populate_values(transformed_column["input"], transformer["input"], preserve_column_refs=False) except CortexException as e: e.wrap("input") raise try: return trans_impl.transform_spark(df, input, column_name) except Exception as e: raise UserRuntimeException("function transform_spark") from e
def train(model_name, model_impl, ctx, model_dir): model = ctx.models[model_name] util.mkdir_p(model_dir) util.rm_dir(model_dir) tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"]) run_config = tf.estimator.RunConfig( tf_random_seed=model["training"]["tf_random_seed"], save_summary_steps=model["training"]["save_summary_steps"], save_checkpoints_secs=model["training"]["save_checkpoints_secs"], save_checkpoints_steps=model["training"]["save_checkpoints_steps"], log_step_count_steps=model["training"]["log_step_count_steps"], keep_checkpoint_max=model["training"]["keep_checkpoint_max"], keep_checkpoint_every_n_hours=model["training"] ["keep_checkpoint_every_n_hours"], model_dir=model_dir, ) train_input_fn = generate_input_fn(model_name, ctx, "training") eval_input_fn = generate_input_fn(model_name, ctx, "evaluation") serving_input_fn = generate_json_serving_input_fn(model_name, ctx) exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False) dataset_metadata = aws.read_json_from_s3(model["dataset"]["metadata_key"], ctx.bucket) train_num_steps = model["training"]["num_steps"] if model["training"]["num_epochs"]: train_num_steps = ( math.ceil(dataset_metadata["dataset_size"] * model["data_partition_ratio"]["training"] / float(model["training"]["batch_size"])) * model["training"]["num_epochs"]) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps) eval_num_steps = model["evaluation"]["num_steps"] if model["evaluation"]["num_epochs"]: eval_num_steps = ( math.ceil(dataset_metadata["dataset_size"] * model["data_partition_ratio"]["evaluation"] / float(model["evaluation"]["batch_size"])) * model["evaluation"]["num_epochs"]) eval_spec = tf.estimator.EvalSpec( eval_input_fn, steps=eval_num_steps, exporters=[exporter], name="estimator-eval", start_delay_secs=model["evaluation"]["start_delay_secs"], throttle_secs=model["evaluation"]["throttle_secs"], ) model_config = ctx.model_config(model["name"]) tf_lib.add_tf_types(model_config) try: estimator = model_impl.create_estimator(run_config, model_config) except Exception as e: raise UserRuntimeException("model " + model_name) from e if model["type"] == "regression": estimator = tf.contrib.estimator.add_metrics( estimator, get_regression_eval_metrics) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return model_dir
def validate_transformer(column_name, df, ctx, spark): transformed_column = ctx.transformed_columns[column_name] trans_impl, _ = ctx.get_transformer_impl(column_name) if hasattr(trans_impl, "transform_python"): try: transform_python_collect = execute_transform_python( column_name, df, ctx, spark, validate=True).collect() except Exception as e: raise UserRuntimeException( "transformed column " + column_name, transformed_column["transformer"] + ".transform_python", ) from e if hasattr(trans_impl, "transform_spark"): try: transform_spark_df = execute_transform_spark( column_name, df, ctx, spark) # check that the return object is a dataframe if type(transform_spark_df) is not DataFrame: raise UserException( "expected pyspark.sql.dataframe.DataFrame but found type {}" .format(type(transform_spark_df))) # check that a column is added with the expected name if column_name not in transform_spark_df.columns: logger.error("schema of output dataframe:") log_df_schema(transform_spark_df, logger.error) raise UserException( "output dataframe after running transformer does not have column {}" .format(column_name)) # check that transformer run on data try: transform_spark_df.select(column_name).collect() except Exception as e: raise UserRuntimeException("function transform_spark") from e actual_structfield = transform_spark_df.select( column_name).schema.fields[0] # check that expected output column has the correct data type if (actual_structfield.dataType not in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[ transformed_column["type"]]): raise UserException( "incorrect column type, expected {}, found {}.".format( " or ".join( str(t) for t in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[ transformed_column["type"]]), actual_structfield.dataType, )) # perform the necessary upcast/downcast for the column e.g INT -> LONG or DOUBLE -> FLOAT transform_spark_df = transform_spark_df.withColumn( column_name, F.col(column_name).cast(CORTEX_TYPE_TO_SPARK_TYPE[ ctx.transformed_columns[column_name]["type"]]), ) # check that the function doesn't modify the schema of the other columns in the input dataframe if set(transform_spark_df.columns) - set([column_name]) != set( df.columns): logger.error("expected schema:") log_df_schema(df, logger.error) logger.error( "found schema (with {} dropped):".format(column_name)) log_df_schema(transform_spark_df.drop(column_name), logger.error) raise UserException( "a column besides {} was modifed in the output dataframe". format(column_name)) except CortexException as e: e.wrap( "transformed column " + column_name, transformed_column["transformer"] + ".transform_spark", ) raise if hasattr(trans_impl, "transform_spark") and hasattr( trans_impl, "transform_python"): name_type_map = [(s.name, s.dataType) for s in transform_spark_df.schema] transform_spark_collect = transform_spark_df.collect() for tp_row, ts_row in zip(transform_python_collect, transform_spark_collect): tp_dict = tp_row.asDict() ts_dict = ts_row.asDict() for name, dataType in name_type_map: if tp_dict[name] == ts_dict[name]: continue elif dataType == FloatType() and util.isclose( tp_dict[name], ts_dict[name], FLOAT_PRECISION): continue raise UserException( column_name, "{0}.transform_spark and {0}.transform_python had differing values" .format(transformed_column["transformer"]), "{} != {}".format(ts_row, tp_row), )
def validate_transformer(column_name, test_df, ctx, spark): transformed_column = ctx.transformed_columns[column_name] transformer = ctx.transformers[transformed_column["transformer"]] trans_impl, _ = ctx.get_transformer_impl(column_name) inferred_python_type = None inferred_spark_type = None if hasattr(trans_impl, "transform_python"): try: if transformer["output_type"] == consts.COLUMN_TYPE_INFERRED: sample_df = test_df.collect() sample = sample_df[0] try: input = ctx.populate_values(transformed_column["input"], transformer["input"], preserve_column_refs=True) except CortexException as e: e.wrap("input") raise transformer_input = create_transformer_inputs_from_map( input, sample) initial_transformed_value = trans_impl.transform_python( transformer_input) inferred_python_type = infer_python_type( initial_transformed_value) for row in sample_df: transformer_input = create_transformer_inputs_from_map( input, row) transformed_value = trans_impl.transform_python( transformer_input) if inferred_python_type != infer_python_type( transformed_value): raise UserException( "transformed column " + column_name, "type inference failed, mixed data types in dataframe.", 'expected type of "' + transformed_sample + '" to be ' + inferred_python_type, ) ctx.write_metadata(transformed_column["id"], {"type": inferred_python_type}) transform_python_collect = execute_transform_python( column_name, test_df, ctx, spark, validate=True).collect() except Exception as e: raise UserRuntimeException( "transformed column " + column_name, transformed_column["transformer"] + ".transform_python", ) from e if hasattr(trans_impl, "transform_spark"): try: transform_spark_df = execute_transform_spark( column_name, test_df, ctx, spark) # check that the return object is a dataframe if type(transform_spark_df) is not DataFrame: raise UserException( "expected pyspark.sql.dataframe.DataFrame but got type {}". format(type(transform_spark_df))) # check that a column is added with the expected name if column_name not in transform_spark_df.columns: logger.error("schema of output dataframe:") log_df_schema(transform_spark_df, logger.error) raise UserException( "output dataframe after running transformer does not have column {}" .format(column_name)) if transformer["output_type"] == consts.COLUMN_TYPE_INFERRED: inferred_spark_type = SPARK_TYPE_TO_CORTEX_TYPE[ transform_spark_df.select(column_name).schema[0].dataType] ctx.write_metadata(transformed_column["id"], {"type": inferred_spark_type}) # check that transformer run on data try: transform_spark_df.select(column_name).collect() except Exception as e: raise UserRuntimeException("function transform_spark") from e # check that expected output column has the correct data type if transformer["output_type"] != consts.COLUMN_TYPE_INFERRED: actual_structfield = transform_spark_df.select( column_name).schema.fields[0] if (actual_structfield.dataType not in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[ transformer["output_type"]]): raise UserException( "incorrect column type: expected {}, got {}.".format( " or ".join( str(t) for t in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[ transformer["output_type"]]), actual_structfield.dataType, )) # perform the necessary casting for the column transform_spark_df = transform_spark_df.withColumn( column_name, F.col(column_name).cast(CORTEX_TYPE_TO_SPARK_TYPE[ ctx.get_inferred_column_type(column_name)]), ) # check that the function doesn't modify the schema of the other columns in the input dataframe if set(transform_spark_df.columns) - set([column_name]) != set( test_df.columns): logger.error("expected schema:") log_df_schema(test_df, logger.error) logger.error( "found schema (with {} dropped):".format(column_name)) log_df_schema(transform_spark_df.drop(column_name), logger.error) raise UserException( "a column besides {} was modifed in the output dataframe". format(column_name)) except CortexException as e: raise UserRuntimeException( "transformed column " + column_name, transformed_column["transformer"] + ".transform_spark", ) from e if hasattr(trans_impl, "transform_spark") and hasattr( trans_impl, "transform_python"): if (transformer["output_type"] == consts.COLUMN_TYPE_INFERRED and inferred_spark_type != inferred_python_type): raise UserException( "transformed column " + column_name, "type inference failed, transform_spark and transform_python had differing types.", "transform_python: " + inferred_python_type, "transform_spark: " + inferred_spark_type, ) name_type_map = [(s.name, s.dataType) for s in transform_spark_df.schema] transform_spark_collect = transform_spark_df.collect() for tp_row, ts_row in zip(transform_python_collect, transform_spark_collect): tp_dict = tp_row.asDict() ts_dict = ts_row.asDict() for name, dataType in name_type_map: if tp_dict[name] == ts_dict[name]: continue elif dataType == FloatType() and util.isclose( tp_dict[name], ts_dict[name], FLOAT_PRECISION): continue raise UserException( column_name, "{0}.transform_spark and {0}.transform_python had differing values" .format(transformed_column["transformer"]), "{} != {}".format(ts_row, tp_row), )
def train(model_name, estimator_impl, ctx, model_dir): model = ctx.models[model_name] util.mkdir_p(model_dir) util.rm_dir(model_dir) tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"]) run_config = tf.estimator.RunConfig( tf_random_seed=model["training"]["tf_random_seed"], save_summary_steps=model["training"]["save_summary_steps"], save_checkpoints_secs=model["training"]["save_checkpoints_secs"], save_checkpoints_steps=model["training"]["save_checkpoints_steps"], log_step_count_steps=model["training"]["log_step_count_steps"], keep_checkpoint_max=model["training"]["keep_checkpoint_max"], keep_checkpoint_every_n_hours=model["training"]["keep_checkpoint_every_n_hours"], model_dir=model_dir, ) train_input_fn = generate_input_fn(model_name, ctx, "training", estimator_impl) eval_input_fn = generate_input_fn(model_name, ctx, "evaluation", estimator_impl) serving_input_fn = generate_json_serving_input_fn(model_name, ctx, estimator_impl) exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False) train_num_steps = model["training"]["num_steps"] dataset_metadata = ctx.get_metadata(model["dataset"]["id"]) if model["training"]["num_epochs"]: train_num_steps = ( math.ceil(dataset_metadata["training_size"] / float(model["training"]["batch_size"])) * model["training"]["num_epochs"] ) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps) eval_num_steps = model["evaluation"]["num_steps"] if model["evaluation"]["num_epochs"]: eval_num_steps = ( math.ceil(dataset_metadata["eval_size"] / float(model["evaluation"]["batch_size"])) * model["evaluation"]["num_epochs"] ) eval_spec = tf.estimator.EvalSpec( eval_input_fn, steps=eval_num_steps, exporters=[exporter], name="estimator-eval", start_delay_secs=model["evaluation"]["start_delay_secs"], throttle_secs=model["evaluation"]["throttle_secs"], ) model_config = ctx.model_config(model_name) try: tf_estimator = estimator_impl.create_estimator(run_config, model_config) except Exception as e: raise UserRuntimeException("model " + model_name) from e target_col_name = util.get_resource_ref(model["target_column"]) if ctx.get_inferred_column_type(target_col_name) == consts.COLUMN_TYPE_FLOAT: tf_estimator = tf.contrib.estimator.add_metrics(tf_estimator, get_regression_eval_metrics) tf.estimator.train_and_evaluate(tf_estimator, train_spec, eval_spec) return model_dir