Ejemplo n.º 1
0
def run_custom_aggregator(aggregator_resource, df, ctx, spark):
    aggregator = ctx.aggregators[aggregator_resource["aggregator"]]
    aggregate_name = aggregator_resource["name"]
    aggregator_impl, _ = ctx.get_aggregator_impl(aggregate_name)
    input_schema = aggregator_resource["inputs"]
    aggregator_column_input = input_schema["columns"]
    args_schema = input_schema["args"]
    args = {}
    if input_schema.get("args",
                        None) is not None and len(input_schema["args"]) > 0:
        args = ctx.populate_args(input_schema["args"])
    try:
        result = aggregator_impl.aggregate_spark(df, aggregator_column_input,
                                                 args)
    except Exception as e:
        raise UserRuntimeException(
            "aggregate " + aggregator_resource["name"],
            "aggregator " + aggregator["name"],
            "function aggregate_spark",
        ) from e

    if not util.validate_value_type(result, aggregator["output_type"]):
        raise UserException(
            "aggregate " + aggregator_resource["name"],
            "aggregator " + aggregator["name"],
            "type of {} is not {}".format(
                util.str_rep(util.pp_str(result), truncate=100),
                aggregator["output_type"]),
        )

    ctx.store_aggregate_result(result, aggregator_resource)
    return result
Ejemplo n.º 2
0
def run_custom_aggregator(aggregate, df, ctx, spark):
    aggregator = ctx.aggregators[aggregate["aggregator"]]
    aggregator_impl, _ = ctx.get_aggregator_impl(aggregate["name"])

    try:
        input = ctx.populate_values(aggregate["input"],
                                    aggregator["input"],
                                    preserve_column_refs=False)
    except CortexException as e:
        e.wrap("aggregate " + aggregate["name"], "input")
        raise

    try:
        result = aggregator_impl.aggregate_spark(df, input)
    except Exception as e:
        raise UserRuntimeException(
            "aggregate " + aggregate["name"],
            "aggregator " + aggregator["name"],
            "function aggregate_spark",
        ) from e

    if aggregator.get(
            "output_type") is not None and not util.validate_output_type(
                result, aggregator["output_type"]):
        raise UserException(
            "aggregate " + aggregate["name"],
            "aggregator " + aggregator["name"],
            "unsupported return type (expected type {}, got {})".format(
                util.data_type_str(aggregator["output_type"]),
                util.user_obj_str(result)),
        )

    result = util.cast_output_type(result, aggregator["output_type"])
    ctx.store_aggregate_result(result, aggregate)
    return result
Ejemplo n.º 3
0
def execute_transform_spark(column_name, df, ctx, spark):
    trans_impl, trans_impl_path = ctx.get_transformer_impl(column_name)
    spark.sparkContext.addPyFile(
        trans_impl_path)  # Executor pods need this because of the UDF
    columns_input_config, impl_args = extract_inputs(column_name, ctx)
    try:
        return trans_impl.transform_spark(df, columns_input_config, impl_args,
                                          column_name)
    except Exception as e:
        raise UserRuntimeException("function transform_spark") from e
Ejemplo n.º 4
0
def transform_column(column_name, df, ctx, spark):
    if not ctx.is_transformed_column(column_name):
        return df
    if column_name in df.columns:
        return df

    transformed_column = ctx.transformed_columns[column_name]
    trans_impl, _ = ctx.get_transformer_impl(column_name)

    if hasattr(trans_impl, "transform_spark"):
        try:
            df = execute_transform_spark(column_name, df, ctx, spark)
            return df.withColumn(
                column_name,
                F.col(column_name).cast(CORTEX_TYPE_TO_SPARK_TYPE[
                    ctx.get_inferred_column_type(column_name)]),
            )
        except CortexException as e:
            raise UserRuntimeException(
                "transformed column " + column_name,
                transformed_column["transformer"] + ".transform_spark",
            ) from e
    elif hasattr(trans_impl, "transform_python"):
        try:
            return execute_transform_python(column_name, df, ctx, spark)
        except Exception as e:
            raise UserRuntimeException(
                "transformed column " + column_name,
                transformed_column["transformer"] + ".transform_python",
            ) from e
    else:
        raise UserException(
            "transformed column " + column_name,
            "transformer " + transformed_column["transformer"],
            "transform_spark(), transform_python(), or both must be defined",
        )
Ejemplo n.º 5
0
def reverse_transform(value):
    ctx = local_cache["ctx"]
    model = local_cache["model"]
    target_col = local_cache["target_col"]

    trans_impl = local_cache["trans_impls"].get(target_col["name"])
    if not (trans_impl and hasattr(trans_impl, "reverse_transform_python")):
        return None

    input = ctx.populate_values(target_col["input"],
                                None,
                                preserve_column_refs=False)
    try:
        result = trans_impl.reverse_transform_python(value, input)
    except Exception as e:
        raise UserRuntimeException("transformer " + target_col["transformer"],
                                   "function reverse_transform_python") from e

    return result
Ejemplo n.º 6
0
def reverse_transform(value):
    ctx = local_cache["ctx"]
    model = local_cache["model"]

    trans_impl = local_cache["trans_impls"].get(model["target_column"], None)
    if not (trans_impl and hasattr(trans_impl, "reverse_transform_python")):
        return None

    transformer_name = model["target_column"]
    input_schema = ctx.transformed_columns[transformer_name]["inputs"]

    if input_schema.get("args", None) is not None and len(input_schema["args"]) > 0:
        args = local_cache["transform_args_cache"].get(transformer_name, {})
    try:
        result = trans_impl.reverse_transform_python(value, args)
    except Exception as e:
        raise UserRuntimeException(
            "transformer " + ctx.transformed_columns[model["target_column"]]["transformer"],
            "function reverse_transform_python",
        ) from e

    return result
Ejemplo n.º 7
0
def execute_transform_spark(column_name, df, ctx, spark):
    trans_impl, trans_impl_path = ctx.get_transformer_impl(column_name)
    transformed_column = ctx.transformed_columns[column_name]
    transformer = ctx.transformers[transformed_column["transformer"]]

    if trans_impl_path not in ctx.spark_uploaded_impls:
        spark.sparkContext.addPyFile(
            trans_impl_path)  # Executor pods need this because of the UDF
        ctx.spark_uploaded_impls[trans_impl_path] = True

    try:
        input = ctx.populate_values(transformed_column["input"],
                                    transformer["input"],
                                    preserve_column_refs=False)
    except CortexException as e:
        e.wrap("input")
        raise

    try:
        return trans_impl.transform_spark(df, input, column_name)
    except Exception as e:
        raise UserRuntimeException("function transform_spark") from e
Ejemplo n.º 8
0
def train(model_name, model_impl, ctx, model_dir):
    model = ctx.models[model_name]

    util.mkdir_p(model_dir)
    util.rm_dir(model_dir)

    tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"])

    run_config = tf.estimator.RunConfig(
        tf_random_seed=model["training"]["tf_random_seed"],
        save_summary_steps=model["training"]["save_summary_steps"],
        save_checkpoints_secs=model["training"]["save_checkpoints_secs"],
        save_checkpoints_steps=model["training"]["save_checkpoints_steps"],
        log_step_count_steps=model["training"]["log_step_count_steps"],
        keep_checkpoint_max=model["training"]["keep_checkpoint_max"],
        keep_checkpoint_every_n_hours=model["training"]
        ["keep_checkpoint_every_n_hours"],
        model_dir=model_dir,
    )

    train_input_fn = generate_input_fn(model_name, ctx, "training")
    eval_input_fn = generate_input_fn(model_name, ctx, "evaluation")
    serving_input_fn = generate_json_serving_input_fn(model_name, ctx)
    exporter = tf.estimator.FinalExporter("estimator",
                                          serving_input_fn,
                                          as_text=False)

    dataset_metadata = aws.read_json_from_s3(model["dataset"]["metadata_key"],
                                             ctx.bucket)
    train_num_steps = model["training"]["num_steps"]
    if model["training"]["num_epochs"]:
        train_num_steps = (
            math.ceil(dataset_metadata["dataset_size"] *
                      model["data_partition_ratio"]["training"] /
                      float(model["training"]["batch_size"])) *
            model["training"]["num_epochs"])

    train_spec = tf.estimator.TrainSpec(train_input_fn,
                                        max_steps=train_num_steps)

    eval_num_steps = model["evaluation"]["num_steps"]
    if model["evaluation"]["num_epochs"]:
        eval_num_steps = (
            math.ceil(dataset_metadata["dataset_size"] *
                      model["data_partition_ratio"]["evaluation"] /
                      float(model["evaluation"]["batch_size"])) *
            model["evaluation"]["num_epochs"])

    eval_spec = tf.estimator.EvalSpec(
        eval_input_fn,
        steps=eval_num_steps,
        exporters=[exporter],
        name="estimator-eval",
        start_delay_secs=model["evaluation"]["start_delay_secs"],
        throttle_secs=model["evaluation"]["throttle_secs"],
    )

    model_config = ctx.model_config(model["name"])
    tf_lib.add_tf_types(model_config)

    try:
        estimator = model_impl.create_estimator(run_config, model_config)
    except Exception as e:
        raise UserRuntimeException("model " + model_name) from e

    if model["type"] == "regression":
        estimator = tf.contrib.estimator.add_metrics(
            estimator, get_regression_eval_metrics)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    return model_dir
Ejemplo n.º 9
0
def validate_transformer(column_name, df, ctx, spark):
    transformed_column = ctx.transformed_columns[column_name]

    trans_impl, _ = ctx.get_transformer_impl(column_name)

    if hasattr(trans_impl, "transform_python"):
        try:
            transform_python_collect = execute_transform_python(
                column_name, df, ctx, spark, validate=True).collect()
        except Exception as e:
            raise UserRuntimeException(
                "transformed column " + column_name,
                transformed_column["transformer"] + ".transform_python",
            ) from e

    if hasattr(trans_impl, "transform_spark"):

        try:
            transform_spark_df = execute_transform_spark(
                column_name, df, ctx, spark)

            # check that the return object is a dataframe
            if type(transform_spark_df) is not DataFrame:
                raise UserException(
                    "expected pyspark.sql.dataframe.DataFrame but found type {}"
                    .format(type(transform_spark_df)))

            # check that a column is added with the expected name
            if column_name not in transform_spark_df.columns:
                logger.error("schema of output dataframe:")
                log_df_schema(transform_spark_df, logger.error)

                raise UserException(
                    "output dataframe after running transformer does not have column {}"
                    .format(column_name))

            # check that transformer run on data
            try:
                transform_spark_df.select(column_name).collect()
            except Exception as e:
                raise UserRuntimeException("function transform_spark") from e

            actual_structfield = transform_spark_df.select(
                column_name).schema.fields[0]

            # check that expected output column has the correct data type
            if (actual_structfield.dataType
                    not in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[
                        transformed_column["type"]]):
                raise UserException(
                    "incorrect column type, expected {}, found {}.".format(
                        " or ".join(
                            str(t)
                            for t in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[
                                transformed_column["type"]]),
                        actual_structfield.dataType,
                    ))

            # perform the necessary upcast/downcast for the column e.g INT -> LONG or DOUBLE -> FLOAT
            transform_spark_df = transform_spark_df.withColumn(
                column_name,
                F.col(column_name).cast(CORTEX_TYPE_TO_SPARK_TYPE[
                    ctx.transformed_columns[column_name]["type"]]),
            )

            # check that the function doesn't modify the schema of the other columns in the input dataframe
            if set(transform_spark_df.columns) - set([column_name]) != set(
                    df.columns):
                logger.error("expected schema:")

                log_df_schema(df, logger.error)

                logger.error(
                    "found schema (with {} dropped):".format(column_name))
                log_df_schema(transform_spark_df.drop(column_name),
                              logger.error)

                raise UserException(
                    "a column besides {} was modifed in the output dataframe".
                    format(column_name))
        except CortexException as e:
            e.wrap(
                "transformed column " + column_name,
                transformed_column["transformer"] + ".transform_spark",
            )
            raise

        if hasattr(trans_impl, "transform_spark") and hasattr(
                trans_impl, "transform_python"):
            name_type_map = [(s.name, s.dataType)
                             for s in transform_spark_df.schema]
            transform_spark_collect = transform_spark_df.collect()

            for tp_row, ts_row in zip(transform_python_collect,
                                      transform_spark_collect):
                tp_dict = tp_row.asDict()
                ts_dict = ts_row.asDict()

                for name, dataType in name_type_map:
                    if tp_dict[name] == ts_dict[name]:
                        continue
                    elif dataType == FloatType() and util.isclose(
                            tp_dict[name], ts_dict[name], FLOAT_PRECISION):
                        continue
                    raise UserException(
                        column_name,
                        "{0}.transform_spark and {0}.transform_python had differing values"
                        .format(transformed_column["transformer"]),
                        "{} != {}".format(ts_row, tp_row),
                    )
Ejemplo n.º 10
0
def validate_transformer(column_name, test_df, ctx, spark):
    transformed_column = ctx.transformed_columns[column_name]
    transformer = ctx.transformers[transformed_column["transformer"]]
    trans_impl, _ = ctx.get_transformer_impl(column_name)

    inferred_python_type = None
    inferred_spark_type = None

    if hasattr(trans_impl, "transform_python"):
        try:
            if transformer["output_type"] == consts.COLUMN_TYPE_INFERRED:
                sample_df = test_df.collect()
                sample = sample_df[0]
                try:
                    input = ctx.populate_values(transformed_column["input"],
                                                transformer["input"],
                                                preserve_column_refs=True)
                except CortexException as e:
                    e.wrap("input")
                    raise
                transformer_input = create_transformer_inputs_from_map(
                    input, sample)
                initial_transformed_value = trans_impl.transform_python(
                    transformer_input)
                inferred_python_type = infer_python_type(
                    initial_transformed_value)

                for row in sample_df:
                    transformer_input = create_transformer_inputs_from_map(
                        input, row)
                    transformed_value = trans_impl.transform_python(
                        transformer_input)
                    if inferred_python_type != infer_python_type(
                            transformed_value):
                        raise UserException(
                            "transformed column " + column_name,
                            "type inference failed, mixed data types in dataframe.",
                            'expected type of "' + transformed_sample +
                            '" to be ' + inferred_python_type,
                        )

                ctx.write_metadata(transformed_column["id"],
                                   {"type": inferred_python_type})

            transform_python_collect = execute_transform_python(
                column_name, test_df, ctx, spark, validate=True).collect()
        except Exception as e:
            raise UserRuntimeException(
                "transformed column " + column_name,
                transformed_column["transformer"] + ".transform_python",
            ) from e

    if hasattr(trans_impl, "transform_spark"):
        try:
            transform_spark_df = execute_transform_spark(
                column_name, test_df, ctx, spark)

            # check that the return object is a dataframe
            if type(transform_spark_df) is not DataFrame:
                raise UserException(
                    "expected pyspark.sql.dataframe.DataFrame but got type {}".
                    format(type(transform_spark_df)))

            # check that a column is added with the expected name
            if column_name not in transform_spark_df.columns:
                logger.error("schema of output dataframe:")
                log_df_schema(transform_spark_df, logger.error)

                raise UserException(
                    "output dataframe after running transformer does not have column {}"
                    .format(column_name))

            if transformer["output_type"] == consts.COLUMN_TYPE_INFERRED:
                inferred_spark_type = SPARK_TYPE_TO_CORTEX_TYPE[
                    transform_spark_df.select(column_name).schema[0].dataType]
                ctx.write_metadata(transformed_column["id"],
                                   {"type": inferred_spark_type})

            # check that transformer run on data
            try:
                transform_spark_df.select(column_name).collect()
            except Exception as e:
                raise UserRuntimeException("function transform_spark") from e

            # check that expected output column has the correct data type
            if transformer["output_type"] != consts.COLUMN_TYPE_INFERRED:
                actual_structfield = transform_spark_df.select(
                    column_name).schema.fields[0]
                if (actual_structfield.dataType
                        not in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[
                            transformer["output_type"]]):
                    raise UserException(
                        "incorrect column type: expected {}, got {}.".format(
                            " or ".join(
                                str(t)
                                for t in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[
                                    transformer["output_type"]]),
                            actual_structfield.dataType,
                        ))

            # perform the necessary casting for the column
            transform_spark_df = transform_spark_df.withColumn(
                column_name,
                F.col(column_name).cast(CORTEX_TYPE_TO_SPARK_TYPE[
                    ctx.get_inferred_column_type(column_name)]),
            )

            # check that the function doesn't modify the schema of the other columns in the input dataframe
            if set(transform_spark_df.columns) - set([column_name]) != set(
                    test_df.columns):
                logger.error("expected schema:")

                log_df_schema(test_df, logger.error)

                logger.error(
                    "found schema (with {} dropped):".format(column_name))
                log_df_schema(transform_spark_df.drop(column_name),
                              logger.error)

                raise UserException(
                    "a column besides {} was modifed in the output dataframe".
                    format(column_name))
        except CortexException as e:
            raise UserRuntimeException(
                "transformed column " + column_name,
                transformed_column["transformer"] + ".transform_spark",
            ) from e

    if hasattr(trans_impl, "transform_spark") and hasattr(
            trans_impl, "transform_python"):
        if (transformer["output_type"] == consts.COLUMN_TYPE_INFERRED
                and inferred_spark_type != inferred_python_type):
            raise UserException(
                "transformed column " + column_name,
                "type inference failed, transform_spark and transform_python had differing types.",
                "transform_python: " + inferred_python_type,
                "transform_spark: " + inferred_spark_type,
            )

        name_type_map = [(s.name, s.dataType)
                         for s in transform_spark_df.schema]
        transform_spark_collect = transform_spark_df.collect()

        for tp_row, ts_row in zip(transform_python_collect,
                                  transform_spark_collect):
            tp_dict = tp_row.asDict()
            ts_dict = ts_row.asDict()

            for name, dataType in name_type_map:
                if tp_dict[name] == ts_dict[name]:
                    continue
                elif dataType == FloatType() and util.isclose(
                        tp_dict[name], ts_dict[name], FLOAT_PRECISION):
                    continue
                raise UserException(
                    column_name,
                    "{0}.transform_spark and {0}.transform_python had differing values"
                    .format(transformed_column["transformer"]),
                    "{} != {}".format(ts_row, tp_row),
                )
Ejemplo n.º 11
0
def train(model_name, estimator_impl, ctx, model_dir):
    model = ctx.models[model_name]

    util.mkdir_p(model_dir)
    util.rm_dir(model_dir)

    tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"])

    run_config = tf.estimator.RunConfig(
        tf_random_seed=model["training"]["tf_random_seed"],
        save_summary_steps=model["training"]["save_summary_steps"],
        save_checkpoints_secs=model["training"]["save_checkpoints_secs"],
        save_checkpoints_steps=model["training"]["save_checkpoints_steps"],
        log_step_count_steps=model["training"]["log_step_count_steps"],
        keep_checkpoint_max=model["training"]["keep_checkpoint_max"],
        keep_checkpoint_every_n_hours=model["training"]["keep_checkpoint_every_n_hours"],
        model_dir=model_dir,
    )

    train_input_fn = generate_input_fn(model_name, ctx, "training", estimator_impl)
    eval_input_fn = generate_input_fn(model_name, ctx, "evaluation", estimator_impl)
    serving_input_fn = generate_json_serving_input_fn(model_name, ctx, estimator_impl)
    exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False)

    train_num_steps = model["training"]["num_steps"]
    dataset_metadata = ctx.get_metadata(model["dataset"]["id"])
    if model["training"]["num_epochs"]:
        train_num_steps = (
            math.ceil(dataset_metadata["training_size"] / float(model["training"]["batch_size"]))
            * model["training"]["num_epochs"]
        )

    train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps)

    eval_num_steps = model["evaluation"]["num_steps"]
    if model["evaluation"]["num_epochs"]:
        eval_num_steps = (
            math.ceil(dataset_metadata["eval_size"] / float(model["evaluation"]["batch_size"]))
            * model["evaluation"]["num_epochs"]
        )

    eval_spec = tf.estimator.EvalSpec(
        eval_input_fn,
        steps=eval_num_steps,
        exporters=[exporter],
        name="estimator-eval",
        start_delay_secs=model["evaluation"]["start_delay_secs"],
        throttle_secs=model["evaluation"]["throttle_secs"],
    )

    model_config = ctx.model_config(model_name)

    try:
        tf_estimator = estimator_impl.create_estimator(run_config, model_config)
    except Exception as e:
        raise UserRuntimeException("model " + model_name) from e

    target_col_name = util.get_resource_ref(model["target_column"])
    if ctx.get_inferred_column_type(target_col_name) == consts.COLUMN_TYPE_FLOAT:
        tf_estimator = tf.contrib.estimator.add_metrics(tf_estimator, get_regression_eval_metrics)

    tf.estimator.train_and_evaluate(tf_estimator, train_spec, eval_spec)

    return model_dir