def build_packages(python_packages, bucket): cmd_partial = {} build_order = get_build_order(python_packages) for package_name in build_order: python_package = python_packages[package_name] if package_name == "requirements.txt": requirements_path = os.path.join(LOCAL_PACKAGE_PATH, package_name) aws.download_file_from_s3(python_package["src_key"], requirements_path, bucket) cmd_partial[package_name] = "-r " + requirements_path else: aws.download_and_extract_zip(python_package["src_key"], LOCAL_PACKAGE_PATH, bucket) cmd_partial[package_name] = os.path.join(LOCAL_PACKAGE_PATH, package_name) logger.info("Setting up packages") restricted_packages = get_restricted_packages() for package_name in build_order: package_wheel_path = os.path.join(WHEELHOUSE_PATH, package_name) requirement = cmd_partial[package_name] logger.info("Building: {}".format(package_name)) completed_process = run("pip3 wheel -w {} {}".format( package_wheel_path, requirement).split()) if completed_process.returncode != 0: raise UserException("creating wheels", package_name) for wheelname in os.listdir(package_wheel_path): name_split = wheelname.split("-") dist_name, version = name_split[0], name_split[1] expected_version = restricted_packages.get(dist_name, None) if expected_version is not None and version != expected_version: raise UserException( "when installing {}, found {}=={} but cortex requires {}=={}" .format(package_name, dist_name, version, dist_name, expected_version)) logger.info("Validating packages") for package_name in build_order: requirement = cmd_partial[package_name] logger.info("Installing: {}".format(package_name)) completed_process = run( "pip3 install --no-index --find-links={} {}".format( os.path.join(WHEELHOUSE_PATH, package_name), requirement).split()) if completed_process.returncode != 0: raise UserException("installing package", package_name) logger.info("Caching built packages") for package_name in build_order: aws.compress_zip_and_upload( os.path.join(WHEELHOUSE_PATH, package_name), python_packages[package_name]["package_key"], bucket, )
def run_custom_aggregator(aggregator_resource, df, ctx, spark): aggregator = ctx.aggregators[aggregator_resource["aggregator"]] aggregate_name = aggregator_resource["name"] aggregator_impl, _ = ctx.get_aggregator_impl(aggregate_name) input_schema = aggregator_resource["inputs"] aggregator_column_input = input_schema["columns"] args_schema = input_schema["args"] args = {} if input_schema.get("args", None) is not None and len(input_schema["args"]) > 0: args = ctx.populate_args(input_schema["args"]) try: result = aggregator_impl.aggregate_spark(df, aggregator_column_input, args) except Exception as e: raise UserRuntimeException( "aggregate " + aggregator_resource["name"], "aggregator " + aggregator["name"], "function aggregate_spark", ) from e if not util.validate_value_type(result, aggregator["output_type"]): raise UserException( "aggregate " + aggregator_resource["name"], "aggregator " + aggregator["name"], "type of {} is not {}".format( util.str_rep(util.pp_str(result), truncate=100), aggregator["output_type"]), ) ctx.store_aggregate_result(result, aggregator_resource) return result
def transform_sample(sample): ctx = local_cache["ctx"] model = local_cache["model"] transformed_sample = {} for column_name in ctx.extract_column_names(model["input"]): if ctx.is_raw_column(column_name): transformed_value = sample[column_name] else: transformed_column = ctx.transformed_columns[column_name] trans_impl = local_cache["trans_impls"][column_name] if not hasattr(trans_impl, "transform_python"): raise UserException( "transformed column " + column_name, "transformer " + transformed_column["transformer"], "transform_python function is missing", ) input = ctx.populate_values(transformed_column["input"], None, preserve_column_refs=True) transformer_input = create_transformer_inputs_from_map( input, sample) transformed_value = trans_impl.transform_python(transformer_input) transformed_sample[column_name] = transformed_value return transformed_sample
def ingest(ctx, spark): if ctx.environment["data"]["type"] == "csv": df = read_csv(ctx, spark) elif ctx.environment["data"]["type"] == "parquet": df = read_parquet(ctx, spark) input_type_map = {f.name: f.dataType for f in df.schema} for raw_column_name in ctx.raw_columns.keys(): raw_column = ctx.raw_columns[raw_column_name] expected_types = CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[ raw_column["type"]] actual_type = input_type_map[raw_column_name] if actual_type not in expected_types: logger.error("found schema:") log_df_schema(df, logger.error) raise UserException( "raw column " + raw_column_name, "type mismatch", "expected {} but found {}".format( " or ".join(str(x) for x in expected_types), actual_type), ) target_type = CORTEX_TYPE_TO_SPARK_TYPE[raw_column["type"]] if target_type != actual_type: df = df.withColumn(raw_column_name, F.col(raw_column_name).cast(target_type)) return df.select(*sorted(df.columns))
def transform_features(raw_features): ctx = local_cache["ctx"] model = local_cache["model"] transformed_features = {} for feature_name in model["features"]: if ctx.is_raw_feature(feature_name): transformed_feature = raw_features[feature_name] else: inputs = ctx.create_inputs_from_features_map( raw_features, feature_name) trans_impl = local_cache["trans_impls"][feature_name] if not hasattr(trans_impl, "transform_python"): raise UserException( "transformed feature " + feature_name, "transformer " + ctx.transformed_features[feature_name]["transformer"], "transform_python function missing", ) args = local_cache["transform_args_cache"].get(feature_name, {}) transformed_feature = trans_impl.transform_python(inputs, args) transformed_features[feature_name] = transformed_feature return transformed_features
def run_custom_aggregator(aggregate, df, ctx, spark): aggregator = ctx.aggregators[aggregate["aggregator"]] aggregator_impl, _ = ctx.get_aggregator_impl(aggregate["name"]) try: input = ctx.populate_values(aggregate["input"], aggregator["input"], preserve_column_refs=False) except CortexException as e: e.wrap("aggregate " + aggregate["name"], "input") raise try: result = aggregator_impl.aggregate_spark(df, input) except Exception as e: raise UserRuntimeException( "aggregate " + aggregate["name"], "aggregator " + aggregator["name"], "function aggregate_spark", ) from e if aggregator.get( "output_type") is not None and not util.validate_output_type( result, aggregator["output_type"]): raise UserException( "aggregate " + aggregate["name"], "aggregator " + aggregator["name"], "unsupported return type (expected type {}, got {})".format( util.data_type_str(aggregator["output_type"]), util.user_obj_str(result)), ) result = util.cast_output_type(result, aggregator["output_type"]) ctx.store_aggregate_result(result, aggregate) return result
def transform_sample(sample): ctx = local_cache["ctx"] model = local_cache["model"] transformed_sample = {} for column_name in model["feature_columns"]: if ctx.is_raw_column(column_name): transformed_value = sample[column_name] else: inputs = ctx.create_column_inputs_map(sample, column_name) trans_impl = local_cache["trans_impls"][column_name] if not hasattr(trans_impl, "transform_python"): raise UserException( "transformed column " + column_name, "transformer " + ctx.transformed_sample[column_name]["transformer"], "transform_python function missing", ) args = local_cache["transform_args_cache"].get(column_name, {}) transformed_value = trans_impl.transform_python(inputs, args) transformed_sample[column_name] = transformed_value return transformed_sample
def install_packages(python_packages, bucket): build_order = get_build_order(python_packages) for package_name in build_order: python_package = python_packages[package_name] aws.download_and_extract_zip( python_package["package_key"], os.path.join(WHEELHOUSE_PATH, package_name), bucket) if "requirements.txt" in python_packages: aws.download_file_from_s3( python_packages["requirements.txt"]["src_key"], "/requirements.txt", bucket) for package_name in build_order: cmd = package_name if package_name == "requirements.txt": cmd = "-r /requirements.txt" completed_process = run( "pip3 install --no-cache-dir --no-index --find-links={} {}".format( os.path.join(WHEELHOUSE_PATH, package_name), cmd).split()) if completed_process.returncode != 0: raise UserException("installing package", package_name) util.rm_file("/requirements.txt") util.rm_dir(WHEELHOUSE_PATH)
def build_packages(python_packages, bucket): cmd_partial = {} build_order = get_build_order(python_packages) for package_name in build_order: python_package = python_packages[package_name] if package_name == "requirements.txt": requirements_path = os.path.join(LOCAL_PACKAGE_PATH, package_name) aws.download_file_from_s3(python_package["src_key"], requirements_path, bucket) cmd_partial[package_name] = "-r " + requirements_path else: aws.download_and_extract_zip(python_package["src_key"], LOCAL_PACKAGE_PATH, bucket) cmd_partial[package_name] = os.path.join(LOCAL_PACKAGE_PATH, package_name) logger.info("Setting up packages") for package_name in build_order: requirement = cmd_partial[package_name] logger.info("Building package {}".format(package_name)) completed_process = run("pip3 wheel -w {} {}".format( os.path.join(WHEELHOUSE_PATH, package_name), requirement).split()) if completed_process.returncode != 0: raise UserException("creating wheels", package_name) logger.info("Validating packages") for package_name in build_order: requirement = cmd_partial[package_name] logger.info("Installing package {}".format(package_name)) completed_process = run( "pip3 install --no-index --find-links={} {}".format( os.path.join(WHEELHOUSE_PATH, package_name), requirement).split()) if completed_process.returncode != 0: raise UserException("installing package", package_name) logger.info("Caching built packages") for package_name in build_order: aws.compress_zip_and_upload( os.path.join(WHEELHOUSE_PATH, package_name), python_packages[package_name]["package_key"], bucket, )
def validate_dataset(ctx, raw_df, cols_to_validate): total_row_count = ctx.get_metadata(ctx.raw_dataset["key"])["dataset_size"] conditions_dict = spark_util.value_check_data(ctx, raw_df, cols_to_validate) if len(conditions_dict) > 0: for column, cond_count_list in conditions_dict.items(): for condition, fail_count in cond_count_list: logger.error( "Data validation {} has been violated in {}/{} samples". format(condition, fail_count, total_row_count)) raise UserException("raw column validations failed")
def _transform_and_validate(*values): result = _transform(*values) if not util.validate_column_type(result, transformed_column["type"]): raise UserException( "transformed column " + column_name, "tranformation " + transformed_column["transformer"], "type of {} is not {}".format(result, transformed_column["type"]), ) return result
def _validate_required_fn_args(impl, fn_name, args): fn = getattr(impl, fn_name, None) if not fn: raise UserException("function " + fn_name, "could not find function") if not callable(fn): raise UserException("function " + fn_name, "not a function") argspec = inspect.getargspec(fn) if argspec.varargs != None or argspec.keywords != None or argspec.defaults != None: raise UserException( "function " + fn_name, "invalid function signature, can only accept positional arguments", ) if args: if argspec.args != args: raise UserException( "function " + fn_name, "expected function arguments arguments ({}) but found ({})". format(", ".join(args), ", ".join(argspec.args)), )
def validate_model_dir(model_dir): """ validates that model_dir has the expected directory tree. For example (your TF serving version number may be different): 1562353043/ saved_model.pb variables/ variables.data-00000-of-00001 variables.index """ version = os.listdir(model_dir)[0] if not version.isdigit(): raise UserException( "No versions of servable default found under base path in model_dir. See docs.cortex.dev for how to properly package your TensorFlow model" ) if "saved_model.pb" not in os.listdir(os.path.join(model_dir, version)): raise UserException( 'Expected packaged model to have a "saved_model.pb" file. See docs.cortex.dev for how to properly package your TensorFlow model' )
def model_config(self, model_name): model = self.models[model_name] if model is None: return None estimator = self.estimators[model["estimator"]] target_column = self.columns[util.get_resource_ref(model["target_column"])] if estimator.get("target_column") is not None: target_col_type = self.get_inferred_column_type(target_column["name"]) if target_col_type not in estimator["target_column"]: raise UserException( "model " + model_name, "target_column", target_column["name"], "unsupported type (expected type {}, got type {})".format( util.data_type_str(estimator["target_column"]), util.data_type_str(target_col_type), ), ) model_config = deepcopy(model) config_keys = [ "name", "estimator" "estimator_path" "target_column" "input" "training_input" "hparams" "prediction_key" "data_partition_ratio" "training" "evaluation" "tags", ] util.keep_dict_keys(model_config, config_keys) model_config["target_column"] = target_column["name"] model_config["input"] = self.populate_values( model["input"], estimator["input"], preserve_column_refs=False ) if model.get("training_input") is not None: model_config["training_input"] = self.populate_values( model["training_input"], estimator["training_input"], preserve_column_refs=False ) if model.get("hparams") is not None: model_config["hparams"] = self.populate_values( model["hparams"], estimator["hparams"], preserve_column_refs=False ) return model_config
def _transform_and_validate(*values): result = _transform(*values) if not util.validate_cortex_type(result, column_type): raise UserException( "transformed column " + column_name, "tranformer " + transformed_column["transformer"], "incorrect return value type: expected {}, got {}.".format( " or ".join(CORTEX_TYPE_TO_ACCEPTABLE_PYTHON_TYPE_STRS[ column_type]), util.user_obj_str(result), ), ) return result
def load_module(self, module_prefix, module_name, impl_key): full_module_name = "{}_{}".format(module_prefix, module_name) try: impl_path = self.download_python_file(impl_key, full_module_name) except CortexException as e: e.wrap("unable to find python file " + module_name) raise try: impl = imp.load_source(full_module_name, impl_path) except Exception as e: raise UserException("unable to load python module " + module_name) from e return impl, impl_path
def read_parquet(ctx, spark): parquet_config = ctx.environment["data"] df = spark.read.parquet(parquet_config["path"]) parquet_columns = [c["column_name"] for c in parquet_config["schema"]] missing_cols = util.subtract_lists(parquet_columns, df.columns) if len(missing_cols) > 0: raise UserException("parquet dataset", "missing columns: " + str(missing_cols)) selectExprs = [ "{} as {}".format(c["column_name"], c["feature_name"]) for c in parquet_config["schema"] ] return df.selectExpr(*selectExprs)
def ingest(ctx, spark): expected_schema = expected_schema_from_context(ctx) if ctx.environment["data"]["type"] == "csv": df = read_csv(ctx, spark) elif ctx.environment["data"]["type"] == "parquet": df = read_parquet(ctx, spark) if compare_column_schemas(expected_schema, df.schema) is not True: logger.error("expected schema:") log_df_schema(spark.createDataFrame([], expected_schema), logger.error) logger.error("found schema:") log_df_schema(df, logger.error) raise UserException("raw data schema mismatch") return df
def cast_compound_type(value, type_str): allowed_types = type_str.split("|") if consts.VALUE_TYPE_INT in allowed_types: if util.is_int(value): return value if consts.VALUE_TYPE_FLOAT in allowed_types: if util.is_int(value): return float(value) if util.is_float(value): return value if consts.VALUE_TYPE_STRING in allowed_types: if util.is_str(value): return value if consts.VALUE_TYPE_BOOL in allowed_types: if util.is_bool(value): return value raise UserException( "unsupported input type (expected type {}, got {})".format( util.data_type_str(type_str), util.user_obj_str(value) ) )
def transform_column(column_name, df, ctx, spark): if not ctx.is_transformed_column(column_name): return df if column_name in df.columns: return df transformed_column = ctx.transformed_columns[column_name] trans_impl, trans_impl_path = ctx.get_transformer_impl(column_name) if hasattr(trans_impl, "transform_spark"): return execute_transform_spark(column_name, df, ctx, spark).withColumn( column_name, F.col(column_name).cast(CORTEX_TYPE_TO_SPARK_TYPE[ ctx.transformed_columns[column_name]["type"]]), ) elif hasattr(trans_impl, "transform_python"): return execute_transform_python(column_name, df, ctx, spark) else: raise UserException( "transformed column " + column_name, "transformer " + transformed_column["transformer"], "transform_spark(), transform_python(), or both must be defined", )
def read_parquet(ctx, spark): parquet_config = ctx.environment["data"] df = spark.read.parquet(parquet_config["path"]) alias_map = {} for parquet_col_config in parquet_config["schema"]: col_name = util.get_resource_ref(parquet_col_config["raw_column"]) if col_name in ctx.raw_columns: alias_map[col_name] = parquet_col_config["parquet_column_name"] missing_cols = set(alias_map.keys()) - set(df.columns) if len(missing_cols) > 0: logger.error("found schema:") log_df_schema(df, logger.error) raise UserException("missing column(s) in input dataset", str(missing_cols)) selectExprs = [ "{} as {}".format(parq_name, col_name) for col_name, parq_name in alias_map.items() ] return df.selectExpr(*selectExprs)
def read_parquet(ctx, spark): parquet_config = ctx.environment["data"] df = spark.read.parquet(parquet_config["path"]) alias_map = { c["parquet_column_name"]: c["raw_column_name"] for c in parquet_config["schema"] if c["parquet_column_name"] in ctx.raw_columns } missing_cols = set(alias_map.keys()) - set(df.columns) if len(missing_cols) > 0: logger.error("found schema:") log_df_schema(df, logger.error) raise UserException("missing column(s) in input dataset", str(missing_cols)) selectExprs = [ "{} as {}".format(alias_map[alias], alias) for alias in alias_map.keys() ] return df.selectExpr(*selectExprs)
def read_csv(ctx, spark): data_config = ctx.environment["data"] csv_config = { util.snake_to_camel(param_name): val for param_name, val in data_config.get("csv_config", {}).items() if val is not None } df = spark.read.csv(data_config["path"], inferSchema=True, mode="FAILFAST", **csv_config) if len(data_config["schema"]) != len(df.columns): raise UserException("expected " + len(data_config["schema"]) + " column(s) but got " + len(df.columns)) col_names = [ util.get_resource_ref(col_ref) for col_ref in data_config["schema"] ] renamed_cols = [ F.col(c).alias(col_names[idx]) for idx, c in enumerate(df.columns) ] return df.select(*renamed_cols)
def transform_column(column_name, df, ctx, spark): if not ctx.is_transformed_column(column_name): return df if column_name in df.columns: return df transformed_column = ctx.transformed_columns[column_name] trans_impl, _ = ctx.get_transformer_impl(column_name) if hasattr(trans_impl, "transform_spark"): try: df = execute_transform_spark(column_name, df, ctx, spark) return df.withColumn( column_name, F.col(column_name).cast(CORTEX_TYPE_TO_SPARK_TYPE[ ctx.get_inferred_column_type(column_name)]), ) except CortexException as e: raise UserRuntimeException( "transformed column " + column_name, transformed_column["transformer"] + ".transform_spark", ) from e elif hasattr(trans_impl, "transform_python"): try: return execute_transform_python(column_name, df, ctx, spark) except Exception as e: raise UserRuntimeException( "transformed column " + column_name, transformed_column["transformer"] + ".transform_python", ) from e else: raise UserException( "transformed column " + column_name, "transformer " + transformed_column["transformer"], "transform_spark(), transform_python(), or both must be defined", )
def ingest_raw_dataset(spark, ctx, features_to_validate, should_ingest): if should_ingest: features_to_validate = list(ctx.rf_id_map.keys()) if len(features_to_validate) > 0: feature_resources_to_validate = [ ctx.rf_id_map[f] for f in features_to_validate ] ctx.upload_resource_status_start(*feature_resources_to_validate) try: if should_ingest: logger.info("Ingesting") logger.info("Ingesting {} data from {}".format( ctx.app["name"], ctx.environment["data"]["path"])) ingest_df = spark_util.ingest(ctx, spark) full_dataset_counter = ingest_df.count() if ctx.environment["data"].get("drop_null"): ingest_df = ingest_df.dropna() logger.info("Dropping any rows that contain null values") write_dataset_counter = ingest_df.count() logger.info("Caching {} data (version: {})".format( ctx.app["name"], ctx.dataset_version)) spark_util.write_raw_dataset(ingest_df, ctx) if ctx.environment["data"].get("drop_null"): logger.info( "{} rows read, {} rows dropped, {} rows ingested". format( full_dataset_counter, full_dataset_counter - write_dataset_counter, write_dataset_counter, )) else: logger.info( "{} rows ingested".format(full_dataset_counter)) logger.info("Reading {} data (version: {})".format( ctx.app["name"], ctx.dataset_version)) raw_df = spark_util.read_raw_dataset(ctx, spark) total_row_count = raw_df.count() conditions_dict = spark_util.value_check_data( ctx, raw_df, features_to_validate) if len(conditions_dict) > 0: for column, cond_count_list in conditions_dict.items(): for condition, fail_count in cond_count_list: logger.error( "Data validation {} has been violated in {}/{} samples" .format(condition, fail_count, total_row_count)) raise UserException("raw feature validations failed") except: ctx.upload_resource_status_failed(*feature_resources_to_validate) raise ctx.upload_resource_status_success(*feature_resources_to_validate) logger.info("First {} samples:".format(3)) show_df(raw_df, ctx, 3) else: logger.info("Reading {} data (version: {})".format( ctx.app["name"], ctx.dataset_version)) raw_df = spark_util.read_raw_dataset(ctx, spark) spark_util.value_check_data(ctx, raw_df, features_to_validate) return raw_df
def ingest(ctx, spark): fileType = ctx.environment["data"]["type"] if fileType == "csv": df = read_csv(ctx, spark) elif fileType == "parquet": df = read_parquet(ctx, spark) input_type_map = {f.name: f.dataType for f in df.schema} for raw_column_name in ctx.raw_columns: raw_column = ctx.raw_columns[raw_column_name] expected_cortex_type = raw_column["type"] actual_spark_type = input_type_map[raw_column_name] if expected_cortex_type == consts.COLUMN_TYPE_INFERRED: if actual_spark_type not in SPARK_TYPE_TO_CORTEX_TYPE: df = df.withColumn(raw_column_name, F.col(raw_column_name).cast(StringType())) else: actual_cortex_type = SPARK_TYPE_TO_CORTEX_TYPE[ actual_spark_type] expected_spark_type = CORTEX_TYPE_TO_SPARK_TYPE[ actual_cortex_type] if actual_spark_type != expected_spark_type: df = df.withColumn( raw_column_name, F.col(raw_column_name).cast(expected_spark_type)) else: expected_spark_type = CORTEX_TYPE_TO_SPARK_TYPE[ expected_cortex_type] if actual_spark_type in SPARK_TYPE_TO_CORTEX_TYPE: expected_types = CORTEX_TYPE_TO_CASTABLE_SPARK_TYPES[fileType][ expected_cortex_type] if actual_spark_type not in expected_types: logger.error("found schema:") log_df_schema(df, logger.error) raise UserException( "raw column " + raw_column_name, "type mismatch", "expected {} but got {}".format( " or ".join(str(x) for x in expected_types), actual_spark_type), ) if actual_spark_type != expected_spark_type: df = df.withColumn( raw_column_name, F.col(raw_column_name).cast(expected_spark_type)) else: try: df = df.withColumn( raw_column_name, F.col(raw_column_name).cast(expected_spark_type)) except Exception as e: raise UserException( "tried casting " + raw_column_name, "from ingested type " + actual_spark_type, "to expected type " + expected_spark_type, "but got exception: " + e, ) return df.select(*sorted(df.columns))
def validate_transformer(column_name, df, ctx, spark): transformed_column = ctx.transformed_columns[column_name] trans_impl, _ = ctx.get_transformer_impl(column_name) if hasattr(trans_impl, "transform_python"): try: transform_python_collect = execute_transform_python( column_name, df, ctx, spark, validate=True).collect() except Exception as e: raise UserRuntimeException( "transformed column " + column_name, transformed_column["transformer"] + ".transform_python", ) from e if hasattr(trans_impl, "transform_spark"): try: transform_spark_df = execute_transform_spark( column_name, df, ctx, spark) # check that the return object is a dataframe if type(transform_spark_df) is not DataFrame: raise UserException( "expected pyspark.sql.dataframe.DataFrame but found type {}" .format(type(transform_spark_df))) # check that a column is added with the expected name if column_name not in transform_spark_df.columns: logger.error("schema of output dataframe:") log_df_schema(transform_spark_df, logger.error) raise UserException( "output dataframe after running transformer does not have column {}" .format(column_name)) # check that transformer run on data try: transform_spark_df.select(column_name).collect() except Exception as e: raise UserRuntimeException("function transform_spark") from e actual_structfield = transform_spark_df.select( column_name).schema.fields[0] # check that expected output column has the correct data type if (actual_structfield.dataType not in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[ transformed_column["type"]]): raise UserException( "incorrect column type, expected {}, found {}.".format( " or ".join( str(t) for t in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[ transformed_column["type"]]), actual_structfield.dataType, )) # perform the necessary upcast/downcast for the column e.g INT -> LONG or DOUBLE -> FLOAT transform_spark_df = transform_spark_df.withColumn( column_name, F.col(column_name).cast(CORTEX_TYPE_TO_SPARK_TYPE[ ctx.transformed_columns[column_name]["type"]]), ) # check that the function doesn't modify the schema of the other columns in the input dataframe if set(transform_spark_df.columns) - set([column_name]) != set( df.columns): logger.error("expected schema:") log_df_schema(df, logger.error) logger.error( "found schema (with {} dropped):".format(column_name)) log_df_schema(transform_spark_df.drop(column_name), logger.error) raise UserException( "a column besides {} was modifed in the output dataframe". format(column_name)) except CortexException as e: e.wrap( "transformed column " + column_name, transformed_column["transformer"] + ".transform_spark", ) raise if hasattr(trans_impl, "transform_spark") and hasattr( trans_impl, "transform_python"): name_type_map = [(s.name, s.dataType) for s in transform_spark_df.schema] transform_spark_collect = transform_spark_df.collect() for tp_row, ts_row in zip(transform_python_collect, transform_spark_collect): tp_dict = tp_row.asDict() ts_dict = ts_row.asDict() for name, dataType in name_type_map: if tp_dict[name] == ts_dict[name]: continue elif dataType == FloatType() and util.isclose( tp_dict[name], ts_dict[name], FLOAT_PRECISION): continue raise UserException( column_name, "{0}.transform_spark and {0}.transform_python had differing values" .format(transformed_column["transformer"]), "{} != {}".format(ts_row, tp_row), )
def validate_transformer(column_name, test_df, ctx, spark): transformed_column = ctx.transformed_columns[column_name] transformer = ctx.transformers[transformed_column["transformer"]] trans_impl, _ = ctx.get_transformer_impl(column_name) inferred_python_type = None inferred_spark_type = None if hasattr(trans_impl, "transform_python"): try: if transformer["output_type"] == consts.COLUMN_TYPE_INFERRED: sample_df = test_df.collect() sample = sample_df[0] try: input = ctx.populate_values(transformed_column["input"], transformer["input"], preserve_column_refs=True) except CortexException as e: e.wrap("input") raise transformer_input = create_transformer_inputs_from_map( input, sample) initial_transformed_value = trans_impl.transform_python( transformer_input) inferred_python_type = infer_python_type( initial_transformed_value) for row in sample_df: transformer_input = create_transformer_inputs_from_map( input, row) transformed_value = trans_impl.transform_python( transformer_input) if inferred_python_type != infer_python_type( transformed_value): raise UserException( "transformed column " + column_name, "type inference failed, mixed data types in dataframe.", 'expected type of "' + transformed_sample + '" to be ' + inferred_python_type, ) ctx.write_metadata(transformed_column["id"], {"type": inferred_python_type}) transform_python_collect = execute_transform_python( column_name, test_df, ctx, spark, validate=True).collect() except Exception as e: raise UserRuntimeException( "transformed column " + column_name, transformed_column["transformer"] + ".transform_python", ) from e if hasattr(trans_impl, "transform_spark"): try: transform_spark_df = execute_transform_spark( column_name, test_df, ctx, spark) # check that the return object is a dataframe if type(transform_spark_df) is not DataFrame: raise UserException( "expected pyspark.sql.dataframe.DataFrame but got type {}". format(type(transform_spark_df))) # check that a column is added with the expected name if column_name not in transform_spark_df.columns: logger.error("schema of output dataframe:") log_df_schema(transform_spark_df, logger.error) raise UserException( "output dataframe after running transformer does not have column {}" .format(column_name)) if transformer["output_type"] == consts.COLUMN_TYPE_INFERRED: inferred_spark_type = SPARK_TYPE_TO_CORTEX_TYPE[ transform_spark_df.select(column_name).schema[0].dataType] ctx.write_metadata(transformed_column["id"], {"type": inferred_spark_type}) # check that transformer run on data try: transform_spark_df.select(column_name).collect() except Exception as e: raise UserRuntimeException("function transform_spark") from e # check that expected output column has the correct data type if transformer["output_type"] != consts.COLUMN_TYPE_INFERRED: actual_structfield = transform_spark_df.select( column_name).schema.fields[0] if (actual_structfield.dataType not in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[ transformer["output_type"]]): raise UserException( "incorrect column type: expected {}, got {}.".format( " or ".join( str(t) for t in CORTEX_TYPE_TO_ACCEPTABLE_SPARK_TYPES[ transformer["output_type"]]), actual_structfield.dataType, )) # perform the necessary casting for the column transform_spark_df = transform_spark_df.withColumn( column_name, F.col(column_name).cast(CORTEX_TYPE_TO_SPARK_TYPE[ ctx.get_inferred_column_type(column_name)]), ) # check that the function doesn't modify the schema of the other columns in the input dataframe if set(transform_spark_df.columns) - set([column_name]) != set( test_df.columns): logger.error("expected schema:") log_df_schema(test_df, logger.error) logger.error( "found schema (with {} dropped):".format(column_name)) log_df_schema(transform_spark_df.drop(column_name), logger.error) raise UserException( "a column besides {} was modifed in the output dataframe". format(column_name)) except CortexException as e: raise UserRuntimeException( "transformed column " + column_name, transformed_column["transformer"] + ".transform_spark", ) from e if hasattr(trans_impl, "transform_spark") and hasattr( trans_impl, "transform_python"): if (transformer["output_type"] == consts.COLUMN_TYPE_INFERRED and inferred_spark_type != inferred_python_type): raise UserException( "transformed column " + column_name, "type inference failed, transform_spark and transform_python had differing types.", "transform_python: " + inferred_python_type, "transform_spark: " + inferred_spark_type, ) name_type_map = [(s.name, s.dataType) for s in transform_spark_df.schema] transform_spark_collect = transform_spark_df.collect() for tp_row, ts_row in zip(transform_python_collect, transform_spark_collect): tp_dict = tp_row.asDict() ts_dict = ts_row.asDict() for name, dataType in name_type_map: if tp_dict[name] == ts_dict[name]: continue elif dataType == FloatType() and util.isclose( tp_dict[name], ts_dict[name], FLOAT_PRECISION): continue raise UserException( column_name, "{0}.transform_spark and {0}.transform_python had differing values" .format(transformed_column["transformer"]), "{} != {}".format(ts_row, tp_row), )
def populate_values(self, input, input_schema, preserve_column_refs): if input is None: if input_schema is None: return None if input_schema.get("_allow_null") == True: return None raise UserException("Null value is not allowed") if util.is_resource_ref(input): res_name = util.get_resource_ref(input) if res_name in self.constants: if self.constants[res_name].get("value") is not None: const_val = self.constants[res_name]["value"] elif self.constants[res_name].get("path") is not None: const_val = self.storage.get_json_external(self.constants[res_name]["path"]) try: return self.populate_values(const_val, input_schema, preserve_column_refs) except CortexException as e: e.wrap("constant " + res_name) raise if res_name in self.aggregates: agg_val = self.get_obj(self.aggregates[res_name]["key"]) try: return self.populate_values(agg_val, input_schema, preserve_column_refs) except CortexException as e: e.wrap("aggregate " + res_name) raise if res_name in self.columns: if input_schema is not None: col_type = self.get_inferred_column_type(res_name) if col_type not in input_schema["_type"]: raise UserException( "column {}: unsupported input type (expected type {}, got type {})".format( res_name, util.data_type_str(input_schema["_type"]), util.data_type_str(col_type), ) ) if preserve_column_refs: return input else: return res_name if util.is_list(input): elem_schema = None if input_schema is not None: if not util.is_list(input_schema["_type"]): raise UserException( "unsupported input type (expected type {}, got {})".format( util.data_type_str(input_schema["_type"]), util.user_obj_str(input) ) ) elem_schema = input_schema["_type"][0] min_count = input_schema.get("_min_count") if min_count is not None and len(input) < min_count: raise UserException( "list has length {}, but the minimum allowed length is {}".format( len(input), min_count ) ) max_count = input_schema.get("_max_count") if max_count is not None and len(input) > max_count: raise UserException( "list has length {}, but the maximum allowed length is {}".format( len(input), max_count ) ) casted = [] for i, elem in enumerate(input): try: casted.append(self.populate_values(elem, elem_schema, preserve_column_refs)) except CortexException as e: e.wrap("index " + i) raise return casted if util.is_dict(input): if input_schema is None: casted = {} for key, val in input.items(): key_casted = self.populate_values(key, None, preserve_column_refs) try: val_casted = self.populate_values(val, None, preserve_column_refs) except CortexException as e: e.wrap(util.user_obj_str(key)) raise casted[key_casted] = val_casted return casted if not util.is_dict(input_schema["_type"]): raise UserException( "unsupported input type (expected type {}, got {})".format( util.data_type_str(input_schema["_type"]), util.user_obj_str(input) ) ) min_count = input_schema.get("_min_count") if min_count is not None and len(input) < min_count: raise UserException( "map has length {}, but the minimum allowed length is {}".format( len(input), min_count ) ) max_count = input_schema.get("_max_count") if max_count is not None and len(input) > max_count: raise UserException( "map has length {}, but the maximum allowed length is {}".format( len(input), max_count ) ) is_generic_map = False if len(input_schema["_type"]) == 1: input_type_key = next(iter(input_schema["_type"].keys())) if is_compound_type(input_type_key): is_generic_map = True generic_map_key_schema = input_schema_from_type_schema(input_type_key) generic_map_value = input_schema["_type"][input_type_key] if is_generic_map: casted = {} for key, val in input.items(): key_casted = self.populate_values( key, generic_map_key_schema, preserve_column_refs ) try: val_casted = self.populate_values( val, generic_map_value, preserve_column_refs ) except CortexException as e: e.wrap(util.user_obj_str(key)) raise casted[key_casted] = val_casted return casted # fixed map casted = {} for key, val_schema in input_schema["_type"].items(): if key in input: val = input[key] else: if val_schema.get("_optional") is not True: raise UserException("missing key: " + util.user_obj_str(key)) if val_schema.get("_default") is None: continue val = val_schema["_default"] try: val_casted = self.populate_values(val, val_schema, preserve_column_refs) except CortexException as e: e.wrap(util.user_obj_str(key)) raise casted[key] = val_casted return casted if input_schema is None: return input if not util.is_str(input_schema["_type"]): raise UserException( "unsupported input type (expected type {}, got {})".format( util.data_type_str(input_schema["_type"]), util.user_obj_str(input) ) ) return cast_compound_type(input, input_schema["_type"])
def get_test_case_tags(conn, ts_id, testtype, rerun=None, type='failed'): """ param: conn: jira connection object param: ts_id: test suite id """ globfilepath = os.path.join(os.path.expanduser('~'), "global_conf.yaml") globdata = get_data_from_yaml(globfilepath) enable_test_mgt = True if 'EnableTestManagement' not in globdata or globdata.get( 'EnableTestManagement', 'no').lower() == "no": enable_test_mgt = False # fetching test cases from given master test suite. if ',' in ts_id: test_cases = [] ts_id = ts_id.split(',') for parent in ts_id: cases = conn.search_issues('parent=' + parent, startAt=0, maxResults=100) cases_b = conn.search_issues('parent=' + parent, startAt=100, maxResults=100) cases.extend(cases_b) test_cases.extend(cases) else: test_cases = [] test_cases = conn.search_issues('parent=' + ts_id, startAt=0, maxResults=100) test_cases_b = conn.search_issues('parent=' + ts_id, startAt=100, maxResults=100) test_cases.extend(test_cases_b) if not test_cases: raise Exception( "No Testcases found to clone for the given Suite {}".format( ts_id)) fieldmap = {field['name']: field['id'] for field in conn.fields()} os.environ.__dict__['fieldmap'] = {} os.environ.__dict__['fieldmap'].update(fieldmap) #sutas_id and Automated are customfields added fro sutas framework, \ # so we don't get these fields dirctly with issue object. sutas_id_field = fieldmap['sutas_id'] try: auto_field = fieldmap['Automated?'] except KeyError: auto_field = fieldmap['Automated'] not_planned = '\n Below mentioned test cases are in "not planned"' \ 'state hence not running them.\n' not_automated = "\n Below mentioned test cases has Automated field" \ "as 'No' hence not running them.\n" np_summ = [] na_summ = [] np_id = [] na_id = [] tc_list = [] tc_data = {} to_be_cloned_test_cases = [] np_slack_str = '' na_slack_str = '' #checking wheather automated and sutas_id fields are existed or not if auto_field and sutas_id_field: # if test case marked as not planed or it is not automatable, those kind \ # of test cases will be ignored for test_case in test_cases: if test_case.fields().status.name.lower() == 'not planned': np_summ.append(test_case.fields().summary.encode('utf-8')) np_id.append(test_case.key) elif test_case.raw['fields'][auto_field] == None: na_summ.append(test_case.fields().summary.encode('utf-8')) na_id.append(test_case.key) elif test_case.fields().status.name.lower() != 'not planned' and \ test_case.raw['fields'][auto_field][0]['value'].lower() == 'yes': if test_case.raw['fields'][sutas_id_field]: tc_list.append(test_case.raw['fields'][sutas_id_field]) to_be_cloned_test_cases.append(test_case) #sending not planned and not aumatable testcase list as notification through slack np_slack_str = TestManagement._format_slack_str( test_case, np_summ, np_id) na_slack_str = TestManagement._format_slack_str( test_case, na_summ, na_id) if np_slack_str: notify.message(not_planned) notify.message(np_slack_str) if na_slack_str: logger.info(not_automated) logger.info(na_slack_str) #notify.message(not_automated) #notify.message(na_slack_str) if tc_list: logger.info( "Testcases which are automated and updated with sutas_id in jira {}" .format(tc_list)) #It will filter test cases based on the conifiguration provided by user in user_configuration file #for test_type and sprint_no robosuitepath = os.environ['robosuitepath'] suite = TestData(parent=None, source=robosuitepath) robotctagids = [] robotcnames = [] for testcase in suite.testcase_table: robotcnames.append(testcase.name.lower()) robotctagids.append(testcase.tags.value) robotcdict = dict(list(zip(robotcnames, robotctagids))) if '-t' in sys.argv: to_be_cloned_test_cases = [] tcnames = os.environ['testnames'].lower().split(',') for tc in tcnames: if tc in robotcnames: for jira_tc in tc_list: if jira_tc in robotcdict[tc]: tcid = jira_tc to_be_cloned_test_cases.append( conn.issue(tcid)) else: raise Exception( "Robot file doesn't contain tc {} Mentioned". format(tc)) else: to_be_cloned_test_cases = TestManagement.filter_tcs( conn, to_be_cloned_test_cases, testtype) if not to_be_cloned_test_cases: raise Exception( "Not found any testcase to clone for the given suite with given configuration" ) final_tcs = [] for test_case in to_be_cloned_test_cases: if not rerun: for robotctagid in robotctagids: if robotctagid: if test_case.raw['fields'][ sutas_id_field] in robotctagid: final_tcs.append(test_case.raw['fields'] [sutas_id_field]) continue else: if test_case.fields().status.name.lower( ) == 'failed' and type == 'failed': final_tcs.append( test_case.raw['fields'][sutas_id_field]) elif test_case.fields().status.name.lower( ) == 'skipped' and type == 'skipped': final_tcs.append( test_case.raw['fields'][sutas_id_field]) tc_list = list(set(final_tcs)) tags = 'OR'.join(tc_list) logger.info(tags) else: raise UserException("Make sure fields 'sutas_id' and " \ "'Automated' must be there in every test case") if not tags: raise UserException( "No test cases found in provided test suite : ", ts_id) issue_obj = None if isinstance(ts_id, list): os.environ['multiplesuiteids'] = ','.join(ts_id) iss_obj = conn.issue(ts_id[-1]) else: iss_obj = conn.issue(ts_id) #checking testmanagent enabled or not in configuration files if enable_test_mgt: if iss_obj: if not rerun: #clning the test suite and its test cases from Master test suite clone_testcases = [conn.issue(tc) for tc in tc_list] issue_obj = TestManagement.clone_test_suite(\ conn, iss_obj, clone_testcases, testtype) logger.warn("cloned test suite id : " + issue_obj.key) notify.message("cloned test suite id : " + issue_obj.key) else: issue_obj = iss_obj logger.warn("Rerunning test suite of id : " + issue_obj.key) notify.message("Rerunning test suite of id : " + issue_obj.key) ts_id = issue_obj.id # fetching test cases from cloned test suite. test_cases = conn.search_issues('parent=' + ts_id) for test_case in test_cases: tc_data[test_case.raw['fields'][sutas_id_field]] = \ (test_case.key, test_case.fields().summary) if issue_obj.fields().status.name.upper() in [ 'TODO', 'TO DO' ] and not rerun: for test_case in test_cases: if test_case.raw['fields'][sutas_id_field]: status = test_case.fields().status.name.lower() if status in ['test in progress','running',\ 'passed', 'failed', 'skipped', 'blocked']: raise Exception( "Test case already executed, Before \ executing a test case make sure\ status in todo or ready to run state." ) else: raise Exception( "Make sure 'sutas_id' field updateed \ with test case ID") transitions = conn.transitions(ts_id) #moving test suite to running state for transition in transitions: if transition['name'].lower() in [ 'run', 'suite in progress' ]: conn.transition_issue( ts_id, str(transition['id'])) elif issue_obj.fields().status.name.lower() in [ 'running', 'suite in progress' ]: if not rerun: raise Exception( "Test suite won't run because it is in Running state,\ If you want to run test suit then clone the master \ suite and provide cloned test suite id" ) elif issue_obj.fields().status.name.lower() in [ 'done', 'completed' ]: if not rerun: raise Exception( "Test suite won't run because it is in Completed state, \ If you want to run test suit then clone the master suite and \ provide cloned test suite id") os.environ.__dict__["testids"] = {} os.environ.__dict__["testids"].update(tc_data) else: raise Exception( "No test suite found with provided test suite id :", ts_id) else: issue_obj = iss_obj return tags, issue_obj