def command(job_id, split_desc, core_params, preprocessing_folder, model_folder, computation_parameters): # LOADING INFO # model_handler = PredictionModelInformationHandler(split_desc, core_params, preprocessing_folder, model_folder) test_df = model_handler.get_test_df() # COMPUTING SUBPOPULATION # col_name = get_computation_parameter("column", computation_parameters) col_type = get_type_of_column(col_name, model_handler) if col_type == "CATEGORY": value = get_computation_parameter("value", computation_parameters) subpop_df = test_df[test_df[col_name] == value] else: raise NotImplementedError("Not implemented yet :-(") # COMPUTING NEW METRICS ON SUBPOP # prediction_type = model_handler.get_prediction_type() if prediction_type == constants.BINARY_CLASSIFICATION: results = compute_binary_subpopulation_metrics(subpop_df, model_handler) else: raise NotImplementedError("Not implemented yet :-(") dkujson.dump_to_filepath(osp.join(model_folder, "subpop.json"), results) return "ok"
def compute(job_id, split_desc, core_params, preprocessing_folder, model_folder, computation_params): if computation_params is None or "features_to_compute" not in computation_params: raise Exception( "'computation_params' should contains a key 'features_to_compute'") model_handler = PredictionModelInformationHandler(split_desc, core_params, preprocessing_folder, model_folder) features_to_compute = computation_params["features_to_compute"] if model_handler.is_kfolding(): df = model_handler.get_full_df() else: df = model_handler.get_test_df() progress = PartialDependenciesProgress(job_id, len(features_to_compute)) saver = PartialDependenciesSaver(model_folder, split_desc["schema"]) computer = PartialDependencyComputer( df, model_handler.get_prediction_type(), model_handler.predict, progress, model_handler.get_sample_weight_variable()) for index, feature_name in enumerate(features_to_compute): drop_missing = model_handler.get_per_feature_col(feature_name).get( "missing_handling") == "DROP_ROW" feature_type = model_handler.get_type_of_column(feature_name) is_dummified = False category_possible_value = None if feature_type == 'CATEGORY': # nan values are replaced by a fake one because neither a scale nor a distribution can be computed with nan feature_values = df[feature_name].fillna( constants.FILL_NA_VALUE).values is_dummified = model_handler.is_column_dummified(feature_name) category_possible_value = model_handler.category_possible_values( feature_name) else: feature_values = df[feature_name].values pd_feature = PartialDependencyFeature(feature_type, feature_values, feature_name, is_dummified, category_possible_value, drop_missing) result = computer.compute(pd_feature) saver.save(result) progress.set_percentage((index + 1) * 100 / len(features_to_compute), no_fail=False)