コード例 #1
0
def command(job_id, split_desc, core_params, preprocessing_folder, model_folder, computation_parameters):

    # LOADING INFO #

    model_handler = PredictionModelInformationHandler(split_desc, core_params, preprocessing_folder, model_folder)

    test_df = model_handler.get_test_df()

    # COMPUTING SUBPOPULATION #

    col_name = get_computation_parameter("column", computation_parameters)
    col_type = get_type_of_column(col_name, model_handler)

    if col_type == "CATEGORY":
        value = get_computation_parameter("value", computation_parameters)
        subpop_df = test_df[test_df[col_name] == value]
    else:
        raise NotImplementedError("Not implemented yet :-(")

    # COMPUTING NEW METRICS ON SUBPOP #

    prediction_type = model_handler.get_prediction_type()

    if prediction_type == constants.BINARY_CLASSIFICATION:
        results = compute_binary_subpopulation_metrics(subpop_df, model_handler)
    else:
        raise NotImplementedError("Not implemented yet :-(")

    dkujson.dump_to_filepath(osp.join(model_folder, "subpop.json"), results)

    return "ok"
コード例 #2
0
def compute(job_id, split_desc, core_params, preprocessing_folder,
            model_folder, computation_params):
    if computation_params is None or "features_to_compute" not in computation_params:
        raise Exception(
            "'computation_params' should contains a key 'features_to_compute'")

    model_handler = PredictionModelInformationHandler(split_desc, core_params,
                                                      preprocessing_folder,
                                                      model_folder)
    features_to_compute = computation_params["features_to_compute"]

    if model_handler.is_kfolding():
        df = model_handler.get_full_df()
    else:
        df = model_handler.get_test_df()
    progress = PartialDependenciesProgress(job_id, len(features_to_compute))
    saver = PartialDependenciesSaver(model_folder, split_desc["schema"])
    computer = PartialDependencyComputer(
        df, model_handler.get_prediction_type(), model_handler.predict,
        progress, model_handler.get_sample_weight_variable())

    for index, feature_name in enumerate(features_to_compute):
        drop_missing = model_handler.get_per_feature_col(feature_name).get(
            "missing_handling") == "DROP_ROW"
        feature_type = model_handler.get_type_of_column(feature_name)
        is_dummified = False
        category_possible_value = None
        if feature_type == 'CATEGORY':
            # nan values are replaced by a fake one because neither a scale nor a distribution can be computed with nan
            feature_values = df[feature_name].fillna(
                constants.FILL_NA_VALUE).values
            is_dummified = model_handler.is_column_dummified(feature_name)
            category_possible_value = model_handler.category_possible_values(
                feature_name)
        else:
            feature_values = df[feature_name].values
        pd_feature = PartialDependencyFeature(feature_type, feature_values,
                                              feature_name, is_dummified,
                                              category_possible_value,
                                              drop_missing)
        result = computer.compute(pd_feature)
        saver.save(result)
        progress.set_percentage((index + 1) * 100 / len(features_to_compute),
                                no_fail=False)