コード例 #1
0
def compute_delayed_functions(list_of_computations: List[Tuple[Delayed, Dict]],
                              client: Client, nb_of_retries_if_erred: int,
                              error_logger_name: str,
                              error_logger_file_name: str) -> None:
    print("start compute")
    print(list_of_computations)

    list_of_delayed_function_calls = [
        computation[0] for computation in list_of_computations
    ]

    list_of_futures: List[Future] = client.compute(
        list_of_delayed_function_calls, retries=nb_of_retries_if_erred)
    distributed.wait(list_of_futures)
    print("end compute")

    error_logger: Logger = create_logger(logger_name=error_logger_name,
                                         log_file_name=error_logger_file_name)
    future: Future
    for future, (delayed, func_args) in zip(list_of_futures,
                                            list_of_computations):
        if future.status == 'error':
            exception = future.exception()
            error_logger.error(f"{exception.__class__}: {exception}\n"
                               f"\tfor arguments {func_args}")
    close_logger(error_logger)
コード例 #2
0
def learn_tree_based_greedy_model_for_dataset_fold(
        dataset_name: str,
        fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int,
        nb_of_original_targets_to_predict: int,
        min_support: float,
        max_depth: int
):

    logger = create_logger(
        logger_name=f'learn_greedy_model_{dataset_name}{fold_i}_tree_derived_rules',
        log_file_name=os.path.join(greedy_models_tree_based_dir(),
                                   f'{dataset_name}{fold_i}_greedy_model_induction_tree_derived_rules.log')
    )
    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    df_original_train = pd.read_csv(get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.train),
                                    delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------
    tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(dataset_name,
                                                                                fold_i,
                                                                                classifier_indicator,
                                                                                nb_of_trees_per_model,
                                                                                nb_of_original_targets_to_predict,
                                                                                min_support,
                                                                                max_depth)
    logger.info(f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}")
    mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name)

    mids_rules: Set[MIDSRule] = {MIDSRule(mcar) for mcar in mcars}

    logger.info(f"ground set size (nb of initial MCARs): {len(mids_rules)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------

    greedy_clf = GreedyRoundRobinTargetRuleClassifier(df_original_train.columns, verbose=False)
    selected_set, selected_set_scores = greedy_clf.fit(ground_set=mids_rules, training_data=df_original_train)

    logger.info(f"Selected {len(selected_set)} out of {len(mcars)} rules "
                f"({(len(selected_set) / len(mcars) *100):.2f}%)")

    logger.info("start saving Naive greedy model")
    tree_based_greedy_clf_abs_file_name = get_tree_based_greedy_clf_abs_file_name(
        dataset_name=dataset_name, fold_i=fold_i,
        classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support, max_depth=max_depth
    )
    store_greedy_naive_classifier(tree_based_greedy_clf_abs_file_name, greedy_clf)
    logger.info(f"finished saving greedy clf to file: {tree_based_greedy_clf_abs_file_name}")
    close_logger(logger)
def mine_cars_for_dataset_fold_target_attribute(
        dataset_name: str,
        fold_i: int,
        target_attribute: str,
        min_support: float,
        min_confidence: float,
        max_length: int,
):
    """
    1. load the required training data of the dataset fold.
    2. make sure the target attribute is the last attribute
    3. mine rules using the parameters settings
        --> check the number of rules!
    4. save the rules to file
    :return:
    """

    relative_name: str = f'{dataset_name}{fold_i}_{target_attribute}_{min_confidence}'

    logger = create_logger(
        logger_name=f'mine_filtered_single_target_cars_' + relative_name,
        log_file_name=os.path.join(assoc_vs_tree_based_single_target_car_dir(),
                                   f'{relative_name}_single_target_filtered_car_mining.log')
    )

    # logger.info(f"rule_cutoff={rule_cutoff}")

    # # load the required training data of the dataset fold.
    # original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
    #   dataset_name, fold_i, TrainTestEnum.train)
    # df_train_original_column_order = pd.read_csv(original_train_data_fold_abs_file_name, delimiter=',')
    # # 2. make sure the target attribute is the last attribute
    # df_train_reordered = reorder_columns(df_train_original_column_order, target_attribute)
    #
    # # REMOVE INSTANCES WITH NAN AS TARGET VALUE:
    # df_train_reordered = remove_instances_with_nans_in_column(df_train_reordered, target_attribute)
    df_train_reordered = prepare_arc_data(dataset_name, fold_i, target_attribute, TrainTestEnum.train)

    logger.info(f"start mining CARs for " + relative_name)

    st_mcars: List[MCAR]
    timings_dict: Dict[str, float]
    filtered_st_mcars, timings_dict = mine_single_target_MCARs_mlext(df_train_reordered,
                                                                     target_attribute=target_attribute,
                                                                     min_support=min_support,
                                                                     min_confidence=min_confidence,
                                                                     max_length=max_length)

    logger.info(f"finished mining CARs for {dataset_name} {fold_i}_{min_support}supp_{min_confidence}conf")
    logger.info(
        f"found {len(filtered_st_mcars)} CARs for {dataset_name} {fold_i}_{min_support}supp_{min_confidence}conf")

    filtered_st_mcars_abs_file_name: str = get_single_target_filtered_cars_abs_filename(
        dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute,
        confidence_boundary_val=min_confidence
    )
    store_mcars(filtered_st_mcars_abs_file_name, filtered_st_mcars)
    logger.info(f"finished writing CARs to file: {filtered_st_mcars_abs_file_name}")
    filtered_st_mcars_mining_timings_abs_file_name = get_single_target_filtered_cars_mining_timings_abs_filename(
        dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute,
        confidence_boundary_val=min_confidence
    )
    store_timings_dict(filtered_st_mcars_mining_timings_abs_file_name, timings_dict)

    close_logger(logger)
def evaluate_mids_model_for_dataset_fold_target_attribute(
        dataset_name: str, fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int,
        min_support: float, max_depth: int):
    logger = create_logger(
        logger_name=f'evaluate_mids_model_tree_derived_' +
        get_tree_derived_rules_rel_file_name_without_extension(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth),
        log_file_name=os.path.join(
            get_tree_based_mids_dir(),
            get_tree_derived_rules_rel_file_name_without_extension(
                dataset_name=dataset_name,
                fold_i=fold_i,
                classifier_indicator=classifier_indicator,
                nb_of_trees_per_model=nb_of_trees_per_model,
                nb_of_original_targets_to_predict=
                nb_of_original_targets_to_predict,
                min_support=min_support,
                max_depth=max_depth) +
            '_model_evaluation_tree_derived_rules.log'))

    # --- load test data ----------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.test)
    df_test_original_column_order = pd.read_csv(
        original_test_data_fold_abs_file_name, delimiter=',')

    # --- load classifier ---------------------------------------------------------------------------------------------
    tree_based_mids_classifier_abs_file_name = get_tree_based_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)

    # mids_classifier_abs_file_name = get_mids_clf_abs_file_name(dataset_name, fold_i)
    logger.info(
        f"start loading MIDS model from {tree_based_mids_classifier_abs_file_name}"
    )
    mids_classifier: MIDSClassifier = load_mids_classifier(
        tree_based_mids_classifier_abs_file_name)
    logger.info("finished loading MIDS model")
    logger.info(mids_classifier)
    reconstructed_mids = MIDSValueReuse()
    reconstructed_mids.classifier = mids_classifier

    mids_classifier.rule_combination_strategy = RuleCombiningStrategy.WEIGHTED_VOTE
    mids_classifier.rule_combinator = WeightedVotingRuleCombinator()

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    filter_nans: bool = True
    target_attr_to_score_info_map: Dict[
        str, ScoreInfo] = score_MIDS_on_its_targets_without_nans(
            reconstructed_mids,
            df_test_original_column_order,
            filter_nans=filter_nans)
    logger.info("Evaluated MIDS classifier on predictive performance")
    target_attrs: List[TargetAttr] = mids_classifier.target_attrs
    for target_attr in target_attrs:
        target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[
            target_attr]
        logger.info(
            f"\t{target_attr}:\n {target_attr_score_info.to_str('    ')}")
        logger.info("\t---")

    # mids_target_attr_to_score_info_abs_file_name: str = get_mids_target_attr_to_score_info_abs_file_name(
    #     dataset_name, fold_i)

    tree_based_mids_target_attr_to_score_info_abs_file_name: str = \
        get_tree_based_mids_target_attr_to_score_info_abs_file_name(
            dataset_name=dataset_name, fold_i=fold_i,
            classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support, max_depth=max_depth
        )

    store_mids_target_attr_to_score_info(
        tree_based_mids_target_attr_to_score_info_abs_file_name,
        target_attr_to_score_info_map)
    logger.info(
        f"Wrote MIDS Dict[TargetAttr, ScoreInfo] to {tree_based_mids_target_attr_to_score_info_abs_file_name}"
    )

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    interpret_stats: MIDSInterpretabilityStatistics \
        = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics(
            MIDSRuleSet(mids_classifier.rules), df_test_original_column_order, target_attributes=target_attrs)
    logger.info("Evaluated MIDS classifier on interpretability")
    logger.info(interpret_stats.to_str("\n"))

    # mids_interpret_stats_abs_file_name: str = get_mids_interpret_stats_abs_file_name(
    #     dataset_name, fold_i)
    tree_based_mids_interpret_stats_abs_file_name: str = get_tree_based_mids_interpret_stats_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)
    store_mids_interpret_stats(tree_based_mids_interpret_stats_abs_file_name,
                               interpret_stats)
    logger.info(
        f"Wrote MIDSInterpretabilityStatistics to {tree_based_mids_interpret_stats_abs_file_name}"
    )
    logger.info("---")

    close_logger(logger)
def learn_single_target_tree_mids_model_for_dataset_fold(
        dataset_name: str, fold_i: int, target_attribute: str,
        nb_of_trees_per_model: int, min_support: float, max_depth: int):
    classifier_indicator = SingleTargetClassifierIndicator.random_forest

    relative_name: str = get_single_target_tree_rules_relative_file_name_without_extension(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth,
    )
    log_file_dir: str = get_single_target_mids_clf_dir()
    logger = create_logger(
        logger_name=f'learn_single_target_tree_mids_' + relative_name,
        log_file_name=os.path.join(
            log_file_dir,
            f'{relative_name}_model_induction_single_target_tree_mids.log'))

    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.train)
    df_train_original_column_order = pd.read_csv(
        original_train_data_fold_abs_file_name, delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------

    tree_clf_derived_rules_abs_file_name: str = get_single_target_tree_rules_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth)
    logger.info(
        f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}")
    mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name)
    logger.info(f"ground set size (nb of initial MCARs): {len(mcars)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------
    algorithm = "RDGS"
    debug_mids_fitting = False

    mids = MIDSValueReuse()
    mids.normalize = True
    logger.info("start MIDS model induction")
    mids.fit(
        df_train_original_column_order,
        targets_to_use=[target_attribute],
        use_targets_from_rule_set=False,
        class_association_rules=mcars,
        debug=debug_mids_fitting,
        algorithm=algorithm,
    )
    logger.info("finished MIDS model induction")
    mids_classifier: MIDSClassifier = mids.classifier
    logger.info(mids_classifier)

    logger.info("start saving MIDS model")
    tree_based_mids_classifier_abs_file_name = get_single_target_tree_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth)
    store_mids_classifier(tree_based_mids_classifier_abs_file_name,
                          mids_classifier)
    logger.info(
        f"finished saving MIDS model to file: {tree_based_mids_classifier_abs_file_name}"
    )
    close_logger(logger)
コード例 #6
0
def one_hot_encode_dataset_fold(dataset_name: str, fold_i: int,
                                ohe_prefix_separator: str) -> None:
    """
    One-hot encodes each of the Arch-bench fold train-test splits.

    """
    logger = create_logger(
        logger_name=f'one_hot_encode{dataset_name}{fold_i}',
        log_file_name=os.path.join(
            get_one_hot_encoded_fold_data_dir(train_test=None),
            f"{dataset_name}{fold_i}.log"))
    drop_first = False

    # === For fold i ====

    # --- Read in the original train and test data from archbench -----------------------------------------------------

    original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.train)
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.test)
    logger.info(
        f"Loading train fold: {original_train_data_fold_abs_file_name}")
    logger.info(f"Loading test fold: {original_test_data_fold_abs_file_name}")

    original_train_df = pd.read_csv(original_train_data_fold_abs_file_name,
                                    delimiter=',')
    original_test_df = pd.read_csv(original_test_data_fold_abs_file_name,
                                   delimiter=',')

    # --- Set each column to 'object' ------- -------------------------------------------------------------------------
    original_train_df = convert_to_categorical(original_train_df, dataset_name,
                                               fold_i)
    original_test_df = convert_to_categorical(original_test_df, dataset_name,
                                              fold_i)

    # --- Concatenate the train and test data for the current fold ----------------------------------------------------
    nb_of_train_examples = len(original_train_df)
    nb_of_test_examples = len(original_test_df)

    logger.info(
        f"Start concatenating train & test folds for {dataset_name}{fold_i}")
    original_concat_df = pd.concat([original_train_df, original_test_df],
                                   axis=0)
    if len(original_concat_df) != nb_of_train_examples + nb_of_test_examples:
        raise Exception("unexpected length")

    # --- Write out the full discretized dataset of this fold to file for inspection purposes -------------------------
    original_full_data_abs_file_name = get_original_full_data_abs_file_name(
        dataset_name, fold_i)
    logger.info(
        f"Writing out UN-encoded full dataset for {dataset_name}{fold_i}: {original_full_data_abs_file_name}"
    )
    original_concat_df.to_csv(original_full_data_abs_file_name, index=False)

    # --- One-hot encoded the full data -------------------------------------------------------------------------------
    logger.info(f"Start one hot encoding {dataset_name}{fold_i}")
    one_hot_encoded_concat_df = pd.get_dummies(original_concat_df,
                                               prefix_sep=ohe_prefix_separator,
                                               drop_first=drop_first)

    one_hot_encoded_full_data_abs_file_name = get_one_hot_encoded_full_data_abs_file_name(
        dataset_name, fold_i)

    # --- Write out the one-hot encoded full data ---------------------------------------------------------------------
    logger.info(
        f"Writing out one hot encoded full dataset for {dataset_name}{fold_i}:"
        f" {one_hot_encoded_full_data_abs_file_name}")
    one_hot_encoded_concat_df.to_csv(one_hot_encoded_full_data_abs_file_name,
                                     index=False)

    # --- Create the EncodingBookKeeper and write it to file ----------------------------------------------------------
    encoding_book_keeper: EncodingBookKeeper = EncodingBookKeeper. \
        build_encoding_book_keeper_from_ohe_columns(one_hot_encoded_concat_df.columns,
                                                    ohe_prefix_separator=ohe_prefix_separator)
    logger.info(
        f"Creating one hot encoding book keeper for {dataset_name}{fold_i}")
    # %%
    encoding_book_keeper_abs_file_name = get_encodings_book_keeper_abs_file_name_for(
        dataset_name, fold_i)
    logger.info(
        f"Saving one hot encoding book keeper for {dataset_name}{fold_i}: {encoding_book_keeper_abs_file_name}"
    )
    store_encoding_book_keeper(encoding_book_keeper_abs_file_name,
                               encoding_book_keeper)

    # -- Split the full one-hot encoded dataset back into train and test ----------------------------------------------
    one_hot_encoded_train_df = one_hot_encoded_concat_df[:nb_of_train_examples]
    one_hot_encoded_test_df = one_hot_encoded_concat_df[nb_of_train_examples:]

    if len(one_hot_encoded_train_df) != nb_of_train_examples:
        raise Exception("unexpected length")
    if len(one_hot_encoded_test_df) != nb_of_test_examples:
        raise Exception("unexpected length")

    # -- Write out the one-hot encoded train and test -----------------------------------------------------------------
    one_hot_encoded_train_abs_file_name = get_one_hot_encoded_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.train)
    one_hot_encoded_test_abs_file_name = get_one_hot_encoded_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.test)
    logger.info(
        f"Saving one hot encoded train fold: {one_hot_encoded_train_abs_file_name}"
    )
    logger.info(
        f"Saving one hot encoded test fold: {one_hot_encoded_test_abs_file_name}"
    )

    one_hot_encoded_train_df.to_csv(one_hot_encoded_train_abs_file_name,
                                    index=False)
    one_hot_encoded_test_df.to_csv(one_hot_encoded_test_abs_file_name,
                                   index=False)
    logger.info("---")
    close_logger(logger)
def learn_and_convert_tree_model_to_rules(
        dataset_name: str, fold_i: int, nb_of_trees_per_model: int,
        nb_of_original_targets_to_predict: int, nb_grouping_iterations: int,
        min_support: float, max_depth: int, seed: int):
    classifier_indicator = SingleTargetClassifierIndicator.random_forest
    train_test = TrainTestEnum.train

    logger = create_logger(
        logger_name=f'mine_multi-target_cars_tree_derived_' +
        get_tree_derived_rules_rel_file_name_without_extension(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth),
        log_file_name=get_tree_derived_rules_logger_abs_file_name(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth))

    # --- load train data ---------------------------------------------------------------------------------------------
    df_original = pd.read_csv(get_original_data_fold_abs_file_name(
        dataset_name, fold_i, train_test),
                              delimiter=',')
    df_one_hot_encoded = pd.read_csv(
        get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i,
                                                    train_test),
        delimiter=",")

    encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper(
        get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i))

    # --- prepare data ------------------------------------------------------------------------------------------------
    logger.info(
        f"Start preparing data using {nb_of_original_targets_to_predict} attrs per group"
        f" with {nb_grouping_iterations} grouping iterations")

    different_attr_groupings: List[AttrGroupPartitioning] = get_attr_groupings(
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        nb_grouping_iterations=nb_grouping_iterations,
        encoding_book_keeper=encoding_book_keeper)

    complete_rule_list: List[MCAR] = []
    cover_checker = CoverChecker()

    total_time_random_forest_learning_s = 0.0
    total_time_rf_conversion_s = 0.0
    # prepared_data_list: List[PreparedDataForTargetSet] = []
    for original_target_attribute_partitioning in different_attr_groupings:
        attr_group: AttrGroup
        for attr_group in original_target_attribute_partitioning:
            prepared_data: PreparedDataForTargetSet = get_prepared_data_for_attr_group(
                original_group_to_predict=attr_group,
                df_original=df_original,
                df_one_hot_encoded=df_one_hot_encoded,
                encoding_book_keeper=encoding_book_keeper)
            # prepared_data_list.append(prepared_data)

            start_time_decision_tree_learning_s = time.time()
            classifier: RandomForestClassifier = RandomForestClassifier(
                n_estimators=nb_of_trees_per_model,
                random_state=seed,
                min_samples_leaf=min_support,
                max_depth=max_depth)

            # --- Learn a random forest given the current number of trees -----------------------------------
            classifier.fit(
                prepared_data.df_one_hot_encoded_descriptive_attributes,
                prepared_data.df_one_hot_encoded_target_attributes)
            end_time_decision_tree_learning_s = time.time()
            total_time_decision_tree_learning_s: float = end_time_decision_tree_learning_s - start_time_decision_tree_learning_s
            total_time_random_forest_learning_s += total_time_decision_tree_learning_s

            tree_based_rules: List[MCAR]
            total_time_rf_conversion_s: float
            tree_based_rules, partial_time_rf_conversion_s = convert_random_forest_to_rules(
                random_forest_clf=classifier,
                df_original_without_nans=prepared_data.
                df_original_without_nans_for_targets,
                descriptive_one_hot_encoded_column_names=prepared_data.
                descriptive_one_hot_encoded_columns,
                # target_attribute_names=df_original_target_attrs_without_nans.columns,
                target_attribute_names=prepared_data.
                target_one_hot_encoded_columns,
                encoding_book_keeper=encoding_book_keeper,
                logger=logger)
            total_time_rf_conversion_s += partial_time_rf_conversion_s
            complete_rule_list.extend(tree_based_rules)

    logger.info(f"Complete set size: {len(complete_rule_list)}")

    # --- Save rules to file ---------------------------------------------------------------------------------

    tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(
        dataset_name, fold_i, classifier_indicator, nb_of_trees_per_model,
        nb_of_original_targets_to_predict, min_support, max_depth)
    store_mcars(tree_clf_derived_rules_abs_file_name, complete_rule_list)
    logger.info(
        f"finished writing tree-derived ruled to file: {tree_clf_derived_rules_abs_file_name}"
    )
    logger.info(
        "==================================================================")

    tree_rule_gen_timing_info = TreeRuleGenTimingInfo(
        total_time_decision_tree_learning_s=total_time_random_forest_learning_s,
        total_time_rf_conversion_s=total_time_rf_conversion_s)

    tree_rule_gen_timing_info_abs_file_name: str = get_tree_derived_rules_gen_timing_info_abs_file_name(
        dataset_name, fold_i, classifier_indicator, nb_of_trees_per_model,
        nb_of_original_targets_to_predict, min_support, max_depth)
    store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name,
                                    tree_rule_gen_timing_info)

    close_logger(logger)
def learn_and_convert_single_target_tree_ensemble_to_rules(
        dataset_name: str, fold_i: int, target_attribute: str,
        nb_of_trees_per_model: int, min_support: float, max_depth: int):
    classifier_indicator = SingleTargetClassifierIndicator.random_forest
    train_test = TrainTestEnum.train

    relative_name: str = get_single_target_tree_rules_relative_file_name_without_extension(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth,
    )

    logger_dir: str = get_single_target_tree_rule_dir()
    logger = create_logger(
        logger_name=f'mine_single_target_tree_rules_' + relative_name,
        log_file_name=os.path.join(
            logger_dir, f"{relative_name}_single_tree_rule_generation.log"))

    # --- load train data ---------------------------------------------------------------------------------------------
    df_original = pd.read_csv(get_original_data_fold_abs_file_name(
        dataset_name, fold_i, train_test),
                              delimiter=',')
    df_one_hot_encoded = pd.read_csv(
        get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i,
                                                    train_test),
        delimiter=",")

    encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper(
        get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i))

    cover_checker = CoverChecker()

    original_group_to_predict: List[Attr] = [target_attribute]
    original_target_attr_set = set(original_group_to_predict)

    logger.info(
        f"Fetching the necessary columns for {dataset_name}{fold_i} {original_target_attr_set}"
    )

    prepared_data: PreparedDataForTargetSet = PreparedDataForTargetSet.prepare_data_for_target_set(
        df_original=df_original,
        df_one_hot_encoded=df_one_hot_encoded,
        encoding_book_keeper=encoding_book_keeper,
        original_target_attr_set=original_target_attr_set,
    )

    # --- Fit and save classifier ---------------------------------------------------------------------------------

    start_time_decision_tree_learning_s = time.time()
    classifier: RandomForestClassifier = RandomForestClassifier(
        n_estimators=nb_of_trees_per_model,
        min_samples_leaf=min_support,
        max_depth=max_depth)

    classifier.fit(X=prepared_data.df_one_hot_encoded_descriptive_attributes,
                   y=prepared_data.df_one_hot_encoded_target_attributes)
    end_time_decision_tree_learning_s = time.time()
    total_time_decision_tree_learning_s: float = end_time_decision_tree_learning_s - start_time_decision_tree_learning_s

    logger.info(
        f"Fitted a {classifier_indicator.value} model predicting {original_target_attr_set}"
        f" for {dataset_name}{fold_i}")

    total_time_rf_conversion_s: TimeDiffSec = 0

    complete_rule_list: List[MCAR] = []
    tree_classifiers = classifier.estimators_
    for tree_clf in tree_classifiers:
        list_of_dt_rules: Optional[List[MIDSRule]] = None
        try:
            start_time_clf_conversion_s = time.time()
            list_of_dt_rules: List[
                MIDSRule] = convert_decision_tree_to_mids_rule_list(
                    tree_classifier=tree_clf,
                    one_hot_encoded_feature_names=prepared_data.
                    descriptive_one_hot_encoded_columns,
                    target_attribute_names=prepared_data.
                    target_one_hot_encoded_columns,
                    encoding_book_keeper=encoding_book_keeper)
        except NotImplementedError as err:
            logger.error(str(err))

        if list_of_dt_rules is not None:

            # --- adding support and confidence to rules
            mids_rule: MIDSRule
            for mids_rule in list_of_dt_rules:
                add_support_and_confidence_to_MIDSRule(
                    prepared_data.df_original_without_nans_for_targets,
                    mids_rule,
                    cover_checker=cover_checker)

            # logger.info(f"found {len(list_of_dt_rules)} rules,"
            #             f" updated total rule set size: {len(complete_rule_list)}")
            mids_rules_as_mcars = [
                mids_rule.car for mids_rule in list_of_dt_rules
            ]
            complete_rule_list.extend(mids_rules_as_mcars)

            end_time_clf_conversion_s = time.time()
            total_time_clf_conversion_s = end_time_clf_conversion_s - start_time_clf_conversion_s
            total_time_rf_conversion_s += total_time_clf_conversion_s

    logger.info(f"Complete set size: {len(complete_rule_list)}")

    for i in range(0, len(complete_rule_list)):
        logger.info(f"rule {i}: {str(complete_rule_list[i])}")
        if i > 10:
            break
    # --- Save rules to file ---------------------------------------------------------------------------------
    tree_clf_derived_rules_abs_file_name: str = get_single_target_tree_rules_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth)
    store_mcars(tree_clf_derived_rules_abs_file_name, complete_rule_list)
    logger.info(
        f"finished writing single-target tree rules to file: {tree_clf_derived_rules_abs_file_name}"
    )

    tree_rule_gen_timing_info = TreeRuleGenTimingInfo(
        total_time_decision_tree_learning_s=total_time_decision_tree_learning_s,
        total_time_rf_conversion_s=total_time_rf_conversion_s)

    tree_rule_gen_timing_info_abs_file_name: str = get_single_target_tree_rules_gen_timing_info_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth)
    store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name,
                                    tree_rule_gen_timing_info)

    logger.info(
        "==================================================================")
    close_logger(logger)
コード例 #9
0
def evaluate_single_target_mids_model_for_dataset_fold(
        dataset_name: str, fold_i: int, logger_name: str,
        logger_file_name: str, mids_classifier_abs_file_name: str,
        mids_target_attr_to_score_info_abs_file_name: str,
        mids_interpret_stats_abs_file_name: str):
    logger = create_logger(logger_name=logger_name,
                           log_file_name=logger_file_name)

    # --- load test data ----------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.test)
    df_test_original_column_order = pd.read_csv(
        original_test_data_fold_abs_file_name, delimiter=',')

    # --- load classifier ---------------------------------------------------------------------------------------------
    # mids_classifier_abs_file_name = get_mids_clf_abs_file_name(dataset_name, fold_i)
    logger.info(
        f"start loading MIDS model from {mids_classifier_abs_file_name}")
    mids_classifier: MIDSClassifier = load_mids_classifier(
        mids_classifier_abs_file_name)
    logger.info("finished loading MIDS model")
    logger.info(mids_classifier)
    reconstructed_mids = MIDSValueReuse()
    reconstructed_mids.classifier = mids_classifier

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    filter_nans: bool = True
    target_attr_to_score_info_map: Dict[
        str, ScoreInfo] = score_MIDS_on_its_targets_without_nans(
            reconstructed_mids,
            df_test_original_column_order,
            filter_nans=filter_nans)
    logger.info("Evaluated MIDS classifier on predictive performance")

    target_attrs: List[TargetAttr] = mids_classifier.target_attrs
    for target_attr in target_attrs:
        target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[
            target_attr]
        logger.info(
            f"\t{target_attr}:\n {target_attr_score_info.to_str('    ')}")
        logger.info("\t---")

    store_mids_target_attr_to_score_info(
        mids_target_attr_to_score_info_abs_file_name,
        target_attr_to_score_info_map)
    logger.info(
        f"Wrote MIDS Dict[TargetAttr, ScoreInfo] to {mids_target_attr_to_score_info_abs_file_name}"
    )

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    interpret_stats: MIDSInterpretabilityStatistics \
        = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics(
            MIDSRuleSet(mids_classifier.rules), df_test_original_column_order, target_attributes=target_attrs)
    logger.info("Evaluated MIDS classifier on interpretability")
    logger.info(interpret_stats.to_str("\n"))

    store_mids_interpret_stats(mids_interpret_stats_abs_file_name,
                               interpret_stats)
    logger.info(
        f"Wrote MIDSInterpretabilityStatistics to {mids_interpret_stats_abs_file_name}"
    )
    logger.info("---")

    close_logger(logger)
def create_single_target_tree_based_mcars(
        dataset_name: str, fold_i: int, target_attribute: str,
        classifier_indicator: SingleTargetClassifierIndicator,
        confidence_boundary_val: float, min_support: float, max_depth: int,
        seed: int):
    train_test = TrainTestEnum.train

    relative_name: str = get_single_target_tree_rules_relative_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)

    logger = create_logger(
        logger_name=f'create_single_target_tree_rules' + relative_name,
        log_file_name=os.path.join(
            assoc_vs_tree_based_single_target_car_dir(),
            f'{relative_name}_single_target_tree_rule_generation.log'))

    logger.info(
        f"Start reading MCARS for {dataset_name}{fold_i}_{target_attribute}"
        f" (confidence {confidence_boundary_val})")
    st_mcars_abs_file_name = get_single_target_filtered_cars_abs_filename(
        dataset_name,
        fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary_val)

    filtered_st_mcars: List[MCAR] = load_mcars(st_mcars_abs_file_name)
    logger.info(
        f"Total nb of MCARS for {dataset_name}{fold_i}_{target_attribute}"
        f" (conf {confidence_boundary_val}): {len(filtered_st_mcars)}")

    n_tree_rules_to_generate = len(filtered_st_mcars)
    logger.info(f"Generate {n_tree_rules_to_generate} tree based rules")

    # --- load train data ---------------------------------------------------------------------------------------------
    df_original = pd.read_csv(get_original_data_fold_abs_file_name(
        dataset_name, fold_i, train_test),
                              delimiter=',')
    df_one_hot_encoded = pd.read_csv(
        get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i,
                                                    train_test),
        delimiter=",")
    encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper(
        get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i))

    # --- prepare data ------------------------------------------------------------------------------------------------

    original_group_to_predict: List[Attr] = [target_attribute]
    original_target_attr_set = set(original_group_to_predict)

    logger.info(
        f"Fetching the necessary columns for {dataset_name}{fold_i} {original_target_attr_set}"
    )

    prepared_data: PreparedDataForTargetSet = PreparedDataForTargetSet.prepare_data_for_target_set(
        df_original=df_original,
        df_one_hot_encoded=df_one_hot_encoded,
        encoding_book_keeper=encoding_book_keeper,
        original_target_attr_set=original_target_attr_set,
    )

    random_forest_abs_file_name: str = get_single_target_random_forest_absolute_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)

    # --- Generate the required nb of tree-based rules ----------------------------------------------------------------
    logger.info(f"Start generating tree-based rules")
    tree_based_mcars: List[MCAR]
    tree_rule_gen_timing_info: TreeRuleGenTimingInfo
    tree_based_mcars, tree_rule_gen_timing_info = generate_n_single_target_tree_rules(
        n_tree_rules_to_generate=n_tree_rules_to_generate,
        prepared_data=prepared_data,
        encoding_book_keeper=encoding_book_keeper,
        min_support=min_support,
        max_depth=max_depth,
        logger=logger,
        seed=seed,
        random_forest_abs_file_name=random_forest_abs_file_name)

    # --- SAVE the generated tree-based rules
    tree_based_rules_abs_file_name: str = get_single_target_tree_rules_absolute_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)
    store_mcars(tree_based_rules_abs_file_name, tree_based_mcars)
    logger.info(
        f"finished writing tree-derived ruled to file: {tree_based_rules_abs_file_name}"
    )

    tree_rule_gen_timing_info_abs_file_name: str = get_single_target_tree_rules_gen_timing_info_absolute_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)
    store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name,
                                    tree_rule_gen_timing_info)

    logger.info(
        "==================================================================")
    close_logger(logger)
コード例 #11
0
def learn_tree_based_mids_model_for_dataset_fold(
        dataset_name: str,
        fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int,
        nb_of_original_targets_to_predict: int,
        min_support: float,
        max_depth: int
):

    logger = create_logger(
        logger_name=f'learn_mids_model{dataset_name}{fold_i}_tree_derived_rules',
        log_file_name=os.path.join(get_tree_based_mids_dir(),
                                   f'{dataset_name}{fold_i}_model_induction_tree_derived_rules.log')
    )
    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    df_original_train = pd.read_csv(get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.train),
                                    delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------
    tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(dataset_name,
                                                                                fold_i,
                                                                                classifier_indicator,
                                                                                nb_of_trees_per_model,
                                                                                nb_of_original_targets_to_predict,
                                                                                min_support,
                                                                                max_depth)
    logger.info(f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}")
    mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name)
    logger.info(f"ground set size (nb of initial MCARs): {len(mcars)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------
    algorithm = "RDGS"
    debug_mids_fitting = False

    mids = MIDSValueReuse()
    mids.normalize = True
    logger.info("start MIDS model induction")

    mids.fit(df_original_train,
             class_association_rules=mcars, debug=debug_mids_fitting, algorithm=algorithm,
             # lambda_array=lambda_array
             use_targets_from_rule_set=False,
             )
    logger.info("finished MIDS model induction")
    mids_classifier: MIDSClassifier = mids.classifier
    logger.info(mids_classifier)
    logger.info(f"Selected {len(mids_classifier.rules)} out of {len(mcars)} rules "
                f"({(len(mids_classifier.rules) / len(mcars) *100):.2f}%)")

    logger.info("start saving MIDS model")
    tree_based_mids_classifier_abs_file_name = get_tree_based_mids_clf_abs_file_name(
        dataset_name=dataset_name, fold_i=fold_i,
        classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support, max_depth=max_depth
    )
    store_mids_classifier(tree_based_mids_classifier_abs_file_name, mids_classifier)
    logger.info(f"finished saving MIDS model to file: {tree_based_mids_classifier_abs_file_name}")
    close_logger(logger)
def evaluate_greedy_model_for_dataset_fold_target_attribute(
        dataset_name: str, fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int,
        min_support: float, max_depth: int):
    logger = create_logger(
        logger_name=f'evaluate_greedy_model_tree_derived_' +
        get_tree_derived_rules_rel_file_name_without_extension(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth),
        log_file_name=os.path.join(
            greedy_models_tree_based_dir(),
            get_tree_derived_rules_rel_file_name_without_extension(
                dataset_name=dataset_name,
                fold_i=fold_i,
                classifier_indicator=classifier_indicator,
                nb_of_trees_per_model=nb_of_trees_per_model,
                nb_of_original_targets_to_predict=
                nb_of_original_targets_to_predict,
                min_support=min_support,
                max_depth=max_depth) +
            '_greedy_model_evaluation_tree_derived_rules.log'))

    # --- load test data ----------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.test)
    df_test_original_column_order = pd.read_csv(
        original_test_data_fold_abs_file_name, delimiter=',')

    # --- load classifier ---------------------------------------------------------------------------------------------

    tree_based_greedy_clf_abs_file_name = get_tree_based_greedy_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)

    logger.info(
        f"start loading greedy model from {tree_based_greedy_clf_abs_file_name}"
    )
    greedy_classifier: GreedyRoundRobinTargetRuleClassifier = load_greedy_naive_classifier(
        tree_based_greedy_clf_abs_file_name)
    logger.info("finished loading greedy model")
    logger.info(greedy_classifier)

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    filter_nans: bool = True
    target_attr_to_score_info_map: Dict[
        str, ScoreInfo] = score_mt_clf_on_its_targets_without_nans(
            greedy_classifier,
            df_test_original_column_order,
            filter_nans=filter_nans)
    logger.info("Evaluated greedy classifier on predictive performance")
    target_attrs: List[TargetAttr] = greedy_classifier.target_attributes
    for target_attr in target_attrs:
        target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[
            target_attr]
        logger.info(
            f"\t{target_attr}:\n {target_attr_score_info.to_str('    ')}")
        logger.info("\t---")

    tree_based_greedy_clf_target_attr_to_score_info_abs_file_name: str = \
        get_tree_based_greedy_clf_target_attr_to_score_info_abs_file_name(
            dataset_name=dataset_name, fold_i=fold_i,
            classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support, max_depth=max_depth
        )
    store_mids_target_attr_to_score_info(
        tree_based_greedy_clf_target_attr_to_score_info_abs_file_name,
        target_attr_to_score_info_map)
    logger.info(
        f"Wrote greedy Dict[TargetAttr, ScoreInfo] to"
        f" {tree_based_greedy_clf_target_attr_to_score_info_abs_file_name}")

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    interpret_stats: MIDSInterpretabilityStatistics \
        = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics(
            MIDSRuleSet(greedy_classifier.learned_rule_set),
        df_test_original_column_order, target_attributes=target_attrs)
    logger.info("Evaluated greedy classifier on interpretability")
    logger.info(interpret_stats.to_str("\n"))

    tree_based_greedy_clf_interpret_stats_abs_file_name: str = get_tree_based_greedy_clf_interpret_stats_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)
    store_mids_interpret_stats(
        tree_based_greedy_clf_interpret_stats_abs_file_name, interpret_stats)
    logger.info(
        f"Wrote InterpretabilityStatistics to {tree_based_greedy_clf_interpret_stats_abs_file_name}"
    )
    logger.info("---")

    close_logger(logger)
def learn_single_target_car_mids_model_for_dataset_fold_confidence_boundary(
        dataset_name: str, fold_i: int, target_attribute: str,
        confidence_boundary: float):

    relative_name: str = get_single_target_filtered_car_mids_relative_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary)

    log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir()
    logger = create_logger(
        logger_name=f'learn_single_target_filtered_car_mids_' + relative_name,
        log_file_name=os.path.join(
            log_file_dir,
            f'{relative_name}_model_induction_single_target_filtered_car_mids.log'
        ))

    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.train)
    df_train_original_column_order = pd.read_csv(
        original_train_data_fold_abs_file_name, delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------

    filtered_st_mcars_abs_file_name: str = get_single_target_filtered_cars_abs_filename(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary)
    logger.info(
        f"Reading single-target CARs from file: {filtered_st_mcars_abs_file_name}"
    )
    st_mcar_list: List[MCAR] = load_mcars(filtered_st_mcars_abs_file_name)

    ground_set_size: int = len(st_mcar_list)
    if ground_set_size <= 0:
        raise Exception(
            f"Ground set size is {ground_set_size} for {dataset_name}{fold_i} {target_attribute}"
        )
    logger.info(f"ground set size (nb of initial MCARs): {len(st_mcar_list)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------
    algorithm = "RDGS"
    debug_mids_fitting = False

    mids = MIDSValueReuse()
    mids.normalize = True
    logger.info("start MIDS model induction")
    mids.fit(
        df_train_original_column_order,
        use_targets_from_rule_set=True,
        class_association_rules=st_mcar_list,
        debug=debug_mids_fitting,
        algorithm=algorithm,
    )
    logger.info("finished MIDS model induction")
    mids_classifier: MIDSClassifier = mids.classifier
    logger.info(mids_classifier)

    logger.info("start saving MIDS model")

    mids_classifier_abs_file_name: str = get_single_target_filtered_car_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary)
    store_mids_classifier(mids_classifier_abs_file_name, mids_classifier)
    logger.info(
        f"finished saving MIDS model to file: {mids_classifier_abs_file_name}")
    close_logger(logger)
def merge_single_target_mids_models_for_dataset_fold(
        dataset_name: str,
        fold_i: int,
        nb_of_trees_to_use: int,
        min_support: float,
        max_depth: int
):
    classifier_indicator = SingleTargetClassifierIndicator.random_forest

    relative_name: str = get_merged_single_target_tree_mids_relative_file_name_without_extension(
        dataset_name=dataset_name, fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_to_use,
        min_support=min_support, max_depth=max_depth,
    )
    log_file_dir: str = get_merged_single_target_mids_clf_dir()

    logger_name: str = f'merge_single_target_mids_models__' + relative_name
    logger_file_name: str = os.path.join(
        log_file_dir,
        f'{relative_name}_model_merging_single_target_tree_mids.log'
    )

    logger = create_logger(
        logger_name=logger_name,
        log_file_name=logger_file_name
    )

    original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i,
                                                                                  TrainTestEnum.train)

    target_columns: List[str] = get_header_attributes(original_train_data_fold_abs_file_name)

    merged_st_clf = MergedSTMIDSClassifier()

    for target_attribute in target_columns:
        st_mids_classifier_abs_file_name: str = get_single_target_tree_mids_clf_abs_file_name(
            dataset_name=dataset_name, fold_i=fold_i,
            target_attribute=target_attribute,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_to_use,
            min_support=min_support, max_depth=max_depth
        )

        # --- load single target classifier ---------------------------------------------------------------------------
        logger.info(f"start loading MIDS model from {st_mids_classifier_abs_file_name}")
        st_mids_classifier: MIDSClassifier = load_mids_classifier(st_mids_classifier_abs_file_name)
        logger.info("finished loading MIDS model")
        logger.info(st_mids_classifier)
        reconstructed_mids = MIDSValueReuse()
        reconstructed_mids.classifier = st_mids_classifier
        merged_st_clf.add_single_target_model(st_mids_classifier)

        st_tree_rule_gen_timing_info_abs_file_name: str = get_single_target_tree_rules_gen_timing_info_abs_file_name(
            dataset_name, fold_i, target_attribute,
            classifier_indicator, nb_of_trees_to_use, min_support, max_depth
        )
        st_tree_rule_gen_timing_info: TreeRuleGenTimingInfo = load_tree_rule_gen_timing_info(
            st_tree_rule_gen_timing_info_abs_file_name)

        st_total_time_decision_tree_learning_s = st_tree_rule_gen_timing_info.total_time_decision_tree_learning_s
        st_total_time_rf_conversion_s = st_tree_rule_gen_timing_info.total_time_rf_conversion_s

        st_total_rule_gen_time_s: float = st_total_time_decision_tree_learning_s + st_total_time_rf_conversion_s
        merged_st_clf.add_rule_generation_time(st_total_rule_gen_time_s)

    # --- load test data ----------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i,
                                                                                 TrainTestEnum.test)
    df_test_original_column_order = pd.read_csv(original_test_data_fold_abs_file_name,
                                                delimiter=',')

    merged_st_clf.calculate_ruleset_interpretability_statistics(
        test_dataframe=df_test_original_column_order, target_attributes=target_columns)

    # --- Evaluate and store predictive performance  ------------------------------------------------------------------
    filter_nans: bool = True
    merged_st_clf.calculate_score_info(test_dataframe=df_test_original_column_order, filter_nans=filter_nans)
    logger.info("Evaluated MERGED MIDS classifier on predictive performance")

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    merged_st_clf.calculate_ruleset_interpretability_statistics(
        test_dataframe=df_test_original_column_order, target_attributes=target_columns)
    logger.info("Evaluated MIDS classifier on interpretability")

    # --- store merged classifier ------------------------------------------------------------------------------------
    logger.info("start saving merged single target MIDS model")
    merged_st_clf_abs_file_name: str = get_merged_single_target_tree_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_to_use,
        min_support=min_support, max_depth=max_depth
    )
    store_merged_st_mids_model(merged_st_clf_abs_file_name, merged_st_clf)
    logger.info(f"finished saving merged single target MIDS model to file: {merged_st_clf_abs_file_name}")
    logger.info("---")

    close_logger(logger)