Example #1
0
def learn_tree_based_greedy_model_for_dataset_fold(
        dataset_name: str,
        fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int,
        nb_of_original_targets_to_predict: int,
        min_support: float,
        max_depth: int
):

    logger = create_logger(
        logger_name=f'learn_greedy_model_{dataset_name}{fold_i}_tree_derived_rules',
        log_file_name=os.path.join(greedy_models_tree_based_dir(),
                                   f'{dataset_name}{fold_i}_greedy_model_induction_tree_derived_rules.log')
    )
    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    df_original_train = pd.read_csv(get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.train),
                                    delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------
    tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(dataset_name,
                                                                                fold_i,
                                                                                classifier_indicator,
                                                                                nb_of_trees_per_model,
                                                                                nb_of_original_targets_to_predict,
                                                                                min_support,
                                                                                max_depth)
    logger.info(f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}")
    mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name)

    mids_rules: Set[MIDSRule] = {MIDSRule(mcar) for mcar in mcars}

    logger.info(f"ground set size (nb of initial MCARs): {len(mids_rules)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------

    greedy_clf = GreedyRoundRobinTargetRuleClassifier(df_original_train.columns, verbose=False)
    selected_set, selected_set_scores = greedy_clf.fit(ground_set=mids_rules, training_data=df_original_train)

    logger.info(f"Selected {len(selected_set)} out of {len(mcars)} rules "
                f"({(len(selected_set) / len(mcars) *100):.2f}%)")

    logger.info("start saving Naive greedy model")
    tree_based_greedy_clf_abs_file_name = get_tree_based_greedy_clf_abs_file_name(
        dataset_name=dataset_name, fold_i=fold_i,
        classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support, max_depth=max_depth
    )
    store_greedy_naive_classifier(tree_based_greedy_clf_abs_file_name, greedy_clf)
    logger.info(f"finished saving greedy clf to file: {tree_based_greedy_clf_abs_file_name}")
    close_logger(logger)
Example #2
0
def prepare_arc_data(
        dataset_name: str,
        fold_i: int,
        target_attribute: str,
        train_test: TrainTestEnum
) -> pd.DataFrame:
    # read in original (discretized) training/test data
    # reorder the data so the target column is last
    original_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i, train_test)
    df_original_column_order = pd.read_csv(original_data_fold_abs_file_name, delimiter=',')
    df_reordered = reorder_columns(df_original_column_order, target_attribute)

    # REMOVE INSTANCES WITH NAN AS TARGET VALUE:
    df_reordered = remove_instances_with_nans_in_column(df_reordered, target_attribute)
    return df_reordered
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [dict(filename="iris", targetvariablename="class", numerical=True)]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    min_support: float = 0.1
    max_length: int = 7
    confidence_boundary_values: List[float] = [0.75, 0.95]

    nb_of_folds: int = 10

    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']

        for fold_i in range(nb_of_folds):
            original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i,
                                                                                          TrainTestEnum.train)

            target_columns: List[str] = get_header_attributes(original_train_data_fold_abs_file_name)
            for target_column in target_columns:
                target_attribute = str(target_column)
                for conf_boundary_val in confidence_boundary_values:
                    if use_dask:
                        func_args = dict(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            min_support=min_support,
                            min_confidence=conf_boundary_val,
                            max_length=max_length
                        )
                        delayed_func = delayed(mine_cars_for_dataset_fold_target_attribute)(
                            **func_args
                        )
                        list_of_computations.append((delayed_func, func_args))
                    else:
                        mine_cars_for_dataset_fold_target_attribute(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            min_support=min_support,
                            min_confidence=conf_boundary_val,
                            max_length=max_length
                        )
        if use_dask:
            log_file_dir = assoc_vs_tree_based_single_target_car_dir()

            logger_name: str = 'mine_single_target_cars_ifo_confidence_bound_ERROR_LOGGER'
            logger_file_name: str = os.path.join(
                log_file_dir,
                f'ERROR_LOG_mine_single_target_cars_ifo_confidence_bound.log'
            )

            compute_delayed_functions(
                list_of_computations=list_of_computations,
                client=client,
                nb_of_retries_if_erred=5,
                error_logger_name=logger_name,
                error_logger_file_name=logger_file_name
            )
def evaluate_mids_model_for_dataset_fold_target_attribute(
        dataset_name: str, fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int,
        min_support: float, max_depth: int):
    logger = create_logger(
        logger_name=f'evaluate_mids_model_tree_derived_' +
        get_tree_derived_rules_rel_file_name_without_extension(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth),
        log_file_name=os.path.join(
            get_tree_based_mids_dir(),
            get_tree_derived_rules_rel_file_name_without_extension(
                dataset_name=dataset_name,
                fold_i=fold_i,
                classifier_indicator=classifier_indicator,
                nb_of_trees_per_model=nb_of_trees_per_model,
                nb_of_original_targets_to_predict=
                nb_of_original_targets_to_predict,
                min_support=min_support,
                max_depth=max_depth) +
            '_model_evaluation_tree_derived_rules.log'))

    # --- load test data ----------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.test)
    df_test_original_column_order = pd.read_csv(
        original_test_data_fold_abs_file_name, delimiter=',')

    # --- load classifier ---------------------------------------------------------------------------------------------
    tree_based_mids_classifier_abs_file_name = get_tree_based_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)

    # mids_classifier_abs_file_name = get_mids_clf_abs_file_name(dataset_name, fold_i)
    logger.info(
        f"start loading MIDS model from {tree_based_mids_classifier_abs_file_name}"
    )
    mids_classifier: MIDSClassifier = load_mids_classifier(
        tree_based_mids_classifier_abs_file_name)
    logger.info("finished loading MIDS model")
    logger.info(mids_classifier)
    reconstructed_mids = MIDSValueReuse()
    reconstructed_mids.classifier = mids_classifier

    mids_classifier.rule_combination_strategy = RuleCombiningStrategy.WEIGHTED_VOTE
    mids_classifier.rule_combinator = WeightedVotingRuleCombinator()

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    filter_nans: bool = True
    target_attr_to_score_info_map: Dict[
        str, ScoreInfo] = score_MIDS_on_its_targets_without_nans(
            reconstructed_mids,
            df_test_original_column_order,
            filter_nans=filter_nans)
    logger.info("Evaluated MIDS classifier on predictive performance")
    target_attrs: List[TargetAttr] = mids_classifier.target_attrs
    for target_attr in target_attrs:
        target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[
            target_attr]
        logger.info(
            f"\t{target_attr}:\n {target_attr_score_info.to_str('    ')}")
        logger.info("\t---")

    # mids_target_attr_to_score_info_abs_file_name: str = get_mids_target_attr_to_score_info_abs_file_name(
    #     dataset_name, fold_i)

    tree_based_mids_target_attr_to_score_info_abs_file_name: str = \
        get_tree_based_mids_target_attr_to_score_info_abs_file_name(
            dataset_name=dataset_name, fold_i=fold_i,
            classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support, max_depth=max_depth
        )

    store_mids_target_attr_to_score_info(
        tree_based_mids_target_attr_to_score_info_abs_file_name,
        target_attr_to_score_info_map)
    logger.info(
        f"Wrote MIDS Dict[TargetAttr, ScoreInfo] to {tree_based_mids_target_attr_to_score_info_abs_file_name}"
    )

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    interpret_stats: MIDSInterpretabilityStatistics \
        = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics(
            MIDSRuleSet(mids_classifier.rules), df_test_original_column_order, target_attributes=target_attrs)
    logger.info("Evaluated MIDS classifier on interpretability")
    logger.info(interpret_stats.to_str("\n"))

    # mids_interpret_stats_abs_file_name: str = get_mids_interpret_stats_abs_file_name(
    #     dataset_name, fold_i)
    tree_based_mids_interpret_stats_abs_file_name: str = get_tree_based_mids_interpret_stats_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)
    store_mids_interpret_stats(tree_based_mids_interpret_stats_abs_file_name,
                               interpret_stats)
    logger.info(
        f"Wrote MIDSInterpretabilityStatistics to {tree_based_mids_interpret_stats_abs_file_name}"
    )
    logger.info("---")

    close_logger(logger)
def learn_single_target_tree_mids_model_for_dataset_fold(
        dataset_name: str, fold_i: int, target_attribute: str,
        nb_of_trees_per_model: int, min_support: float, max_depth: int):
    classifier_indicator = SingleTargetClassifierIndicator.random_forest

    relative_name: str = get_single_target_tree_rules_relative_file_name_without_extension(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth,
    )
    log_file_dir: str = get_single_target_mids_clf_dir()
    logger = create_logger(
        logger_name=f'learn_single_target_tree_mids_' + relative_name,
        log_file_name=os.path.join(
            log_file_dir,
            f'{relative_name}_model_induction_single_target_tree_mids.log'))

    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.train)
    df_train_original_column_order = pd.read_csv(
        original_train_data_fold_abs_file_name, delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------

    tree_clf_derived_rules_abs_file_name: str = get_single_target_tree_rules_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth)
    logger.info(
        f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}")
    mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name)
    logger.info(f"ground set size (nb of initial MCARs): {len(mcars)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------
    algorithm = "RDGS"
    debug_mids_fitting = False

    mids = MIDSValueReuse()
    mids.normalize = True
    logger.info("start MIDS model induction")
    mids.fit(
        df_train_original_column_order,
        targets_to_use=[target_attribute],
        use_targets_from_rule_set=False,
        class_association_rules=mcars,
        debug=debug_mids_fitting,
        algorithm=algorithm,
    )
    logger.info("finished MIDS model induction")
    mids_classifier: MIDSClassifier = mids.classifier
    logger.info(mids_classifier)

    logger.info("start saving MIDS model")
    tree_based_mids_classifier_abs_file_name = get_single_target_tree_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth)
    store_mids_classifier(tree_based_mids_classifier_abs_file_name,
                          mids_classifier)
    logger.info(
        f"finished saving MIDS model to file: {tree_based_mids_classifier_abs_file_name}"
    )
    close_logger(logger)
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [
        dict(filename="iris", targetvariablename="class", numerical=True)
    ]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name

    min_support = 0.1
    max_depth = 7

    nb_of_trees_to_use_list: List[int] = [25, 50]

    nb_of_folds: int = 10
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']
        for fold_i in range(nb_of_folds):

            original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
                dataset_name, fold_i, TrainTestEnum.train)

            target_columns: List[str] = get_header_attributes(
                original_train_data_fold_abs_file_name)
            for target_column in target_columns:
                target_attribute = str(target_column)
                for nb_of_trees_to_use in nb_of_trees_to_use_list:
                    if use_dask:
                        func_args = dict(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            nb_of_trees_per_model=nb_of_trees_to_use,
                            min_support=min_support,
                            max_depth=max_depth)

                        delayed_func = \
                            delayed(
                                learn_single_target_tree_mids_model_for_dataset_fold)(
                                    **func_args
                            )
                        list_of_computations.append((delayed_func, func_args))
                    else:
                        learn_single_target_tree_mids_model_for_dataset_fold(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            nb_of_trees_per_model=nb_of_trees_to_use,
                            min_support=min_support,
                            max_depth=max_depth)

    if use_dask:
        log_file_dir: str = get_single_target_mids_clf_dir()

        logger_name = f'learn_single_target_tree_mids_ERROR_LOGGER'
        logger_file_name = os.path.join(
            log_file_dir,
            f'ERROR_LOG_model_induction_single_target_tree_mids.log')

        compute_delayed_functions(list_of_computations=list_of_computations,
                                  client=client,
                                  nb_of_retries_if_erred=5,
                                  error_logger_name=logger_name,
                                  error_logger_file_name=logger_file_name)
def learn_and_convert_tree_model_to_rules(
        dataset_name: str, fold_i: int, nb_of_trees_per_model: int,
        nb_of_original_targets_to_predict: int, nb_grouping_iterations: int,
        min_support: float, max_depth: int, seed: int):
    classifier_indicator = SingleTargetClassifierIndicator.random_forest
    train_test = TrainTestEnum.train

    logger = create_logger(
        logger_name=f'mine_multi-target_cars_tree_derived_' +
        get_tree_derived_rules_rel_file_name_without_extension(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth),
        log_file_name=get_tree_derived_rules_logger_abs_file_name(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth))

    # --- load train data ---------------------------------------------------------------------------------------------
    df_original = pd.read_csv(get_original_data_fold_abs_file_name(
        dataset_name, fold_i, train_test),
                              delimiter=',')
    df_one_hot_encoded = pd.read_csv(
        get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i,
                                                    train_test),
        delimiter=",")

    encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper(
        get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i))

    # --- prepare data ------------------------------------------------------------------------------------------------
    logger.info(
        f"Start preparing data using {nb_of_original_targets_to_predict} attrs per group"
        f" with {nb_grouping_iterations} grouping iterations")

    different_attr_groupings: List[AttrGroupPartitioning] = get_attr_groupings(
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        nb_grouping_iterations=nb_grouping_iterations,
        encoding_book_keeper=encoding_book_keeper)

    complete_rule_list: List[MCAR] = []
    cover_checker = CoverChecker()

    total_time_random_forest_learning_s = 0.0
    total_time_rf_conversion_s = 0.0
    # prepared_data_list: List[PreparedDataForTargetSet] = []
    for original_target_attribute_partitioning in different_attr_groupings:
        attr_group: AttrGroup
        for attr_group in original_target_attribute_partitioning:
            prepared_data: PreparedDataForTargetSet = get_prepared_data_for_attr_group(
                original_group_to_predict=attr_group,
                df_original=df_original,
                df_one_hot_encoded=df_one_hot_encoded,
                encoding_book_keeper=encoding_book_keeper)
            # prepared_data_list.append(prepared_data)

            start_time_decision_tree_learning_s = time.time()
            classifier: RandomForestClassifier = RandomForestClassifier(
                n_estimators=nb_of_trees_per_model,
                random_state=seed,
                min_samples_leaf=min_support,
                max_depth=max_depth)

            # --- Learn a random forest given the current number of trees -----------------------------------
            classifier.fit(
                prepared_data.df_one_hot_encoded_descriptive_attributes,
                prepared_data.df_one_hot_encoded_target_attributes)
            end_time_decision_tree_learning_s = time.time()
            total_time_decision_tree_learning_s: float = end_time_decision_tree_learning_s - start_time_decision_tree_learning_s
            total_time_random_forest_learning_s += total_time_decision_tree_learning_s

            tree_based_rules: List[MCAR]
            total_time_rf_conversion_s: float
            tree_based_rules, partial_time_rf_conversion_s = convert_random_forest_to_rules(
                random_forest_clf=classifier,
                df_original_without_nans=prepared_data.
                df_original_without_nans_for_targets,
                descriptive_one_hot_encoded_column_names=prepared_data.
                descriptive_one_hot_encoded_columns,
                # target_attribute_names=df_original_target_attrs_without_nans.columns,
                target_attribute_names=prepared_data.
                target_one_hot_encoded_columns,
                encoding_book_keeper=encoding_book_keeper,
                logger=logger)
            total_time_rf_conversion_s += partial_time_rf_conversion_s
            complete_rule_list.extend(tree_based_rules)

    logger.info(f"Complete set size: {len(complete_rule_list)}")

    # --- Save rules to file ---------------------------------------------------------------------------------

    tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(
        dataset_name, fold_i, classifier_indicator, nb_of_trees_per_model,
        nb_of_original_targets_to_predict, min_support, max_depth)
    store_mcars(tree_clf_derived_rules_abs_file_name, complete_rule_list)
    logger.info(
        f"finished writing tree-derived ruled to file: {tree_clf_derived_rules_abs_file_name}"
    )
    logger.info(
        "==================================================================")

    tree_rule_gen_timing_info = TreeRuleGenTimingInfo(
        total_time_decision_tree_learning_s=total_time_random_forest_learning_s,
        total_time_rf_conversion_s=total_time_rf_conversion_s)

    tree_rule_gen_timing_info_abs_file_name: str = get_tree_derived_rules_gen_timing_info_abs_file_name(
        dataset_name, fold_i, classifier_indicator, nb_of_trees_per_model,
        nb_of_original_targets_to_predict, min_support, max_depth)
    store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name,
                                    tree_rule_gen_timing_info)

    close_logger(logger)
Example #8
0
def evaluate_single_target_mids_model_for_dataset_fold(
        dataset_name: str, fold_i: int, logger_name: str,
        logger_file_name: str, mids_classifier_abs_file_name: str,
        mids_target_attr_to_score_info_abs_file_name: str,
        mids_interpret_stats_abs_file_name: str):
    logger = create_logger(logger_name=logger_name,
                           log_file_name=logger_file_name)

    # --- load test data ----------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.test)
    df_test_original_column_order = pd.read_csv(
        original_test_data_fold_abs_file_name, delimiter=',')

    # --- load classifier ---------------------------------------------------------------------------------------------
    # mids_classifier_abs_file_name = get_mids_clf_abs_file_name(dataset_name, fold_i)
    logger.info(
        f"start loading MIDS model from {mids_classifier_abs_file_name}")
    mids_classifier: MIDSClassifier = load_mids_classifier(
        mids_classifier_abs_file_name)
    logger.info("finished loading MIDS model")
    logger.info(mids_classifier)
    reconstructed_mids = MIDSValueReuse()
    reconstructed_mids.classifier = mids_classifier

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    filter_nans: bool = True
    target_attr_to_score_info_map: Dict[
        str, ScoreInfo] = score_MIDS_on_its_targets_without_nans(
            reconstructed_mids,
            df_test_original_column_order,
            filter_nans=filter_nans)
    logger.info("Evaluated MIDS classifier on predictive performance")

    target_attrs: List[TargetAttr] = mids_classifier.target_attrs
    for target_attr in target_attrs:
        target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[
            target_attr]
        logger.info(
            f"\t{target_attr}:\n {target_attr_score_info.to_str('    ')}")
        logger.info("\t---")

    store_mids_target_attr_to_score_info(
        mids_target_attr_to_score_info_abs_file_name,
        target_attr_to_score_info_map)
    logger.info(
        f"Wrote MIDS Dict[TargetAttr, ScoreInfo] to {mids_target_attr_to_score_info_abs_file_name}"
    )

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    interpret_stats: MIDSInterpretabilityStatistics \
        = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics(
            MIDSRuleSet(mids_classifier.rules), df_test_original_column_order, target_attributes=target_attrs)
    logger.info("Evaluated MIDS classifier on interpretability")
    logger.info(interpret_stats.to_str("\n"))

    store_mids_interpret_stats(mids_interpret_stats_abs_file_name,
                               interpret_stats)
    logger.info(
        f"Wrote MIDSInterpretabilityStatistics to {mids_interpret_stats_abs_file_name}"
    )
    logger.info("---")

    close_logger(logger)
def create_single_target_tree_based_mcars(
        dataset_name: str, fold_i: int, target_attribute: str,
        classifier_indicator: SingleTargetClassifierIndicator,
        confidence_boundary_val: float, min_support: float, max_depth: int,
        seed: int):
    train_test = TrainTestEnum.train

    relative_name: str = get_single_target_tree_rules_relative_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)

    logger = create_logger(
        logger_name=f'create_single_target_tree_rules' + relative_name,
        log_file_name=os.path.join(
            assoc_vs_tree_based_single_target_car_dir(),
            f'{relative_name}_single_target_tree_rule_generation.log'))

    logger.info(
        f"Start reading MCARS for {dataset_name}{fold_i}_{target_attribute}"
        f" (confidence {confidence_boundary_val})")
    st_mcars_abs_file_name = get_single_target_filtered_cars_abs_filename(
        dataset_name,
        fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary_val)

    filtered_st_mcars: List[MCAR] = load_mcars(st_mcars_abs_file_name)
    logger.info(
        f"Total nb of MCARS for {dataset_name}{fold_i}_{target_attribute}"
        f" (conf {confidence_boundary_val}): {len(filtered_st_mcars)}")

    n_tree_rules_to_generate = len(filtered_st_mcars)
    logger.info(f"Generate {n_tree_rules_to_generate} tree based rules")

    # --- load train data ---------------------------------------------------------------------------------------------
    df_original = pd.read_csv(get_original_data_fold_abs_file_name(
        dataset_name, fold_i, train_test),
                              delimiter=',')
    df_one_hot_encoded = pd.read_csv(
        get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i,
                                                    train_test),
        delimiter=",")
    encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper(
        get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i))

    # --- prepare data ------------------------------------------------------------------------------------------------

    original_group_to_predict: List[Attr] = [target_attribute]
    original_target_attr_set = set(original_group_to_predict)

    logger.info(
        f"Fetching the necessary columns for {dataset_name}{fold_i} {original_target_attr_set}"
    )

    prepared_data: PreparedDataForTargetSet = PreparedDataForTargetSet.prepare_data_for_target_set(
        df_original=df_original,
        df_one_hot_encoded=df_one_hot_encoded,
        encoding_book_keeper=encoding_book_keeper,
        original_target_attr_set=original_target_attr_set,
    )

    random_forest_abs_file_name: str = get_single_target_random_forest_absolute_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)

    # --- Generate the required nb of tree-based rules ----------------------------------------------------------------
    logger.info(f"Start generating tree-based rules")
    tree_based_mcars: List[MCAR]
    tree_rule_gen_timing_info: TreeRuleGenTimingInfo
    tree_based_mcars, tree_rule_gen_timing_info = generate_n_single_target_tree_rules(
        n_tree_rules_to_generate=n_tree_rules_to_generate,
        prepared_data=prepared_data,
        encoding_book_keeper=encoding_book_keeper,
        min_support=min_support,
        max_depth=max_depth,
        logger=logger,
        seed=seed,
        random_forest_abs_file_name=random_forest_abs_file_name)

    # --- SAVE the generated tree-based rules
    tree_based_rules_abs_file_name: str = get_single_target_tree_rules_absolute_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)
    store_mcars(tree_based_rules_abs_file_name, tree_based_mcars)
    logger.info(
        f"finished writing tree-derived ruled to file: {tree_based_rules_abs_file_name}"
    )

    tree_rule_gen_timing_info_abs_file_name: str = get_single_target_tree_rules_gen_timing_info_absolute_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)
    store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name,
                                    tree_rule_gen_timing_info)

    logger.info(
        "==================================================================")
    close_logger(logger)
def learn_and_convert_single_target_tree_ensemble_to_rules(
        dataset_name: str, fold_i: int, target_attribute: str,
        nb_of_trees_per_model: int, min_support: float, max_depth: int):
    classifier_indicator = SingleTargetClassifierIndicator.random_forest
    train_test = TrainTestEnum.train

    relative_name: str = get_single_target_tree_rules_relative_file_name_without_extension(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth,
    )

    logger_dir: str = get_single_target_tree_rule_dir()
    logger = create_logger(
        logger_name=f'mine_single_target_tree_rules_' + relative_name,
        log_file_name=os.path.join(
            logger_dir, f"{relative_name}_single_tree_rule_generation.log"))

    # --- load train data ---------------------------------------------------------------------------------------------
    df_original = pd.read_csv(get_original_data_fold_abs_file_name(
        dataset_name, fold_i, train_test),
                              delimiter=',')
    df_one_hot_encoded = pd.read_csv(
        get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i,
                                                    train_test),
        delimiter=",")

    encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper(
        get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i))

    cover_checker = CoverChecker()

    original_group_to_predict: List[Attr] = [target_attribute]
    original_target_attr_set = set(original_group_to_predict)

    logger.info(
        f"Fetching the necessary columns for {dataset_name}{fold_i} {original_target_attr_set}"
    )

    prepared_data: PreparedDataForTargetSet = PreparedDataForTargetSet.prepare_data_for_target_set(
        df_original=df_original,
        df_one_hot_encoded=df_one_hot_encoded,
        encoding_book_keeper=encoding_book_keeper,
        original_target_attr_set=original_target_attr_set,
    )

    # --- Fit and save classifier ---------------------------------------------------------------------------------

    start_time_decision_tree_learning_s = time.time()
    classifier: RandomForestClassifier = RandomForestClassifier(
        n_estimators=nb_of_trees_per_model,
        min_samples_leaf=min_support,
        max_depth=max_depth)

    classifier.fit(X=prepared_data.df_one_hot_encoded_descriptive_attributes,
                   y=prepared_data.df_one_hot_encoded_target_attributes)
    end_time_decision_tree_learning_s = time.time()
    total_time_decision_tree_learning_s: float = end_time_decision_tree_learning_s - start_time_decision_tree_learning_s

    logger.info(
        f"Fitted a {classifier_indicator.value} model predicting {original_target_attr_set}"
        f" for {dataset_name}{fold_i}")

    total_time_rf_conversion_s: TimeDiffSec = 0

    complete_rule_list: List[MCAR] = []
    tree_classifiers = classifier.estimators_
    for tree_clf in tree_classifiers:
        list_of_dt_rules: Optional[List[MIDSRule]] = None
        try:
            start_time_clf_conversion_s = time.time()
            list_of_dt_rules: List[
                MIDSRule] = convert_decision_tree_to_mids_rule_list(
                    tree_classifier=tree_clf,
                    one_hot_encoded_feature_names=prepared_data.
                    descriptive_one_hot_encoded_columns,
                    target_attribute_names=prepared_data.
                    target_one_hot_encoded_columns,
                    encoding_book_keeper=encoding_book_keeper)
        except NotImplementedError as err:
            logger.error(str(err))

        if list_of_dt_rules is not None:

            # --- adding support and confidence to rules
            mids_rule: MIDSRule
            for mids_rule in list_of_dt_rules:
                add_support_and_confidence_to_MIDSRule(
                    prepared_data.df_original_without_nans_for_targets,
                    mids_rule,
                    cover_checker=cover_checker)

            # logger.info(f"found {len(list_of_dt_rules)} rules,"
            #             f" updated total rule set size: {len(complete_rule_list)}")
            mids_rules_as_mcars = [
                mids_rule.car for mids_rule in list_of_dt_rules
            ]
            complete_rule_list.extend(mids_rules_as_mcars)

            end_time_clf_conversion_s = time.time()
            total_time_clf_conversion_s = end_time_clf_conversion_s - start_time_clf_conversion_s
            total_time_rf_conversion_s += total_time_clf_conversion_s

    logger.info(f"Complete set size: {len(complete_rule_list)}")

    for i in range(0, len(complete_rule_list)):
        logger.info(f"rule {i}: {str(complete_rule_list[i])}")
        if i > 10:
            break
    # --- Save rules to file ---------------------------------------------------------------------------------
    tree_clf_derived_rules_abs_file_name: str = get_single_target_tree_rules_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth)
    store_mcars(tree_clf_derived_rules_abs_file_name, complete_rule_list)
    logger.info(
        f"finished writing single-target tree rules to file: {tree_clf_derived_rules_abs_file_name}"
    )

    tree_rule_gen_timing_info = TreeRuleGenTimingInfo(
        total_time_decision_tree_learning_s=total_time_decision_tree_learning_s,
        total_time_rf_conversion_s=total_time_rf_conversion_s)

    tree_rule_gen_timing_info_abs_file_name: str = get_single_target_tree_rules_gen_timing_info_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth)
    store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name,
                                    tree_rule_gen_timing_info)

    logger.info(
        "==================================================================")
    close_logger(logger)
Example #11
0
def learn_tree_based_mids_model_for_dataset_fold(
        dataset_name: str,
        fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int,
        nb_of_original_targets_to_predict: int,
        min_support: float,
        max_depth: int
):

    logger = create_logger(
        logger_name=f'learn_mids_model{dataset_name}{fold_i}_tree_derived_rules',
        log_file_name=os.path.join(get_tree_based_mids_dir(),
                                   f'{dataset_name}{fold_i}_model_induction_tree_derived_rules.log')
    )
    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    df_original_train = pd.read_csv(get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.train),
                                    delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------
    tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(dataset_name,
                                                                                fold_i,
                                                                                classifier_indicator,
                                                                                nb_of_trees_per_model,
                                                                                nb_of_original_targets_to_predict,
                                                                                min_support,
                                                                                max_depth)
    logger.info(f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}")
    mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name)
    logger.info(f"ground set size (nb of initial MCARs): {len(mcars)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------
    algorithm = "RDGS"
    debug_mids_fitting = False

    mids = MIDSValueReuse()
    mids.normalize = True
    logger.info("start MIDS model induction")

    mids.fit(df_original_train,
             class_association_rules=mcars, debug=debug_mids_fitting, algorithm=algorithm,
             # lambda_array=lambda_array
             use_targets_from_rule_set=False,
             )
    logger.info("finished MIDS model induction")
    mids_classifier: MIDSClassifier = mids.classifier
    logger.info(mids_classifier)
    logger.info(f"Selected {len(mids_classifier.rules)} out of {len(mcars)} rules "
                f"({(len(mids_classifier.rules) / len(mcars) *100):.2f}%)")

    logger.info("start saving MIDS model")
    tree_based_mids_classifier_abs_file_name = get_tree_based_mids_clf_abs_file_name(
        dataset_name=dataset_name, fold_i=fold_i,
        classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support, max_depth=max_depth
    )
    store_mids_classifier(tree_based_mids_classifier_abs_file_name, mids_classifier)
    logger.info(f"finished saving MIDS model to file: {tree_based_mids_classifier_abs_file_name}")
    close_logger(logger)
def evaluate_greedy_model_for_dataset_fold_target_attribute(
        dataset_name: str, fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int,
        min_support: float, max_depth: int):
    logger = create_logger(
        logger_name=f'evaluate_greedy_model_tree_derived_' +
        get_tree_derived_rules_rel_file_name_without_extension(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth),
        log_file_name=os.path.join(
            greedy_models_tree_based_dir(),
            get_tree_derived_rules_rel_file_name_without_extension(
                dataset_name=dataset_name,
                fold_i=fold_i,
                classifier_indicator=classifier_indicator,
                nb_of_trees_per_model=nb_of_trees_per_model,
                nb_of_original_targets_to_predict=
                nb_of_original_targets_to_predict,
                min_support=min_support,
                max_depth=max_depth) +
            '_greedy_model_evaluation_tree_derived_rules.log'))

    # --- load test data ----------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.test)
    df_test_original_column_order = pd.read_csv(
        original_test_data_fold_abs_file_name, delimiter=',')

    # --- load classifier ---------------------------------------------------------------------------------------------

    tree_based_greedy_clf_abs_file_name = get_tree_based_greedy_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)

    logger.info(
        f"start loading greedy model from {tree_based_greedy_clf_abs_file_name}"
    )
    greedy_classifier: GreedyRoundRobinTargetRuleClassifier = load_greedy_naive_classifier(
        tree_based_greedy_clf_abs_file_name)
    logger.info("finished loading greedy model")
    logger.info(greedy_classifier)

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    filter_nans: bool = True
    target_attr_to_score_info_map: Dict[
        str, ScoreInfo] = score_mt_clf_on_its_targets_without_nans(
            greedy_classifier,
            df_test_original_column_order,
            filter_nans=filter_nans)
    logger.info("Evaluated greedy classifier on predictive performance")
    target_attrs: List[TargetAttr] = greedy_classifier.target_attributes
    for target_attr in target_attrs:
        target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[
            target_attr]
        logger.info(
            f"\t{target_attr}:\n {target_attr_score_info.to_str('    ')}")
        logger.info("\t---")

    tree_based_greedy_clf_target_attr_to_score_info_abs_file_name: str = \
        get_tree_based_greedy_clf_target_attr_to_score_info_abs_file_name(
            dataset_name=dataset_name, fold_i=fold_i,
            classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support, max_depth=max_depth
        )
    store_mids_target_attr_to_score_info(
        tree_based_greedy_clf_target_attr_to_score_info_abs_file_name,
        target_attr_to_score_info_map)
    logger.info(
        f"Wrote greedy Dict[TargetAttr, ScoreInfo] to"
        f" {tree_based_greedy_clf_target_attr_to_score_info_abs_file_name}")

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    interpret_stats: MIDSInterpretabilityStatistics \
        = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics(
            MIDSRuleSet(greedy_classifier.learned_rule_set),
        df_test_original_column_order, target_attributes=target_attrs)
    logger.info("Evaluated greedy classifier on interpretability")
    logger.info(interpret_stats.to_str("\n"))

    tree_based_greedy_clf_interpret_stats_abs_file_name: str = get_tree_based_greedy_clf_interpret_stats_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)
    store_mids_interpret_stats(
        tree_based_greedy_clf_interpret_stats_abs_file_name, interpret_stats)
    logger.info(
        f"Wrote InterpretabilityStatistics to {tree_based_greedy_clf_interpret_stats_abs_file_name}"
    )
    logger.info("---")

    close_logger(logger)
def learn_single_target_car_mids_model_for_dataset_fold_confidence_boundary(
        dataset_name: str, fold_i: int, target_attribute: str,
        confidence_boundary: float):

    relative_name: str = get_single_target_filtered_car_mids_relative_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary)

    log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir()
    logger = create_logger(
        logger_name=f'learn_single_target_filtered_car_mids_' + relative_name,
        log_file_name=os.path.join(
            log_file_dir,
            f'{relative_name}_model_induction_single_target_filtered_car_mids.log'
        ))

    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.train)
    df_train_original_column_order = pd.read_csv(
        original_train_data_fold_abs_file_name, delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------

    filtered_st_mcars_abs_file_name: str = get_single_target_filtered_cars_abs_filename(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary)
    logger.info(
        f"Reading single-target CARs from file: {filtered_st_mcars_abs_file_name}"
    )
    st_mcar_list: List[MCAR] = load_mcars(filtered_st_mcars_abs_file_name)

    ground_set_size: int = len(st_mcar_list)
    if ground_set_size <= 0:
        raise Exception(
            f"Ground set size is {ground_set_size} for {dataset_name}{fold_i} {target_attribute}"
        )
    logger.info(f"ground set size (nb of initial MCARs): {len(st_mcar_list)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------
    algorithm = "RDGS"
    debug_mids_fitting = False

    mids = MIDSValueReuse()
    mids.normalize = True
    logger.info("start MIDS model induction")
    mids.fit(
        df_train_original_column_order,
        use_targets_from_rule_set=True,
        class_association_rules=st_mcar_list,
        debug=debug_mids_fitting,
        algorithm=algorithm,
    )
    logger.info("finished MIDS model induction")
    mids_classifier: MIDSClassifier = mids.classifier
    logger.info(mids_classifier)

    logger.info("start saving MIDS model")

    mids_classifier_abs_file_name: str = get_single_target_filtered_car_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary)
    store_mids_classifier(mids_classifier_abs_file_name, mids_classifier)
    logger.info(
        f"finished saving MIDS model to file: {mids_classifier_abs_file_name}")
    close_logger(logger)
Example #14
0
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [
        dict(filename="iris", targetvariablename="class", numerical=True)
    ]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    min_support = 0.1
    max_depth = 7

    nb_of_trees_to_use_list: List[int] = [25, 50]

    list_of_computations: List[Tuple[Delayed, Dict]] = []
    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']
        for fold_i in range(10):
            original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
                dataset_name, fold_i, TrainTestEnum.train)

            target_columns: List[str] = get_header_attributes(
                original_train_data_fold_abs_file_name)
            for target_column in target_columns:
                target_attribute = str(target_column)

                for nb_of_trees_to_use in nb_of_trees_to_use_list:
                    if use_dask:
                        func_args = dict(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            nb_of_trees_per_model=nb_of_trees_to_use,
                            min_support=min_support,
                            max_depth=max_depth)

                        delayed_func = \
                            delayed(learn_and_convert_single_target_tree_ensemble_to_rules)(
                                **func_args
                            )
                        list_of_computations.append((delayed_func, func_args))
                    else:
                        learn_and_convert_single_target_tree_ensemble_to_rules(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            nb_of_trees_per_model=nb_of_trees_to_use,
                            min_support=min_support,
                            max_depth=max_depth)

            # Result: pairs of (association rule, tree-based rule) sets, with an increasing number of rules.

            # --- Learn an (M)IDS model for each of the two rule sets in a pair. ------------------------------

            # --- Evaluate the learned IDS models using the chosen evaluation metrics. ------------------------

            # --- Plot the evaluation metrics in function of the increasing number of rules. ------------------

    if use_dask:
        log_file_dir: str = get_single_target_tree_rule_dir()

        logger_name: str = f'mine_single_target_tree_mids_ERROR_LOGGER'
        logger_file_name: str = os.path.join(
            log_file_dir,
            f'ERROR_LOG_model_induction_single_target_tree_mids_easy.log')

        compute_delayed_functions(list_of_computations=list_of_computations,
                                  client=client,
                                  nb_of_retries_if_erred=5,
                                  error_logger_name=logger_name,
                                  error_logger_file_name=logger_file_name)
def merge_single_target_mids_models_for_dataset_fold(
        dataset_name: str,
        fold_i: int,
        nb_of_trees_to_use: int,
        min_support: float,
        max_depth: int
):
    classifier_indicator = SingleTargetClassifierIndicator.random_forest

    relative_name: str = get_merged_single_target_tree_mids_relative_file_name_without_extension(
        dataset_name=dataset_name, fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_to_use,
        min_support=min_support, max_depth=max_depth,
    )
    log_file_dir: str = get_merged_single_target_mids_clf_dir()

    logger_name: str = f'merge_single_target_mids_models__' + relative_name
    logger_file_name: str = os.path.join(
        log_file_dir,
        f'{relative_name}_model_merging_single_target_tree_mids.log'
    )

    logger = create_logger(
        logger_name=logger_name,
        log_file_name=logger_file_name
    )

    original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i,
                                                                                  TrainTestEnum.train)

    target_columns: List[str] = get_header_attributes(original_train_data_fold_abs_file_name)

    merged_st_clf = MergedSTMIDSClassifier()

    for target_attribute in target_columns:
        st_mids_classifier_abs_file_name: str = get_single_target_tree_mids_clf_abs_file_name(
            dataset_name=dataset_name, fold_i=fold_i,
            target_attribute=target_attribute,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_to_use,
            min_support=min_support, max_depth=max_depth
        )

        # --- load single target classifier ---------------------------------------------------------------------------
        logger.info(f"start loading MIDS model from {st_mids_classifier_abs_file_name}")
        st_mids_classifier: MIDSClassifier = load_mids_classifier(st_mids_classifier_abs_file_name)
        logger.info("finished loading MIDS model")
        logger.info(st_mids_classifier)
        reconstructed_mids = MIDSValueReuse()
        reconstructed_mids.classifier = st_mids_classifier
        merged_st_clf.add_single_target_model(st_mids_classifier)

        st_tree_rule_gen_timing_info_abs_file_name: str = get_single_target_tree_rules_gen_timing_info_abs_file_name(
            dataset_name, fold_i, target_attribute,
            classifier_indicator, nb_of_trees_to_use, min_support, max_depth
        )
        st_tree_rule_gen_timing_info: TreeRuleGenTimingInfo = load_tree_rule_gen_timing_info(
            st_tree_rule_gen_timing_info_abs_file_name)

        st_total_time_decision_tree_learning_s = st_tree_rule_gen_timing_info.total_time_decision_tree_learning_s
        st_total_time_rf_conversion_s = st_tree_rule_gen_timing_info.total_time_rf_conversion_s

        st_total_rule_gen_time_s: float = st_total_time_decision_tree_learning_s + st_total_time_rf_conversion_s
        merged_st_clf.add_rule_generation_time(st_total_rule_gen_time_s)

    # --- load test data ----------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i,
                                                                                 TrainTestEnum.test)
    df_test_original_column_order = pd.read_csv(original_test_data_fold_abs_file_name,
                                                delimiter=',')

    merged_st_clf.calculate_ruleset_interpretability_statistics(
        test_dataframe=df_test_original_column_order, target_attributes=target_columns)

    # --- Evaluate and store predictive performance  ------------------------------------------------------------------
    filter_nans: bool = True
    merged_st_clf.calculate_score_info(test_dataframe=df_test_original_column_order, filter_nans=filter_nans)
    logger.info("Evaluated MERGED MIDS classifier on predictive performance")

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    merged_st_clf.calculate_ruleset_interpretability_statistics(
        test_dataframe=df_test_original_column_order, target_attributes=target_columns)
    logger.info("Evaluated MIDS classifier on interpretability")

    # --- store merged classifier ------------------------------------------------------------------------------------
    logger.info("start saving merged single target MIDS model")
    merged_st_clf_abs_file_name: str = get_merged_single_target_tree_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_to_use,
        min_support=min_support, max_depth=max_depth
    )
    store_merged_st_mids_model(merged_st_clf_abs_file_name, merged_st_clf)
    logger.info(f"finished saving merged single target MIDS model to file: {merged_st_clf_abs_file_name}")
    logger.info("---")

    close_logger(logger)