def get_tree_derived_rules_gen_timing_info_abs_file_name(dataset_name: str, fold_i: int,
                                         classifier_indicator: SingleTargetClassifierIndicator,
                                         nb_of_trees_per_model: int,
                                         nb_of_original_targets_to_predict: int,
                                         min_support: float,
                                         max_depth: int) -> str:
    rules_dir: str = get_tree_derived_rules_dir()
    relative_file_name = get_tree_derived_rules_rel_file_name_without_extension(
            dataset_name=dataset_name, fold_i=fold_i,
            classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support, max_depth=max_depth
            )
    tree_derived_rule_abs_file_name = os.path.join(rules_dir, f"{relative_file_name}_timings.json.gz")
    return tree_derived_rule_abs_file_name
Ejemplo n.º 2
0
def get_tree_based_greedy_clf_abs_file_name(
        dataset_name: str, fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int,
        min_support: float, max_depth: int) -> str:
    greedy_clf_dir: str = greedy_models_tree_based_dir()

    relative_file_name = get_tree_derived_rules_rel_file_name_without_extension(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)
    greedy_clf_abs_file_name = os.path.join(
        greedy_clf_dir, f'{relative_file_name}_greedy_naive_clf.json.gz')
    return greedy_clf_abs_file_name
def evaluate_mids_model_for_dataset_fold_target_attribute(
        dataset_name: str, fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int,
        min_support: float, max_depth: int):
    logger = create_logger(
        logger_name=f'evaluate_mids_model_tree_derived_' +
        get_tree_derived_rules_rel_file_name_without_extension(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth),
        log_file_name=os.path.join(
            get_tree_based_mids_dir(),
            get_tree_derived_rules_rel_file_name_without_extension(
                dataset_name=dataset_name,
                fold_i=fold_i,
                classifier_indicator=classifier_indicator,
                nb_of_trees_per_model=nb_of_trees_per_model,
                nb_of_original_targets_to_predict=
                nb_of_original_targets_to_predict,
                min_support=min_support,
                max_depth=max_depth) +
            '_model_evaluation_tree_derived_rules.log'))

    # --- load test data ----------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.test)
    df_test_original_column_order = pd.read_csv(
        original_test_data_fold_abs_file_name, delimiter=',')

    # --- load classifier ---------------------------------------------------------------------------------------------
    tree_based_mids_classifier_abs_file_name = get_tree_based_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)

    # mids_classifier_abs_file_name = get_mids_clf_abs_file_name(dataset_name, fold_i)
    logger.info(
        f"start loading MIDS model from {tree_based_mids_classifier_abs_file_name}"
    )
    mids_classifier: MIDSClassifier = load_mids_classifier(
        tree_based_mids_classifier_abs_file_name)
    logger.info("finished loading MIDS model")
    logger.info(mids_classifier)
    reconstructed_mids = MIDSValueReuse()
    reconstructed_mids.classifier = mids_classifier

    mids_classifier.rule_combination_strategy = RuleCombiningStrategy.WEIGHTED_VOTE
    mids_classifier.rule_combinator = WeightedVotingRuleCombinator()

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    filter_nans: bool = True
    target_attr_to_score_info_map: Dict[
        str, ScoreInfo] = score_MIDS_on_its_targets_without_nans(
            reconstructed_mids,
            df_test_original_column_order,
            filter_nans=filter_nans)
    logger.info("Evaluated MIDS classifier on predictive performance")
    target_attrs: List[TargetAttr] = mids_classifier.target_attrs
    for target_attr in target_attrs:
        target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[
            target_attr]
        logger.info(
            f"\t{target_attr}:\n {target_attr_score_info.to_str('    ')}")
        logger.info("\t---")

    # mids_target_attr_to_score_info_abs_file_name: str = get_mids_target_attr_to_score_info_abs_file_name(
    #     dataset_name, fold_i)

    tree_based_mids_target_attr_to_score_info_abs_file_name: str = \
        get_tree_based_mids_target_attr_to_score_info_abs_file_name(
            dataset_name=dataset_name, fold_i=fold_i,
            classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support, max_depth=max_depth
        )

    store_mids_target_attr_to_score_info(
        tree_based_mids_target_attr_to_score_info_abs_file_name,
        target_attr_to_score_info_map)
    logger.info(
        f"Wrote MIDS Dict[TargetAttr, ScoreInfo] to {tree_based_mids_target_attr_to_score_info_abs_file_name}"
    )

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    interpret_stats: MIDSInterpretabilityStatistics \
        = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics(
            MIDSRuleSet(mids_classifier.rules), df_test_original_column_order, target_attributes=target_attrs)
    logger.info("Evaluated MIDS classifier on interpretability")
    logger.info(interpret_stats.to_str("\n"))

    # mids_interpret_stats_abs_file_name: str = get_mids_interpret_stats_abs_file_name(
    #     dataset_name, fold_i)
    tree_based_mids_interpret_stats_abs_file_name: str = get_tree_based_mids_interpret_stats_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)
    store_mids_interpret_stats(tree_based_mids_interpret_stats_abs_file_name,
                               interpret_stats)
    logger.info(
        f"Wrote MIDSInterpretabilityStatistics to {tree_based_mids_interpret_stats_abs_file_name}"
    )
    logger.info("---")

    close_logger(logger)
def learn_and_convert_tree_model_to_rules(
        dataset_name: str, fold_i: int, nb_of_trees_per_model: int,
        nb_of_original_targets_to_predict: int, nb_grouping_iterations: int,
        min_support: float, max_depth: int, seed: int):
    classifier_indicator = SingleTargetClassifierIndicator.random_forest
    train_test = TrainTestEnum.train

    logger = create_logger(
        logger_name=f'mine_multi-target_cars_tree_derived_' +
        get_tree_derived_rules_rel_file_name_without_extension(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth),
        log_file_name=get_tree_derived_rules_logger_abs_file_name(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth))

    # --- load train data ---------------------------------------------------------------------------------------------
    df_original = pd.read_csv(get_original_data_fold_abs_file_name(
        dataset_name, fold_i, train_test),
                              delimiter=',')
    df_one_hot_encoded = pd.read_csv(
        get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i,
                                                    train_test),
        delimiter=",")

    encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper(
        get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i))

    # --- prepare data ------------------------------------------------------------------------------------------------
    logger.info(
        f"Start preparing data using {nb_of_original_targets_to_predict} attrs per group"
        f" with {nb_grouping_iterations} grouping iterations")

    different_attr_groupings: List[AttrGroupPartitioning] = get_attr_groupings(
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        nb_grouping_iterations=nb_grouping_iterations,
        encoding_book_keeper=encoding_book_keeper)

    complete_rule_list: List[MCAR] = []
    cover_checker = CoverChecker()

    total_time_random_forest_learning_s = 0.0
    total_time_rf_conversion_s = 0.0
    # prepared_data_list: List[PreparedDataForTargetSet] = []
    for original_target_attribute_partitioning in different_attr_groupings:
        attr_group: AttrGroup
        for attr_group in original_target_attribute_partitioning:
            prepared_data: PreparedDataForTargetSet = get_prepared_data_for_attr_group(
                original_group_to_predict=attr_group,
                df_original=df_original,
                df_one_hot_encoded=df_one_hot_encoded,
                encoding_book_keeper=encoding_book_keeper)
            # prepared_data_list.append(prepared_data)

            start_time_decision_tree_learning_s = time.time()
            classifier: RandomForestClassifier = RandomForestClassifier(
                n_estimators=nb_of_trees_per_model,
                random_state=seed,
                min_samples_leaf=min_support,
                max_depth=max_depth)

            # --- Learn a random forest given the current number of trees -----------------------------------
            classifier.fit(
                prepared_data.df_one_hot_encoded_descriptive_attributes,
                prepared_data.df_one_hot_encoded_target_attributes)
            end_time_decision_tree_learning_s = time.time()
            total_time_decision_tree_learning_s: float = end_time_decision_tree_learning_s - start_time_decision_tree_learning_s
            total_time_random_forest_learning_s += total_time_decision_tree_learning_s

            tree_based_rules: List[MCAR]
            total_time_rf_conversion_s: float
            tree_based_rules, partial_time_rf_conversion_s = convert_random_forest_to_rules(
                random_forest_clf=classifier,
                df_original_without_nans=prepared_data.
                df_original_without_nans_for_targets,
                descriptive_one_hot_encoded_column_names=prepared_data.
                descriptive_one_hot_encoded_columns,
                # target_attribute_names=df_original_target_attrs_without_nans.columns,
                target_attribute_names=prepared_data.
                target_one_hot_encoded_columns,
                encoding_book_keeper=encoding_book_keeper,
                logger=logger)
            total_time_rf_conversion_s += partial_time_rf_conversion_s
            complete_rule_list.extend(tree_based_rules)

    logger.info(f"Complete set size: {len(complete_rule_list)}")

    # --- Save rules to file ---------------------------------------------------------------------------------

    tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(
        dataset_name, fold_i, classifier_indicator, nb_of_trees_per_model,
        nb_of_original_targets_to_predict, min_support, max_depth)
    store_mcars(tree_clf_derived_rules_abs_file_name, complete_rule_list)
    logger.info(
        f"finished writing tree-derived ruled to file: {tree_clf_derived_rules_abs_file_name}"
    )
    logger.info(
        "==================================================================")

    tree_rule_gen_timing_info = TreeRuleGenTimingInfo(
        total_time_decision_tree_learning_s=total_time_random_forest_learning_s,
        total_time_rf_conversion_s=total_time_rf_conversion_s)

    tree_rule_gen_timing_info_abs_file_name: str = get_tree_derived_rules_gen_timing_info_abs_file_name(
        dataset_name, fold_i, classifier_indicator, nb_of_trees_per_model,
        nb_of_original_targets_to_predict, min_support, max_depth)
    store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name,
                                    tree_rule_gen_timing_info)

    close_logger(logger)
def evaluate_greedy_model_for_dataset_fold_target_attribute(
        dataset_name: str, fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int,
        min_support: float, max_depth: int):
    logger = create_logger(
        logger_name=f'evaluate_greedy_model_tree_derived_' +
        get_tree_derived_rules_rel_file_name_without_extension(
            dataset_name=dataset_name,
            fold_i=fold_i,
            classifier_indicator=classifier_indicator,
            nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support,
            max_depth=max_depth),
        log_file_name=os.path.join(
            greedy_models_tree_based_dir(),
            get_tree_derived_rules_rel_file_name_without_extension(
                dataset_name=dataset_name,
                fold_i=fold_i,
                classifier_indicator=classifier_indicator,
                nb_of_trees_per_model=nb_of_trees_per_model,
                nb_of_original_targets_to_predict=
                nb_of_original_targets_to_predict,
                min_support=min_support,
                max_depth=max_depth) +
            '_greedy_model_evaluation_tree_derived_rules.log'))

    # --- load test data ----------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.test)
    df_test_original_column_order = pd.read_csv(
        original_test_data_fold_abs_file_name, delimiter=',')

    # --- load classifier ---------------------------------------------------------------------------------------------

    tree_based_greedy_clf_abs_file_name = get_tree_based_greedy_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)

    logger.info(
        f"start loading greedy model from {tree_based_greedy_clf_abs_file_name}"
    )
    greedy_classifier: GreedyRoundRobinTargetRuleClassifier = load_greedy_naive_classifier(
        tree_based_greedy_clf_abs_file_name)
    logger.info("finished loading greedy model")
    logger.info(greedy_classifier)

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    filter_nans: bool = True
    target_attr_to_score_info_map: Dict[
        str, ScoreInfo] = score_mt_clf_on_its_targets_without_nans(
            greedy_classifier,
            df_test_original_column_order,
            filter_nans=filter_nans)
    logger.info("Evaluated greedy classifier on predictive performance")
    target_attrs: List[TargetAttr] = greedy_classifier.target_attributes
    for target_attr in target_attrs:
        target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[
            target_attr]
        logger.info(
            f"\t{target_attr}:\n {target_attr_score_info.to_str('    ')}")
        logger.info("\t---")

    tree_based_greedy_clf_target_attr_to_score_info_abs_file_name: str = \
        get_tree_based_greedy_clf_target_attr_to_score_info_abs_file_name(
            dataset_name=dataset_name, fold_i=fold_i,
            classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
            min_support=min_support, max_depth=max_depth
        )
    store_mids_target_attr_to_score_info(
        tree_based_greedy_clf_target_attr_to_score_info_abs_file_name,
        target_attr_to_score_info_map)
    logger.info(
        f"Wrote greedy Dict[TargetAttr, ScoreInfo] to"
        f" {tree_based_greedy_clf_target_attr_to_score_info_abs_file_name}")

    # --- Evaluate and store interpretability statistics --------------------------------------------------------------
    interpret_stats: MIDSInterpretabilityStatistics \
        = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics(
            MIDSRuleSet(greedy_classifier.learned_rule_set),
        df_test_original_column_order, target_attributes=target_attrs)
    logger.info("Evaluated greedy classifier on interpretability")
    logger.info(interpret_stats.to_str("\n"))

    tree_based_greedy_clf_interpret_stats_abs_file_name: str = get_tree_based_greedy_clf_interpret_stats_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support,
        max_depth=max_depth)
    store_mids_interpret_stats(
        tree_based_greedy_clf_interpret_stats_abs_file_name, interpret_stats)
    logger.info(
        f"Wrote InterpretabilityStatistics to {tree_based_greedy_clf_interpret_stats_abs_file_name}"
    )
    logger.info("---")

    close_logger(logger)