def compute_delayed_functions(list_of_computations: List[Tuple[Delayed, Dict]], client: Client, nb_of_retries_if_erred: int, error_logger_name: str, error_logger_file_name: str) -> None: print("start compute") print(list_of_computations) list_of_delayed_function_calls = [ computation[0] for computation in list_of_computations ] list_of_futures: List[Future] = client.compute( list_of_delayed_function_calls, retries=nb_of_retries_if_erred) distributed.wait(list_of_futures) print("end compute") error_logger: Logger = create_logger(logger_name=error_logger_name, log_file_name=error_logger_file_name) future: Future for future, (delayed, func_args) in zip(list_of_futures, list_of_computations): if future.status == 'error': exception = future.exception() error_logger.error(f"{exception.__class__}: {exception}\n" f"\tfor arguments {func_args}") close_logger(error_logger)
def learn_tree_based_greedy_model_for_dataset_fold( dataset_name: str, fold_i: int, classifier_indicator: SingleTargetClassifierIndicator, nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int, min_support: float, max_depth: int ): logger = create_logger( logger_name=f'learn_greedy_model_{dataset_name}{fold_i}_tree_derived_rules', log_file_name=os.path.join(greedy_models_tree_based_dir(), f'{dataset_name}{fold_i}_greedy_model_induction_tree_derived_rules.log') ) # --- load train data --------------------------------------------------------------------------------------------- # read in original (discretized) training data df_original_train = pd.read_csv(get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.train), delimiter=',') # --- load association rules -------------------------------------------------------------------------------------- tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(dataset_name, fold_i, classifier_indicator, nb_of_trees_per_model, nb_of_original_targets_to_predict, min_support, max_depth) logger.info(f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}") mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name) mids_rules: Set[MIDSRule] = {MIDSRule(mcar) for mcar in mcars} logger.info(f"ground set size (nb of initial MCARs): {len(mids_rules)}") # --- Fit and save classifier ------------------------------------------------------------------------------------- greedy_clf = GreedyRoundRobinTargetRuleClassifier(df_original_train.columns, verbose=False) selected_set, selected_set_scores = greedy_clf.fit(ground_set=mids_rules, training_data=df_original_train) logger.info(f"Selected {len(selected_set)} out of {len(mcars)} rules " f"({(len(selected_set) / len(mcars) *100):.2f}%)") logger.info("start saving Naive greedy model") tree_based_greedy_clf_abs_file_name = get_tree_based_greedy_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth ) store_greedy_naive_classifier(tree_based_greedy_clf_abs_file_name, greedy_clf) logger.info(f"finished saving greedy clf to file: {tree_based_greedy_clf_abs_file_name}") close_logger(logger)
def mine_cars_for_dataset_fold_target_attribute( dataset_name: str, fold_i: int, target_attribute: str, min_support: float, min_confidence: float, max_length: int, ): """ 1. load the required training data of the dataset fold. 2. make sure the target attribute is the last attribute 3. mine rules using the parameters settings --> check the number of rules! 4. save the rules to file :return: """ relative_name: str = f'{dataset_name}{fold_i}_{target_attribute}_{min_confidence}' logger = create_logger( logger_name=f'mine_filtered_single_target_cars_' + relative_name, log_file_name=os.path.join(assoc_vs_tree_based_single_target_car_dir(), f'{relative_name}_single_target_filtered_car_mining.log') ) # logger.info(f"rule_cutoff={rule_cutoff}") # # load the required training data of the dataset fold. # original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name( # dataset_name, fold_i, TrainTestEnum.train) # df_train_original_column_order = pd.read_csv(original_train_data_fold_abs_file_name, delimiter=',') # # 2. make sure the target attribute is the last attribute # df_train_reordered = reorder_columns(df_train_original_column_order, target_attribute) # # # REMOVE INSTANCES WITH NAN AS TARGET VALUE: # df_train_reordered = remove_instances_with_nans_in_column(df_train_reordered, target_attribute) df_train_reordered = prepare_arc_data(dataset_name, fold_i, target_attribute, TrainTestEnum.train) logger.info(f"start mining CARs for " + relative_name) st_mcars: List[MCAR] timings_dict: Dict[str, float] filtered_st_mcars, timings_dict = mine_single_target_MCARs_mlext(df_train_reordered, target_attribute=target_attribute, min_support=min_support, min_confidence=min_confidence, max_length=max_length) logger.info(f"finished mining CARs for {dataset_name} {fold_i}_{min_support}supp_{min_confidence}conf") logger.info( f"found {len(filtered_st_mcars)} CARs for {dataset_name} {fold_i}_{min_support}supp_{min_confidence}conf") filtered_st_mcars_abs_file_name: str = get_single_target_filtered_cars_abs_filename( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, confidence_boundary_val=min_confidence ) store_mcars(filtered_st_mcars_abs_file_name, filtered_st_mcars) logger.info(f"finished writing CARs to file: {filtered_st_mcars_abs_file_name}") filtered_st_mcars_mining_timings_abs_file_name = get_single_target_filtered_cars_mining_timings_abs_filename( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, confidence_boundary_val=min_confidence ) store_timings_dict(filtered_st_mcars_mining_timings_abs_file_name, timings_dict) close_logger(logger)
def evaluate_mids_model_for_dataset_fold_target_attribute( dataset_name: str, fold_i: int, classifier_indicator: SingleTargetClassifierIndicator, nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int, min_support: float, max_depth: int): logger = create_logger( logger_name=f'evaluate_mids_model_tree_derived_' + get_tree_derived_rules_rel_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth), log_file_name=os.path.join( get_tree_based_mids_dir(), get_tree_derived_rules_rel_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict= nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) + '_model_evaluation_tree_derived_rules.log')) # --- load test data ---------------------------------------------------------------------------------------------- # read in original (discretized) training data original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.test) df_test_original_column_order = pd.read_csv( original_test_data_fold_abs_file_name, delimiter=',') # --- load classifier --------------------------------------------------------------------------------------------- tree_based_mids_classifier_abs_file_name = get_tree_based_mids_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) # mids_classifier_abs_file_name = get_mids_clf_abs_file_name(dataset_name, fold_i) logger.info( f"start loading MIDS model from {tree_based_mids_classifier_abs_file_name}" ) mids_classifier: MIDSClassifier = load_mids_classifier( tree_based_mids_classifier_abs_file_name) logger.info("finished loading MIDS model") logger.info(mids_classifier) reconstructed_mids = MIDSValueReuse() reconstructed_mids.classifier = mids_classifier mids_classifier.rule_combination_strategy = RuleCombiningStrategy.WEIGHTED_VOTE mids_classifier.rule_combinator = WeightedVotingRuleCombinator() # --- Evaluate and store interpretability statistics -------------------------------------------------------------- filter_nans: bool = True target_attr_to_score_info_map: Dict[ str, ScoreInfo] = score_MIDS_on_its_targets_without_nans( reconstructed_mids, df_test_original_column_order, filter_nans=filter_nans) logger.info("Evaluated MIDS classifier on predictive performance") target_attrs: List[TargetAttr] = mids_classifier.target_attrs for target_attr in target_attrs: target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[ target_attr] logger.info( f"\t{target_attr}:\n {target_attr_score_info.to_str(' ')}") logger.info("\t---") # mids_target_attr_to_score_info_abs_file_name: str = get_mids_target_attr_to_score_info_abs_file_name( # dataset_name, fold_i) tree_based_mids_target_attr_to_score_info_abs_file_name: str = \ get_tree_based_mids_target_attr_to_score_info_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth ) store_mids_target_attr_to_score_info( tree_based_mids_target_attr_to_score_info_abs_file_name, target_attr_to_score_info_map) logger.info( f"Wrote MIDS Dict[TargetAttr, ScoreInfo] to {tree_based_mids_target_attr_to_score_info_abs_file_name}" ) # --- Evaluate and store interpretability statistics -------------------------------------------------------------- interpret_stats: MIDSInterpretabilityStatistics \ = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics( MIDSRuleSet(mids_classifier.rules), df_test_original_column_order, target_attributes=target_attrs) logger.info("Evaluated MIDS classifier on interpretability") logger.info(interpret_stats.to_str("\n")) # mids_interpret_stats_abs_file_name: str = get_mids_interpret_stats_abs_file_name( # dataset_name, fold_i) tree_based_mids_interpret_stats_abs_file_name: str = get_tree_based_mids_interpret_stats_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) store_mids_interpret_stats(tree_based_mids_interpret_stats_abs_file_name, interpret_stats) logger.info( f"Wrote MIDSInterpretabilityStatistics to {tree_based_mids_interpret_stats_abs_file_name}" ) logger.info("---") close_logger(logger)
def learn_single_target_tree_mids_model_for_dataset_fold( dataset_name: str, fold_i: int, target_attribute: str, nb_of_trees_per_model: int, min_support: float, max_depth: int): classifier_indicator = SingleTargetClassifierIndicator.random_forest relative_name: str = get_single_target_tree_rules_relative_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, min_support=min_support, max_depth=max_depth, ) log_file_dir: str = get_single_target_mids_clf_dir() logger = create_logger( logger_name=f'learn_single_target_tree_mids_' + relative_name, log_file_name=os.path.join( log_file_dir, f'{relative_name}_model_induction_single_target_tree_mids.log')) # --- load train data --------------------------------------------------------------------------------------------- # read in original (discretized) training data original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.train) df_train_original_column_order = pd.read_csv( original_train_data_fold_abs_file_name, delimiter=',') # --- load association rules -------------------------------------------------------------------------------------- tree_clf_derived_rules_abs_file_name: str = get_single_target_tree_rules_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, min_support=min_support, max_depth=max_depth) logger.info( f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}") mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name) logger.info(f"ground set size (nb of initial MCARs): {len(mcars)}") # --- Fit and save classifier ------------------------------------------------------------------------------------- algorithm = "RDGS" debug_mids_fitting = False mids = MIDSValueReuse() mids.normalize = True logger.info("start MIDS model induction") mids.fit( df_train_original_column_order, targets_to_use=[target_attribute], use_targets_from_rule_set=False, class_association_rules=mcars, debug=debug_mids_fitting, algorithm=algorithm, ) logger.info("finished MIDS model induction") mids_classifier: MIDSClassifier = mids.classifier logger.info(mids_classifier) logger.info("start saving MIDS model") tree_based_mids_classifier_abs_file_name = get_single_target_tree_mids_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, min_support=min_support, max_depth=max_depth) store_mids_classifier(tree_based_mids_classifier_abs_file_name, mids_classifier) logger.info( f"finished saving MIDS model to file: {tree_based_mids_classifier_abs_file_name}" ) close_logger(logger)
def one_hot_encode_dataset_fold(dataset_name: str, fold_i: int, ohe_prefix_separator: str) -> None: """ One-hot encodes each of the Arch-bench fold train-test splits. """ logger = create_logger( logger_name=f'one_hot_encode{dataset_name}{fold_i}', log_file_name=os.path.join( get_one_hot_encoded_fold_data_dir(train_test=None), f"{dataset_name}{fold_i}.log")) drop_first = False # === For fold i ==== # --- Read in the original train and test data from archbench ----------------------------------------------------- original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.train) original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.test) logger.info( f"Loading train fold: {original_train_data_fold_abs_file_name}") logger.info(f"Loading test fold: {original_test_data_fold_abs_file_name}") original_train_df = pd.read_csv(original_train_data_fold_abs_file_name, delimiter=',') original_test_df = pd.read_csv(original_test_data_fold_abs_file_name, delimiter=',') # --- Set each column to 'object' ------- ------------------------------------------------------------------------- original_train_df = convert_to_categorical(original_train_df, dataset_name, fold_i) original_test_df = convert_to_categorical(original_test_df, dataset_name, fold_i) # --- Concatenate the train and test data for the current fold ---------------------------------------------------- nb_of_train_examples = len(original_train_df) nb_of_test_examples = len(original_test_df) logger.info( f"Start concatenating train & test folds for {dataset_name}{fold_i}") original_concat_df = pd.concat([original_train_df, original_test_df], axis=0) if len(original_concat_df) != nb_of_train_examples + nb_of_test_examples: raise Exception("unexpected length") # --- Write out the full discretized dataset of this fold to file for inspection purposes ------------------------- original_full_data_abs_file_name = get_original_full_data_abs_file_name( dataset_name, fold_i) logger.info( f"Writing out UN-encoded full dataset for {dataset_name}{fold_i}: {original_full_data_abs_file_name}" ) original_concat_df.to_csv(original_full_data_abs_file_name, index=False) # --- One-hot encoded the full data ------------------------------------------------------------------------------- logger.info(f"Start one hot encoding {dataset_name}{fold_i}") one_hot_encoded_concat_df = pd.get_dummies(original_concat_df, prefix_sep=ohe_prefix_separator, drop_first=drop_first) one_hot_encoded_full_data_abs_file_name = get_one_hot_encoded_full_data_abs_file_name( dataset_name, fold_i) # --- Write out the one-hot encoded full data --------------------------------------------------------------------- logger.info( f"Writing out one hot encoded full dataset for {dataset_name}{fold_i}:" f" {one_hot_encoded_full_data_abs_file_name}") one_hot_encoded_concat_df.to_csv(one_hot_encoded_full_data_abs_file_name, index=False) # --- Create the EncodingBookKeeper and write it to file ---------------------------------------------------------- encoding_book_keeper: EncodingBookKeeper = EncodingBookKeeper. \ build_encoding_book_keeper_from_ohe_columns(one_hot_encoded_concat_df.columns, ohe_prefix_separator=ohe_prefix_separator) logger.info( f"Creating one hot encoding book keeper for {dataset_name}{fold_i}") # %% encoding_book_keeper_abs_file_name = get_encodings_book_keeper_abs_file_name_for( dataset_name, fold_i) logger.info( f"Saving one hot encoding book keeper for {dataset_name}{fold_i}: {encoding_book_keeper_abs_file_name}" ) store_encoding_book_keeper(encoding_book_keeper_abs_file_name, encoding_book_keeper) # -- Split the full one-hot encoded dataset back into train and test ---------------------------------------------- one_hot_encoded_train_df = one_hot_encoded_concat_df[:nb_of_train_examples] one_hot_encoded_test_df = one_hot_encoded_concat_df[nb_of_train_examples:] if len(one_hot_encoded_train_df) != nb_of_train_examples: raise Exception("unexpected length") if len(one_hot_encoded_test_df) != nb_of_test_examples: raise Exception("unexpected length") # -- Write out the one-hot encoded train and test ----------------------------------------------------------------- one_hot_encoded_train_abs_file_name = get_one_hot_encoded_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.train) one_hot_encoded_test_abs_file_name = get_one_hot_encoded_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.test) logger.info( f"Saving one hot encoded train fold: {one_hot_encoded_train_abs_file_name}" ) logger.info( f"Saving one hot encoded test fold: {one_hot_encoded_test_abs_file_name}" ) one_hot_encoded_train_df.to_csv(one_hot_encoded_train_abs_file_name, index=False) one_hot_encoded_test_df.to_csv(one_hot_encoded_test_abs_file_name, index=False) logger.info("---") close_logger(logger)
def learn_and_convert_tree_model_to_rules( dataset_name: str, fold_i: int, nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int, nb_grouping_iterations: int, min_support: float, max_depth: int, seed: int): classifier_indicator = SingleTargetClassifierIndicator.random_forest train_test = TrainTestEnum.train logger = create_logger( logger_name=f'mine_multi-target_cars_tree_derived_' + get_tree_derived_rules_rel_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth), log_file_name=get_tree_derived_rules_logger_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth)) # --- load train data --------------------------------------------------------------------------------------------- df_original = pd.read_csv(get_original_data_fold_abs_file_name( dataset_name, fold_i, train_test), delimiter=',') df_one_hot_encoded = pd.read_csv( get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i, train_test), delimiter=",") encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper( get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i)) # --- prepare data ------------------------------------------------------------------------------------------------ logger.info( f"Start preparing data using {nb_of_original_targets_to_predict} attrs per group" f" with {nb_grouping_iterations} grouping iterations") different_attr_groupings: List[AttrGroupPartitioning] = get_attr_groupings( nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, nb_grouping_iterations=nb_grouping_iterations, encoding_book_keeper=encoding_book_keeper) complete_rule_list: List[MCAR] = [] cover_checker = CoverChecker() total_time_random_forest_learning_s = 0.0 total_time_rf_conversion_s = 0.0 # prepared_data_list: List[PreparedDataForTargetSet] = [] for original_target_attribute_partitioning in different_attr_groupings: attr_group: AttrGroup for attr_group in original_target_attribute_partitioning: prepared_data: PreparedDataForTargetSet = get_prepared_data_for_attr_group( original_group_to_predict=attr_group, df_original=df_original, df_one_hot_encoded=df_one_hot_encoded, encoding_book_keeper=encoding_book_keeper) # prepared_data_list.append(prepared_data) start_time_decision_tree_learning_s = time.time() classifier: RandomForestClassifier = RandomForestClassifier( n_estimators=nb_of_trees_per_model, random_state=seed, min_samples_leaf=min_support, max_depth=max_depth) # --- Learn a random forest given the current number of trees ----------------------------------- classifier.fit( prepared_data.df_one_hot_encoded_descriptive_attributes, prepared_data.df_one_hot_encoded_target_attributes) end_time_decision_tree_learning_s = time.time() total_time_decision_tree_learning_s: float = end_time_decision_tree_learning_s - start_time_decision_tree_learning_s total_time_random_forest_learning_s += total_time_decision_tree_learning_s tree_based_rules: List[MCAR] total_time_rf_conversion_s: float tree_based_rules, partial_time_rf_conversion_s = convert_random_forest_to_rules( random_forest_clf=classifier, df_original_without_nans=prepared_data. df_original_without_nans_for_targets, descriptive_one_hot_encoded_column_names=prepared_data. descriptive_one_hot_encoded_columns, # target_attribute_names=df_original_target_attrs_without_nans.columns, target_attribute_names=prepared_data. target_one_hot_encoded_columns, encoding_book_keeper=encoding_book_keeper, logger=logger) total_time_rf_conversion_s += partial_time_rf_conversion_s complete_rule_list.extend(tree_based_rules) logger.info(f"Complete set size: {len(complete_rule_list)}") # --- Save rules to file --------------------------------------------------------------------------------- tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name( dataset_name, fold_i, classifier_indicator, nb_of_trees_per_model, nb_of_original_targets_to_predict, min_support, max_depth) store_mcars(tree_clf_derived_rules_abs_file_name, complete_rule_list) logger.info( f"finished writing tree-derived ruled to file: {tree_clf_derived_rules_abs_file_name}" ) logger.info( "==================================================================") tree_rule_gen_timing_info = TreeRuleGenTimingInfo( total_time_decision_tree_learning_s=total_time_random_forest_learning_s, total_time_rf_conversion_s=total_time_rf_conversion_s) tree_rule_gen_timing_info_abs_file_name: str = get_tree_derived_rules_gen_timing_info_abs_file_name( dataset_name, fold_i, classifier_indicator, nb_of_trees_per_model, nb_of_original_targets_to_predict, min_support, max_depth) store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name, tree_rule_gen_timing_info) close_logger(logger)
def learn_and_convert_single_target_tree_ensemble_to_rules( dataset_name: str, fold_i: int, target_attribute: str, nb_of_trees_per_model: int, min_support: float, max_depth: int): classifier_indicator = SingleTargetClassifierIndicator.random_forest train_test = TrainTestEnum.train relative_name: str = get_single_target_tree_rules_relative_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, min_support=min_support, max_depth=max_depth, ) logger_dir: str = get_single_target_tree_rule_dir() logger = create_logger( logger_name=f'mine_single_target_tree_rules_' + relative_name, log_file_name=os.path.join( logger_dir, f"{relative_name}_single_tree_rule_generation.log")) # --- load train data --------------------------------------------------------------------------------------------- df_original = pd.read_csv(get_original_data_fold_abs_file_name( dataset_name, fold_i, train_test), delimiter=',') df_one_hot_encoded = pd.read_csv( get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i, train_test), delimiter=",") encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper( get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i)) cover_checker = CoverChecker() original_group_to_predict: List[Attr] = [target_attribute] original_target_attr_set = set(original_group_to_predict) logger.info( f"Fetching the necessary columns for {dataset_name}{fold_i} {original_target_attr_set}" ) prepared_data: PreparedDataForTargetSet = PreparedDataForTargetSet.prepare_data_for_target_set( df_original=df_original, df_one_hot_encoded=df_one_hot_encoded, encoding_book_keeper=encoding_book_keeper, original_target_attr_set=original_target_attr_set, ) # --- Fit and save classifier --------------------------------------------------------------------------------- start_time_decision_tree_learning_s = time.time() classifier: RandomForestClassifier = RandomForestClassifier( n_estimators=nb_of_trees_per_model, min_samples_leaf=min_support, max_depth=max_depth) classifier.fit(X=prepared_data.df_one_hot_encoded_descriptive_attributes, y=prepared_data.df_one_hot_encoded_target_attributes) end_time_decision_tree_learning_s = time.time() total_time_decision_tree_learning_s: float = end_time_decision_tree_learning_s - start_time_decision_tree_learning_s logger.info( f"Fitted a {classifier_indicator.value} model predicting {original_target_attr_set}" f" for {dataset_name}{fold_i}") total_time_rf_conversion_s: TimeDiffSec = 0 complete_rule_list: List[MCAR] = [] tree_classifiers = classifier.estimators_ for tree_clf in tree_classifiers: list_of_dt_rules: Optional[List[MIDSRule]] = None try: start_time_clf_conversion_s = time.time() list_of_dt_rules: List[ MIDSRule] = convert_decision_tree_to_mids_rule_list( tree_classifier=tree_clf, one_hot_encoded_feature_names=prepared_data. descriptive_one_hot_encoded_columns, target_attribute_names=prepared_data. target_one_hot_encoded_columns, encoding_book_keeper=encoding_book_keeper) except NotImplementedError as err: logger.error(str(err)) if list_of_dt_rules is not None: # --- adding support and confidence to rules mids_rule: MIDSRule for mids_rule in list_of_dt_rules: add_support_and_confidence_to_MIDSRule( prepared_data.df_original_without_nans_for_targets, mids_rule, cover_checker=cover_checker) # logger.info(f"found {len(list_of_dt_rules)} rules," # f" updated total rule set size: {len(complete_rule_list)}") mids_rules_as_mcars = [ mids_rule.car for mids_rule in list_of_dt_rules ] complete_rule_list.extend(mids_rules_as_mcars) end_time_clf_conversion_s = time.time() total_time_clf_conversion_s = end_time_clf_conversion_s - start_time_clf_conversion_s total_time_rf_conversion_s += total_time_clf_conversion_s logger.info(f"Complete set size: {len(complete_rule_list)}") for i in range(0, len(complete_rule_list)): logger.info(f"rule {i}: {str(complete_rule_list[i])}") if i > 10: break # --- Save rules to file --------------------------------------------------------------------------------- tree_clf_derived_rules_abs_file_name: str = get_single_target_tree_rules_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, min_support=min_support, max_depth=max_depth) store_mcars(tree_clf_derived_rules_abs_file_name, complete_rule_list) logger.info( f"finished writing single-target tree rules to file: {tree_clf_derived_rules_abs_file_name}" ) tree_rule_gen_timing_info = TreeRuleGenTimingInfo( total_time_decision_tree_learning_s=total_time_decision_tree_learning_s, total_time_rf_conversion_s=total_time_rf_conversion_s) tree_rule_gen_timing_info_abs_file_name: str = get_single_target_tree_rules_gen_timing_info_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, min_support=min_support, max_depth=max_depth) store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name, tree_rule_gen_timing_info) logger.info( "==================================================================") close_logger(logger)
def evaluate_single_target_mids_model_for_dataset_fold( dataset_name: str, fold_i: int, logger_name: str, logger_file_name: str, mids_classifier_abs_file_name: str, mids_target_attr_to_score_info_abs_file_name: str, mids_interpret_stats_abs_file_name: str): logger = create_logger(logger_name=logger_name, log_file_name=logger_file_name) # --- load test data ---------------------------------------------------------------------------------------------- # read in original (discretized) training data original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.test) df_test_original_column_order = pd.read_csv( original_test_data_fold_abs_file_name, delimiter=',') # --- load classifier --------------------------------------------------------------------------------------------- # mids_classifier_abs_file_name = get_mids_clf_abs_file_name(dataset_name, fold_i) logger.info( f"start loading MIDS model from {mids_classifier_abs_file_name}") mids_classifier: MIDSClassifier = load_mids_classifier( mids_classifier_abs_file_name) logger.info("finished loading MIDS model") logger.info(mids_classifier) reconstructed_mids = MIDSValueReuse() reconstructed_mids.classifier = mids_classifier # --- Evaluate and store interpretability statistics -------------------------------------------------------------- filter_nans: bool = True target_attr_to_score_info_map: Dict[ str, ScoreInfo] = score_MIDS_on_its_targets_without_nans( reconstructed_mids, df_test_original_column_order, filter_nans=filter_nans) logger.info("Evaluated MIDS classifier on predictive performance") target_attrs: List[TargetAttr] = mids_classifier.target_attrs for target_attr in target_attrs: target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[ target_attr] logger.info( f"\t{target_attr}:\n {target_attr_score_info.to_str(' ')}") logger.info("\t---") store_mids_target_attr_to_score_info( mids_target_attr_to_score_info_abs_file_name, target_attr_to_score_info_map) logger.info( f"Wrote MIDS Dict[TargetAttr, ScoreInfo] to {mids_target_attr_to_score_info_abs_file_name}" ) # --- Evaluate and store interpretability statistics -------------------------------------------------------------- interpret_stats: MIDSInterpretabilityStatistics \ = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics( MIDSRuleSet(mids_classifier.rules), df_test_original_column_order, target_attributes=target_attrs) logger.info("Evaluated MIDS classifier on interpretability") logger.info(interpret_stats.to_str("\n")) store_mids_interpret_stats(mids_interpret_stats_abs_file_name, interpret_stats) logger.info( f"Wrote MIDSInterpretabilityStatistics to {mids_interpret_stats_abs_file_name}" ) logger.info("---") close_logger(logger)
def create_single_target_tree_based_mcars( dataset_name: str, fold_i: int, target_attribute: str, classifier_indicator: SingleTargetClassifierIndicator, confidence_boundary_val: float, min_support: float, max_depth: int, seed: int): train_test = TrainTestEnum.train relative_name: str = get_single_target_tree_rules_relative_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, min_support=min_support, max_depth=max_depth, confidence_boundary_val=confidence_boundary_val) logger = create_logger( logger_name=f'create_single_target_tree_rules' + relative_name, log_file_name=os.path.join( assoc_vs_tree_based_single_target_car_dir(), f'{relative_name}_single_target_tree_rule_generation.log')) logger.info( f"Start reading MCARS for {dataset_name}{fold_i}_{target_attribute}" f" (confidence {confidence_boundary_val})") st_mcars_abs_file_name = get_single_target_filtered_cars_abs_filename( dataset_name, fold_i, target_attribute=target_attribute, confidence_boundary_val=confidence_boundary_val) filtered_st_mcars: List[MCAR] = load_mcars(st_mcars_abs_file_name) logger.info( f"Total nb of MCARS for {dataset_name}{fold_i}_{target_attribute}" f" (conf {confidence_boundary_val}): {len(filtered_st_mcars)}") n_tree_rules_to_generate = len(filtered_st_mcars) logger.info(f"Generate {n_tree_rules_to_generate} tree based rules") # --- load train data --------------------------------------------------------------------------------------------- df_original = pd.read_csv(get_original_data_fold_abs_file_name( dataset_name, fold_i, train_test), delimiter=',') df_one_hot_encoded = pd.read_csv( get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i, train_test), delimiter=",") encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper( get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i)) # --- prepare data ------------------------------------------------------------------------------------------------ original_group_to_predict: List[Attr] = [target_attribute] original_target_attr_set = set(original_group_to_predict) logger.info( f"Fetching the necessary columns for {dataset_name}{fold_i} {original_target_attr_set}" ) prepared_data: PreparedDataForTargetSet = PreparedDataForTargetSet.prepare_data_for_target_set( df_original=df_original, df_one_hot_encoded=df_one_hot_encoded, encoding_book_keeper=encoding_book_keeper, original_target_attr_set=original_target_attr_set, ) random_forest_abs_file_name: str = get_single_target_random_forest_absolute_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, min_support=min_support, max_depth=max_depth, confidence_boundary_val=confidence_boundary_val) # --- Generate the required nb of tree-based rules ---------------------------------------------------------------- logger.info(f"Start generating tree-based rules") tree_based_mcars: List[MCAR] tree_rule_gen_timing_info: TreeRuleGenTimingInfo tree_based_mcars, tree_rule_gen_timing_info = generate_n_single_target_tree_rules( n_tree_rules_to_generate=n_tree_rules_to_generate, prepared_data=prepared_data, encoding_book_keeper=encoding_book_keeper, min_support=min_support, max_depth=max_depth, logger=logger, seed=seed, random_forest_abs_file_name=random_forest_abs_file_name) # --- SAVE the generated tree-based rules tree_based_rules_abs_file_name: str = get_single_target_tree_rules_absolute_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, min_support=min_support, max_depth=max_depth, confidence_boundary_val=confidence_boundary_val) store_mcars(tree_based_rules_abs_file_name, tree_based_mcars) logger.info( f"finished writing tree-derived ruled to file: {tree_based_rules_abs_file_name}" ) tree_rule_gen_timing_info_abs_file_name: str = get_single_target_tree_rules_gen_timing_info_absolute_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, min_support=min_support, max_depth=max_depth, confidence_boundary_val=confidence_boundary_val) store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name, tree_rule_gen_timing_info) logger.info( "==================================================================") close_logger(logger)
def learn_tree_based_mids_model_for_dataset_fold( dataset_name: str, fold_i: int, classifier_indicator: SingleTargetClassifierIndicator, nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int, min_support: float, max_depth: int ): logger = create_logger( logger_name=f'learn_mids_model{dataset_name}{fold_i}_tree_derived_rules', log_file_name=os.path.join(get_tree_based_mids_dir(), f'{dataset_name}{fold_i}_model_induction_tree_derived_rules.log') ) # --- load train data --------------------------------------------------------------------------------------------- # read in original (discretized) training data df_original_train = pd.read_csv(get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.train), delimiter=',') # --- load association rules -------------------------------------------------------------------------------------- tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(dataset_name, fold_i, classifier_indicator, nb_of_trees_per_model, nb_of_original_targets_to_predict, min_support, max_depth) logger.info(f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}") mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name) logger.info(f"ground set size (nb of initial MCARs): {len(mcars)}") # --- Fit and save classifier ------------------------------------------------------------------------------------- algorithm = "RDGS" debug_mids_fitting = False mids = MIDSValueReuse() mids.normalize = True logger.info("start MIDS model induction") mids.fit(df_original_train, class_association_rules=mcars, debug=debug_mids_fitting, algorithm=algorithm, # lambda_array=lambda_array use_targets_from_rule_set=False, ) logger.info("finished MIDS model induction") mids_classifier: MIDSClassifier = mids.classifier logger.info(mids_classifier) logger.info(f"Selected {len(mids_classifier.rules)} out of {len(mcars)} rules " f"({(len(mids_classifier.rules) / len(mcars) *100):.2f}%)") logger.info("start saving MIDS model") tree_based_mids_classifier_abs_file_name = get_tree_based_mids_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth ) store_mids_classifier(tree_based_mids_classifier_abs_file_name, mids_classifier) logger.info(f"finished saving MIDS model to file: {tree_based_mids_classifier_abs_file_name}") close_logger(logger)
def evaluate_greedy_model_for_dataset_fold_target_attribute( dataset_name: str, fold_i: int, classifier_indicator: SingleTargetClassifierIndicator, nb_of_trees_per_model: int, nb_of_original_targets_to_predict: int, min_support: float, max_depth: int): logger = create_logger( logger_name=f'evaluate_greedy_model_tree_derived_' + get_tree_derived_rules_rel_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth), log_file_name=os.path.join( greedy_models_tree_based_dir(), get_tree_derived_rules_rel_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict= nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) + '_greedy_model_evaluation_tree_derived_rules.log')) # --- load test data ---------------------------------------------------------------------------------------------- # read in original (discretized) training data original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.test) df_test_original_column_order = pd.read_csv( original_test_data_fold_abs_file_name, delimiter=',') # --- load classifier --------------------------------------------------------------------------------------------- tree_based_greedy_clf_abs_file_name = get_tree_based_greedy_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) logger.info( f"start loading greedy model from {tree_based_greedy_clf_abs_file_name}" ) greedy_classifier: GreedyRoundRobinTargetRuleClassifier = load_greedy_naive_classifier( tree_based_greedy_clf_abs_file_name) logger.info("finished loading greedy model") logger.info(greedy_classifier) # --- Evaluate and store interpretability statistics -------------------------------------------------------------- filter_nans: bool = True target_attr_to_score_info_map: Dict[ str, ScoreInfo] = score_mt_clf_on_its_targets_without_nans( greedy_classifier, df_test_original_column_order, filter_nans=filter_nans) logger.info("Evaluated greedy classifier on predictive performance") target_attrs: List[TargetAttr] = greedy_classifier.target_attributes for target_attr in target_attrs: target_attr_score_info: ScoreInfo = target_attr_to_score_info_map[ target_attr] logger.info( f"\t{target_attr}:\n {target_attr_score_info.to_str(' ')}") logger.info("\t---") tree_based_greedy_clf_target_attr_to_score_info_abs_file_name: str = \ get_tree_based_greedy_clf_target_attr_to_score_info_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth ) store_mids_target_attr_to_score_info( tree_based_greedy_clf_target_attr_to_score_info_abs_file_name, target_attr_to_score_info_map) logger.info( f"Wrote greedy Dict[TargetAttr, ScoreInfo] to" f" {tree_based_greedy_clf_target_attr_to_score_info_abs_file_name}") # --- Evaluate and store interpretability statistics -------------------------------------------------------------- interpret_stats: MIDSInterpretabilityStatistics \ = MIDSInterpretabilityStatisticsCalculator.calculate_ruleset_statistics( MIDSRuleSet(greedy_classifier.learned_rule_set), df_test_original_column_order, target_attributes=target_attrs) logger.info("Evaluated greedy classifier on interpretability") logger.info(interpret_stats.to_str("\n")) tree_based_greedy_clf_interpret_stats_abs_file_name: str = get_tree_based_greedy_clf_interpret_stats_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict=nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) store_mids_interpret_stats( tree_based_greedy_clf_interpret_stats_abs_file_name, interpret_stats) logger.info( f"Wrote InterpretabilityStatistics to {tree_based_greedy_clf_interpret_stats_abs_file_name}" ) logger.info("---") close_logger(logger)
def learn_single_target_car_mids_model_for_dataset_fold_confidence_boundary( dataset_name: str, fold_i: int, target_attribute: str, confidence_boundary: float): relative_name: str = get_single_target_filtered_car_mids_relative_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, confidence_boundary_val=confidence_boundary) log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir() logger = create_logger( logger_name=f'learn_single_target_filtered_car_mids_' + relative_name, log_file_name=os.path.join( log_file_dir, f'{relative_name}_model_induction_single_target_filtered_car_mids.log' )) # --- load train data --------------------------------------------------------------------------------------------- # read in original (discretized) training data original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.train) df_train_original_column_order = pd.read_csv( original_train_data_fold_abs_file_name, delimiter=',') # --- load association rules -------------------------------------------------------------------------------------- filtered_st_mcars_abs_file_name: str = get_single_target_filtered_cars_abs_filename( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, confidence_boundary_val=confidence_boundary) logger.info( f"Reading single-target CARs from file: {filtered_st_mcars_abs_file_name}" ) st_mcar_list: List[MCAR] = load_mcars(filtered_st_mcars_abs_file_name) ground_set_size: int = len(st_mcar_list) if ground_set_size <= 0: raise Exception( f"Ground set size is {ground_set_size} for {dataset_name}{fold_i} {target_attribute}" ) logger.info(f"ground set size (nb of initial MCARs): {len(st_mcar_list)}") # --- Fit and save classifier ------------------------------------------------------------------------------------- algorithm = "RDGS" debug_mids_fitting = False mids = MIDSValueReuse() mids.normalize = True logger.info("start MIDS model induction") mids.fit( df_train_original_column_order, use_targets_from_rule_set=True, class_association_rules=st_mcar_list, debug=debug_mids_fitting, algorithm=algorithm, ) logger.info("finished MIDS model induction") mids_classifier: MIDSClassifier = mids.classifier logger.info(mids_classifier) logger.info("start saving MIDS model") mids_classifier_abs_file_name: str = get_single_target_filtered_car_mids_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, confidence_boundary_val=confidence_boundary) store_mids_classifier(mids_classifier_abs_file_name, mids_classifier) logger.info( f"finished saving MIDS model to file: {mids_classifier_abs_file_name}") close_logger(logger)
def merge_single_target_mids_models_for_dataset_fold( dataset_name: str, fold_i: int, nb_of_trees_to_use: int, min_support: float, max_depth: int ): classifier_indicator = SingleTargetClassifierIndicator.random_forest relative_name: str = get_merged_single_target_tree_mids_relative_file_name_without_extension( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_to_use, min_support=min_support, max_depth=max_depth, ) log_file_dir: str = get_merged_single_target_mids_clf_dir() logger_name: str = f'merge_single_target_mids_models__' + relative_name logger_file_name: str = os.path.join( log_file_dir, f'{relative_name}_model_merging_single_target_tree_mids.log' ) logger = create_logger( logger_name=logger_name, log_file_name=logger_file_name ) original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.train) target_columns: List[str] = get_header_attributes(original_train_data_fold_abs_file_name) merged_st_clf = MergedSTMIDSClassifier() for target_attribute in target_columns: st_mids_classifier_abs_file_name: str = get_single_target_tree_mids_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_to_use, min_support=min_support, max_depth=max_depth ) # --- load single target classifier --------------------------------------------------------------------------- logger.info(f"start loading MIDS model from {st_mids_classifier_abs_file_name}") st_mids_classifier: MIDSClassifier = load_mids_classifier(st_mids_classifier_abs_file_name) logger.info("finished loading MIDS model") logger.info(st_mids_classifier) reconstructed_mids = MIDSValueReuse() reconstructed_mids.classifier = st_mids_classifier merged_st_clf.add_single_target_model(st_mids_classifier) st_tree_rule_gen_timing_info_abs_file_name: str = get_single_target_tree_rules_gen_timing_info_abs_file_name( dataset_name, fold_i, target_attribute, classifier_indicator, nb_of_trees_to_use, min_support, max_depth ) st_tree_rule_gen_timing_info: TreeRuleGenTimingInfo = load_tree_rule_gen_timing_info( st_tree_rule_gen_timing_info_abs_file_name) st_total_time_decision_tree_learning_s = st_tree_rule_gen_timing_info.total_time_decision_tree_learning_s st_total_time_rf_conversion_s = st_tree_rule_gen_timing_info.total_time_rf_conversion_s st_total_rule_gen_time_s: float = st_total_time_decision_tree_learning_s + st_total_time_rf_conversion_s merged_st_clf.add_rule_generation_time(st_total_rule_gen_time_s) # --- load test data ---------------------------------------------------------------------------------------------- # read in original (discretized) training data original_test_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.test) df_test_original_column_order = pd.read_csv(original_test_data_fold_abs_file_name, delimiter=',') merged_st_clf.calculate_ruleset_interpretability_statistics( test_dataframe=df_test_original_column_order, target_attributes=target_columns) # --- Evaluate and store predictive performance ------------------------------------------------------------------ filter_nans: bool = True merged_st_clf.calculate_score_info(test_dataframe=df_test_original_column_order, filter_nans=filter_nans) logger.info("Evaluated MERGED MIDS classifier on predictive performance") # --- Evaluate and store interpretability statistics -------------------------------------------------------------- merged_st_clf.calculate_ruleset_interpretability_statistics( test_dataframe=df_test_original_column_order, target_attributes=target_columns) logger.info("Evaluated MIDS classifier on interpretability") # --- store merged classifier ------------------------------------------------------------------------------------ logger.info("start saving merged single target MIDS model") merged_st_clf_abs_file_name: str = get_merged_single_target_tree_mids_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_to_use, min_support=min_support, max_depth=max_depth ) store_merged_st_mids_model(merged_st_clf_abs_file_name, merged_st_clf) logger.info(f"finished saving merged single target MIDS model to file: {merged_st_clf_abs_file_name}") logger.info("---") close_logger(logger)