Exemple #1
0
def learn_tree_based_greedy_model_for_dataset_fold(
        dataset_name: str,
        fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int,
        nb_of_original_targets_to_predict: int,
        min_support: float,
        max_depth: int
):

    logger = create_logger(
        logger_name=f'learn_greedy_model_{dataset_name}{fold_i}_tree_derived_rules',
        log_file_name=os.path.join(greedy_models_tree_based_dir(),
                                   f'{dataset_name}{fold_i}_greedy_model_induction_tree_derived_rules.log')
    )
    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    df_original_train = pd.read_csv(get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.train),
                                    delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------
    tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(dataset_name,
                                                                                fold_i,
                                                                                classifier_indicator,
                                                                                nb_of_trees_per_model,
                                                                                nb_of_original_targets_to_predict,
                                                                                min_support,
                                                                                max_depth)
    logger.info(f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}")
    mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name)

    mids_rules: Set[MIDSRule] = {MIDSRule(mcar) for mcar in mcars}

    logger.info(f"ground set size (nb of initial MCARs): {len(mids_rules)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------

    greedy_clf = GreedyRoundRobinTargetRuleClassifier(df_original_train.columns, verbose=False)
    selected_set, selected_set_scores = greedy_clf.fit(ground_set=mids_rules, training_data=df_original_train)

    logger.info(f"Selected {len(selected_set)} out of {len(mcars)} rules "
                f"({(len(selected_set) / len(mcars) *100):.2f}%)")

    logger.info("start saving Naive greedy model")
    tree_based_greedy_clf_abs_file_name = get_tree_based_greedy_clf_abs_file_name(
        dataset_name=dataset_name, fold_i=fold_i,
        classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support, max_depth=max_depth
    )
    store_greedy_naive_classifier(tree_based_greedy_clf_abs_file_name, greedy_clf)
    logger.info(f"finished saving greedy clf to file: {tree_based_greedy_clf_abs_file_name}")
    close_logger(logger)
def learn_single_target_tree_mids_model_for_dataset_fold(
        dataset_name: str, fold_i: int, target_attribute: str,
        nb_of_trees_per_model: int, min_support: float, max_depth: int):
    classifier_indicator = SingleTargetClassifierIndicator.random_forest

    relative_name: str = get_single_target_tree_rules_relative_file_name_without_extension(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth,
    )
    log_file_dir: str = get_single_target_mids_clf_dir()
    logger = create_logger(
        logger_name=f'learn_single_target_tree_mids_' + relative_name,
        log_file_name=os.path.join(
            log_file_dir,
            f'{relative_name}_model_induction_single_target_tree_mids.log'))

    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.train)
    df_train_original_column_order = pd.read_csv(
        original_train_data_fold_abs_file_name, delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------

    tree_clf_derived_rules_abs_file_name: str = get_single_target_tree_rules_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth)
    logger.info(
        f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}")
    mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name)
    logger.info(f"ground set size (nb of initial MCARs): {len(mcars)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------
    algorithm = "RDGS"
    debug_mids_fitting = False

    mids = MIDSValueReuse()
    mids.normalize = True
    logger.info("start MIDS model induction")
    mids.fit(
        df_train_original_column_order,
        targets_to_use=[target_attribute],
        use_targets_from_rule_set=False,
        class_association_rules=mcars,
        debug=debug_mids_fitting,
        algorithm=algorithm,
    )
    logger.info("finished MIDS model induction")
    mids_classifier: MIDSClassifier = mids.classifier
    logger.info(mids_classifier)

    logger.info("start saving MIDS model")
    tree_based_mids_classifier_abs_file_name = get_single_target_tree_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        nb_of_trees_per_model=nb_of_trees_per_model,
        min_support=min_support,
        max_depth=max_depth)
    store_mids_classifier(tree_based_mids_classifier_abs_file_name,
                          mids_classifier)
    logger.info(
        f"finished saving MIDS model to file: {tree_based_mids_classifier_abs_file_name}"
    )
    close_logger(logger)
Exemple #3
0
def learn_tree_based_mids_model_for_dataset_fold(
        dataset_name: str,
        fold_i: int,
        classifier_indicator: SingleTargetClassifierIndicator,
        nb_of_trees_per_model: int,
        nb_of_original_targets_to_predict: int,
        min_support: float,
        max_depth: int
):

    logger = create_logger(
        logger_name=f'learn_mids_model{dataset_name}{fold_i}_tree_derived_rules',
        log_file_name=os.path.join(get_tree_based_mids_dir(),
                                   f'{dataset_name}{fold_i}_model_induction_tree_derived_rules.log')
    )
    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    df_original_train = pd.read_csv(get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.train),
                                    delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------
    tree_clf_derived_rules_abs_file_name = get_tree_derived_rules_abs_file_name(dataset_name,
                                                                                fold_i,
                                                                                classifier_indicator,
                                                                                nb_of_trees_per_model,
                                                                                nb_of_original_targets_to_predict,
                                                                                min_support,
                                                                                max_depth)
    logger.info(f"Reading MCARs from file: {tree_clf_derived_rules_abs_file_name}")
    mcars: List[MCAR] = load_mcars(tree_clf_derived_rules_abs_file_name)
    logger.info(f"ground set size (nb of initial MCARs): {len(mcars)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------
    algorithm = "RDGS"
    debug_mids_fitting = False

    mids = MIDSValueReuse()
    mids.normalize = True
    logger.info("start MIDS model induction")

    mids.fit(df_original_train,
             class_association_rules=mcars, debug=debug_mids_fitting, algorithm=algorithm,
             # lambda_array=lambda_array
             use_targets_from_rule_set=False,
             )
    logger.info("finished MIDS model induction")
    mids_classifier: MIDSClassifier = mids.classifier
    logger.info(mids_classifier)
    logger.info(f"Selected {len(mids_classifier.rules)} out of {len(mcars)} rules "
                f"({(len(mids_classifier.rules) / len(mcars) *100):.2f}%)")

    logger.info("start saving MIDS model")
    tree_based_mids_classifier_abs_file_name = get_tree_based_mids_clf_abs_file_name(
        dataset_name=dataset_name, fold_i=fold_i,
        classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
        nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
        min_support=min_support, max_depth=max_depth
    )
    store_mids_classifier(tree_based_mids_classifier_abs_file_name, mids_classifier)
    logger.info(f"finished saving MIDS model to file: {tree_based_mids_classifier_abs_file_name}")
    close_logger(logger)
def create_single_target_tree_based_mcars(
        dataset_name: str, fold_i: int, target_attribute: str,
        classifier_indicator: SingleTargetClassifierIndicator,
        confidence_boundary_val: float, min_support: float, max_depth: int,
        seed: int):
    train_test = TrainTestEnum.train

    relative_name: str = get_single_target_tree_rules_relative_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)

    logger = create_logger(
        logger_name=f'create_single_target_tree_rules' + relative_name,
        log_file_name=os.path.join(
            assoc_vs_tree_based_single_target_car_dir(),
            f'{relative_name}_single_target_tree_rule_generation.log'))

    logger.info(
        f"Start reading MCARS for {dataset_name}{fold_i}_{target_attribute}"
        f" (confidence {confidence_boundary_val})")
    st_mcars_abs_file_name = get_single_target_filtered_cars_abs_filename(
        dataset_name,
        fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary_val)

    filtered_st_mcars: List[MCAR] = load_mcars(st_mcars_abs_file_name)
    logger.info(
        f"Total nb of MCARS for {dataset_name}{fold_i}_{target_attribute}"
        f" (conf {confidence_boundary_val}): {len(filtered_st_mcars)}")

    n_tree_rules_to_generate = len(filtered_st_mcars)
    logger.info(f"Generate {n_tree_rules_to_generate} tree based rules")

    # --- load train data ---------------------------------------------------------------------------------------------
    df_original = pd.read_csv(get_original_data_fold_abs_file_name(
        dataset_name, fold_i, train_test),
                              delimiter=',')
    df_one_hot_encoded = pd.read_csv(
        get_one_hot_encoded_data_fold_abs_file_name(dataset_name, fold_i,
                                                    train_test),
        delimiter=",")
    encoding_book_keeper: EncodingBookKeeper = load_encoding_book_keeper(
        get_encodings_book_keeper_abs_file_name_for(dataset_name, fold_i))

    # --- prepare data ------------------------------------------------------------------------------------------------

    original_group_to_predict: List[Attr] = [target_attribute]
    original_target_attr_set = set(original_group_to_predict)

    logger.info(
        f"Fetching the necessary columns for {dataset_name}{fold_i} {original_target_attr_set}"
    )

    prepared_data: PreparedDataForTargetSet = PreparedDataForTargetSet.prepare_data_for_target_set(
        df_original=df_original,
        df_one_hot_encoded=df_one_hot_encoded,
        encoding_book_keeper=encoding_book_keeper,
        original_target_attr_set=original_target_attr_set,
    )

    random_forest_abs_file_name: str = get_single_target_random_forest_absolute_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)

    # --- Generate the required nb of tree-based rules ----------------------------------------------------------------
    logger.info(f"Start generating tree-based rules")
    tree_based_mcars: List[MCAR]
    tree_rule_gen_timing_info: TreeRuleGenTimingInfo
    tree_based_mcars, tree_rule_gen_timing_info = generate_n_single_target_tree_rules(
        n_tree_rules_to_generate=n_tree_rules_to_generate,
        prepared_data=prepared_data,
        encoding_book_keeper=encoding_book_keeper,
        min_support=min_support,
        max_depth=max_depth,
        logger=logger,
        seed=seed,
        random_forest_abs_file_name=random_forest_abs_file_name)

    # --- SAVE the generated tree-based rules
    tree_based_rules_abs_file_name: str = get_single_target_tree_rules_absolute_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)
    store_mcars(tree_based_rules_abs_file_name, tree_based_mcars)
    logger.info(
        f"finished writing tree-derived ruled to file: {tree_based_rules_abs_file_name}"
    )

    tree_rule_gen_timing_info_abs_file_name: str = get_single_target_tree_rules_gen_timing_info_absolute_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        classifier_indicator=classifier_indicator,
        min_support=min_support,
        max_depth=max_depth,
        confidence_boundary_val=confidence_boundary_val)
    store_tree_rule_gen_timing_info(tree_rule_gen_timing_info_abs_file_name,
                                    tree_rule_gen_timing_info)

    logger.info(
        "==================================================================")
    close_logger(logger)
def learn_single_target_car_mids_model_for_dataset_fold_confidence_boundary(
        dataset_name: str, fold_i: int, target_attribute: str,
        confidence_boundary: float):

    relative_name: str = get_single_target_filtered_car_mids_relative_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary)

    log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir()
    logger = create_logger(
        logger_name=f'learn_single_target_filtered_car_mids_' + relative_name,
        log_file_name=os.path.join(
            log_file_dir,
            f'{relative_name}_model_induction_single_target_filtered_car_mids.log'
        ))

    # --- load train data ---------------------------------------------------------------------------------------------
    # read in original (discretized) training data
    original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
        dataset_name, fold_i, TrainTestEnum.train)
    df_train_original_column_order = pd.read_csv(
        original_train_data_fold_abs_file_name, delimiter=',')

    # --- load association rules --------------------------------------------------------------------------------------

    filtered_st_mcars_abs_file_name: str = get_single_target_filtered_cars_abs_filename(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary)
    logger.info(
        f"Reading single-target CARs from file: {filtered_st_mcars_abs_file_name}"
    )
    st_mcar_list: List[MCAR] = load_mcars(filtered_st_mcars_abs_file_name)

    ground_set_size: int = len(st_mcar_list)
    if ground_set_size <= 0:
        raise Exception(
            f"Ground set size is {ground_set_size} for {dataset_name}{fold_i} {target_attribute}"
        )
    logger.info(f"ground set size (nb of initial MCARs): {len(st_mcar_list)}")

    # --- Fit and save classifier -------------------------------------------------------------------------------------
    algorithm = "RDGS"
    debug_mids_fitting = False

    mids = MIDSValueReuse()
    mids.normalize = True
    logger.info("start MIDS model induction")
    mids.fit(
        df_train_original_column_order,
        use_targets_from_rule_set=True,
        class_association_rules=st_mcar_list,
        debug=debug_mids_fitting,
        algorithm=algorithm,
    )
    logger.info("finished MIDS model induction")
    mids_classifier: MIDSClassifier = mids.classifier
    logger.info(mids_classifier)

    logger.info("start saving MIDS model")

    mids_classifier_abs_file_name: str = get_single_target_filtered_car_mids_clf_abs_file_name(
        dataset_name=dataset_name,
        fold_i=fold_i,
        target_attribute=target_attribute,
        confidence_boundary_val=confidence_boundary)
    store_mids_classifier(mids_classifier_abs_file_name, mids_classifier)
    logger.info(
        f"finished saving MIDS model to file: {mids_classifier_abs_file_name}")
    close_logger(logger)