def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [dict(filename="iris", targetvariablename="class", numerical=True)]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    min_support = 0.1
    max_depth = 7
    nb_of_trees_to_use_list: List[int] = [50]

    nb_of_folds: int = 10

    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']
        for fold_i in range(nb_of_folds):
            for nb_of_trees_to_use in nb_of_trees_to_use_list:
                if use_dask:
                    func_args = dict(
                        dataset_name=dataset_name, fold_i=fold_i,
                        nb_of_trees_to_use=nb_of_trees_to_use, min_support=min_support,
                        max_depth=max_depth
                    )

                    delayed_func = \
                        delayed(merge_single_target_mids_models_for_dataset_fold)(
                            **func_args
                        )
                    list_of_computations.append((delayed_func, func_args))
                else:
                    merge_single_target_mids_models_for_dataset_fold(
                        dataset_name=dataset_name, fold_i=fold_i,
                        nb_of_trees_to_use=nb_of_trees_to_use, min_support=min_support,
                        max_depth=max_depth
                    )

    if use_dask:
        log_file_dir: str = get_merged_single_target_mids_clf_dir()

        logger_name: str = f'merge_single_target_tree_mids_ERROR_LOGGER'
        logger_file_name: str = os.path.join(
            log_file_dir,
            f'ERROR_LOG_model_merging_single_target_tree_mids.log'
        )

        compute_delayed_functions(
            list_of_computations=list_of_computations,
            client=client,
            nb_of_retries_if_erred=5,
            error_logger_name=logger_name,
            error_logger_file_name=logger_file_name
        )
Example #2
0
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [dict(filename="iris", targetvariablename="class", numerical=True)]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    nb_of_folds: int = 10
    classifier_indicator = SingleTargetClassifierIndicator.random_forest
    nb_of_original_targets_to_predict: int = 2
    nb_of_trees_per_model_list: List[int] = [5, 10]
    min_support: float = 0.1  # min_samples_leaf must be at least 1 or in (0, 0.5], got 0

    max_depth: int = 7 - nb_of_original_targets_to_predict

    use_dask = False
    if use_dask:
        client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']

        for fold_i in range(nb_of_folds):

            for nb_of_trees_per_model in nb_of_trees_per_model_list:

                clf_abs_file_name = get_tree_based_greedy_clf_abs_file_name(
                    dataset_name=dataset_name, fold_i=fold_i,
                    classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model,
                    nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
                    min_support=min_support, max_depth=max_depth
                )
                n_days_in_hours = 18

                should_refit: bool = file_does_not_exist_or_has_been_created_earlier_than_(
                    clf_abs_file_name,
                    n_days_in_hours
                )

                if should_refit:

                    if use_dask:
                        func_args = dict(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            classifier_indicator=classifier_indicator,
                            nb_of_trees_per_model=nb_of_trees_per_model,
                            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
                            min_support=min_support,
                            max_depth=max_depth)

                        delayed_func = \
                            delayed(learn_tree_based_greedy_model_for_dataset_fold)(
                                **func_args
                            )
                        list_of_computations.append((delayed_func, func_args))
                    else:
                        learn_tree_based_greedy_model_for_dataset_fold(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            classifier_indicator=classifier_indicator,
                            nb_of_trees_per_model=nb_of_trees_per_model,
                            nb_of_original_targets_to_predict=nb_of_original_targets_to_predict,
                            min_support=min_support,
                            max_depth=max_depth
                        )

    if use_dask:
        log_file_dir: str = greedy_models_tree_based_dir()

        logger_name: str = 'greedy_model_induction_tree_derived_rules_ERROR_LOGGER'
        logger_file_name: str = os.path.join(
            log_file_dir,
            f'ERROR_LOG_greedy_model_induction_tree_derived_rules.log'
        )

        compute_delayed_functions(
            list_of_computations=list_of_computations,
            client=client,
            nb_of_retries_if_erred=5,
            error_logger_name=logger_name,
            error_logger_file_name=logger_file_name
        )
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [
        dict(filename="iris", targetvariablename="class", numerical=True)
    ]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name

    min_support = 0.1
    max_depth = 7

    nb_of_trees_to_use_list: List[int] = [25, 50]

    nb_of_folds: int = 10
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']
        for fold_i in range(nb_of_folds):

            original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
                dataset_name, fold_i, TrainTestEnum.train)

            target_columns: List[str] = get_header_attributes(
                original_train_data_fold_abs_file_name)
            for target_column in target_columns:
                target_attribute = str(target_column)
                for nb_of_trees_to_use in nb_of_trees_to_use_list:
                    if use_dask:
                        func_args = dict(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            nb_of_trees_per_model=nb_of_trees_to_use,
                            min_support=min_support,
                            max_depth=max_depth)

                        delayed_func = \
                            delayed(
                                learn_single_target_tree_mids_model_for_dataset_fold)(
                                    **func_args
                            )
                        list_of_computations.append((delayed_func, func_args))
                    else:
                        learn_single_target_tree_mids_model_for_dataset_fold(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            nb_of_trees_per_model=nb_of_trees_to_use,
                            min_support=min_support,
                            max_depth=max_depth)

    if use_dask:
        log_file_dir: str = get_single_target_mids_clf_dir()

        logger_name = f'learn_single_target_tree_mids_ERROR_LOGGER'
        logger_file_name = os.path.join(
            log_file_dir,
            f'ERROR_LOG_model_induction_single_target_tree_mids.log')

        compute_delayed_functions(list_of_computations=list_of_computations,
                                  client=client,
                                  nb_of_retries_if_erred=5,
                                  error_logger_name=logger_name,
                                  error_logger_file_name=logger_file_name)
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [
        dict(filename="iris", targetvariablename="class", numerical=True)
    ]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    classifier_indicator: SingleTargetClassifierIndicator = SingleTargetClassifierIndicator.random_forest
    seed: int = 3
    min_support = 0.1
    max_depth = 7

    confidence_boundary_values: List[float] = [0.75, 0.95]

    use_dask = False
    if use_dask:
        client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']
        target_attribute: str = dataset_info['targetvariablename']

        for fold_i in range(10):
            # --- Select subsets with a varying nb of rules (here: a varying confidence) --------------------------
            # --- For each of the different sets of association rules,
            #       generate a set of tree-based rules of the same size (*).
            for confidence_boundary_val in confidence_boundary_values:
                if use_dask:
                    func_args = dict(
                        dataset_name=dataset_name,
                        fold_i=fold_i,
                        target_attribute=target_attribute,
                        classifier_indicator=classifier_indicator,
                        confidence_boundary_val=confidence_boundary_val,
                        min_support=min_support,
                        max_depth=max_depth,
                        seed=seed)

                    delayed_func = \
                        delayed(create_single_target_tree_based_mcars)(
                            **func_args
                        )
                    list_of_computations.append((delayed_func, func_args))
                else:
                    create_single_target_tree_based_mcars(
                        dataset_name=dataset_name,
                        fold_i=fold_i,
                        target_attribute=target_attribute,
                        classifier_indicator=classifier_indicator,
                        confidence_boundary_val=confidence_boundary_val,
                        min_support=min_support,
                        max_depth=max_depth,
                        seed=seed)

            # Result: pairs of (association rule, tree-based rule) sets, with an increasing number of rules.

            # --- Learn an (M)IDS model for each of the two rule sets in a pair. ------------------------------

            # --- Evaluate the learned IDS models using the chosen evaluation metrics. ------------------------

            # --- Plot the evaluation metrics in function of the increasing number of rules. ------------------

    if use_dask:
        log_file_dir: str = assoc_vs_tree_based_single_target_car_dir()

        logger_name: str = 'create_single_target_tree_rules_ERROR_LOGGER'
        logger_file_name: str = os.path.join(
            log_file_dir, f'ERROR_LOG_single_target_tree_rule_generation.log')

        compute_delayed_functions(list_of_computations=list_of_computations,
                                  client=client,
                                  nb_of_retries_if_erred=5,
                                  error_logger_name=logger_name,
                                  error_logger_file_name=logger_file_name)
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [dict(filename="iris", targetvariablename="class", numerical=True)]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    confidence_boundary_values: List[float] = [0.75, 0.95]

    nb_of_folds: int = 10

    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host=scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']
        target_attribute: str = dataset_info['targetvariablename']
        for fold_i in range(nb_of_folds):
            for confidence_boundary_val in confidence_boundary_values:
                relative_name: str = get_single_target_filtered_car_mids_relative_file_name(
                    dataset_name=dataset_name, fold_i=fold_i,
                    target_attribute=target_attribute,
                    confidence_boundary_val=confidence_boundary_val
                )
                log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir()

                logger_name: str = f'evaluate_single_target_car_mids_' + relative_name
                logger_file_name: str = os.path.join(
                    log_file_dir,
                    f'{relative_name}_model_evaluation_single_target_car_mids.log'
                )

                mids_classifier_abs_file_name: str = get_single_target_filtered_car_mids_clf_abs_file_name(
                    dataset_name=dataset_name, fold_i=fold_i,
                    target_attribute=target_attribute,
                    confidence_boundary_val=confidence_boundary_val
                )

                mids_target_attr_to_score_info_abs_file_name: str = \
                    get_single_target_filtered_car_mids_target_attr_to_score_info_abs_file_name(
                        dataset_name=dataset_name, fold_i=fold_i,
                        target_attribute=target_attribute,
                        confidence_boundary_val=confidence_boundary_val
                    )

                mids_interpret_stats_abs_file_name: str = get_single_target_filtered_car_mids_interpret_stats_abs_file_name(
                    dataset_name=dataset_name, fold_i=fold_i,
                    target_attribute=target_attribute,
                    confidence_boundary_val=confidence_boundary_val
                )

                if use_dask:
                    func_args = dict(
                        dataset_name=dataset_name, fold_i=fold_i,
                        logger_name=logger_name,
                        logger_file_name=logger_file_name,
                        mids_classifier_abs_file_name=mids_classifier_abs_file_name,
                        mids_target_attr_to_score_info_abs_file_name=mids_target_attr_to_score_info_abs_file_name,
                        mids_interpret_stats_abs_file_name=mids_interpret_stats_abs_file_name
                    )

                    delayed_func = \
                        delayed(evaluate_single_target_mids_model_for_dataset_fold)(
                            **func_args
                        )
                    list_of_computations.append((delayed_func, func_args))
                else:
                    evaluate_single_target_mids_model_for_dataset_fold(
                        dataset_name=dataset_name, fold_i=fold_i,
                        logger_name=logger_name,
                        logger_file_name=logger_file_name,
                        mids_classifier_abs_file_name=mids_classifier_abs_file_name,
                        mids_target_attr_to_score_info_abs_file_name=mids_target_attr_to_score_info_abs_file_name,
                        mids_interpret_stats_abs_file_name=mids_interpret_stats_abs_file_name
                    )

    if use_dask:
        logger_name: str = f'evaluate_single_target_car_mids_ERROR_LOGGER'

        log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir()
        logger_file_name: str = os.path.join(
            log_file_dir,
            f'ERROR_LOG_model_evaluation_single_target_car_mids.log'
        )

        compute_delayed_functions(
            list_of_computations=list_of_computations,
            client=client,
            nb_of_retries_if_erred=5,
            error_logger_name=logger_name,
            error_logger_file_name=logger_file_name
        )