def main():
    # from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [
        dict(filename="iris", targetvariablename="class", numerical=True)
    ]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    confidence_boundary_values: List[float] = [0.75, 0.95]

    nb_of_folds: int = 10

    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']
        target_attribute: str = dataset_info['targetvariablename']
        for fold_i in range(nb_of_folds):
            # original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i,
            #                                                                               TrainTestEnum.train)
            # target_columns: List[str] = get_header_attributes(original_train_data_fold_abs_file_name)
            target_columns = [target_attribute]
            for target_column in target_columns:
                target_attribute = str(target_column)
                for confidence_boundary_val in confidence_boundary_values:
                    if use_dask:

                        func_args = dict(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_column,
                            confidence_boundary=confidence_boundary_val)

                        delayed_func = \
                            delayed(learn_single_target_car_mids_model_for_dataset_fold_confidence_boundary)(
                                **func_args
                            )
                        list_of_computations.append((delayed_func, func_args))
                    else:
                        learn_single_target_car_mids_model_for_dataset_fold_confidence_boundary(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            confidence_boundary=confidence_boundary_val)
    if use_dask:
        log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir()

        logger_name: str = f'learn_single_target_filtered_car_mids_ERROR_LOGGER'
        logger_file_name: str = os.path.join(
            log_file_dir,
            f'ERROR_LOG_model_induction_single_filtered_target_car_mids.log')
        compute_delayed_functions(list_of_computations=list_of_computations,
                                  client=client,
                                  nb_of_retries_if_erred=5,
                                  error_logger_name=logger_name,
                                  error_logger_file_name=logger_file_name)
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [dict(filename="iris", targetvariablename="class", numerical=True)]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    min_support: float = 0.1
    max_length: int = 7
    confidence_boundary_values: List[float] = [0.75, 0.95]

    nb_of_folds: int = 10

    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']

        for fold_i in range(nb_of_folds):
            original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i,
                                                                                          TrainTestEnum.train)

            target_columns: List[str] = get_header_attributes(original_train_data_fold_abs_file_name)
            for target_column in target_columns:
                target_attribute = str(target_column)
                for conf_boundary_val in confidence_boundary_values:
                    if use_dask:
                        func_args = dict(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            min_support=min_support,
                            min_confidence=conf_boundary_val,
                            max_length=max_length
                        )
                        delayed_func = delayed(mine_cars_for_dataset_fold_target_attribute)(
                            **func_args
                        )
                        list_of_computations.append((delayed_func, func_args))
                    else:
                        mine_cars_for_dataset_fold_target_attribute(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            min_support=min_support,
                            min_confidence=conf_boundary_val,
                            max_length=max_length
                        )
        if use_dask:
            log_file_dir = assoc_vs_tree_based_single_target_car_dir()

            logger_name: str = 'mine_single_target_cars_ifo_confidence_bound_ERROR_LOGGER'
            logger_file_name: str = os.path.join(
                log_file_dir,
                f'ERROR_LOG_mine_single_target_cars_ifo_confidence_bound.log'
            )

            compute_delayed_functions(
                list_of_computations=list_of_computations,
                client=client,
                nb_of_retries_if_erred=5,
                error_logger_name=logger_name,
                error_logger_file_name=logger_file_name
            )
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [
        dict(filename="iris", targetvariablename="class", numerical=True)
    ]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    nb_of_folds: int = 10
    classifier_indicator = SingleTargetClassifierIndicator.random_forest
    nb_of_original_targets_to_predict: int = 2

    nb_of_trees_per_model_list: List[int] = [5, 10]
    min_support: float = 0.1  # min_samples_leaf must be at least 1 or in (0, 0.5], got 0

    max_depth: int = 7 - nb_of_original_targets_to_predict

    use_dask = False
    if use_dask:
        client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']

        for fold_i in range(nb_of_folds):

            for nb_of_trees_per_model in nb_of_trees_per_model_list:

                if use_dask:
                    func_args = dict(
                        dataset_name=dataset_name,
                        fold_i=fold_i,
                        classifier_indicator=classifier_indicator,
                        nb_of_trees_per_model=nb_of_trees_per_model,
                        nb_of_original_targets_to_predict=
                        nb_of_original_targets_to_predict,
                        min_support=min_support,
                        max_depth=max_depth)

                    delayed_func = \
                        delayed(evaluate_mids_model_for_dataset_fold_target_attribute)(
                            **func_args
                        )
                    list_of_computations.append((delayed_func, func_args))
                else:
                    evaluate_mids_model_for_dataset_fold_target_attribute(
                        dataset_name=dataset_name,
                        fold_i=fold_i,
                        classifier_indicator=classifier_indicator,
                        nb_of_trees_per_model=nb_of_trees_per_model,
                        nb_of_original_targets_to_predict=
                        nb_of_original_targets_to_predict,
                        min_support=min_support,
                        max_depth=max_depth)

    if use_dask:
        log_file_dir: str = get_tree_based_mids_dir()

        logger_name: str = 'model_evaluation_tree_derived_rules_ERROR_LOGGER'
        logger_file_name: str = os.path.join(
            log_file_dir, f'ERROR_LOG_model_evaluation_tree_derived_rules.log')

        compute_delayed_functions(list_of_computations=list_of_computations,
                                  client=client,
                                  nb_of_retries_if_erred=5,
                                  error_logger_name=logger_name,
                                  error_logger_file_name=logger_file_name)
    if use_dask:
        nb_of_retries_if_erred = 2
        print("start compute")
        print(list_of_computations)
        distributed.wait(
            client.compute(list_of_computations,
                           retries=nb_of_retries_if_erred))
        print("end compute")
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [
        dict(filename="iris", targetvariablename="class", numerical=True)
    ]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    seed: int = 3
    nb_of_folds: int = 10
    nb_of_original_targets_to_predict: int = 2
    nb_grouping_iterations = 5

    nb_of_trees_per_model_list: List[int] = [5, 10]
    min_support: float = 0.1  # min_samples_leaf must be at least 1 or in (0, 0.5], got 0

    max_depth: int = 7 - nb_of_original_targets_to_predict

    use_dask = False
    if use_dask:
        client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']

        for fold_i in range(nb_of_folds):

            for nb_of_trees_per_model in nb_of_trees_per_model_list:

                if use_dask:

                    func_args = dict(
                        dataset_name=dataset_name,
                        fold_i=fold_i,
                        nb_of_trees_per_model=nb_of_trees_per_model,
                        nb_of_original_targets_to_predict=
                        nb_of_original_targets_to_predict,
                        nb_grouping_iterations=nb_grouping_iterations,
                        min_support=min_support,
                        max_depth=max_depth,
                        seed=seed)

                    delayed_func = \
                        delayed(learn_and_convert_tree_model_to_rules)(
                            **func_args
                        )
                    list_of_computations.append((delayed_func, func_args))
                else:
                    learn_and_convert_tree_model_to_rules(
                        dataset_name=dataset_name,
                        fold_i=fold_i,
                        nb_of_trees_per_model=nb_of_trees_per_model,
                        nb_of_original_targets_to_predict=
                        nb_of_original_targets_to_predict,
                        nb_grouping_iterations=nb_grouping_iterations,
                        min_support=min_support,
                        max_depth=max_depth,
                        seed=seed)
    if use_dask:
        log_file_dir: str = get_tree_derived_rules_dir()

        logger_name: str = 'multi_target_tree_rule_generation_ERROR_LOGGER'
        logger_file_name: str = os.path.join(
            log_file_dir, f'ERROR_LOG_multi_target_tree_rule_generation.log')

        compute_delayed_functions(list_of_computations=list_of_computations,
                                  client=client,
                                  nb_of_retries_if_erred=5,
                                  error_logger_name=logger_name,
                                  error_logger_file_name=logger_file_name)
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [
        dict(filename="iris", targetvariablename="class", numerical=True)
    ]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    classifier_indicator = SingleTargetClassifierIndicator.random_forest

    confidence_boundary_values: List[float] = [0.75, 0.95]
    min_support = 0.1
    max_depth = 7

    nb_of_folds: int = 10

    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']
        target_attribute: str = dataset_info['targetvariablename']
        for fold_i in range(nb_of_folds):
            for confidence_boundary_val in confidence_boundary_values:

                n_days_in_hours = 24.0 * 0

                # do if file not exists or has been created more than 72 hours ago
                tree_based_mids_classifier_abs_file_name = get_single_target_filtered_tree_mids_clf_abs_file_name(
                    dataset_name=dataset_name,
                    fold_i=fold_i,
                    target_attribute=target_attribute,
                    classifier_indicator=classifier_indicator,
                    min_support=min_support,
                    max_depth=max_depth,
                    confidence_boundary_val=confidence_boundary_val)
                should_refit: bool = file_does_not_exist_or_has_been_created_earlier_than_(
                    tree_based_mids_classifier_abs_file_name, n_days_in_hours)
                if should_refit:
                    if use_dask:
                        func_args = dict(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            confidence_boundary=confidence_boundary_val,
                            min_support=min_support,
                            max_depth=max_depth)

                        delayed_func = \
                            delayed(
                                learn_single_target_tree_mids_model_for_dataset_fold_confidence)(
                                    **func_args
                            )
                        list_of_computations.append((delayed_func, func_args))
                    else:
                        learn_single_target_tree_mids_model_for_dataset_fold_confidence(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            confidence_boundary=confidence_boundary_val,
                            min_support=min_support,
                            max_depth=max_depth)

    if use_dask:
        log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir()

        logger_name = f'learn_single_target_tree_mids_ERROR_LOGGER'
        logger_file_name = os.path.join(
            log_file_dir,
            f'ERROR_LOG_model_induction_single_target_tree_mids.log')

        compute_delayed_functions(list_of_computations=list_of_computations,
                                  client=client,
                                  nb_of_retries_if_erred=5,
                                  error_logger_name=logger_name,
                                  error_logger_file_name=logger_file_name)
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [
        dict(filename="iris", targetvariablename="class", numerical=True)
    ]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    list_of_computations: List[Tuple[Delayed, Dict]] = []

    confidence_boundary_values: List[float] = [0.75, 0.95]
    min_support = 0.1
    max_depth = 7

    nb_of_folds: int = 10

    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']
        target_attribute: str = dataset_info['targetvariablename']
        for fold_i in range(nb_of_folds):
            for confidence_boundary_val in confidence_boundary_values:

                classifier_indicator = SingleTargetClassifierIndicator.random_forest

                relative_name: str = get_single_target_tree_mids_clf_relative_file_name(
                    dataset_name=dataset_name,
                    fold_i=fold_i,
                    target_attribute=target_attribute,
                    classifier_indicator=classifier_indicator,
                    confidence_boundary_val=confidence_boundary_val,
                    min_support=min_support,
                    max_depth=max_depth,
                )
                log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir(
                )

                logger_name: str = f'evaluate_single_target_tree_mids_' + relative_name
                logger_file_name: str = os.path.join(
                    log_file_dir,
                    f'{relative_name}_model_evaluation_single_target_tree_mids.log'
                )

                mids_classifier_abs_file_name: str = get_single_target_filtered_tree_mids_clf_abs_file_name(
                    dataset_name=dataset_name,
                    fold_i=fold_i,
                    target_attribute=target_attribute,
                    classifier_indicator=classifier_indicator,
                    min_support=min_support,
                    max_depth=max_depth,
                    confidence_boundary_val=confidence_boundary_val)

                mids_target_attr_to_score_info_abs_file_name: str = \
                    get_single_target_filtered_tree_mids_target_attr_to_score_info_abs_file_name(
                        dataset_name=dataset_name, fold_i=fold_i,
                        target_attribute=target_attribute,
                        classifier_indicator=classifier_indicator,
                        min_support=min_support, max_depth=max_depth,
                        confidence_boundary_val=confidence_boundary_val
                    )

                mids_interpret_stats_abs_file_name: str = get_single_target_filtered_tree_mids_interpret_stats_abs_file_name(
                    dataset_name=dataset_name,
                    fold_i=fold_i,
                    target_attribute=target_attribute,
                    classifier_indicator=classifier_indicator,
                    min_support=min_support,
                    max_depth=max_depth,
                    confidence_boundary_val=confidence_boundary_val)

                if use_dask:
                    func_args = dict(
                        dataset_name=dataset_name,
                        fold_i=fold_i,
                        logger_name=logger_name,
                        logger_file_name=logger_file_name,
                        mids_classifier_abs_file_name=
                        mids_classifier_abs_file_name,
                        mids_target_attr_to_score_info_abs_file_name=
                        mids_target_attr_to_score_info_abs_file_name,
                        mids_interpret_stats_abs_file_name=
                        mids_interpret_stats_abs_file_name)

                    delayed_func = \
                        delayed(evaluate_single_target_mids_model_for_dataset_fold)(
                            **func_args
                        )
                    list_of_computations.append((delayed_func, func_args))
                else:
                    evaluate_single_target_mids_model_for_dataset_fold(
                        dataset_name=dataset_name,
                        fold_i=fold_i,
                        logger_name=logger_name,
                        logger_file_name=logger_file_name,
                        mids_classifier_abs_file_name=
                        mids_classifier_abs_file_name,
                        mids_target_attr_to_score_info_abs_file_name=
                        mids_target_attr_to_score_info_abs_file_name,
                        mids_interpret_stats_abs_file_name=
                        mids_interpret_stats_abs_file_name)

    if use_dask:
        log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir()

        logger_name: str = f'evaluate_single_target_tree_mids_ERROR_LOGGER'
        logger_file_name: str = os.path.join(
            log_file_dir,
            f'ERROR_LOG_model_evaluation_single_target_tree_mids.log')

        compute_delayed_functions(list_of_computations=list_of_computations,
                                  client=client,
                                  nb_of_retries_if_erred=5,
                                  error_logger_name=logger_name,
                                  error_logger_file_name=logger_file_name)
Ejemplo n.º 7
0
def main():
    from experiments.arcbench_data_preparation.dataset_info import datasets
    datasets = [
        dict(filename="iris", targetvariablename="class", numerical=True)
    ]
    from experiments.dask_utils.dask_initialization import scheduler_host_name
    scheduler_host: str = scheduler_host_name
    min_support = 0.1
    max_depth = 7

    nb_of_trees_to_use_list: List[int] = [25, 50]

    list_of_computations: List[Tuple[Delayed, Dict]] = []
    use_dask = False
    if use_dask:
        client: Client = reconnect_client_to_ssh_cluster(scheduler_host)

    for dataset_info in datasets:
        dataset_name = dataset_info['filename']
        for fold_i in range(10):
            original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(
                dataset_name, fold_i, TrainTestEnum.train)

            target_columns: List[str] = get_header_attributes(
                original_train_data_fold_abs_file_name)
            for target_column in target_columns:
                target_attribute = str(target_column)

                for nb_of_trees_to_use in nb_of_trees_to_use_list:
                    if use_dask:
                        func_args = dict(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            nb_of_trees_per_model=nb_of_trees_to_use,
                            min_support=min_support,
                            max_depth=max_depth)

                        delayed_func = \
                            delayed(learn_and_convert_single_target_tree_ensemble_to_rules)(
                                **func_args
                            )
                        list_of_computations.append((delayed_func, func_args))
                    else:
                        learn_and_convert_single_target_tree_ensemble_to_rules(
                            dataset_name=dataset_name,
                            fold_i=fold_i,
                            target_attribute=target_attribute,
                            nb_of_trees_per_model=nb_of_trees_to_use,
                            min_support=min_support,
                            max_depth=max_depth)

            # Result: pairs of (association rule, tree-based rule) sets, with an increasing number of rules.

            # --- Learn an (M)IDS model for each of the two rule sets in a pair. ------------------------------

            # --- Evaluate the learned IDS models using the chosen evaluation metrics. ------------------------

            # --- Plot the evaluation metrics in function of the increasing number of rules. ------------------

    if use_dask:
        log_file_dir: str = get_single_target_tree_rule_dir()

        logger_name: str = f'mine_single_target_tree_mids_ERROR_LOGGER'
        logger_file_name: str = os.path.join(
            log_file_dir,
            f'ERROR_LOG_model_induction_single_target_tree_mids_easy.log')

        compute_delayed_functions(list_of_computations=list_of_computations,
                                  client=client,
                                  nb_of_retries_if_erred=5,
                                  error_logger_name=logger_name,
                                  error_logger_file_name=logger_file_name)