def main(): # from experiments.arcbench_data_preparation.dataset_info import datasets datasets = [ dict(filename="iris", targetvariablename="class", numerical=True) ] from experiments.dask_utils.dask_initialization import scheduler_host_name scheduler_host: str = scheduler_host_name list_of_computations: List[Tuple[Delayed, Dict]] = [] confidence_boundary_values: List[float] = [0.75, 0.95] nb_of_folds: int = 10 use_dask = False if use_dask: client: Client = reconnect_client_to_ssh_cluster(scheduler_host) for dataset_info in datasets: dataset_name = dataset_info['filename'] target_attribute: str = dataset_info['targetvariablename'] for fold_i in range(nb_of_folds): # original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i, # TrainTestEnum.train) # target_columns: List[str] = get_header_attributes(original_train_data_fold_abs_file_name) target_columns = [target_attribute] for target_column in target_columns: target_attribute = str(target_column) for confidence_boundary_val in confidence_boundary_values: if use_dask: func_args = dict( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_column, confidence_boundary=confidence_boundary_val) delayed_func = \ delayed(learn_single_target_car_mids_model_for_dataset_fold_confidence_boundary)( **func_args ) list_of_computations.append((delayed_func, func_args)) else: learn_single_target_car_mids_model_for_dataset_fold_confidence_boundary( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, confidence_boundary=confidence_boundary_val) if use_dask: log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir() logger_name: str = f'learn_single_target_filtered_car_mids_ERROR_LOGGER' logger_file_name: str = os.path.join( log_file_dir, f'ERROR_LOG_model_induction_single_filtered_target_car_mids.log') compute_delayed_functions(list_of_computations=list_of_computations, client=client, nb_of_retries_if_erred=5, error_logger_name=logger_name, error_logger_file_name=logger_file_name)
def main(): from experiments.arcbench_data_preparation.dataset_info import datasets datasets = [dict(filename="iris", targetvariablename="class", numerical=True)] from experiments.dask_utils.dask_initialization import scheduler_host_name scheduler_host: str = scheduler_host_name list_of_computations: List[Tuple[Delayed, Dict]] = [] min_support: float = 0.1 max_length: int = 7 confidence_boundary_values: List[float] = [0.75, 0.95] nb_of_folds: int = 10 use_dask = False if use_dask: client: Client = reconnect_client_to_ssh_cluster(scheduler_host) for dataset_info in datasets: dataset_name = dataset_info['filename'] for fold_i in range(nb_of_folds): original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name(dataset_name, fold_i, TrainTestEnum.train) target_columns: List[str] = get_header_attributes(original_train_data_fold_abs_file_name) for target_column in target_columns: target_attribute = str(target_column) for conf_boundary_val in confidence_boundary_values: if use_dask: func_args = dict( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, min_support=min_support, min_confidence=conf_boundary_val, max_length=max_length ) delayed_func = delayed(mine_cars_for_dataset_fold_target_attribute)( **func_args ) list_of_computations.append((delayed_func, func_args)) else: mine_cars_for_dataset_fold_target_attribute( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, min_support=min_support, min_confidence=conf_boundary_val, max_length=max_length ) if use_dask: log_file_dir = assoc_vs_tree_based_single_target_car_dir() logger_name: str = 'mine_single_target_cars_ifo_confidence_bound_ERROR_LOGGER' logger_file_name: str = os.path.join( log_file_dir, f'ERROR_LOG_mine_single_target_cars_ifo_confidence_bound.log' ) compute_delayed_functions( list_of_computations=list_of_computations, client=client, nb_of_retries_if_erred=5, error_logger_name=logger_name, error_logger_file_name=logger_file_name )
def main(): from experiments.arcbench_data_preparation.dataset_info import datasets datasets = [ dict(filename="iris", targetvariablename="class", numerical=True) ] from experiments.dask_utils.dask_initialization import scheduler_host_name scheduler_host: str = scheduler_host_name list_of_computations: List[Tuple[Delayed, Dict]] = [] nb_of_folds: int = 10 classifier_indicator = SingleTargetClassifierIndicator.random_forest nb_of_original_targets_to_predict: int = 2 nb_of_trees_per_model_list: List[int] = [5, 10] min_support: float = 0.1 # min_samples_leaf must be at least 1 or in (0, 0.5], got 0 max_depth: int = 7 - nb_of_original_targets_to_predict use_dask = False if use_dask: client = reconnect_client_to_ssh_cluster(scheduler_host) for dataset_info in datasets: dataset_name = dataset_info['filename'] for fold_i in range(nb_of_folds): for nb_of_trees_per_model in nb_of_trees_per_model_list: if use_dask: func_args = dict( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict= nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) delayed_func = \ delayed(evaluate_mids_model_for_dataset_fold_target_attribute)( **func_args ) list_of_computations.append((delayed_func, func_args)) else: evaluate_mids_model_for_dataset_fold_target_attribute( dataset_name=dataset_name, fold_i=fold_i, classifier_indicator=classifier_indicator, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict= nb_of_original_targets_to_predict, min_support=min_support, max_depth=max_depth) if use_dask: log_file_dir: str = get_tree_based_mids_dir() logger_name: str = 'model_evaluation_tree_derived_rules_ERROR_LOGGER' logger_file_name: str = os.path.join( log_file_dir, f'ERROR_LOG_model_evaluation_tree_derived_rules.log') compute_delayed_functions(list_of_computations=list_of_computations, client=client, nb_of_retries_if_erred=5, error_logger_name=logger_name, error_logger_file_name=logger_file_name) if use_dask: nb_of_retries_if_erred = 2 print("start compute") print(list_of_computations) distributed.wait( client.compute(list_of_computations, retries=nb_of_retries_if_erred)) print("end compute")
def main(): from experiments.arcbench_data_preparation.dataset_info import datasets datasets = [ dict(filename="iris", targetvariablename="class", numerical=True) ] from experiments.dask_utils.dask_initialization import scheduler_host_name scheduler_host: str = scheduler_host_name list_of_computations: List[Tuple[Delayed, Dict]] = [] seed: int = 3 nb_of_folds: int = 10 nb_of_original_targets_to_predict: int = 2 nb_grouping_iterations = 5 nb_of_trees_per_model_list: List[int] = [5, 10] min_support: float = 0.1 # min_samples_leaf must be at least 1 or in (0, 0.5], got 0 max_depth: int = 7 - nb_of_original_targets_to_predict use_dask = False if use_dask: client = reconnect_client_to_ssh_cluster(scheduler_host) for dataset_info in datasets: dataset_name = dataset_info['filename'] for fold_i in range(nb_of_folds): for nb_of_trees_per_model in nb_of_trees_per_model_list: if use_dask: func_args = dict( dataset_name=dataset_name, fold_i=fold_i, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict= nb_of_original_targets_to_predict, nb_grouping_iterations=nb_grouping_iterations, min_support=min_support, max_depth=max_depth, seed=seed) delayed_func = \ delayed(learn_and_convert_tree_model_to_rules)( **func_args ) list_of_computations.append((delayed_func, func_args)) else: learn_and_convert_tree_model_to_rules( dataset_name=dataset_name, fold_i=fold_i, nb_of_trees_per_model=nb_of_trees_per_model, nb_of_original_targets_to_predict= nb_of_original_targets_to_predict, nb_grouping_iterations=nb_grouping_iterations, min_support=min_support, max_depth=max_depth, seed=seed) if use_dask: log_file_dir: str = get_tree_derived_rules_dir() logger_name: str = 'multi_target_tree_rule_generation_ERROR_LOGGER' logger_file_name: str = os.path.join( log_file_dir, f'ERROR_LOG_multi_target_tree_rule_generation.log') compute_delayed_functions(list_of_computations=list_of_computations, client=client, nb_of_retries_if_erred=5, error_logger_name=logger_name, error_logger_file_name=logger_file_name)
def main(): from experiments.arcbench_data_preparation.dataset_info import datasets datasets = [ dict(filename="iris", targetvariablename="class", numerical=True) ] from experiments.dask_utils.dask_initialization import scheduler_host_name scheduler_host: str = scheduler_host_name list_of_computations: List[Tuple[Delayed, Dict]] = [] classifier_indicator = SingleTargetClassifierIndicator.random_forest confidence_boundary_values: List[float] = [0.75, 0.95] min_support = 0.1 max_depth = 7 nb_of_folds: int = 10 use_dask = False if use_dask: client: Client = reconnect_client_to_ssh_cluster(scheduler_host) for dataset_info in datasets: dataset_name = dataset_info['filename'] target_attribute: str = dataset_info['targetvariablename'] for fold_i in range(nb_of_folds): for confidence_boundary_val in confidence_boundary_values: n_days_in_hours = 24.0 * 0 # do if file not exists or has been created more than 72 hours ago tree_based_mids_classifier_abs_file_name = get_single_target_filtered_tree_mids_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, min_support=min_support, max_depth=max_depth, confidence_boundary_val=confidence_boundary_val) should_refit: bool = file_does_not_exist_or_has_been_created_earlier_than_( tree_based_mids_classifier_abs_file_name, n_days_in_hours) if should_refit: if use_dask: func_args = dict( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, confidence_boundary=confidence_boundary_val, min_support=min_support, max_depth=max_depth) delayed_func = \ delayed( learn_single_target_tree_mids_model_for_dataset_fold_confidence)( **func_args ) list_of_computations.append((delayed_func, func_args)) else: learn_single_target_tree_mids_model_for_dataset_fold_confidence( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, confidence_boundary=confidence_boundary_val, min_support=min_support, max_depth=max_depth) if use_dask: log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir() logger_name = f'learn_single_target_tree_mids_ERROR_LOGGER' logger_file_name = os.path.join( log_file_dir, f'ERROR_LOG_model_induction_single_target_tree_mids.log') compute_delayed_functions(list_of_computations=list_of_computations, client=client, nb_of_retries_if_erred=5, error_logger_name=logger_name, error_logger_file_name=logger_file_name)
def main(): from experiments.arcbench_data_preparation.dataset_info import datasets datasets = [ dict(filename="iris", targetvariablename="class", numerical=True) ] from experiments.dask_utils.dask_initialization import scheduler_host_name scheduler_host: str = scheduler_host_name list_of_computations: List[Tuple[Delayed, Dict]] = [] confidence_boundary_values: List[float] = [0.75, 0.95] min_support = 0.1 max_depth = 7 nb_of_folds: int = 10 use_dask = False if use_dask: client: Client = reconnect_client_to_ssh_cluster(scheduler_host) for dataset_info in datasets: dataset_name = dataset_info['filename'] target_attribute: str = dataset_info['targetvariablename'] for fold_i in range(nb_of_folds): for confidence_boundary_val in confidence_boundary_values: classifier_indicator = SingleTargetClassifierIndicator.random_forest relative_name: str = get_single_target_tree_mids_clf_relative_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, confidence_boundary_val=confidence_boundary_val, min_support=min_support, max_depth=max_depth, ) log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir( ) logger_name: str = f'evaluate_single_target_tree_mids_' + relative_name logger_file_name: str = os.path.join( log_file_dir, f'{relative_name}_model_evaluation_single_target_tree_mids.log' ) mids_classifier_abs_file_name: str = get_single_target_filtered_tree_mids_clf_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, min_support=min_support, max_depth=max_depth, confidence_boundary_val=confidence_boundary_val) mids_target_attr_to_score_info_abs_file_name: str = \ get_single_target_filtered_tree_mids_target_attr_to_score_info_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, min_support=min_support, max_depth=max_depth, confidence_boundary_val=confidence_boundary_val ) mids_interpret_stats_abs_file_name: str = get_single_target_filtered_tree_mids_interpret_stats_abs_file_name( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, classifier_indicator=classifier_indicator, min_support=min_support, max_depth=max_depth, confidence_boundary_val=confidence_boundary_val) if use_dask: func_args = dict( dataset_name=dataset_name, fold_i=fold_i, logger_name=logger_name, logger_file_name=logger_file_name, mids_classifier_abs_file_name= mids_classifier_abs_file_name, mids_target_attr_to_score_info_abs_file_name= mids_target_attr_to_score_info_abs_file_name, mids_interpret_stats_abs_file_name= mids_interpret_stats_abs_file_name) delayed_func = \ delayed(evaluate_single_target_mids_model_for_dataset_fold)( **func_args ) list_of_computations.append((delayed_func, func_args)) else: evaluate_single_target_mids_model_for_dataset_fold( dataset_name=dataset_name, fold_i=fold_i, logger_name=logger_name, logger_file_name=logger_file_name, mids_classifier_abs_file_name= mids_classifier_abs_file_name, mids_target_attr_to_score_info_abs_file_name= mids_target_attr_to_score_info_abs_file_name, mids_interpret_stats_abs_file_name= mids_interpret_stats_abs_file_name) if use_dask: log_file_dir: str = assoc_vs_tree_based_single_target_mids_clf_dir() logger_name: str = f'evaluate_single_target_tree_mids_ERROR_LOGGER' logger_file_name: str = os.path.join( log_file_dir, f'ERROR_LOG_model_evaluation_single_target_tree_mids.log') compute_delayed_functions(list_of_computations=list_of_computations, client=client, nb_of_retries_if_erred=5, error_logger_name=logger_name, error_logger_file_name=logger_file_name)
def main(): from experiments.arcbench_data_preparation.dataset_info import datasets datasets = [ dict(filename="iris", targetvariablename="class", numerical=True) ] from experiments.dask_utils.dask_initialization import scheduler_host_name scheduler_host: str = scheduler_host_name min_support = 0.1 max_depth = 7 nb_of_trees_to_use_list: List[int] = [25, 50] list_of_computations: List[Tuple[Delayed, Dict]] = [] use_dask = False if use_dask: client: Client = reconnect_client_to_ssh_cluster(scheduler_host) for dataset_info in datasets: dataset_name = dataset_info['filename'] for fold_i in range(10): original_train_data_fold_abs_file_name = get_original_data_fold_abs_file_name( dataset_name, fold_i, TrainTestEnum.train) target_columns: List[str] = get_header_attributes( original_train_data_fold_abs_file_name) for target_column in target_columns: target_attribute = str(target_column) for nb_of_trees_to_use in nb_of_trees_to_use_list: if use_dask: func_args = dict( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, nb_of_trees_per_model=nb_of_trees_to_use, min_support=min_support, max_depth=max_depth) delayed_func = \ delayed(learn_and_convert_single_target_tree_ensemble_to_rules)( **func_args ) list_of_computations.append((delayed_func, func_args)) else: learn_and_convert_single_target_tree_ensemble_to_rules( dataset_name=dataset_name, fold_i=fold_i, target_attribute=target_attribute, nb_of_trees_per_model=nb_of_trees_to_use, min_support=min_support, max_depth=max_depth) # Result: pairs of (association rule, tree-based rule) sets, with an increasing number of rules. # --- Learn an (M)IDS model for each of the two rule sets in a pair. ------------------------------ # --- Evaluate the learned IDS models using the chosen evaluation metrics. ------------------------ # --- Plot the evaluation metrics in function of the increasing number of rules. ------------------ if use_dask: log_file_dir: str = get_single_target_tree_rule_dir() logger_name: str = f'mine_single_target_tree_mids_ERROR_LOGGER' logger_file_name: str = os.path.join( log_file_dir, f'ERROR_LOG_model_induction_single_target_tree_mids_easy.log') compute_delayed_functions(list_of_computations=list_of_computations, client=client, nb_of_retries_if_erred=5, error_logger_name=logger_name, error_logger_file_name=logger_file_name)