def do_one_fold( fold_index: int, test_key_set: Set[Constant], # fd: FoldData ): print('\n===========================') print('=== start FOLD ' + str(fold_index + 1) + ' of ' + str(fd.nb_folds)) print('===========================') training_example_collection, test_examples = split_examples_into_training_and_test_sets( fd.all_key_sets, test_key_set, fd.examples_collection_usable_for_training, fd.examples_usable_for_testing) print('\ttotal nb of labeled examples: ' + str(fd.total_nb_of_labeled_examples)) nb_of_training_ex = len(training_example_collection.example_wrappers_sp) nb_of_test_ex = len(test_examples) print('\tnb of TRAINING ex: ' + str(nb_of_training_ex)) print('\tnb of TEST ex: ' + str(nb_of_test_ex)) # =========================== start_time = time.time() # ============================================================================================================== print('\t=== start building tree for fold ' + str(fold_index + 1)) # TRAIN MODEL using training set tree = build_tree( fd.internal_ex_format, fd.treebuilder_type, fd.parsed_settings.language, fd.possible_labels, training_example_collection, prediction_goal=fd.prediction_goal, full_background_knowledge_sp=fd.full_background_knowledge_sp, debug_printing_tree_building=fd.debug_printing_tree_building, engine=fd.engine) tree = prune_tree( tree, debug_printing_tree_pruning=fd.debug_printing_tree_pruning) nb_of_nodes = tree.get_nb_of_nodes() nb_inner_nodes = tree.get_nb_of_inner_nodes() fd.total_nb_of_nodes_per_fold.append(nb_of_nodes) fd.nb_of_inner_node_per_fold.append(nb_inner_nodes) # write out tree tree_fname = fd.dir_output_files + fd.fname_prefix_fold + '_fold' + str( fold_index) + ".tree" write_out_tree(tree_fname, tree) print('\t=== end building tree for fold ' + str(fold_index + 1)) # ============================================================================================================== print('\t=== start converting tree to program for fold ' + str(fold_index + 1)) program = convert_tree_to_program( fd.kb_format, fd.treebuilder_type, tree, fd.parsed_settings.language, debug_printing=fd.debug_printing_program_conversion, prediction_goal=fd.prediction_goal, index_of_label_var=fd.index_of_label_var) program_fname = fd.dir_output_files + fd.fname_prefix_fold + '_fold' + str( fold_index) + ".program" write_out_program(program_fname, program) print('\t=== end converting tree to program for fold ' + str(fold_index + 1)) # ============================================================================================================== print('\t=== start classifying test set' + str(fold_index + 1)) # EVALUATE MODEL using test set classifier = get_keys_classifier( fd.internal_ex_format, program, fd.prediction_goal, fd.index_of_label_var, fd.stripped_background_knowledge, debug_printing=fd.debug_printing_get_classifier, engine=fd.engine) statistics_handler = do_labeled_examples_get_correctly_classified( classifier, test_examples, fd.possible_labels, fd. debug_printing_classification) # type: ClassificationStatisticsHandler # =================== end_time = time.time() # time in seconds: # time in seconds elapsed_time = end_time - start_time fd.execution_time_per_fold.append(elapsed_time) accuracy, _ = statistics_handler.get_accuracy() fd.accuracies_folds.append(accuracy) statistics_fname = fd.dir_output_files + fd.fname_prefix_fold + '_fold' + str( fold_index) + ".statistics" statistics_handler.write_out_statistics_to_file(statistics_fname) with open(statistics_fname, 'a') as f: f.write('\n\nnb of TRAINING ex: ' + str(nb_of_training_ex) + "\n") f.write('nb of TEST ex: ' + str(nb_of_test_ex) + "\n\n") f.write("total nb of nodes: " + str(nb_of_nodes) + "\n") f.write("nb of internal nodes: " + str(nb_inner_nodes) + "\n\n") f.write("execution time of fold: " + str(elapsed_time) + " seconds\n") print("total nb of nodes: " + str(nb_of_nodes)) print("nb of internal nodes: " + str(nb_inner_nodes)) print("execution time of fold: ", elapsed_time, "seconds") print('\t=== end classifying test set' + str(fold_index + 1)) print('\t=== end FOLD ' + str(fold_index + 1) + ' of ' + str(fd.nb_folds) + '\n')
def run_program(settings: ProgramSettings): # get the name of the program to run fname_labeled_examples = settings.filename_prefix + kb_suffix fname_settings = settings.filename_prefix + s_suffix # BACKGROUND KNOWLEDGE fname_background_knowledge = settings.filename_prefix + bg_suffix # background_knowledge = parse_background_knowledge(fname_background_knowledge) # else: # background_knowledge = None debug_printing = settings.debug_parsing if settings.kb_format is None: raise NotImplementedError( 'Automatic recognition of input format is not yet supported.') else: # SETTINGS FILE settings_file_parser = SettingsParserMapper.get_settings_parser( settings.kb_format) parsed_settings = settings_file_parser.parse(fname_settings) if settings.kb_format is KnowledgeBaseFormat.MODELS: possible_labels = parsed_settings.possible_labels training_examples_collection, background_knowledge_wrapper \ = preprocessing_examples_models(fname_labeled_examples, parsed_settings, settings.internal_examples_format, fname_background_knowledge) prediction_goal = None index_of_label_var = None elif settings.kb_format is KnowledgeBaseFormat.KEYS: training_examples_collection, prediction_goal, index_of_label_var, possible_labels, background_knowledge_wrapper = \ preprocessing_examples_keys(fname_labeled_examples, parsed_settings, settings.internal_examples_format, fname_background_knowledge, filter_out_unlabeled_examples=False) else: raise KnowledgeBaseFormatException( 'Only the input formats Models and Key are supported.') engine = DefaultEngine() engine.unknown = 1 full_background_knowledge_sp = background_knowledge_wrapper.get_full_background_knowledge_simple_program( ) tree = build_tree( settings.internal_examples_format, settings.treebuilder_type, parsed_settings.language, possible_labels, training_examples_collection, prediction_goal=prediction_goal, full_background_knowledge_sp=full_background_knowledge_sp, debug_printing_tree_building=debug_printing, engine=engine) tree = prune_tree(tree) program = convert_tree_to_program( settings.kb_format, settings.treebuilder_type, tree, parsed_settings.language, debug_printing=debug_printing, prediction_goal=prediction_goal, index_of_label_var=index_of_label_var)
def do_all_examples(fd: FoldData): print('\n=======================================') print('=== FINALLY, learn tree on all examples') print('========================================') print('\ttotal nb of labeled examples: ' + str(fd.total_nb_of_labeled_examples)) print('\t=== start building tree for ALL examples') # =========================== start_time = time.time() # TRAIN MODEL using training set tree = build_tree( fd.internal_ex_format, fd.treebuilder_type, fd.parsed_settings.language, fd.possible_labels, fd.examples_collection_usable_for_training, prediction_goal=fd.prediction_goal, full_background_knowledge_sp=fd.full_background_knowledge_sp, debug_printing_tree_building=fd.debug_printing_tree_building, engine=fd.engine) tree = prune_tree( tree, debug_printing_tree_pruning=fd.debug_printing_tree_pruning) nb_of_nodes = tree.get_nb_of_nodes() nb_inner_nodes = tree.get_nb_of_inner_nodes() fd.total_nb_of_nodes_per_fold.append(nb_of_nodes) fd.nb_of_inner_node_per_fold.append(nb_inner_nodes) # write out tree tree_fname = fd.dir_output_files + fd.fname_prefix_fold + ".tree" write_out_tree(tree_fname, tree) print('=== end building tree for ALL examples') print('=== start converting tree to program for ALL examples') program = convert_tree_to_program( fd.kb_format, fd.treebuilder_type, tree, fd.parsed_settings.language, debug_printing=fd.debug_printing_program_conversion, prediction_goal=fd.prediction_goal, index_of_label_var=fd.index_of_label_var) program_fname = fd.dir_output_files + fd.fname_prefix_fold + ".program" write_out_program(program_fname, program) print('=== end converting tree to program for ALL examples') all_examples = fd.examples_collection_usable_for_training.get_labeled_examples( ) print('\t=== start classifying total set') # EVALUATE MODEL using test set classifier = get_keys_classifier( fd.internal_ex_format, program, fd.prediction_goal, fd.index_of_label_var, fd.stripped_background_knowledge, debug_printing=fd.debug_printing_get_classifier, engine=fd.engine) statistics_handler = do_labeled_examples_get_correctly_classified( classifier, all_examples, fd.possible_labels, fd. debug_printing_classification) # type: ClassificationStatisticsHandler end_time = time.time() # time in seconds: # time in seconds elapsed_time = end_time - start_time accuracy, _ = statistics_handler.get_accuracy() statistics_fname = fd.dir_output_files + fd.fname_prefix_fold + ".statistics" statistics_handler.write_out_statistics_to_file(statistics_fname) mean_accuracy_of_folds = mean(fd.accuracies_folds) var_accuracy_of_folds = variance(fd.accuracies_folds, mean_accuracy_of_folds) std_accuracy_of_folds = sqrt(var_accuracy_of_folds) confidence = 0.9 mean_acc, conf_left, conf_right, diff_from_mean = mean_confidence_interval( fd.accuracies_folds, confidence) mean_total_nb_of_nodes = mean(fd.total_nb_of_nodes_per_fold) var_total_nb_of_nodes = variance(fd.total_nb_of_nodes_per_fold, mean_total_nb_of_nodes) std_total_nb_of_nodes = sqrt(var_total_nb_of_nodes) mean_nb_of_inner_nodes = mean(fd.nb_of_inner_node_per_fold) var_nb_of_inner_nodes = variance(fd.nb_of_inner_node_per_fold, mean_nb_of_inner_nodes) std_nb_of_inner_nodes = sqrt(var_nb_of_inner_nodes) total_execution_time_of_cross_validation = sum(fd.execution_time_per_fold) with open(statistics_fname, 'a') as f: f.write("\n\ntotal nb of examples (labeled + unlabeled): " + str(fd.total_nb_of_examples) + "\n") f.write("total nb of LABELED examples: " + str(fd.total_nb_of_labeled_examples) + "\n\n") f.write("list of accuracies per fold:\n") f.write("\t" + str(fd.accuracies_folds) + "\n") f.write("mean accuracy: " + str(mean_accuracy_of_folds) + "\n") f.write("var accuracy: " + str(var_accuracy_of_folds) + "\n") f.write("std accuracy: " + str(std_accuracy_of_folds) + "\n") f.write("accuracy of total tree: " + str(statistics_handler.get_accuracy()[0]) + "\n\n") f.write("accuracy " + str(confidence * 100) + "% confidence interval: [" + str(conf_left) + "," + str(conf_right) + "]\n") f.write("\taccuracy " + str(confidence * 100) + "% confidence interval around mean: " + str(mean_acc) + " +- " + str(diff_from_mean) + "\n\n") f.write("total nb of nodes in total tree: " + str(nb_of_nodes) + "\n") f.write("nb of internal nodes in total tree: " + str(nb_inner_nodes) + "\n\n") f.write("list of total nb of nodes per fold:\n") f.write("\t" + str(fd.total_nb_of_nodes_per_fold) + "\n") f.write("mean total nb of nodes: " + str(mean_total_nb_of_nodes) + "\n") f.write("var total nb of nodes: " + str(var_total_nb_of_nodes) + "\n") f.write("std total nb of nodes: " + str(std_total_nb_of_nodes) + "\n\n") f.write("list of nb of internal nodes per fold:\n") f.write("\t" + str(fd.nb_of_inner_node_per_fold) + "\n") f.write("mean nb of internal nodes: " + str(mean_nb_of_inner_nodes) + "\n") f.write("var nb of internal nodes: " + str(var_nb_of_inner_nodes) + "\n") f.write("std nb of internal nodes: " + str(std_nb_of_inner_nodes) + "\n\n") f.write("execution times of folds:\n") f.write("\t" + str(fd.execution_time_per_fold) + "\n") f.write("total time cross (sum folds): " + str(total_execution_time_of_cross_validation) + " seconds\n") f.write("time total tree building + verifying: " + str(elapsed_time) + " seconds\n") print("total nb of nodes in total tree: " + str(nb_of_nodes)) print("nb of internal nodes in total tree: " + str(nb_inner_nodes)) print() print("list of accuracies per fold:") print("\t" + str(fd.accuracies_folds)) print("mean accuracy: " + str(mean_accuracy_of_folds)) print("var accuracy: " + str(var_accuracy_of_folds)) print("std accuracy " + str(std_accuracy_of_folds)) print("accuracy of total tree: " + str(statistics_handler.get_accuracy())) print() print("accuracy " + str(confidence * 100) + "% confidence interval: [" + str(conf_left) + "," + str(conf_right) + "]") print("\taccuracy " + str(confidence * 100) + "% confidence interval around mean: " + str(mean_acc) + " +- " + str(diff_from_mean)) print() print("total nb of nodes in total tree: " + str(nb_of_nodes)) print("nb of internal nodes in total tree: " + str(nb_inner_nodes)) print() print("list of total nb of nodes per fold:") print("\t" + str(fd.total_nb_of_nodes_per_fold)) print("mean total nb of nodes: " + str(mean_total_nb_of_nodes)) print("var total nb of nodes: " + str(var_total_nb_of_nodes)) print("std total nb of nodes: " + str(std_total_nb_of_nodes)) print() print("list of nb of internal nodes per fold:") print("\t" + str(fd.nb_of_inner_node_per_fold)) print("mean nb of internal nodes: " + str(mean_nb_of_inner_nodes)) print("var nb of internal nodes: " + str(var_nb_of_inner_nodes)) print("std nb of internal nodes: " + str(std_nb_of_inner_nodes)) print() print("execution times of folds:") print("\t" + str(fd.execution_time_per_fold)) print("total time cross (sum folds):", total_execution_time_of_cross_validation, "seconds") print("time total tree building + verifying:", elapsed_time, "seconds") print('\t=== end classifying total set')