def test_tasks(self, learners, base_learners, measures, results_path, save_orange_data=False): """Repeat the following experiment self._repeats times: Prepare tasks' data with the _prepare_tasks_data() function. Test the performance of the given learning algorithms with the given base learning algorithms and compute the testing results using the given scoring measures. Process the obtained repetition scores with the _process_repetition_scores() function. Note: This function only test some specific combinations of base_learners and learners as used by the binarization experiment. Arguments: learners -- ordered dictionary with items of the form (name, learner), where name is a string representing the learner's name and learner is a MTL method (e.g. ERM, NoMerging, ...) base learners -- ordered dictionary with items of the form (name, learner), where name is a string representing the base learner's name and learner is a scikit-learn estimator object measures -- list of strings representing measure's names (currently, only CA and AUC are supported) results_path -- string representing the path where to save any extra information about the running of this test (currently, only used for pickling the results when there is an error in calling the learner) save_orange_data -- boolean indicating whether to save the Orange data tables created with the call to self._prepare_tasks_data() function """ rpt_scores = OrderedDict() dend_info = {bl : OrderedDict() for bl in base_learners.iterkeys()} for i in range(self._repeats): self._repetition_number = i self._prepare_tasks_data(**self._tasks_data_params) if save_orange_data: self._save_orange_data(i, results_path) rpt_scores[i] = {bl : dict() for bl in base_learners.iterkeys()} for bl in base_learners: for l in learners: start = time.clock() try: if isinstance(learners[l], bin_exp.TreeMarkedAndMergedLearner): R = learners[l](self._tasks.keys(), self._merged_learn_data_orange, base_learners[bl]) elif isinstance(base_learners[bl], Orange.core.Learner): wrapped_bl = OrangeClassifierWrapper( orange_learner=base_learners[bl]) R = learners[l](self._tasks, wrapped_bl) else: raise ValueError("An unexpected combination of " "base_learner and leaner detected: {} and " "{}".format(type(base_learners[bl]), type(learners[l]))) except Exception as e: logger.exception("There was an error during repetition:" " {} with base learner: {} and learner: {}.".\ format(i, bl, l)) if i > 0: logger.info("Saving the results of previous " "repetitions.") # remove the scores of the last repetition del rpt_scores[i] # process the remaining repetition scores self._process_repetition_scores(rpt_scores, dend_info) # pickle them to a file pickle_path_fmt = os.path.join(results_path, "bl-{}.pkl") self.pickle_test_results(pickle_path_fmt) # re-raise the original exception import sys exc_info = sys.exc_info() raise exc_info[1], None, exc_info[2] rpt_scores[i][bl][l] = self._test_tasks(R["task_models"], measures) end = time.clock() logger.debug("Finished repetition: {}, base learner: {}, " "learner: {} in {:.2f}s".format(i, bl, l, end-start)) # store dendrogram info if the results contain it if "dend_info" in R: dend_info[bl][i] = R["dend_info"] # pickle and visualize the decision tree if the learner is a # (sub)class of TreeMarkedAndMergedLearner if isinstance(learners[l], bin_exp.TreeMarkedAndMergedLearner): tree = R["task_models"].values()[0] pickle_path = os.path.join(results_path, "{}-{}-" "repeat{}.pkl".format(bl, l, i)) svg_path = os.path.join(results_path, "{}-{}-repeat{}" ".svg".format(bl, l, i)) tikz_path = os.path.join(results_path, "{}-{}-repeat{}" "-tikz.tex".format(bl, l, i)) pickle_obj(tree, pickle_path) save_treegraph_image(tree, svg_path) draw_and_save_tikz_tree_document(tree, tikz_path) self._process_repetition_scores(rpt_scores, dend_info)
split_const = octree.SplitConstructor_ExhaustiveBinary(measure= fscoring.InfoGain()) tree_learner = octree.TreeLearner(split=split_const, min_instances=10, same_majority_pruning=True, store_instances=True) from Orange.data import Table # TEST for equality of "original" vs. "pickled/unpickled" Orange trees from PyMTL.util import pickle_obj, unpickle_obj import numpy as np for i in range(10): data = Table(os.path.join(results_path, "bool_func-a8d4n100g2tg5nse0.0rs15" "nls10-seed63-complete_test/orange_merged_learn-" "repetition{}.tab".format(i))) tree = tree_learner(data) pickle_path = os.path.join(results_path, "test-pickle.pkl") pickle_obj(tree, pickle_path) unpickled_tree = unpickle_obj(pickle_path) print ("Repetition {} original vs. pickled/unpickled tree equality:". format(i)), print np.all(tree[e] == unpickled_tree[e] for e in data) os.remove(pickle_path) data = Table(os.path.join(results_path, "bool_func-a8d4n100g2tg5nse0.0rs15" "nls10-seed63-complete_test/orange_merged_learn-" "repetition0.tab")) tree = tree_learner(data) tex_file = os.path.join(results_path, "test-tree.tex") pdf_file = os.path.join(results_path, "test-tree.pdf") draw_and_save_tikz_tree_document(tree, tex_file) import subprocess subprocess.call(["-c", "pdflatex -interaction=batchmode {0} && "
def generate_boolean_data_with_complete_test_sets(a, d, n, g, tg, noise, random_seed=1, n_learning_sets=1, funcs_pickle_path=None): """Generate a synthetic MTL problem of learning Boolean functions according to the given parameters. In addition, create test sets that cover the complete attribute space (2**a distinct examples). Log the report about the generated MTL problem, which includes: - the Boolean function of each group, - the % of True values in y for each task, - the average % of True values in y (across all tasks). Parameters ---------- a : int Number of attributes/variables of the generated Boolean functions. d : int The expected number of attributes/variables in a disjunct. n : int The number of examples for each task to generate. g : int The number of task groups to generate. Each task group shares the same Boolean functions. tg : int The number of tasks (with their corresponding data) to generate for each task group. noise : float The proportion of examples of each task that have their class values determined randomly. random_seed : int (optional) The random seed with which to initialize a private Random object. n_learning_sets : int (optional) The number of different learning sets to create for each task. funcs_pickle_path : str (optional) Path where to pickle the list of generated Boolean functions. Returns ------- tasks : list If n_learning_sets == 1, a list of Bunch objects corresponding to Boolean function learning tasks. Otherwise, a list of lists of Bunch objects, where each list corresponds to a set of different learning sets for each task. tasks_complete_test_sets : list A list of (X, y) tuples corresponding to complete testing sets for each task. """ tasks, funcs, attr = _generate_boolean_data(a, d, n, g, tg, noise, random_seed, n_learning_sets=n_learning_sets) if funcs_pickle_path: pickle_obj(funcs, funcs_pickle_path) tasks_complete_test_sets = [] # generate a complete testing set for each Boolean function n_funcs = len(funcs) print ("Generating the complete test sets for {} Boolean functions". format(n_funcs)) for i, func in enumerate(funcs): complete_test_set = _generate_complete_test_set(attr, func) # duplicate the generated complete testing set for each task from the # current task group for _ in range(tg): tasks_complete_test_sets.append(complete_test_set) update_progress(1.* (i + 1) / n_funcs) print _report_about_generated_boolean_mtl_problem(funcs, tasks) return tasks, tasks_complete_test_sets
def test_tasks(self, learners, base_learners, measures, results_path, save_orange_data=False): """Repeat the following experiment self._repeats times: Prepare tasks' data with the _prepare_tasks_data() function. Test the performance of the given learning algorithms with the given base learning algorithms and compute the testing results using the given scoring measures. Process the obtained repetition scores with the _process_repetition_scores() function. Note: This function only test some specific combinations of base_learners and learners as used by the binarization experiment. Arguments: learners -- ordered dictionary with items of the form (name, learner), where name is a string representing the learner's name and learner is a MTL method (e.g. ERM, NoMerging, ...) base learners -- ordered dictionary with items of the form (name, learner), where name is a string representing the base learner's name and learner is a scikit-learn estimator object measures -- list of strings representing measure's names (currently, only CA and AUC are supported) results_path -- string representing the path where to save any extra information about the running of this test (currently, only used for pickling the results when there is an error in calling the learner) save_orange_data -- boolean indicating whether to save the Orange data tables created with the call to self._prepare_tasks_data() function """ rpt_scores = OrderedDict() dend_info = {bl: OrderedDict() for bl in base_learners.iterkeys()} for i in range(self._repeats): self._repetition_number = i self._prepare_tasks_data(**self._tasks_data_params) if save_orange_data: self._save_orange_data(i, results_path) rpt_scores[i] = {bl: dict() for bl in base_learners.iterkeys()} for bl in base_learners: for l in learners: start = time.clock() try: if isinstance(learners[l], bin_exp.TreeMarkedAndMergedLearner): R = learners[l](self._tasks.keys(), self._merged_learn_data_orange, base_learners[bl]) elif isinstance(base_learners[bl], Orange.core.Learner): wrapped_bl = OrangeClassifierWrapper( orange_learner=base_learners[bl]) R = learners[l](self._tasks, wrapped_bl) else: raise ValueError( "An unexpected combination of " "base_learner and leaner detected: {} and " "{}".format(type(base_learners[bl]), type(learners[l]))) except Exception as e: logger.exception("There was an error during repetition:" " {} with base learner: {} and learner: {}.".\ format(i, bl, l)) if i > 0: logger.info("Saving the results of previous " "repetitions.") # remove the scores of the last repetition del rpt_scores[i] # process the remaining repetition scores self._process_repetition_scores( rpt_scores, dend_info) # pickle them to a file pickle_path_fmt = os.path.join( results_path, "bl-{}.pkl") self.pickle_test_results(pickle_path_fmt) # re-raise the original exception import sys exc_info = sys.exc_info() raise exc_info[1], None, exc_info[2] rpt_scores[i][bl][l] = self._test_tasks( R["task_models"], measures) end = time.clock() logger.debug("Finished repetition: {}, base learner: {}, " "learner: {} in {:.2f}s".format( i, bl, l, end - start)) # store dendrogram info if the results contain it if "dend_info" in R: dend_info[bl][i] = R["dend_info"] # pickle and visualize the decision tree if the learner is a # (sub)class of TreeMarkedAndMergedLearner if isinstance(learners[l], bin_exp.TreeMarkedAndMergedLearner): tree = R["task_models"].values()[0] pickle_path = os.path.join( results_path, "{}-{}-" "repeat{}.pkl".format(bl, l, i)) svg_path = os.path.join( results_path, "{}-{}-repeat{}" ".svg".format(bl, l, i)) tikz_path = os.path.join( results_path, "{}-{}-repeat{}" "-tikz.tex".format(bl, l, i)) pickle_obj(tree, pickle_path) save_treegraph_image(tree, svg_path) draw_and_save_tikz_tree_document(tree, tikz_path) self._process_repetition_scores(rpt_scores, dend_info)
same_majority_pruning=True, store_instances=True) from Orange.data import Table # TEST for equality of "original" vs. "pickled/unpickled" Orange trees from PyMTL.util import pickle_obj, unpickle_obj import numpy as np for i in range(10): data = Table( os.path.join( results_path, "bool_func-a8d4n100g2tg5nse0.0rs15" "nls10-seed63-complete_test/orange_merged_learn-" "repetition{}.tab".format(i))) tree = tree_learner(data) pickle_path = os.path.join(results_path, "test-pickle.pkl") pickle_obj(tree, pickle_path) unpickled_tree = unpickle_obj(pickle_path) print("Repetition {} original vs. pickled/unpickled tree equality:". format(i)), print np.all(tree[e] == unpickled_tree[e] for e in data) os.remove(pickle_path) data = Table( os.path.join( results_path, "bool_func-a8d4n100g2tg5nse0.0rs15" "nls10-seed63-complete_test/orange_merged_learn-" "repetition0.tab")) tree = tree_learner(data) tex_file = os.path.join(results_path, "test-tree.tex") pdf_file = os.path.join(results_path, "test-tree.pdf") draw_and_save_tikz_tree_document(tree, tex_file)
def generate_boolean_data_with_complete_test_sets(a, d, n, g, tg, noise, random_seed=1, n_learning_sets=1, funcs_pickle_path=None): """Generate a synthetic MTL problem of learning Boolean functions according to the given parameters. In addition, create test sets that cover the complete attribute space (2**a distinct examples). Log the report about the generated MTL problem, which includes: - the Boolean function of each group, - the % of True values in y for each task, - the average % of True values in y (across all tasks). Parameters ---------- a : int Number of attributes/variables of the generated Boolean functions. d : int The expected number of attributes/variables in a disjunct. n : int The number of examples for each task to generate. g : int The number of task groups to generate. Each task group shares the same Boolean functions. tg : int The number of tasks (with their corresponding data) to generate for each task group. noise : float The proportion of examples of each task that have their class values determined randomly. random_seed : int (optional) The random seed with which to initialize a private Random object. n_learning_sets : int (optional) The number of different learning sets to create for each task. funcs_pickle_path : str (optional) Path where to pickle the list of generated Boolean functions. Returns ------- tasks : list If n_learning_sets == 1, a list of Bunch objects corresponding to Boolean function learning tasks. Otherwise, a list of lists of Bunch objects, where each list corresponds to a set of different learning sets for each task. tasks_complete_test_sets : list A list of (X, y) tuples corresponding to complete testing sets for each task. """ tasks, funcs, attr = _generate_boolean_data( a, d, n, g, tg, noise, random_seed, n_learning_sets=n_learning_sets) if funcs_pickle_path: pickle_obj(funcs, funcs_pickle_path) tasks_complete_test_sets = [] # generate a complete testing set for each Boolean function n_funcs = len(funcs) print("Generating the complete test sets for {} Boolean functions".format( n_funcs)) for i, func in enumerate(funcs): complete_test_set = _generate_complete_test_set(attr, func) # duplicate the generated complete testing set for each task from the # current task group for _ in range(tg): tasks_complete_test_sets.append(complete_test_set) update_progress(1. * (i + 1) / n_funcs) print _report_about_generated_boolean_mtl_problem(funcs, tasks) return tasks, tasks_complete_test_sets