def test_tasks(self, learners, base_learners, measures, results_path,
                save_orange_data=False):
     """Repeat the following experiment self._repeats times:
     Prepare tasks' data with the _prepare_tasks_data() function.
     Test the performance of the given learning algorithms with the given
     base learning algorithms and compute the testing results using the
     given scoring measures.
     Process the obtained repetition scores with the
     _process_repetition_scores() function.
     Note: This function only test some specific combinations of
     base_learners and learners as used by the binarization experiment.
     
     Arguments:
     learners -- ordered dictionary with items of the form (name, learner),
         where name is a string representing the learner's name and
         learner is a MTL method (e.g. ERM, NoMerging, ...) 
     base learners -- ordered dictionary with items of the form (name,
         learner), where name is a string representing the base learner's
         name and learner is a scikit-learn estimator object
     measures -- list of strings representing measure's names (currently,
         only CA and AUC are supported)
     results_path -- string representing the path where to save any extra
         information about the running of this test (currently, only used
         for pickling the results when there is an error in calling the
         learner)
     save_orange_data -- boolean indicating whether to save the Orange data
         tables created with the call to self._prepare_tasks_data() function
     
     """
     rpt_scores = OrderedDict()
     dend_info = {bl : OrderedDict() for bl in base_learners.iterkeys()}
     for i in range(self._repeats):
         self._repetition_number = i
         self._prepare_tasks_data(**self._tasks_data_params)
         if save_orange_data:
             self._save_orange_data(i, results_path)
         rpt_scores[i] = {bl : dict() for bl in base_learners.iterkeys()}
         for bl in base_learners:
             for l in learners:
                 start = time.clock()
                 try: 
                     if isinstance(learners[l],
                                   bin_exp.TreeMarkedAndMergedLearner):
                         R = learners[l](self._tasks.keys(),
                                         self._merged_learn_data_orange,
                                         base_learners[bl])
                     elif isinstance(base_learners[bl], Orange.core.Learner):
                         wrapped_bl = OrangeClassifierWrapper(
                                         orange_learner=base_learners[bl])
                         R = learners[l](self._tasks, wrapped_bl)
                     else:
                         raise ValueError("An unexpected combination of "
                                 "base_learner and leaner detected: {} and "
                                 "{}".format(type(base_learners[bl]),
                                             type(learners[l])))
                 except Exception as e:
                     logger.exception("There was an error during repetition:"
                         " {} with base learner: {} and learner: {}.".\
                         format(i, bl, l))
                     if i > 0:
                         logger.info("Saving the results of previous "
                                     "repetitions.")
                         # remove the scores of the last repetition
                         del rpt_scores[i]
                         # process the remaining repetition scores
                         self._process_repetition_scores(rpt_scores,
                                                         dend_info)
                         # pickle them to a file
                         pickle_path_fmt = os.path.join(results_path,
                                                        "bl-{}.pkl")
                         self.pickle_test_results(pickle_path_fmt)
                     # re-raise the original exception
                     import sys
                     exc_info = sys.exc_info()
                     raise exc_info[1], None, exc_info[2]
                 rpt_scores[i][bl][l] = self._test_tasks(R["task_models"],
                                                         measures)
                 end = time.clock()
                 logger.debug("Finished repetition: {}, base learner: {}, "
                     "learner: {} in {:.2f}s".format(i, bl, l, end-start))
                 # store dendrogram info if the results contain it 
                 if "dend_info" in R:
                     dend_info[bl][i] = R["dend_info"]
                 # pickle and visualize the decision tree if the learner is a
                 # (sub)class of TreeMarkedAndMergedLearner
                 if isinstance(learners[l],
                               bin_exp.TreeMarkedAndMergedLearner):
                     tree = R["task_models"].values()[0]
                     pickle_path = os.path.join(results_path, "{}-{}-"
                                     "repeat{}.pkl".format(bl, l, i))
                     svg_path = os.path.join(results_path, "{}-{}-repeat{}"
                                             ".svg".format(bl, l, i))
                     tikz_path = os.path.join(results_path, "{}-{}-repeat{}"
                                              "-tikz.tex".format(bl, l, i))
                     pickle_obj(tree, pickle_path)
                     save_treegraph_image(tree, svg_path)
                     draw_and_save_tikz_tree_document(tree, tikz_path)
     self._process_repetition_scores(rpt_scores, dend_info)
 split_const = octree.SplitConstructor_ExhaustiveBinary(measure=
                                                        fscoring.InfoGain())
 tree_learner = octree.TreeLearner(split=split_const, min_instances=10,
                         same_majority_pruning=True, store_instances=True)
 from Orange.data import Table
 
 # TEST for equality of "original" vs. "pickled/unpickled" Orange trees
 from PyMTL.util import pickle_obj, unpickle_obj
 import numpy as np
 for i in range(10):
     data = Table(os.path.join(results_path, "bool_func-a8d4n100g2tg5nse0.0rs15"
                           "nls10-seed63-complete_test/orange_merged_learn-"
                           "repetition{}.tab".format(i)))
     tree = tree_learner(data)
     pickle_path = os.path.join(results_path, "test-pickle.pkl")
     pickle_obj(tree, pickle_path)
     unpickled_tree = unpickle_obj(pickle_path)
     print ("Repetition {} original vs. pickled/unpickled tree equality:".
            format(i)),
     print np.all(tree[e] == unpickled_tree[e] for e in data)
 os.remove(pickle_path)
 
 data = Table(os.path.join(results_path, "bool_func-a8d4n100g2tg5nse0.0rs15"
                           "nls10-seed63-complete_test/orange_merged_learn-"
                           "repetition0.tab"))
 tree = tree_learner(data)
 tex_file = os.path.join(results_path, "test-tree.tex")
 pdf_file = os.path.join(results_path, "test-tree.pdf")
 draw_and_save_tikz_tree_document(tree, tex_file)
 import subprocess
 subprocess.call(["-c", "pdflatex -interaction=batchmode {0} && "
Esempio n. 3
0
def generate_boolean_data_with_complete_test_sets(a, d, n, g, tg, noise,
        random_seed=1, n_learning_sets=1, funcs_pickle_path=None):
    """Generate a synthetic MTL problem of learning Boolean functions according
    to the given parameters. In addition, create test sets that cover the
    complete attribute space (2**a distinct examples).
    Log the report about the generated MTL problem, which includes:
    - the Boolean function of each group,
    - the % of True values in y for each task,
    - the average % of True values in y (across all tasks).
    
    Parameters
    ----------
    a : int
        Number of attributes/variables of the generated Boolean functions.
    d : int
        The expected number of attributes/variables in a disjunct.
    n : int
        The number of examples for each task to generate.
    g : int
        The number of task groups to generate. Each task group shares the
        same Boolean functions.
    tg : int
        The number of tasks (with their corresponding data) to generate for
        each task group.
    noise : float
        The proportion of examples of each task that have their class values
        determined randomly.
    random_seed : int (optional)
        The random seed with which to initialize a private Random object.
    n_learning_sets : int (optional)
        The number of different learning sets to create for each task.
    funcs_pickle_path : str (optional)
        Path where to pickle the list of generated Boolean functions. 
    
    Returns
    -------
    tasks : list
        If n_learning_sets == 1, a list of Bunch objects corresponding to
        Boolean function learning tasks.
        Otherwise, a list of lists of Bunch objects, where each list corresponds
        to a set of different learning sets for each task.
    tasks_complete_test_sets : list
        A list of (X, y) tuples corresponding to complete testing sets for each
        task.
    
    """
    tasks, funcs, attr = _generate_boolean_data(a, d, n, g, tg, noise,
                            random_seed, n_learning_sets=n_learning_sets)
    if funcs_pickle_path:
        pickle_obj(funcs, funcs_pickle_path)
    
    tasks_complete_test_sets = []
    # generate a complete testing set for each Boolean function
    n_funcs = len(funcs)
    print ("Generating the complete test sets for {} Boolean functions".
           format(n_funcs))
    for i, func in enumerate(funcs):
        complete_test_set = _generate_complete_test_set(attr, func)
        # duplicate the generated complete testing set for each task from the
        # current task group 
        for _ in range(tg):
            tasks_complete_test_sets.append(complete_test_set)
        update_progress(1.* (i + 1) / n_funcs)
    print
    
    _report_about_generated_boolean_mtl_problem(funcs, tasks)
    return tasks, tasks_complete_test_sets
 def test_tasks(self,
                learners,
                base_learners,
                measures,
                results_path,
                save_orange_data=False):
     """Repeat the following experiment self._repeats times:
     Prepare tasks' data with the _prepare_tasks_data() function.
     Test the performance of the given learning algorithms with the given
     base learning algorithms and compute the testing results using the
     given scoring measures.
     Process the obtained repetition scores with the
     _process_repetition_scores() function.
     Note: This function only test some specific combinations of
     base_learners and learners as used by the binarization experiment.
     
     Arguments:
     learners -- ordered dictionary with items of the form (name, learner),
         where name is a string representing the learner's name and
         learner is a MTL method (e.g. ERM, NoMerging, ...) 
     base learners -- ordered dictionary with items of the form (name,
         learner), where name is a string representing the base learner's
         name and learner is a scikit-learn estimator object
     measures -- list of strings representing measure's names (currently,
         only CA and AUC are supported)
     results_path -- string representing the path where to save any extra
         information about the running of this test (currently, only used
         for pickling the results when there is an error in calling the
         learner)
     save_orange_data -- boolean indicating whether to save the Orange data
         tables created with the call to self._prepare_tasks_data() function
     
     """
     rpt_scores = OrderedDict()
     dend_info = {bl: OrderedDict() for bl in base_learners.iterkeys()}
     for i in range(self._repeats):
         self._repetition_number = i
         self._prepare_tasks_data(**self._tasks_data_params)
         if save_orange_data:
             self._save_orange_data(i, results_path)
         rpt_scores[i] = {bl: dict() for bl in base_learners.iterkeys()}
         for bl in base_learners:
             for l in learners:
                 start = time.clock()
                 try:
                     if isinstance(learners[l],
                                   bin_exp.TreeMarkedAndMergedLearner):
                         R = learners[l](self._tasks.keys(),
                                         self._merged_learn_data_orange,
                                         base_learners[bl])
                     elif isinstance(base_learners[bl],
                                     Orange.core.Learner):
                         wrapped_bl = OrangeClassifierWrapper(
                             orange_learner=base_learners[bl])
                         R = learners[l](self._tasks, wrapped_bl)
                     else:
                         raise ValueError(
                             "An unexpected combination of "
                             "base_learner and leaner detected: {} and "
                             "{}".format(type(base_learners[bl]),
                                         type(learners[l])))
                 except Exception as e:
                     logger.exception("There was an error during repetition:"
                         " {} with base learner: {} and learner: {}.".\
                         format(i, bl, l))
                     if i > 0:
                         logger.info("Saving the results of previous "
                                     "repetitions.")
                         # remove the scores of the last repetition
                         del rpt_scores[i]
                         # process the remaining repetition scores
                         self._process_repetition_scores(
                             rpt_scores, dend_info)
                         # pickle them to a file
                         pickle_path_fmt = os.path.join(
                             results_path, "bl-{}.pkl")
                         self.pickle_test_results(pickle_path_fmt)
                     # re-raise the original exception
                     import sys
                     exc_info = sys.exc_info()
                     raise exc_info[1], None, exc_info[2]
                 rpt_scores[i][bl][l] = self._test_tasks(
                     R["task_models"], measures)
                 end = time.clock()
                 logger.debug("Finished repetition: {}, base learner: {}, "
                              "learner: {} in {:.2f}s".format(
                                  i, bl, l, end - start))
                 # store dendrogram info if the results contain it
                 if "dend_info" in R:
                     dend_info[bl][i] = R["dend_info"]
                 # pickle and visualize the decision tree if the learner is a
                 # (sub)class of TreeMarkedAndMergedLearner
                 if isinstance(learners[l],
                               bin_exp.TreeMarkedAndMergedLearner):
                     tree = R["task_models"].values()[0]
                     pickle_path = os.path.join(
                         results_path, "{}-{}-"
                         "repeat{}.pkl".format(bl, l, i))
                     svg_path = os.path.join(
                         results_path, "{}-{}-repeat{}"
                         ".svg".format(bl, l, i))
                     tikz_path = os.path.join(
                         results_path, "{}-{}-repeat{}"
                         "-tikz.tex".format(bl, l, i))
                     pickle_obj(tree, pickle_path)
                     save_treegraph_image(tree, svg_path)
                     draw_and_save_tikz_tree_document(tree, tikz_path)
     self._process_repetition_scores(rpt_scores, dend_info)
                                      same_majority_pruning=True,
                                      store_instances=True)
    from Orange.data import Table

    # TEST for equality of "original" vs. "pickled/unpickled" Orange trees
    from PyMTL.util import pickle_obj, unpickle_obj
    import numpy as np
    for i in range(10):
        data = Table(
            os.path.join(
                results_path, "bool_func-a8d4n100g2tg5nse0.0rs15"
                "nls10-seed63-complete_test/orange_merged_learn-"
                "repetition{}.tab".format(i)))
        tree = tree_learner(data)
        pickle_path = os.path.join(results_path, "test-pickle.pkl")
        pickle_obj(tree, pickle_path)
        unpickled_tree = unpickle_obj(pickle_path)
        print("Repetition {} original vs. pickled/unpickled tree equality:".
              format(i)),
        print np.all(tree[e] == unpickled_tree[e] for e in data)
    os.remove(pickle_path)

    data = Table(
        os.path.join(
            results_path, "bool_func-a8d4n100g2tg5nse0.0rs15"
            "nls10-seed63-complete_test/orange_merged_learn-"
            "repetition0.tab"))
    tree = tree_learner(data)
    tex_file = os.path.join(results_path, "test-tree.tex")
    pdf_file = os.path.join(results_path, "test-tree.pdf")
    draw_and_save_tikz_tree_document(tree, tex_file)
Esempio n. 6
0
def generate_boolean_data_with_complete_test_sets(a,
                                                  d,
                                                  n,
                                                  g,
                                                  tg,
                                                  noise,
                                                  random_seed=1,
                                                  n_learning_sets=1,
                                                  funcs_pickle_path=None):
    """Generate a synthetic MTL problem of learning Boolean functions according
    to the given parameters. In addition, create test sets that cover the
    complete attribute space (2**a distinct examples).
    Log the report about the generated MTL problem, which includes:
    - the Boolean function of each group,
    - the % of True values in y for each task,
    - the average % of True values in y (across all tasks).
    
    Parameters
    ----------
    a : int
        Number of attributes/variables of the generated Boolean functions.
    d : int
        The expected number of attributes/variables in a disjunct.
    n : int
        The number of examples for each task to generate.
    g : int
        The number of task groups to generate. Each task group shares the
        same Boolean functions.
    tg : int
        The number of tasks (with their corresponding data) to generate for
        each task group.
    noise : float
        The proportion of examples of each task that have their class values
        determined randomly.
    random_seed : int (optional)
        The random seed with which to initialize a private Random object.
    n_learning_sets : int (optional)
        The number of different learning sets to create for each task.
    funcs_pickle_path : str (optional)
        Path where to pickle the list of generated Boolean functions. 
    
    Returns
    -------
    tasks : list
        If n_learning_sets == 1, a list of Bunch objects corresponding to
        Boolean function learning tasks.
        Otherwise, a list of lists of Bunch objects, where each list corresponds
        to a set of different learning sets for each task.
    tasks_complete_test_sets : list
        A list of (X, y) tuples corresponding to complete testing sets for each
        task.
    
    """
    tasks, funcs, attr = _generate_boolean_data(
        a, d, n, g, tg, noise, random_seed, n_learning_sets=n_learning_sets)
    if funcs_pickle_path:
        pickle_obj(funcs, funcs_pickle_path)

    tasks_complete_test_sets = []
    # generate a complete testing set for each Boolean function
    n_funcs = len(funcs)
    print("Generating the complete test sets for {} Boolean functions".format(
        n_funcs))
    for i, func in enumerate(funcs):
        complete_test_set = _generate_complete_test_set(attr, func)
        # duplicate the generated complete testing set for each task from the
        # current task group
        for _ in range(tg):
            tasks_complete_test_sets.append(complete_test_set)
        update_progress(1. * (i + 1) / n_funcs)
    print

    _report_about_generated_boolean_mtl_problem(funcs, tasks)
    return tasks, tasks_complete_test_sets