def generate_boolean_data_with_complete_test_sets(a, d, n, g, tg, noise, random_seed=1, n_learning_sets=1, funcs_pickle_path=None): """Generate a synthetic MTL problem of learning Boolean functions according to the given parameters. In addition, create test sets that cover the complete attribute space (2**a distinct examples). Log the report about the generated MTL problem, which includes: - the Boolean function of each group, - the % of True values in y for each task, - the average % of True values in y (across all tasks). Parameters ---------- a : int Number of attributes/variables of the generated Boolean functions. d : int The expected number of attributes/variables in a disjunct. n : int The number of examples for each task to generate. g : int The number of task groups to generate. Each task group shares the same Boolean functions. tg : int The number of tasks (with their corresponding data) to generate for each task group. noise : float The proportion of examples of each task that have their class values determined randomly. random_seed : int (optional) The random seed with which to initialize a private Random object. n_learning_sets : int (optional) The number of different learning sets to create for each task. funcs_pickle_path : str (optional) Path where to pickle the list of generated Boolean functions. Returns ------- tasks : list If n_learning_sets == 1, a list of Bunch objects corresponding to Boolean function learning tasks. Otherwise, a list of lists of Bunch objects, where each list corresponds to a set of different learning sets for each task. tasks_complete_test_sets : list A list of (X, y) tuples corresponding to complete testing sets for each task. """ tasks, funcs, attr = _generate_boolean_data(a, d, n, g, tg, noise, random_seed, n_learning_sets=n_learning_sets) if funcs_pickle_path: pickle_obj(funcs, funcs_pickle_path) tasks_complete_test_sets = [] # generate a complete testing set for each Boolean function n_funcs = len(funcs) print ("Generating the complete test sets for {} Boolean functions". format(n_funcs)) for i, func in enumerate(funcs): complete_test_set = _generate_complete_test_set(attr, func) # duplicate the generated complete testing set for each task from the # current task group for _ in range(tg): tasks_complete_test_sets.append(complete_test_set) update_progress(1.* (i + 1) / n_funcs) print _report_about_generated_boolean_mtl_problem(funcs, tasks) return tasks, tasks_complete_test_sets
def __call__(self, tasks, base_learner): """Run the merging algorithm for the given tasks. Perform the intelligent merging of tasks' data according to the ERM learning method. After the merging is complete, build a model for each remaining (merged) task and assign this model to each original task of this (merged) task. Return a dictionary of data structures computed within this call to ERM. It has the following keys: task_models -- dictionary mapping from each original task id to its model dend_info -- list of tuples (one for each merged task) as returned by the convert_merg_history_to_scipy_linkage function Arguments: tasks -- dictionary mapping from tasks' ids to their Task objects base_learner -- scikit-learn estimator """ self._base_learner = base_learner # create an ordered dictionary of MergedTask objects from the given # dictionary of tasks self._tasks = OrderedDict() for _, task in sorted(tasks.iteritems()): merg_task = MergedTask(task) self._tasks[merg_task.id] = merg_task # populate the dictionary of task pairs that are candidates for merging C = dict() pairs = list(combinations(self._tasks, 2)) n_pairs = len(pairs) msg = "Computing candidate pairs for merging ({} pairs)".format( n_pairs) logger.debug(msg) print msg for i, (tid_i, tid_j) in enumerate(pairs): if self._prefilter(tid_i, tid_j): avg_pred_errs, p_values_ij = \ self._estimate_errors_significances(tid_i, tid_j) er_ij = error_reduction(avg_pred_errs["data1"]["data1"], avg_pred_errs["data2"]["data2"], avg_pred_errs["dataM"]["dataM"], self._tasks[tid_i].get_data_size(), self._tasks[tid_j].get_data_size()) min_ij = min(avg_pred_errs["data1"]["dataM"], avg_pred_errs["data2"]["dataM"]) if er_ij >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_ij: cp = CandidatePair(tid_i, tid_j, p_values_ij) C[cp.key] = cp update_progress(1. * (i + 1) / n_pairs) print # iteratively merge the most similar pair of tasks, until such pairs # exist n_cand = len(C) msg = "Processing {} candidate pairs for merging".format(n_cand) logger.debug(msg) print msg while len(C) > 0: # find the task pair with the minimal maximal p-value maxes = [(cp_key, cp.get_max_p_value()) for cp_key, cp in C.iteritems()] (min_tid_i, min_tid_j), _ = min(maxes, key=lambda x: x[1]) # merge the pair of tasks and update self._tasks task_M = MergedTask(self._tasks[min_tid_i], self._tasks[min_tid_j]) tid_M = task_M.id del self._tasks[min_tid_i] del self._tasks[min_tid_j] self._tasks[tid_M] = task_M # remove task pairs that don't exist anymore from C for (tid_i, tid_j) in C.keys(): if ((tid_i == min_tid_i) or (tid_i == min_tid_j) or (tid_j == min_tid_i) or (tid_j == min_tid_j)): del C[(tid_i, tid_j)] # find new task pairs that are candidates for merging for tid_i in self._tasks: if tid_i != tid_M and self._prefilter(tid_i, tid_M): avg_pred_errs, p_values_iM = \ self._estimate_errors_significances(tid_i, tid_M) er_iM = error_reduction(avg_pred_errs["data1"]["data1"], avg_pred_errs["data2"]["data2"], avg_pred_errs["dataM"]["dataM"], self._tasks[tid_i].get_data_size(), self._tasks[tid_M].get_data_size()) min_iM = min(avg_pred_errs["data1"]["dataM"], avg_pred_errs["data2"]["dataM"]) if er_iM >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_iM: cp = CandidatePair(tid_i, tid_M, p_values_iM) C[cp.key] = cp update_progress(1. * len(C) / n_cand, invert=True) print # build a model for each remaining (merged) task and store the info # for drawing a dendrogram showing the merging history task_models = dict() dend_info = [] for merg_task in self._tasks.itervalues(): # NOTE: When the number of unique class values is less than 2, we # cannot fit an ordinary model (e.g. logistic regression). Instead, # we have to use a dummy classifier which is subsequently augmented # to handle all the other class values. # NOTE: The scikit-learn estimator must be cloned so that each # (merged) task gets its own classifier X, y = merg_task.get_learn_data() if len(np.unique(y)) < 2: logger.info("Learning data for merged task {} has less than 2 " "class values. Using DummyClassifier.".\ format(merg_task)) model = DummyClassifier() model.fit(X, y) change_dummy_classes(model, np.array([0, 1])) else: model = clone(self._base_learner) model.fit(X, y) # assign this model to each original task of this (merged) task original_ids = merg_task.get_original_ids() for tid in original_ids: task_models[tid] = model # store the dendrogram info (if the task is truly a merged task) if len(original_ids) > 1: dend_info.append( convert_merg_history_to_scipy_linkage( merg_task.merg_history)) # create and fill the return dictionary R = dict() R["task_models"] = task_models R["dend_info"] = dend_info return R
def __call__(self, tasks, base_learner): """Run the merging algorithm for the given tasks. Perform the intelligent merging of tasks' data according to the ERM learning method. After the merging is complete, build a model for each remaining (merged) task and assign this model to each original task of this (merged) task. Return a dictionary of data structures computed within this call to ERM. It has the following keys: task_models -- dictionary mapping from each original task id to its model dend_info -- list of tuples (one for each merged task) as returned by the convert_merg_history_to_scipy_linkage function Arguments: tasks -- dictionary mapping from tasks' ids to their Task objects base_learner -- scikit-learn estimator """ self._base_learner = base_learner # create an ordered dictionary of MergedTask objects from the given # dictionary of tasks self._tasks = OrderedDict() for _, task in sorted(tasks.iteritems()): merg_task = MergedTask(task) self._tasks[merg_task.id] = merg_task # populate the dictionary of task pairs that are candidates for merging C = dict() pairs = list(combinations(self._tasks, 2)) n_pairs = len(pairs) msg = "Computing candidate pairs for merging ({} pairs)".format(n_pairs) logger.debug(msg) print msg for i, (tid_i, tid_j) in enumerate(pairs): if self._prefilter(tid_i, tid_j): avg_pred_errs, p_values_ij = \ self._estimate_errors_significances(tid_i, tid_j) er_ij = error_reduction(avg_pred_errs["data1"]["data1"], avg_pred_errs["data2"]["data2"], avg_pred_errs["dataM"]["dataM"], self._tasks[tid_i].get_data_size(), self._tasks[tid_j].get_data_size()) min_ij = min(avg_pred_errs["data1"]["dataM"], avg_pred_errs["data2"]["dataM"]) if er_ij >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_ij: cp = CandidatePair(tid_i, tid_j, p_values_ij) C[cp.key] = cp update_progress(1.* (i + 1) / n_pairs) print # iteratively merge the most similar pair of tasks, until such pairs # exist n_cand = len(C) msg = "Processing {} candidate pairs for merging".format(n_cand) logger.debug(msg) print msg while len(C) > 0: # find the task pair with the minimal maximal p-value maxes = [(cp_key, cp.get_max_p_value()) for cp_key, cp in C.iteritems()] (min_tid_i, min_tid_j), _ = min(maxes, key=lambda x: x[1]) # merge the pair of tasks and update self._tasks task_M = MergedTask(self._tasks[min_tid_i], self._tasks[min_tid_j]) tid_M = task_M.id del self._tasks[min_tid_i] del self._tasks[min_tid_j] self._tasks[tid_M] = task_M # remove task pairs that don't exist anymore from C for (tid_i, tid_j) in C.keys(): if ((tid_i == min_tid_i) or (tid_i == min_tid_j) or (tid_j == min_tid_i) or (tid_j == min_tid_j)): del C[(tid_i, tid_j)] # find new task pairs that are candidates for merging for tid_i in self._tasks: if tid_i != tid_M and self._prefilter(tid_i, tid_M): avg_pred_errs, p_values_iM = \ self._estimate_errors_significances(tid_i, tid_M) er_iM = error_reduction(avg_pred_errs["data1"]["data1"], avg_pred_errs["data2"]["data2"], avg_pred_errs["dataM"]["dataM"], self._tasks[tid_i].get_data_size(), self._tasks[tid_M].get_data_size()) min_iM = min(avg_pred_errs["data1"]["dataM"], avg_pred_errs["data2"]["dataM"]) if er_iM >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_iM: cp = CandidatePair(tid_i, tid_M, p_values_iM) C[cp.key] = cp update_progress(1.* len(C) / n_cand, invert=True) print # build a model for each remaining (merged) task and store the info # for drawing a dendrogram showing the merging history task_models = dict() dend_info = [] for merg_task in self._tasks.itervalues(): # NOTE: When the number of unique class values is less than 2, we # cannot fit an ordinary model (e.g. logistic regression). Instead, # we have to use a dummy classifier which is subsequently augmented # to handle all the other class values. # NOTE: The scikit-learn estimator must be cloned so that each # (merged) task gets its own classifier X, y = merg_task.get_learn_data() if len(np.unique(y)) < 2: logger.info("Learning data for merged task {} has less than 2 " "class values. Using DummyClassifier.".\ format(merg_task)) model = DummyClassifier() model.fit(X, y) change_dummy_classes(model, np.array([0, 1])) else: model = clone(self._base_learner) model.fit(X, y) # assign this model to each original task of this (merged) task original_ids = merg_task.get_original_ids() for tid in original_ids: task_models[tid] = model # store the dendrogram info (if the task is truly a merged task) if len(original_ids) > 1: dend_info.append(convert_merg_history_to_scipy_linkage( merg_task.merg_history)) # create and fill the return dictionary R = dict() R["task_models"] = task_models R["dend_info"] = dend_info return R
def generate_boolean_data_with_complete_test_sets(a, d, n, g, tg, noise, random_seed=1, n_learning_sets=1, funcs_pickle_path=None): """Generate a synthetic MTL problem of learning Boolean functions according to the given parameters. In addition, create test sets that cover the complete attribute space (2**a distinct examples). Log the report about the generated MTL problem, which includes: - the Boolean function of each group, - the % of True values in y for each task, - the average % of True values in y (across all tasks). Parameters ---------- a : int Number of attributes/variables of the generated Boolean functions. d : int The expected number of attributes/variables in a disjunct. n : int The number of examples for each task to generate. g : int The number of task groups to generate. Each task group shares the same Boolean functions. tg : int The number of tasks (with their corresponding data) to generate for each task group. noise : float The proportion of examples of each task that have their class values determined randomly. random_seed : int (optional) The random seed with which to initialize a private Random object. n_learning_sets : int (optional) The number of different learning sets to create for each task. funcs_pickle_path : str (optional) Path where to pickle the list of generated Boolean functions. Returns ------- tasks : list If n_learning_sets == 1, a list of Bunch objects corresponding to Boolean function learning tasks. Otherwise, a list of lists of Bunch objects, where each list corresponds to a set of different learning sets for each task. tasks_complete_test_sets : list A list of (X, y) tuples corresponding to complete testing sets for each task. """ tasks, funcs, attr = _generate_boolean_data( a, d, n, g, tg, noise, random_seed, n_learning_sets=n_learning_sets) if funcs_pickle_path: pickle_obj(funcs, funcs_pickle_path) tasks_complete_test_sets = [] # generate a complete testing set for each Boolean function n_funcs = len(funcs) print("Generating the complete test sets for {} Boolean functions".format( n_funcs)) for i, func in enumerate(funcs): complete_test_set = _generate_complete_test_set(attr, func) # duplicate the generated complete testing set for each task from the # current task group for _ in range(tg): tasks_complete_test_sets.append(complete_test_set) update_progress(1. * (i + 1) / n_funcs) print _report_about_generated_boolean_mtl_problem(funcs, tasks) return tasks, tasks_complete_test_sets