def __call__(self, tasks, base_learner): """Run the merging algorithm for the given tasks. Learn a model using the given base learner for each task on its own data (no merging). Return a dictionary of data structures computed within this learner. It has the following keys: task_models -- dictionary mapping from tasks' ids to the learned models Arguments: tasks -- dictionary mapping from tasks' ids to their Task objects base_learner -- scikit-learn estimator """ task_models = dict() for tid, task in tasks.iteritems(): # NOTE: When the number of unique class values is less than 2, we # cannot fit an ordinary model (e.g. logistic regression). Instead, # we have to use a dummy classifier which is subsequently augmented # to handle all the other class values. # NOTE: The scikit-learn estimator must be cloned so that each data # set gets its own classifier learn = task.get_learn_data() if len(np.unique(learn[1])) < 2: logger.debug("Learning data for task {} has less than 2 class " "values. Using DummyClassifier.".format(tid)) model = DummyClassifier() model.fit(*learn) change_dummy_classes(model, np.array([0, 1])) else: model = clone(base_learner) model.fit(*learn) task_models[tid] = model # create and fill the return dictionary R = dict() R["task_models"] = task_models return R
def _generalized_cross_validation_clas(learner, data1, data2, cv_folds1): """Perform one part of the generalized version of the cross-validation testing method on the given data sets. Perform cross-validation over data set data1. For each fold of data1, build models on the remaining folds of data1, the whole data set data2 and the merged data set and test them on the selected fold of data1. Return a tuple (pred_errs1, pred_errs2, pred_errsm), where: pred_errs1 -- numpy.array of prediction errors of the model built on the remaining folds of data1 for instances in data1 pred_errs2 -- numpy.array of prediction errors of the model built on the whole data set data2 for instances in data1 pred_errm -- numpy.array of prediction errors of the model built on the merged data set for instances in data1 Arguments: learner -- scikit-learn classification estimator data1 -- tuple (X, y) representing the first data set, where: X -- numpy.array which holds the attribute values y -- numpy.array which holds the class value data2 -- tuple (X, y) representing the second data set, where: X -- numpy.array which holds the attribute values y -- numpy.array which holds the class value cv_folds1 -- list of tuples (learn, test) to perform cross-validation over data1, where: learn -- numpy.array with a Boolean mask for selecting learning instances test -- numpy.array with a Boolean mask for selecting testing instances """ # unpack the data1 and data2 tuples X1, y1 = data1 X2, y2 = data2 # build a model on data2 # NOTE: The model does not change throughout cross-validation on data1 # NOTE: When the number of unique class values is less than 2, we # cannot fit an ordinary model (e.g. logistic regression). Instead, we # have to use a dummy classifier which is subsequently augmented to # handle all the other class values. # NOTE: The scikit-learn estimator must be cloned so that each data set # gets its own classifier if len(np.unique(y2)) < 2: model2 = DummyClassifier() model2.fit(X2, y2) change_dummy_classes(model2, np.array([0, 1])) else: model2 = clone(learner) model2.fit(X2, y2) _check_classes(model2) # prediction errors of models computed as: # 1 - P_model(predicted_class == true_class) # (pred. errors of the model built on data2 can be computed right away) pred_proba2 = model2.predict_proba(X1) pred_errs2 = 1 - pred_proba2[np.arange(y1.shape[0]), y1] pred_errs1 = -np.ones(y1.shape) pred_errsm = -np.ones(y1.shape) # perform generalized cross-validation on data1 for learn_ind, test_ind in cv_folds1: # create testing data arrays for the current fold test_X, test_y = X1[test_ind], y1[test_ind] # create learning data arrays for the current fold learn1 = X1[learn_ind], y1[learn_ind] learnm = (np.concatenate((X1[learn_ind], X2), axis=0), np.concatenate((y1[learn_ind], y2), axis=0)) # build models # NOTE: When the number of unique class values is less than 2, we # cannot fit an ordinary model (e.g. logistic regression). Instead, we # have to use a dummy classifier which is subsequently augmented to # handle all the other class values. # NOTE: The scikit-learn estimator must be cloned so that each data # set gets its own classifier if len(np.unique(learn1[1])) < 2: model1 = DummyClassifier() model1.fit(*learn1) change_dummy_classes(model1, np.array([0, 1])) else: model1 = clone(learner) model1.fit(*learn1) _check_classes(model1) if len(np.unique(learnm[1])) < 2: modelm = DummyClassifier() modelm.fit(*learn1) change_dummy_classes(modelm, np.array([0, 1])) else: modelm = clone(learner) modelm.fit(*learnm) _check_classes(modelm) # compute the prediction errors of both models on the current testing # data pred_proba1 = model1.predict_proba(test_X) pred_errs1[test_ind] = 1 - pred_proba1[np.arange(test_y.shape[0]), test_y] pred_probam = modelm.predict_proba(test_X) pred_errsm[test_ind] = 1 - pred_probam[np.arange(test_y.shape[0]), test_y] return pred_errs1, pred_errs2, pred_errsm
def __call__(self, tasks, base_learner): """Run the merging algorithm for the given tasks. Perform the intelligent merging of tasks' data according to the ERM learning method. After the merging is complete, build a model for each remaining (merged) task and assign this model to each original task of this (merged) task. Return a dictionary of data structures computed within this call to ERM. It has the following keys: task_models -- dictionary mapping from each original task id to its model dend_info -- list of tuples (one for each merged task) as returned by the convert_merg_history_to_scipy_linkage function Arguments: tasks -- dictionary mapping from tasks' ids to their Task objects base_learner -- scikit-learn estimator """ self._base_learner = base_learner # create an ordered dictionary of MergedTask objects from the given # dictionary of tasks self._tasks = OrderedDict() for _, task in sorted(tasks.iteritems()): merg_task = MergedTask(task) self._tasks[merg_task.id] = merg_task # populate the dictionary of task pairs that are candidates for merging C = dict() pairs = list(combinations(self._tasks, 2)) n_pairs = len(pairs) msg = "Computing candidate pairs for merging ({} pairs)".format(n_pairs) logger.debug(msg) print msg for i, (tid_i, tid_j) in enumerate(pairs): if self._prefilter(tid_i, tid_j): avg_pred_errs, p_values_ij = \ self._estimate_errors_significances(tid_i, tid_j) er_ij = error_reduction(avg_pred_errs["data1"]["data1"], avg_pred_errs["data2"]["data2"], avg_pred_errs["dataM"]["dataM"], self._tasks[tid_i].get_data_size(), self._tasks[tid_j].get_data_size()) min_ij = min(avg_pred_errs["data1"]["dataM"], avg_pred_errs["data2"]["dataM"]) if er_ij >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_ij: cp = CandidatePair(tid_i, tid_j, p_values_ij) C[cp.key] = cp update_progress(1.* (i + 1) / n_pairs) print # iteratively merge the most similar pair of tasks, until such pairs # exist n_cand = len(C) msg = "Processing {} candidate pairs for merging".format(n_cand) logger.debug(msg) print msg while len(C) > 0: # find the task pair with the minimal maximal p-value maxes = [(cp_key, cp.get_max_p_value()) for cp_key, cp in C.iteritems()] (min_tid_i, min_tid_j), _ = min(maxes, key=lambda x: x[1]) # merge the pair of tasks and update self._tasks task_M = MergedTask(self._tasks[min_tid_i], self._tasks[min_tid_j]) tid_M = task_M.id del self._tasks[min_tid_i] del self._tasks[min_tid_j] self._tasks[tid_M] = task_M # remove task pairs that don't exist anymore from C for (tid_i, tid_j) in C.keys(): if ((tid_i == min_tid_i) or (tid_i == min_tid_j) or (tid_j == min_tid_i) or (tid_j == min_tid_j)): del C[(tid_i, tid_j)] # find new task pairs that are candidates for merging for tid_i in self._tasks: if tid_i != tid_M and self._prefilter(tid_i, tid_M): avg_pred_errs, p_values_iM = \ self._estimate_errors_significances(tid_i, tid_M) er_iM = error_reduction(avg_pred_errs["data1"]["data1"], avg_pred_errs["data2"]["data2"], avg_pred_errs["dataM"]["dataM"], self._tasks[tid_i].get_data_size(), self._tasks[tid_M].get_data_size()) min_iM = min(avg_pred_errs["data1"]["dataM"], avg_pred_errs["data2"]["dataM"]) if er_iM >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_iM: cp = CandidatePair(tid_i, tid_M, p_values_iM) C[cp.key] = cp update_progress(1.* len(C) / n_cand, invert=True) print # build a model for each remaining (merged) task and store the info # for drawing a dendrogram showing the merging history task_models = dict() dend_info = [] for merg_task in self._tasks.itervalues(): # NOTE: When the number of unique class values is less than 2, we # cannot fit an ordinary model (e.g. logistic regression). Instead, # we have to use a dummy classifier which is subsequently augmented # to handle all the other class values. # NOTE: The scikit-learn estimator must be cloned so that each # (merged) task gets its own classifier X, y = merg_task.get_learn_data() if len(np.unique(y)) < 2: logger.info("Learning data for merged task {} has less than 2 " "class values. Using DummyClassifier.".\ format(merg_task)) model = DummyClassifier() model.fit(X, y) change_dummy_classes(model, np.array([0, 1])) else: model = clone(self._base_learner) model.fit(X, y) # assign this model to each original task of this (merged) task original_ids = merg_task.get_original_ids() for tid in original_ids: task_models[tid] = model # store the dendrogram info (if the task is truly a merged task) if len(original_ids) > 1: dend_info.append(convert_merg_history_to_scipy_linkage( merg_task.merg_history)) # create and fill the return dictionary R = dict() R["task_models"] = task_models R["dend_info"] = dend_info return R
def __call__(self, tasks, base_learner): """Run the merging algorithm for the given tasks. Perform the intelligent merging of tasks' data according to the ERM learning method. After the merging is complete, build a model for each remaining (merged) task and assign this model to each original task of this (merged) task. Return a dictionary of data structures computed within this call to ERM. It has the following keys: task_models -- dictionary mapping from each original task id to its model dend_info -- list of tuples (one for each merged task) as returned by the convert_merg_history_to_scipy_linkage function Arguments: tasks -- dictionary mapping from tasks' ids to their Task objects base_learner -- scikit-learn estimator """ self._base_learner = base_learner # create an ordered dictionary of MergedTask objects from the given # dictionary of tasks self._tasks = OrderedDict() for _, task in sorted(tasks.iteritems()): merg_task = MergedTask(task) self._tasks[merg_task.id] = merg_task # populate the dictionary of task pairs that are candidates for merging C = dict() pairs = list(combinations(self._tasks, 2)) n_pairs = len(pairs) msg = "Computing candidate pairs for merging ({} pairs)".format( n_pairs) logger.debug(msg) print msg for i, (tid_i, tid_j) in enumerate(pairs): if self._prefilter(tid_i, tid_j): avg_pred_errs, p_values_ij = \ self._estimate_errors_significances(tid_i, tid_j) er_ij = error_reduction(avg_pred_errs["data1"]["data1"], avg_pred_errs["data2"]["data2"], avg_pred_errs["dataM"]["dataM"], self._tasks[tid_i].get_data_size(), self._tasks[tid_j].get_data_size()) min_ij = min(avg_pred_errs["data1"]["dataM"], avg_pred_errs["data2"]["dataM"]) if er_ij >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_ij: cp = CandidatePair(tid_i, tid_j, p_values_ij) C[cp.key] = cp update_progress(1. * (i + 1) / n_pairs) print # iteratively merge the most similar pair of tasks, until such pairs # exist n_cand = len(C) msg = "Processing {} candidate pairs for merging".format(n_cand) logger.debug(msg) print msg while len(C) > 0: # find the task pair with the minimal maximal p-value maxes = [(cp_key, cp.get_max_p_value()) for cp_key, cp in C.iteritems()] (min_tid_i, min_tid_j), _ = min(maxes, key=lambda x: x[1]) # merge the pair of tasks and update self._tasks task_M = MergedTask(self._tasks[min_tid_i], self._tasks[min_tid_j]) tid_M = task_M.id del self._tasks[min_tid_i] del self._tasks[min_tid_j] self._tasks[tid_M] = task_M # remove task pairs that don't exist anymore from C for (tid_i, tid_j) in C.keys(): if ((tid_i == min_tid_i) or (tid_i == min_tid_j) or (tid_j == min_tid_i) or (tid_j == min_tid_j)): del C[(tid_i, tid_j)] # find new task pairs that are candidates for merging for tid_i in self._tasks: if tid_i != tid_M and self._prefilter(tid_i, tid_M): avg_pred_errs, p_values_iM = \ self._estimate_errors_significances(tid_i, tid_M) er_iM = error_reduction(avg_pred_errs["data1"]["data1"], avg_pred_errs["data2"]["data2"], avg_pred_errs["dataM"]["dataM"], self._tasks[tid_i].get_data_size(), self._tasks[tid_M].get_data_size()) min_iM = min(avg_pred_errs["data1"]["dataM"], avg_pred_errs["data2"]["dataM"]) if er_iM >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_iM: cp = CandidatePair(tid_i, tid_M, p_values_iM) C[cp.key] = cp update_progress(1. * len(C) / n_cand, invert=True) print # build a model for each remaining (merged) task and store the info # for drawing a dendrogram showing the merging history task_models = dict() dend_info = [] for merg_task in self._tasks.itervalues(): # NOTE: When the number of unique class values is less than 2, we # cannot fit an ordinary model (e.g. logistic regression). Instead, # we have to use a dummy classifier which is subsequently augmented # to handle all the other class values. # NOTE: The scikit-learn estimator must be cloned so that each # (merged) task gets its own classifier X, y = merg_task.get_learn_data() if len(np.unique(y)) < 2: logger.info("Learning data for merged task {} has less than 2 " "class values. Using DummyClassifier.".\ format(merg_task)) model = DummyClassifier() model.fit(X, y) change_dummy_classes(model, np.array([0, 1])) else: model = clone(self._base_learner) model.fit(X, y) # assign this model to each original task of this (merged) task original_ids = merg_task.get_original_ids() for tid in original_ids: task_models[tid] = model # store the dendrogram info (if the task is truly a merged task) if len(original_ids) > 1: dend_info.append( convert_merg_history_to_scipy_linkage( merg_task.merg_history)) # create and fill the return dictionary R = dict() R["task_models"] = task_models R["dend_info"] = dend_info return R