Ejemplo n.º 1
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Learn a model using
     the given base learner for each task on its own data (no merging).
     Return a dictionary of data structures computed within this learner.
     It has the following keys:
         task_models -- dictionary mapping from tasks' ids to the learned
             models
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     task_models = dict()
     for tid, task in tasks.iteritems():
         # NOTE: When the number of unique class values is less than 2, we
         # cannot fit an ordinary model (e.g. logistic regression). Instead,
         # we have to use a dummy classifier which is subsequently augmented
         # to handle all the other class values.
         # NOTE: The scikit-learn estimator must be cloned so that each data
         # set gets its own classifier
         learn = task.get_learn_data()
         if len(np.unique(learn[1])) < 2:
             logger.debug("Learning data for task {} has less than 2 class "
                          "values. Using DummyClassifier.".format(tid))
             model = DummyClassifier()
             model.fit(*learn)
             change_dummy_classes(model, np.array([0, 1]))
         else:
             model = clone(base_learner)
             model.fit(*learn)
         task_models[tid] = model
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     return R
Ejemplo n.º 2
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Learn a model using
     the given base learner for each task on its own data (no merging).
     Return a dictionary of data structures computed within this learner.
     It has the following keys:
         task_models -- dictionary mapping from tasks' ids to the learned
             models
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     task_models = dict()
     for tid, task in tasks.iteritems():
         # NOTE: When the number of unique class values is less than 2, we
         # cannot fit an ordinary model (e.g. logistic regression). Instead,
         # we have to use a dummy classifier which is subsequently augmented
         # to handle all the other class values.
         # NOTE: The scikit-learn estimator must be cloned so that each data
         # set gets its own classifier
         learn = task.get_learn_data()
         if len(np.unique(learn[1])) < 2:
             logger.debug("Learning data for task {} has less than 2 class "
                          "values. Using DummyClassifier.".format(tid))
             model = DummyClassifier()
             model.fit(*learn)
             change_dummy_classes(model, np.array([0, 1]))
         else:
             model = clone(base_learner)
             model.fit(*learn)
         task_models[tid] = model
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     return R
Ejemplo n.º 3
0
def _generalized_cross_validation_clas(learner, data1, data2, cv_folds1):
    """Perform one part of the generalized version of the cross-validation
    testing method on the given data sets.
    Perform cross-validation over data set data1. For each fold of data1,
    build models on the remaining folds of data1, the whole data set data2 and
    the merged data set and test them on the selected fold of data1.
    Return a tuple (pred_errs1, pred_errs2, pred_errsm), where:
        pred_errs1 -- numpy.array of prediction errors of the model built on the
            remaining folds of data1 for instances in data1
        pred_errs2 -- numpy.array of prediction errors of the model built on the
            whole data set data2 for instances in data1
        pred_errm -- numpy.array of prediction errors of the model built on the
            merged data set for instances in data1
    
    Arguments:
    learner -- scikit-learn classification estimator
    data1 -- tuple (X, y) representing the first data set, where:
        X -- numpy.array which holds the attribute values
        y -- numpy.array which holds the class value
    data2 -- tuple (X, y) representing the second data set, where:
        X -- numpy.array which holds the attribute values
        y -- numpy.array which holds the class value
    cv_folds1 -- list of tuples (learn, test) to perform cross-validation over
        data1, where:
        learn -- numpy.array with a Boolean mask for selecting learning
            instances
        test -- numpy.array with a Boolean mask for selecting testing instances
    
    """
    # unpack the data1 and data2 tuples
    X1, y1 = data1
    X2, y2 = data2
    # build a model on data2
    # NOTE: The model does not change throughout cross-validation on data1
    # NOTE: When the number of unique class values is less than 2, we
    # cannot fit an ordinary model (e.g. logistic regression). Instead, we
    # have to use a dummy classifier which is subsequently augmented to
    # handle all the other class values.
    # NOTE: The scikit-learn estimator must be cloned so that each data set
    # gets its own classifier
    if len(np.unique(y2)) < 2:
        model2 = DummyClassifier()
        model2.fit(X2, y2)
        change_dummy_classes(model2, np.array([0, 1]))
    else:
        model2 = clone(learner)
        model2.fit(X2, y2)
    _check_classes(model2)
    # prediction errors of models computed as:
    # 1 - P_model(predicted_class == true_class)
    # (pred. errors of the model built on data2 can be computed right away) 
    pred_proba2 = model2.predict_proba(X1)
    pred_errs2 = 1 - pred_proba2[np.arange(y1.shape[0]), y1]
    pred_errs1 = -np.ones(y1.shape)
    pred_errsm = -np.ones(y1.shape)
    # perform generalized cross-validation on data1
    for learn_ind, test_ind in cv_folds1:
        # create testing data arrays for the current fold
        test_X, test_y = X1[test_ind], y1[test_ind]
        # create learning data arrays for the current fold
        learn1 = X1[learn_ind], y1[learn_ind]
        learnm = (np.concatenate((X1[learn_ind], X2), axis=0),
                  np.concatenate((y1[learn_ind], y2), axis=0))
        # build models
        # NOTE: When the number of unique class values is less than 2, we
        # cannot fit an ordinary model (e.g. logistic regression). Instead, we
        # have to use a dummy classifier which is subsequently augmented to
        # handle all the other class values.
        # NOTE: The scikit-learn estimator must be cloned so that each data
        # set gets its own classifier 
        if len(np.unique(learn1[1])) < 2:
            model1 = DummyClassifier()
            model1.fit(*learn1)
            change_dummy_classes(model1, np.array([0, 1]))
        else:
            model1 = clone(learner)
            model1.fit(*learn1)
        _check_classes(model1)
        if len(np.unique(learnm[1])) < 2:
            modelm = DummyClassifier()
            modelm.fit(*learn1)
            change_dummy_classes(modelm, np.array([0, 1]))
        else:
            modelm = clone(learner)
            modelm.fit(*learnm)
        _check_classes(modelm)
        # compute the prediction errors of both models on the current testing
        # data
        pred_proba1 = model1.predict_proba(test_X)
        pred_errs1[test_ind] = 1 - pred_proba1[np.arange(test_y.shape[0]),
                                               test_y]
        pred_probam = modelm.predict_proba(test_X)
        pred_errsm[test_ind] = 1 - pred_probam[np.arange(test_y.shape[0]),
                                               test_y]
    return pred_errs1, pred_errs2, pred_errsm
Ejemplo n.º 4
0
def _generalized_cross_validation_clas(learner, data1, data2, cv_folds1):
    """Perform one part of the generalized version of the cross-validation
    testing method on the given data sets.
    Perform cross-validation over data set data1. For each fold of data1,
    build models on the remaining folds of data1, the whole data set data2 and
    the merged data set and test them on the selected fold of data1.
    Return a tuple (pred_errs1, pred_errs2, pred_errsm), where:
        pred_errs1 -- numpy.array of prediction errors of the model built on the
            remaining folds of data1 for instances in data1
        pred_errs2 -- numpy.array of prediction errors of the model built on the
            whole data set data2 for instances in data1
        pred_errm -- numpy.array of prediction errors of the model built on the
            merged data set for instances in data1
    
    Arguments:
    learner -- scikit-learn classification estimator
    data1 -- tuple (X, y) representing the first data set, where:
        X -- numpy.array which holds the attribute values
        y -- numpy.array which holds the class value
    data2 -- tuple (X, y) representing the second data set, where:
        X -- numpy.array which holds the attribute values
        y -- numpy.array which holds the class value
    cv_folds1 -- list of tuples (learn, test) to perform cross-validation over
        data1, where:
        learn -- numpy.array with a Boolean mask for selecting learning
            instances
        test -- numpy.array with a Boolean mask for selecting testing instances
    
    """
    # unpack the data1 and data2 tuples
    X1, y1 = data1
    X2, y2 = data2
    # build a model on data2
    # NOTE: The model does not change throughout cross-validation on data1
    # NOTE: When the number of unique class values is less than 2, we
    # cannot fit an ordinary model (e.g. logistic regression). Instead, we
    # have to use a dummy classifier which is subsequently augmented to
    # handle all the other class values.
    # NOTE: The scikit-learn estimator must be cloned so that each data set
    # gets its own classifier
    if len(np.unique(y2)) < 2:
        model2 = DummyClassifier()
        model2.fit(X2, y2)
        change_dummy_classes(model2, np.array([0, 1]))
    else:
        model2 = clone(learner)
        model2.fit(X2, y2)
    _check_classes(model2)
    # prediction errors of models computed as:
    # 1 - P_model(predicted_class == true_class)
    # (pred. errors of the model built on data2 can be computed right away)
    pred_proba2 = model2.predict_proba(X1)
    pred_errs2 = 1 - pred_proba2[np.arange(y1.shape[0]), y1]
    pred_errs1 = -np.ones(y1.shape)
    pred_errsm = -np.ones(y1.shape)
    # perform generalized cross-validation on data1
    for learn_ind, test_ind in cv_folds1:
        # create testing data arrays for the current fold
        test_X, test_y = X1[test_ind], y1[test_ind]
        # create learning data arrays for the current fold
        learn1 = X1[learn_ind], y1[learn_ind]
        learnm = (np.concatenate((X1[learn_ind], X2), axis=0),
                  np.concatenate((y1[learn_ind], y2), axis=0))
        # build models
        # NOTE: When the number of unique class values is less than 2, we
        # cannot fit an ordinary model (e.g. logistic regression). Instead, we
        # have to use a dummy classifier which is subsequently augmented to
        # handle all the other class values.
        # NOTE: The scikit-learn estimator must be cloned so that each data
        # set gets its own classifier
        if len(np.unique(learn1[1])) < 2:
            model1 = DummyClassifier()
            model1.fit(*learn1)
            change_dummy_classes(model1, np.array([0, 1]))
        else:
            model1 = clone(learner)
            model1.fit(*learn1)
        _check_classes(model1)
        if len(np.unique(learnm[1])) < 2:
            modelm = DummyClassifier()
            modelm.fit(*learn1)
            change_dummy_classes(modelm, np.array([0, 1]))
        else:
            modelm = clone(learner)
            modelm.fit(*learnm)
        _check_classes(modelm)
        # compute the prediction errors of both models on the current testing
        # data
        pred_proba1 = model1.predict_proba(test_X)
        pred_errs1[test_ind] = 1 - pred_proba1[np.arange(test_y.shape[0]),
                                               test_y]
        pred_probam = modelm.predict_proba(test_X)
        pred_errsm[test_ind] = 1 - pred_probam[np.arange(test_y.shape[0]),
                                               test_y]
    return pred_errs1, pred_errs2, pred_errsm
Ejemplo n.º 5
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Perform the
     intelligent merging of tasks' data according to the ERM learning method.
     After the merging is complete, build a model for each remaining (merged)
     task and assign this model to each original task of this (merged) task.
     Return a dictionary of data structures computed within this call to ERM.
     It has the following keys:
         task_models -- dictionary mapping from each original task id to its
             model
         dend_info -- list of tuples (one for each merged task) as returned
             by the convert_merg_history_to_scipy_linkage function
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     self._base_learner = base_learner
     # create an ordered dictionary of MergedTask objects from the given
     # dictionary of tasks
     self._tasks = OrderedDict()
     for _, task in sorted(tasks.iteritems()):
         merg_task = MergedTask(task)
         self._tasks[merg_task.id] = merg_task
     # populate the dictionary of task pairs that are candidates for merging
     C = dict()
     pairs = list(combinations(self._tasks, 2))
     n_pairs = len(pairs)
     msg = "Computing candidate pairs for merging ({} pairs)".format(n_pairs)
     logger.debug(msg)
     print msg
     for i, (tid_i, tid_j) in enumerate(pairs):
         if self._prefilter(tid_i, tid_j):
             avg_pred_errs, p_values_ij = \
                 self._estimate_errors_significances(tid_i, tid_j)
             er_ij = error_reduction(avg_pred_errs["data1"]["data1"],
                                     avg_pred_errs["data2"]["data2"],
                                     avg_pred_errs["dataM"]["dataM"],
                                     self._tasks[tid_i].get_data_size(),
                                     self._tasks[tid_j].get_data_size())
             min_ij = min(avg_pred_errs["data1"]["dataM"],
                          avg_pred_errs["data2"]["dataM"])
             if  er_ij >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_ij:
                 cp = CandidatePair(tid_i, tid_j, p_values_ij)
                 C[cp.key] = cp
         update_progress(1.* (i + 1) / n_pairs)
     print
     # iteratively merge the most similar pair of tasks, until such pairs
     # exist
     n_cand = len(C)
     msg = "Processing {} candidate pairs for merging".format(n_cand)
     logger.debug(msg)
     print msg
     while len(C) > 0:
         # find the task pair with the minimal maximal p-value
         maxes = [(cp_key, cp.get_max_p_value()) for cp_key, cp in
                  C.iteritems()]
         (min_tid_i, min_tid_j), _ = min(maxes, key=lambda x: x[1])
         # merge the pair of tasks and update self._tasks
         task_M = MergedTask(self._tasks[min_tid_i], self._tasks[min_tid_j])
         tid_M = task_M.id
         del self._tasks[min_tid_i]
         del self._tasks[min_tid_j]
         self._tasks[tid_M] = task_M
         # remove task pairs that don't exist anymore from C
         for (tid_i, tid_j) in C.keys():
             if ((tid_i == min_tid_i) or (tid_i == min_tid_j) or
                 (tid_j == min_tid_i) or (tid_j == min_tid_j)):
                 del C[(tid_i, tid_j)]
         # find new task pairs that are candidates for merging
         for tid_i in self._tasks:
             if tid_i != tid_M and self._prefilter(tid_i, tid_M):
                 avg_pred_errs, p_values_iM = \
                     self._estimate_errors_significances(tid_i, tid_M)
                 er_iM = error_reduction(avg_pred_errs["data1"]["data1"],
                                         avg_pred_errs["data2"]["data2"],
                                         avg_pred_errs["dataM"]["dataM"],
                                         self._tasks[tid_i].get_data_size(),
                                         self._tasks[tid_M].get_data_size())
                 min_iM = min(avg_pred_errs["data1"]["dataM"],
                              avg_pred_errs["data2"]["dataM"])
                 if er_iM >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_iM:
                     cp = CandidatePair(tid_i, tid_M, p_values_iM)
                     C[cp.key] = cp
         update_progress(1.* len(C) / n_cand, invert=True)
     print
     # build a model for each remaining (merged) task and store the info
     # for drawing a dendrogram showing the merging history
     task_models = dict()
     dend_info = []
     for merg_task in self._tasks.itervalues():
         # NOTE: When the number of unique class values is less than 2, we
         # cannot fit an ordinary model (e.g. logistic regression). Instead,
         # we have to use a dummy classifier which is subsequently augmented
         # to handle all the other class values.
         # NOTE: The scikit-learn estimator must be cloned so that each
         # (merged) task gets its own classifier
         X, y = merg_task.get_learn_data()
         if len(np.unique(y)) < 2:
             logger.info("Learning data for merged task {} has less than 2 "
                         "class values. Using DummyClassifier.".\
                         format(merg_task))
             model = DummyClassifier()
             model.fit(X, y)
             change_dummy_classes(model, np.array([0, 1]))
         else:
             model = clone(self._base_learner)
             model.fit(X, y)
         # assign this model to each original task of this (merged) task
         original_ids = merg_task.get_original_ids()
         for tid in original_ids:
             task_models[tid] = model
         # store the dendrogram info (if the task is truly a merged task)
         if len(original_ids) > 1:
             dend_info.append(convert_merg_history_to_scipy_linkage(
                                 merg_task.merg_history))
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     R["dend_info"] = dend_info
     return R
Ejemplo n.º 6
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Perform the
     intelligent merging of tasks' data according to the ERM learning method.
     After the merging is complete, build a model for each remaining (merged)
     task and assign this model to each original task of this (merged) task.
     Return a dictionary of data structures computed within this call to ERM.
     It has the following keys:
         task_models -- dictionary mapping from each original task id to its
             model
         dend_info -- list of tuples (one for each merged task) as returned
             by the convert_merg_history_to_scipy_linkage function
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     self._base_learner = base_learner
     # create an ordered dictionary of MergedTask objects from the given
     # dictionary of tasks
     self._tasks = OrderedDict()
     for _, task in sorted(tasks.iteritems()):
         merg_task = MergedTask(task)
         self._tasks[merg_task.id] = merg_task
     # populate the dictionary of task pairs that are candidates for merging
     C = dict()
     pairs = list(combinations(self._tasks, 2))
     n_pairs = len(pairs)
     msg = "Computing candidate pairs for merging ({} pairs)".format(
         n_pairs)
     logger.debug(msg)
     print msg
     for i, (tid_i, tid_j) in enumerate(pairs):
         if self._prefilter(tid_i, tid_j):
             avg_pred_errs, p_values_ij = \
                 self._estimate_errors_significances(tid_i, tid_j)
             er_ij = error_reduction(avg_pred_errs["data1"]["data1"],
                                     avg_pred_errs["data2"]["data2"],
                                     avg_pred_errs["dataM"]["dataM"],
                                     self._tasks[tid_i].get_data_size(),
                                     self._tasks[tid_j].get_data_size())
             min_ij = min(avg_pred_errs["data1"]["dataM"],
                          avg_pred_errs["data2"]["dataM"])
             if er_ij >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_ij:
                 cp = CandidatePair(tid_i, tid_j, p_values_ij)
                 C[cp.key] = cp
         update_progress(1. * (i + 1) / n_pairs)
     print
     # iteratively merge the most similar pair of tasks, until such pairs
     # exist
     n_cand = len(C)
     msg = "Processing {} candidate pairs for merging".format(n_cand)
     logger.debug(msg)
     print msg
     while len(C) > 0:
         # find the task pair with the minimal maximal p-value
         maxes = [(cp_key, cp.get_max_p_value())
                  for cp_key, cp in C.iteritems()]
         (min_tid_i, min_tid_j), _ = min(maxes, key=lambda x: x[1])
         # merge the pair of tasks and update self._tasks
         task_M = MergedTask(self._tasks[min_tid_i], self._tasks[min_tid_j])
         tid_M = task_M.id
         del self._tasks[min_tid_i]
         del self._tasks[min_tid_j]
         self._tasks[tid_M] = task_M
         # remove task pairs that don't exist anymore from C
         for (tid_i, tid_j) in C.keys():
             if ((tid_i == min_tid_i) or (tid_i == min_tid_j)
                     or (tid_j == min_tid_i) or (tid_j == min_tid_j)):
                 del C[(tid_i, tid_j)]
         # find new task pairs that are candidates for merging
         for tid_i in self._tasks:
             if tid_i != tid_M and self._prefilter(tid_i, tid_M):
                 avg_pred_errs, p_values_iM = \
                     self._estimate_errors_significances(tid_i, tid_M)
                 er_iM = error_reduction(avg_pred_errs["data1"]["data1"],
                                         avg_pred_errs["data2"]["data2"],
                                         avg_pred_errs["dataM"]["dataM"],
                                         self._tasks[tid_i].get_data_size(),
                                         self._tasks[tid_M].get_data_size())
                 min_iM = min(avg_pred_errs["data1"]["dataM"],
                              avg_pred_errs["data2"]["dataM"])
                 if er_iM >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_iM:
                     cp = CandidatePair(tid_i, tid_M, p_values_iM)
                     C[cp.key] = cp
         update_progress(1. * len(C) / n_cand, invert=True)
     print
     # build a model for each remaining (merged) task and store the info
     # for drawing a dendrogram showing the merging history
     task_models = dict()
     dend_info = []
     for merg_task in self._tasks.itervalues():
         # NOTE: When the number of unique class values is less than 2, we
         # cannot fit an ordinary model (e.g. logistic regression). Instead,
         # we have to use a dummy classifier which is subsequently augmented
         # to handle all the other class values.
         # NOTE: The scikit-learn estimator must be cloned so that each
         # (merged) task gets its own classifier
         X, y = merg_task.get_learn_data()
         if len(np.unique(y)) < 2:
             logger.info("Learning data for merged task {} has less than 2 "
                         "class values. Using DummyClassifier.".\
                         format(merg_task))
             model = DummyClassifier()
             model.fit(X, y)
             change_dummy_classes(model, np.array([0, 1]))
         else:
             model = clone(self._base_learner)
             model.fit(X, y)
         # assign this model to each original task of this (merged) task
         original_ids = merg_task.get_original_ids()
         for tid in original_ids:
             task_models[tid] = model
         # store the dendrogram info (if the task is truly a merged task)
         if len(original_ids) > 1:
             dend_info.append(
                 convert_merg_history_to_scipy_linkage(
                     merg_task.merg_history))
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     R["dend_info"] = dend_info
     return R