Example #1
0
 def reduce(self, result):
     if self.select_regexp:
         inputs = [key3 for key3 in result
             if re.search(self.select_regexp, str(key3))]
     else:
         inputs = result.keys()
     if len(inputs) != 2:
         raise KeyError("Need to find exactly two results to compute a score."
         " Found %i: %s" % (len(inputs), inputs))
     key_true = [k for k in inputs if k.find(conf.TRUE) != -1][0]
     key_pred = [k for k in inputs if k.find(conf.PREDICTION) != -1][0]
     y_true = result[key_true]
     y_pred = result[key_pred]
     try:  # If list of arrays (CV, LOO, etc.) concatenate them
         y_true = np.concatenate(y_true)
         y_pred = np.concatenate(y_pred)
     except ValueError:
         pass
     out = Result(key=result["key"])
     p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, average=None)
     key, _ = key_pop(key_pred, -1)
     out[key_push(key, conf.SCORE_PRECISION)] = p
     out[key_push(key, conf.SCORE_RECALL)] = r
     out[key_push(key, conf.SCORE_RECALL_MEAN)] = r.mean()
     out[key_push(key, conf.SCORE_F1)] = f1
     out[key_push(key, conf.SCORE_ACCURACY)] = accuracy_score(y_true, y_pred)
     if self.keep:
         out.update(result)
     return out
Example #2
0
    def reduce(self, result):
        if self.select_regexp:
            inputs = [key3 for key3 in result
                      if re.search(self.select_regexp, str(key3))]
        else:
            inputs = result.keys()
        if len(inputs) != 2:
            raise KeyError("Need to find exactly two results to compute a "
                           "score. Found %i: %s" % (len(inputs), inputs))
        key_true = [k for k in inputs if k.find(conf.TRUE) != -1][0]
        key_pred = [k for k in inputs if k.find(conf.PREDICTION) != -1][0]
        y_true = result[key_true]
        y_pred = result[key_pred]
        try:  # If list of arrays (CV, LOO, etc.) concatenate them
            y_true = np.concatenate(y_true)
            y_pred = np.concatenate(y_pred)
        except ValueError:
            pass
        out = Result(key=result["key"])
        p, r, f1, s = precision_recall_fscore_support(y_true,
                                                      y_pred,
                                                      average=None)

        # Compute p-value of recall for each class
        def recall_test(recall, n_trials, apriori_p):
            n_success = recall * n_trials
            pval = binom_test(n_success, n=n_trials, p=apriori_p)
            if recall > apriori_p:
                return (pval / 2)
            else:
                return 1 - (pval / 2)

        n_classes = len(s)  # Number of classes
        n_obs = len(y_true)
        prior_p = s.astype(np.float)/s.sum()  # A priori probability of each class
        r_pvalues = np.zeros_like(r)
        for class_index in range(n_classes):
            n_trials = s[class_index]
            #print "Class {class_index}: {n_success} success on {n_trials} trials".format(n_success=n_success, n_trials=n_trials, class_index=class_index)
            r_pvalues[class_index] = recall_test(r[class_index],
                                                 n_trials,
                                                 prior_p[class_index])

        # Compute p-value of mean recall
        mean_r = r.mean()
        mean_r_pvalue = binom_test(int(mean_r * n_obs), n=n_obs, p=.5)

        key, _ = key_pop(key_pred, -1)
        out[key_push(key, conf.SCORE_PRECISION)] = p
        out[key_push(key, conf.SCORE_RECALL)] = r
        out[key_push(key, conf.SCORE_RECALL_PVALUES)] = r_pvalues
        out[key_push(key, conf.SCORE_RECALL_MEAN)] = mean_r
        out[key_push(key, conf.SCORE_RECALL_MEAN_PVALUE)] = mean_r_pvalue
        out[key_push(key, conf.SCORE_F1)] = f1
        out[key_push(key, conf.SCORE_ACCURACY)] = accuracy_score(y_true,
                                                                 y_pred)
        if self.keep:
            out.update(result)
        return out
Example #3
0
 def reduce(self, store_results=True):
     # Terminaison (leaf) node return results
     if not self.children:
         return self.load_state(name="results")
     # 1) Build sub-aggregates over children
     children_results = [child.reduce(store_results=False) for
         child in self.children]
     result_set = ResultSet(*children_results)
     if not self.reducer:
         return result_set
     # Group by key, without consideration of the fold/permutation number
     # which is the head of the key
     # use OrderedDict to preserve runing order
     from collections import OrderedDict
     groups = OrderedDict()
     for result in result_set:
         # remove the head of the key
         _, key_tail = key_pop(result["key"], index=0)
         result["key"] = key_tail
         key_tail = result["key"]
         if not key_tail in groups:
             groups[key_tail] = list()
         groups[key_tail].append(result)
     # For each key, stack results
     reduced = ResultSet()
     for key in groups:
         result_stacked = Result.stack(*groups[key])
         reduced.add(self.reducer.reduce(result_stacked))
     return reduced
Example #4
0
 def reduce(self, result_set):
     from epac.utils import train_test_split
     # Iterate over the result_set: a list of results (see transform).
     # Each result contains an additional unique key called "key". Example:
     # "MySVC(C=1.0)" or "MySVC(C=2.0)"
     # then you can design you own reducer!
     max_accuracy = -1
     for result in result_set:
         # Each result is the dictionary returned by "transform".
         # If there is a CV in the workflow, EPAC suffixes keys 
         # with /test or /train.
         # function train_test_split split result (dict) into two sub-dicts
         # removing /test or /train suffix. It returns two reference of the same
         # dict if no /test or /train suffix where found.
         output = dict()  # output result is a dictonary
         result_train, result_test = train_test_split(result)
         if result_train is result_test:  # No CV in the EPAC workflow
             accuracy = accuracy_score(result['y/true'], result['y/pred'])
             output["acc/y"] = accuracy
         else:  # there was a CV in the EPAC workflow
             accuracy = accuracy_score(result_test['y/true'], result_test['y/pred'])
             output["acc/y/test"] = accuracy
             output["acc/y/train"] = accuracy_score(result_train['y/true'], result_train['y/pred'])
         if accuracy > max_accuracy:
             # keep the key in the reduced result
             best_result = Result(key=result['key'], **output)
     return best_result  # reducer return a single result
Example #5
0
    def reduce(self, store_results=True):
        # Terminaison (leaf) node return results
        if not self.children:
            return self.load_results()
        # 1) Build sub-aggregates over children
        children_results = [
            child.reduce(store_results=False) for child in self.children
        ]
        result_set = ResultSet(*children_results)
        if not self.reducer:
            return result_set

        if not self.need_group_key:
            reduced = ResultSet()
            reduced.add(self.reducer.reduce(result_set))
            return reduced

        # Group by key, without consideration of the fold/permutation number
        # which is the head of the key
        # use OrderedDict to preserve runing order
        from collections import OrderedDict
        groups = OrderedDict()
        for result in result_set:
            # remove the head of the key
            _, key_tail = key_pop(result["key"], index=0)
            result["key"] = key_tail
            if not key_tail in groups:
                groups[key_tail] = list()
            groups[key_tail].append(result)
        # For each key, stack results
        reduced = ResultSet()
        for key in groups:
            result_stacked = Result.stack(*groups[key])
            reduced.add(self.reducer.reduce(result_stacked))
        return reduced
Example #6
0
 def reduce(self, result):
     if self.select_regexp:
         select_keys = [key for key in result
             if re.search(self.select_regexp, str(key))]
             #if re.search(self.select_regexp) != -1]
     else:
         select_keys = result.keys()
     out = Result(key=result.key())
     for key in select_keys:
         out[key] = result[key][0]
         randm_res = np.vstack(result[key][1:])
         count = np.sum(randm_res > result[key][0], axis=0).astype("float")
         pval = count / (randm_res.shape[0])
         out[key_push(key, "pval")] = pval
     if self.keep:
         out.update(result)
     return out
Example #7
0
 def transform(self, **Xy):
     Xy_train, Xy_test = train_test_split(Xy)
     result = Result(key=self.get_signature(), **Xy)
     if not self.store:
         self.store = StoreMem()
     self.save_results(ResultSet(result))
     if Xy_train is Xy_test:
         return Xy
     else:
         return Xy_train
Example #8
0
 def reduce(self, store_results=True):
     children_results = [
         child.reduce(store_results=False) for child in self.children
     ]
     results = ResultSet(*children_results)
     if self.reducer:
         to_refit, best_params = self.reducer.reduce(results)
         Xy = self.load_results()
         Xy = self._results2dict(**Xy)
         self.refited = to_refit
         self.best_params = best_params
         out = self.refited.top_down(**Xy)
         out[conf.BEST_PARAMS] = best_params
         result = Result(key=self.get_signature(), **out)
         return ResultSet(result)
     return results
Example #9
0
 def reduce(self, result_set):
     # if you want to a remote execution of your code, import should be done
     # within methods
     from epac.utils import train_test_split
     from epac.map_reduce.results import ResultSet
     outputs = list()  # output result is a dictonary
     for result in result_set:
         output = dict()  # output result is a dictonary
         result_train, result_test = train_test_split(result)
         if result_train is result_test:
             accuracy = accuracy_score(result['y/true'], result['y/pred'])
             output["acc/y"] = accuracy
         else:
             accuracy = accuracy_score(result_test['y/true'], result_test['y/pred'])
             output["acc/y/test"] = accuracy
             output["acc/y/train"] = accuracy_score(result_train['y/true'], result_train['y/pred'])
         outputs.append(Result(key=result['key'], **output))
     return ResultSet(*outputs)
Example #10
0
    def top_down(self, **Xy):
        """Top-down data processing method

            This method does nothing more that recursively call
            parent/children func_name. Most of time, it should be re-defined.

            Parameters
            ----------
            func_name: str
                the name of the function to be called
            recursion: boolean
                if True recursively call parent/children func_name. If the
                current node is the root of the tree call the children.
                This way the whole tree is executed.
                If it is a leaf, then recursively call the parent before
                being executed. This a pipeline made of the path from the
                leaf to the root is executed.
            **Xy: dict
                the keyword dictionnary of data-flow

            Return
            ------
            A dictionnary of processed data
        """
        if conf.TRACE_TOPDOWN:
            print self.get_key()
        if debug.DEBUG:
            debug.current = self
            debug.Xy = Xy
        if not self.parent:
            self.initialization(**Xy)  ## Performe some initialization
        Xy = self.transform(**Xy)
        if self.children:
            # Call children func_name down to leaves
            ret = [child.top_down(**Xy) for child in self.get_children_top_down()]
            Xy = ret[0] if len(ret) == 1 else ret
        else:
            result = Result(key=self.get_signature(), **Xy)
            self.save_state(ResultSet(result), name="result_set")
        return Xy
Example #11
0
    def top_down(self, **Xy):
        """Top-down data processing method

        This method does nothing more that recursively call
        parent/children func_name. Most of time, it should be re-defined.

        Parameters
        ----------
        func_name: str
            the name of the function to be called

        recursion: boolean
            if True recursively call parent/children func_name. If the
            current node is the root of the tree call the children.
            This way the whole tree is executed.
            If it is a leaf, then recursively call the parent before
            being executed. This a pipeline made of the path from the
            leaf to the root is executed.

        **Xy: dict
            the keyword dictionnary of data-flow

        Return
        ------
        A dictionnary of processed data

        Example
        -------
        >>> from epac import Methods
        >>> from sklearn.svm import SVC
        >>> from sklearn import datasets
        >>> X, y = datasets.make_classification(n_samples=12,
        ...                                     n_features=10,
        ...                                     n_informative=2,
        ...                                     random_state=1)
        >>> methods = Methods(*[SVC(C=1), SVC(C=2)])
        >>> methods.top_down(X=X, y=y)
        [{'y/true': array([1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1]), 'y/pred': array([1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1])}, {'y/true': array([1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1]), 'y/pred': array([1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1])}]

        """
        if conf.TRACE_TOPDOWN:
            print(self.get_key())
        if debug.DEBUG:
            debug.current = self
            debug.Xy = Xy
        if not self.parent:
            self.initialization(**Xy)  # Performe some initialization
        Xy = self.transform(**Xy)

        if not self.stop_top_down:
            if self.children:
                # Call children func_name down to leaves
                ret = [
                    child.top_down(**Xy)
                    for child in self.get_children_top_down()
                ]
                Xy = ret[0] if len(ret) == 1 else ret
            else:
                result = Result(key=self.get_signature(), **Xy)
                self.save_results(ResultSet(result))
        return Xy