Example #1
0
def train_test_split(Xy):
    """Split Xy into two dictonaries. If input dictonnary whas not build
    with train_test_merge(Xy1, Xy2) then return twice the input
    dictonnary.

    Parameters
    ----------
    Xy: dict

    Returns
    -------
    dict1, dict2 : splited dictionaries

    Example
    -------
    >>> train_test_merged = train_test_merge(dict(a=1, b=2), dict(a=33, b=44, c=55))
    >>> print train_test_merged
    {'c/test': 55, 'a/test': 33, 'b/test': 44, 'a/train': 1, 'b/train': 2}
    >>> print train_test_split(train_test_merged)
    ({'a': 1, 'b': 2}, {'a': 33, 'c': 55, 'b': 44})
    >>> print train_test_split(dict(a=1, b=2))
    ({'a': 1, 'b': 2}, {'a': 1, 'b': 2})
    """
    keys_train = [k for k in Xy if (key_pop(k)[1] == conf.TRAIN)]
    keys_test = [k for k in Xy if (key_pop(k)[1] == conf.TEST)]
    if not keys_train and not keys_test:
        return Xy, Xy
    if keys_train and keys_test:
        Xy_train = {key_pop(k)[0]: Xy[k] for k in keys_train}
        Xy_test ={key_pop(k)[0]: Xy[k] for k in keys_test}
        return Xy_train, Xy_test
    raise KeyError("data-flow could not be splitted")
Example #2
0
def train_test_split(Xy):
    """Split Xy into two dictonaries. If input dictonnary whas not build
    with train_test_merge(Xy1, Xy2) then return twice the input
    dictonnary.

    Parameters
    ----------
    Xy: dict

    Returns
    -------
    dict1, dict2 : splited dictionaries

    Example
    -------
    >>> train_test_merged = train_test_merge(dict(a=1, b=2), dict(a=33, b=44, c=55))
    >>> print(train_test_merged)
    {'c/test': 55, 'a/test': 33, 'b/test': 44, 'a/train': 1, 'b/train': 2}
    >>> print(train_test_split(train_test_merged))
    ({'a': 1, 'b': 2}, {'a': 33, 'c': 55, 'b': 44})
    >>> print(train_test_split(dict(a=1, b=2)))
    ({'a': 1, 'b': 2}, {'a': 1, 'b': 2})
    """
    keys_train = [k for k in Xy if (key_pop(k)[1] == conf.TRAIN)]
    keys_test = [k for k in Xy if (key_pop(k)[1] == conf.TEST)]
    if not keys_train and not keys_test:
        return Xy, Xy
    if keys_train and keys_test:
        Xy_train = {key_pop(k)[0]: Xy[k] for k in keys_train}
        Xy_test = {key_pop(k)[0]: Xy[k] for k in keys_test}
        return Xy_train, Xy_test
    raise KeyError("data-flow could not be splitted")
Example #3
0
 def reduce(self, store_results=True):
     # Terminaison (leaf) node return results
     if not self.children:
         return self.load_state(name="results")
     # 1) Build sub-aggregates over children
     children_results = [child.reduce(store_results=False) for
         child in self.children]
     result_set = ResultSet(*children_results)
     if not self.reducer:
         return result_set
     # Group by key, without consideration of the fold/permutation number
     # which is the head of the key
     # use OrderedDict to preserve runing order
     from collections import OrderedDict
     groups = OrderedDict()
     for result in result_set:
         # remove the head of the key
         _, key_tail = key_pop(result["key"], index=0)
         result["key"] = key_tail
         key_tail = result["key"]
         if not key_tail in groups:
             groups[key_tail] = list()
         groups[key_tail].append(result)
     # For each key, stack results
     reduced = ResultSet()
     for key in groups:
         result_stacked = Result.stack(*groups[key])
         reduced.add(self.reducer.reduce(result_stacked))
     return reduced
Example #4
0
 def reduce(self, result):
     if self.select_regexp:
         inputs = [key3 for key3 in result
             if re.search(self.select_regexp, str(key3))]
     else:
         inputs = result.keys()
     if len(inputs) != 2:
         raise KeyError("Need to find exactly two results to compute a score."
         " Found %i: %s" % (len(inputs), inputs))
     key_true = [k for k in inputs if k.find(conf.TRUE) != -1][0]
     key_pred = [k for k in inputs if k.find(conf.PREDICTION) != -1][0]
     y_true = result[key_true]
     y_pred = result[key_pred]
     try:  # If list of arrays (CV, LOO, etc.) concatenate them
         y_true = np.concatenate(y_true)
         y_pred = np.concatenate(y_pred)
     except ValueError:
         pass
     out = Result(key=result["key"])
     p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, average=None)
     key, _ = key_pop(key_pred, -1)
     out[key_push(key, conf.SCORE_PRECISION)] = p
     out[key_push(key, conf.SCORE_RECALL)] = r
     out[key_push(key, conf.SCORE_RECALL_MEAN)] = r.mean()
     out[key_push(key, conf.SCORE_F1)] = f1
     out[key_push(key, conf.SCORE_ACCURACY)] = accuracy_score(y_true, y_pred)
     if self.keep:
         out.update(result)
     return out
Example #5
0
    def reduce(self, store_results=True):
        # Terminaison (leaf) node return results
        if not self.children:
            return self.load_results()
        # 1) Build sub-aggregates over children
        children_results = [
            child.reduce(store_results=False) for child in self.children
        ]
        result_set = ResultSet(*children_results)
        if not self.reducer:
            return result_set

        if not self.need_group_key:
            reduced = ResultSet()
            reduced.add(self.reducer.reduce(result_set))
            return reduced

        # Group by key, without consideration of the fold/permutation number
        # which is the head of the key
        # use OrderedDict to preserve runing order
        from collections import OrderedDict
        groups = OrderedDict()
        for result in result_set:
            # remove the head of the key
            _, key_tail = key_pop(result["key"], index=0)
            result["key"] = key_tail
            if not key_tail in groups:
                groups[key_tail] = list()
            groups[key_tail].append(result)
        # For each key, stack results
        reduced = ResultSet()
        for key in groups:
            result_stacked = Result.stack(*groups[key])
            reduced.add(self.reducer.reduce(result_stacked))
        return reduced
Example #6
0
    def reduce(self, result):
        if self.select_regexp:
            inputs = [key3 for key3 in result
                      if re.search(self.select_regexp, str(key3))]
        else:
            inputs = result.keys()
        if len(inputs) != 2:
            raise KeyError("Need to find exactly two results to compute a "
                           "score. Found %i: %s" % (len(inputs), inputs))
        key_true = [k for k in inputs if k.find(conf.TRUE) != -1][0]
        key_pred = [k for k in inputs if k.find(conf.PREDICTION) != -1][0]
        y_true = result[key_true]
        y_pred = result[key_pred]
        try:  # If list of arrays (CV, LOO, etc.) concatenate them
            y_true = np.concatenate(y_true)
            y_pred = np.concatenate(y_pred)
        except ValueError:
            pass
        out = Result(key=result["key"])
        p, r, f1, s = precision_recall_fscore_support(y_true,
                                                      y_pred,
                                                      average=None)

        # Compute p-value of recall for each class
        def recall_test(recall, n_trials, apriori_p):
            n_success = recall * n_trials
            pval = binom_test(n_success, n=n_trials, p=apriori_p)
            if recall > apriori_p:
                return (pval / 2)
            else:
                return 1 - (pval / 2)

        n_classes = len(s)  # Number of classes
        n_obs = len(y_true)
        prior_p = s.astype(np.float)/s.sum()  # A priori probability of each class
        r_pvalues = np.zeros_like(r)
        for class_index in range(n_classes):
            n_trials = s[class_index]
            #print "Class {class_index}: {n_success} success on {n_trials} trials".format(n_success=n_success, n_trials=n_trials, class_index=class_index)
            r_pvalues[class_index] = recall_test(r[class_index],
                                                 n_trials,
                                                 prior_p[class_index])

        # Compute p-value of mean recall
        mean_r = r.mean()
        mean_r_pvalue = binom_test(int(mean_r * n_obs), n=n_obs, p=.5)

        key, _ = key_pop(key_pred, -1)
        out[key_push(key, conf.SCORE_PRECISION)] = p
        out[key_push(key, conf.SCORE_RECALL)] = r
        out[key_push(key, conf.SCORE_RECALL_PVALUES)] = r_pvalues
        out[key_push(key, conf.SCORE_RECALL_MEAN)] = mean_r
        out[key_push(key, conf.SCORE_RECALL_MEAN_PVALUE)] = mean_r_pvalue
        out[key_push(key, conf.SCORE_F1)] = f1
        out[key_push(key, conf.SCORE_ACCURACY)] = accuracy_score(y_true,
                                                                 y_pred)
        if self.keep:
            out.update(result)
        return out
Example #7
0
 def reduce(self, result):
     diff_perm_nbs = self.get_diff_perm_nbs(result)
     max_r2 = {}
     for perm_nb in diff_perm_nbs:
         max_r2[perm_nb] = self.get_max_r2_with_perm_nb(result,
                                                        perm_nb)
     r2_no_perms = max_r2[0]
     count = 0
     for i in max_r2:
         if i == 0:
             continue
         if r2_no_perms < max_r2[i]:
             count += 1
     p_value = float(count) / float(len(max_r2))
     _, res_key = key_pop(result.keys()[0], index=-1)
     out = Result(key=res_key)
     out["pval"] = p_value
     return out
Example #8
0
    def load(self, key=""):
        """Load everything that is prefixed with key.

        Parmaters
        ---------
        key: str
            if key point to a file (without the extension), return the file
            if key point to a directory, return a dictionary where
            values are objects corresponding to all files found in all
            sub-directories. Values are indexed with their keys.
            if key is an empty string, assume dirpath is a tree root.

        See Also
        --------
        BaseNode.save()
        """
        from epac.configuration import conf
        from epac.workflow.base import key_pop
        path = os.path.join(self.dirpath, key)
        #prefix = os.path.join(path, conf.STORE_FS_NODE_PREFIX)
        if os.path.isfile(path + conf.STORE_FS_PICKLE_SUFFIX):
            return self.load_pickle(path + conf.STORE_FS_PICKLE_SUFFIX)
        if os.path.isfile(path + conf.STORE_FS_JSON_SUFFIX):
            return self.load_pickle(path + conf.STORE_FS_JSON_SUFFIX)
        if os.path.isdir(path):
            filepaths = []
            for base, dirs, files in os.walk(self.dirpath):
                #print base, dirs, files
                for filepath in [os.path.join(base, basename) for \
                    basename in files]:
                    filepaths.append(filepath)
            loaded = dict()
            dirpath = os.path.join(self.dirpath, "")
            for filepath in filepaths:
                _, ext = os.path.splitext(filepath)
                if ext == conf.STORE_FS_JSON_SUFFIX:
                    key1 = filepath.replace(dirpath, "").\
                        replace(conf.STORE_FS_JSON_SUFFIX, "")
                    obj = self.load_json(filepath)
                    loaded[key1] = obj
                elif ext == conf.STORE_FS_PICKLE_SUFFIX:
                    key1 = filepath.replace(dirpath, "").\
                        replace(conf.STORE_FS_PICKLE_SUFFIX, "")
                    loaded[key1] = self.load_pickle(filepath)
                else:
                    raise IOError('File %s has an unkown extension: %s' %
                        (filepath, ext))
            if key == "":  # No key provided assume a whole tree to load
                tree = loaded.pop(conf.STORE_EXECUTION_TREE_PREFIX)
                for key1 in loaded:
                    key, attrname = key_pop(key1)
                    #attrname, ext = os.path.splitext(basename)
                    if attrname != conf.STORE_STORE_PREFIX:
                        raise ValueError('Do not know what to do with %s') \
                            % key1
                    node = tree.get_node(key)
                    if not node.store:
                        node.store = loaded[key1]
                    else:
                        keys_local = node.store.dict.keys()
                        keys_disk = loaded[key1].dict.keys()
                        if set(keys_local).intersection(set(keys_disk)):
                            raise KeyError("Merge store with same keys")
                        node.store.dict.update(loaded[key1].dict)
                loaded = tree
            return loaded
Example #9
0
    def load(self, key=""):
        """Load everything that is prefixed with key.

        Parmaters
        ---------
        key: str
            if key point to a file (without the extension), return the file
            if key point to a directory, return a dictionary where
            values are objects corresponding to all files found in all
            sub-directories. Values are indexed with their keys.
            if key is an empty string, assume dirpath is a tree root.

        See Also
        --------
        BaseNode.save()
        """
        from epac.configuration import conf
        from epac.workflow.base import key_pop
        path = os.path.join(self.dirpath, key)
        #prefix = os.path.join(path, conf.STORE_FS_NODE_PREFIX)
        if os.path.isfile(path + conf.STORE_FS_PICKLE_SUFFIX):
            return self.load_pickle(path + conf.STORE_FS_PICKLE_SUFFIX)
        if os.path.isfile(path + conf.STORE_FS_JSON_SUFFIX):
            return self.load_pickle(path + conf.STORE_FS_JSON_SUFFIX)
        if os.path.isdir(path):
            filepaths = []
            for base, dirs, files in os.walk(self.dirpath):
                #print base, dirs, files
                for filepath in [os.path.join(base, basename) for \
                    basename in files]:
                    filepaths.append(filepath)
            loaded = dict()
            dirpath = os.path.join(self.dirpath, "")
            for filepath in filepaths:
                _, ext = os.path.splitext(filepath)
                if ext == conf.STORE_FS_JSON_SUFFIX:
                    key1 = filepath.replace(dirpath, "").\
                        replace(conf.STORE_FS_JSON_SUFFIX, "")
                    obj = self.load_json(filepath)
                    loaded[key1] = obj
                elif ext == conf.STORE_FS_PICKLE_SUFFIX:
                    key1 = filepath.replace(dirpath, "").\
                        replace(conf.STORE_FS_PICKLE_SUFFIX, "")
                    loaded[key1] = self.load_pickle(filepath)
                else:
                    raise IOError('File %s has an unkown extension: %s' %
                                  (filepath, ext))
            if key == "":  # No key provided assume a whole tree to load
                tree = loaded.pop(conf.STORE_EXECUTION_TREE_PREFIX)
                for key1 in loaded:
                    key, attrname = key_pop(key1)
                    #attrname, ext = os.path.splitext(basename)
                    if attrname != conf.STORE_STORE_PREFIX:
                        raise ValueError('Do not know what to do with %s') \
                            % key1
                    node = tree.get_node(key)
                    if not node.store:
                        node.store = loaded[key1]
                    else:
                        keys_local = node.store.dict.keys()
                        keys_disk = loaded[key1].dict.keys()
                        if set(keys_local).intersection(set(keys_disk)):
                            raise KeyError("Merge store with same keys")
                        node.store.dict.update(loaded[key1].dict)
                loaded = tree
            return loaded