def reduce(self, result): if self.select_regexp: inputs = [key3 for key3 in result if re.search(self.select_regexp, str(key3))] else: inputs = result.keys() if len(inputs) != 2: raise KeyError("Need to find exactly two results to compute a score." " Found %i: %s" % (len(inputs), inputs)) key_true = [k for k in inputs if k.find(conf.TRUE) != -1][0] key_pred = [k for k in inputs if k.find(conf.PREDICTION) != -1][0] y_true = result[key_true] y_pred = result[key_pred] try: # If list of arrays (CV, LOO, etc.) concatenate them y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) except ValueError: pass out = Result(key=result["key"]) p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, average=None) key, _ = key_pop(key_pred, -1) out[key_push(key, conf.SCORE_PRECISION)] = p out[key_push(key, conf.SCORE_RECALL)] = r out[key_push(key, conf.SCORE_RECALL_MEAN)] = r.mean() out[key_push(key, conf.SCORE_F1)] = f1 out[key_push(key, conf.SCORE_ACCURACY)] = accuracy_score(y_true, y_pred) if self.keep: out.update(result) return out
def reduce(self, result): if self.select_regexp: inputs = [key3 for key3 in result if re.search(self.select_regexp, str(key3))] else: inputs = result.keys() if len(inputs) != 2: raise KeyError("Need to find exactly two results to compute a " "score. Found %i: %s" % (len(inputs), inputs)) key_true = [k for k in inputs if k.find(conf.TRUE) != -1][0] key_pred = [k for k in inputs if k.find(conf.PREDICTION) != -1][0] y_true = result[key_true] y_pred = result[key_pred] try: # If list of arrays (CV, LOO, etc.) concatenate them y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) except ValueError: pass out = Result(key=result["key"]) p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, average=None) # Compute p-value of recall for each class def recall_test(recall, n_trials, apriori_p): n_success = recall * n_trials pval = binom_test(n_success, n=n_trials, p=apriori_p) if recall > apriori_p: return (pval / 2) else: return 1 - (pval / 2) n_classes = len(s) # Number of classes n_obs = len(y_true) prior_p = s.astype(np.float)/s.sum() # A priori probability of each class r_pvalues = np.zeros_like(r) for class_index in range(n_classes): n_trials = s[class_index] #print "Class {class_index}: {n_success} success on {n_trials} trials".format(n_success=n_success, n_trials=n_trials, class_index=class_index) r_pvalues[class_index] = recall_test(r[class_index], n_trials, prior_p[class_index]) # Compute p-value of mean recall mean_r = r.mean() mean_r_pvalue = binom_test(int(mean_r * n_obs), n=n_obs, p=.5) key, _ = key_pop(key_pred, -1) out[key_push(key, conf.SCORE_PRECISION)] = p out[key_push(key, conf.SCORE_RECALL)] = r out[key_push(key, conf.SCORE_RECALL_PVALUES)] = r_pvalues out[key_push(key, conf.SCORE_RECALL_MEAN)] = mean_r out[key_push(key, conf.SCORE_RECALL_MEAN_PVALUE)] = mean_r_pvalue out[key_push(key, conf.SCORE_F1)] = f1 out[key_push(key, conf.SCORE_ACCURACY)] = accuracy_score(y_true, y_pred) if self.keep: out.update(result) return out
def reduce(self, store_results=True): # Terminaison (leaf) node return results if not self.children: return self.load_state(name="results") # 1) Build sub-aggregates over children children_results = [child.reduce(store_results=False) for child in self.children] result_set = ResultSet(*children_results) if not self.reducer: return result_set # Group by key, without consideration of the fold/permutation number # which is the head of the key # use OrderedDict to preserve runing order from collections import OrderedDict groups = OrderedDict() for result in result_set: # remove the head of the key _, key_tail = key_pop(result["key"], index=0) result["key"] = key_tail key_tail = result["key"] if not key_tail in groups: groups[key_tail] = list() groups[key_tail].append(result) # For each key, stack results reduced = ResultSet() for key in groups: result_stacked = Result.stack(*groups[key]) reduced.add(self.reducer.reduce(result_stacked)) return reduced
def reduce(self, result_set): from epac.utils import train_test_split # Iterate over the result_set: a list of results (see transform). # Each result contains an additional unique key called "key". Example: # "MySVC(C=1.0)" or "MySVC(C=2.0)" # then you can design you own reducer! max_accuracy = -1 for result in result_set: # Each result is the dictionary returned by "transform". # If there is a CV in the workflow, EPAC suffixes keys # with /test or /train. # function train_test_split split result (dict) into two sub-dicts # removing /test or /train suffix. It returns two reference of the same # dict if no /test or /train suffix where found. output = dict() # output result is a dictonary result_train, result_test = train_test_split(result) if result_train is result_test: # No CV in the EPAC workflow accuracy = accuracy_score(result['y/true'], result['y/pred']) output["acc/y"] = accuracy else: # there was a CV in the EPAC workflow accuracy = accuracy_score(result_test['y/true'], result_test['y/pred']) output["acc/y/test"] = accuracy output["acc/y/train"] = accuracy_score(result_train['y/true'], result_train['y/pred']) if accuracy > max_accuracy: # keep the key in the reduced result best_result = Result(key=result['key'], **output) return best_result # reducer return a single result
def reduce(self, store_results=True): # Terminaison (leaf) node return results if not self.children: return self.load_results() # 1) Build sub-aggregates over children children_results = [ child.reduce(store_results=False) for child in self.children ] result_set = ResultSet(*children_results) if not self.reducer: return result_set if not self.need_group_key: reduced = ResultSet() reduced.add(self.reducer.reduce(result_set)) return reduced # Group by key, without consideration of the fold/permutation number # which is the head of the key # use OrderedDict to preserve runing order from collections import OrderedDict groups = OrderedDict() for result in result_set: # remove the head of the key _, key_tail = key_pop(result["key"], index=0) result["key"] = key_tail if not key_tail in groups: groups[key_tail] = list() groups[key_tail].append(result) # For each key, stack results reduced = ResultSet() for key in groups: result_stacked = Result.stack(*groups[key]) reduced.add(self.reducer.reduce(result_stacked)) return reduced
def reduce(self, result): if self.select_regexp: select_keys = [key for key in result if re.search(self.select_regexp, str(key))] #if re.search(self.select_regexp) != -1] else: select_keys = result.keys() out = Result(key=result.key()) for key in select_keys: out[key] = result[key][0] randm_res = np.vstack(result[key][1:]) count = np.sum(randm_res > result[key][0], axis=0).astype("float") pval = count / (randm_res.shape[0]) out[key_push(key, "pval")] = pval if self.keep: out.update(result) return out
def transform(self, **Xy): Xy_train, Xy_test = train_test_split(Xy) result = Result(key=self.get_signature(), **Xy) if not self.store: self.store = StoreMem() self.save_results(ResultSet(result)) if Xy_train is Xy_test: return Xy else: return Xy_train
def reduce(self, store_results=True): children_results = [ child.reduce(store_results=False) for child in self.children ] results = ResultSet(*children_results) if self.reducer: to_refit, best_params = self.reducer.reduce(results) Xy = self.load_results() Xy = self._results2dict(**Xy) self.refited = to_refit self.best_params = best_params out = self.refited.top_down(**Xy) out[conf.BEST_PARAMS] = best_params result = Result(key=self.get_signature(), **out) return ResultSet(result) return results
def reduce(self, result_set): # if you want to a remote execution of your code, import should be done # within methods from epac.utils import train_test_split from epac.map_reduce.results import ResultSet outputs = list() # output result is a dictonary for result in result_set: output = dict() # output result is a dictonary result_train, result_test = train_test_split(result) if result_train is result_test: accuracy = accuracy_score(result['y/true'], result['y/pred']) output["acc/y"] = accuracy else: accuracy = accuracy_score(result_test['y/true'], result_test['y/pred']) output["acc/y/test"] = accuracy output["acc/y/train"] = accuracy_score(result_train['y/true'], result_train['y/pred']) outputs.append(Result(key=result['key'], **output)) return ResultSet(*outputs)
def top_down(self, **Xy): """Top-down data processing method This method does nothing more that recursively call parent/children func_name. Most of time, it should be re-defined. Parameters ---------- func_name: str the name of the function to be called recursion: boolean if True recursively call parent/children func_name. If the current node is the root of the tree call the children. This way the whole tree is executed. If it is a leaf, then recursively call the parent before being executed. This a pipeline made of the path from the leaf to the root is executed. **Xy: dict the keyword dictionnary of data-flow Return ------ A dictionnary of processed data """ if conf.TRACE_TOPDOWN: print self.get_key() if debug.DEBUG: debug.current = self debug.Xy = Xy if not self.parent: self.initialization(**Xy) ## Performe some initialization Xy = self.transform(**Xy) if self.children: # Call children func_name down to leaves ret = [child.top_down(**Xy) for child in self.get_children_top_down()] Xy = ret[0] if len(ret) == 1 else ret else: result = Result(key=self.get_signature(), **Xy) self.save_state(ResultSet(result), name="result_set") return Xy
def top_down(self, **Xy): """Top-down data processing method This method does nothing more that recursively call parent/children func_name. Most of time, it should be re-defined. Parameters ---------- func_name: str the name of the function to be called recursion: boolean if True recursively call parent/children func_name. If the current node is the root of the tree call the children. This way the whole tree is executed. If it is a leaf, then recursively call the parent before being executed. This a pipeline made of the path from the leaf to the root is executed. **Xy: dict the keyword dictionnary of data-flow Return ------ A dictionnary of processed data Example ------- >>> from epac import Methods >>> from sklearn.svm import SVC >>> from sklearn import datasets >>> X, y = datasets.make_classification(n_samples=12, ... n_features=10, ... n_informative=2, ... random_state=1) >>> methods = Methods(*[SVC(C=1), SVC(C=2)]) >>> methods.top_down(X=X, y=y) [{'y/true': array([1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1]), 'y/pred': array([1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1])}, {'y/true': array([1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1]), 'y/pred': array([1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1])}] """ if conf.TRACE_TOPDOWN: print(self.get_key()) if debug.DEBUG: debug.current = self debug.Xy = Xy if not self.parent: self.initialization(**Xy) # Performe some initialization Xy = self.transform(**Xy) if not self.stop_top_down: if self.children: # Call children func_name down to leaves ret = [ child.top_down(**Xy) for child in self.get_children_top_down() ] Xy = ret[0] if len(ret) == 1 else ret else: result = Result(key=self.get_signature(), **Xy) self.save_results(ResultSet(result)) return Xy