def _get_feature_names_at_node(self, node, input_features=None, entry=False): """ main function to make the feature go down the graphpipleine and retrieve the features at a given node Parameter --------- node : string or .. name of the node input_features : None or list if not None, the list of feature (at the input of the graphpipeline) entry : boolean, default = False if True will retrieve the feature at the ENTRY of a given model, otherwise the feature at the EXIT of a given model Returns ------- list of features for the given node, or None """ if not self._already_fitted: raise NotFittedError("Please fit the model before") if input_features is None: input_features = self._Xinput_features feature_dico = {} for n in self._nodes_order: predecessors = list(self.complete_graph.predecessors(n)) if len(predecessors) == 0: last_features = input_features elif len(predecessors) == 1: last_features = feature_dico[predecessors[0]] else: predecessors = self._all_concat_order[n] all_last_features = [ feature_dico[predecessor] for predecessor in predecessors ] if all_last_features is None or None in all_last_features: last_features = None else: last_features = unlist(all_last_features) model = self._models[n] if last_features is None or None in last_features: last_features = None if n != node: feature_dico[n] = try_to_find_features_names( model, input_features=last_features) if feature_dico[n] is not None: feature_dico[n] = list(feature_dico[n]) else: if entry: # Entry, I'll return the features at the entry of the node return last_features else: # Otherwise I'll return the features at the exit of the node feature_dico[n] = try_to_find_features_names( model, input_features=last_features) if feature_dico[n] is not None: feature_dico[n] = list(feature_dico[n]) return feature_dico[n] raise ValueError("node %s isn't in the graph" % node)
def test_try_to_find_features_names(): list_of_words = ["aa bb", "bb bb cc", "dd aa cc", "ee"] vec = CountVectorizer() vec.fit_transform(list_of_words) assert try_to_find_features_names(vec) == ["aa", "bb", "cc", "dd", "ee"] pipe = Pipeline([("nothing", DebugPassThrough()), ("vec", CountVectorizer())]) pipe.fit_transform(list_of_words) assert try_to_find_features_names(pipe) == ["aa", "bb", "cc", "dd", "ee"] union = FeatureUnion(transformer_list=[( "bagword", CountVectorizer()), ("bagchar", CountVectorizer(analyzer="char"))]) union.fit_transform(list_of_words) assert try_to_find_features_names(union) == [ "bagword__aa", "bagword__bb", "bagword__cc", "bagword__dd", "bagword__ee", "bagchar__ ", "bagchar__a", "bagchar__b", "bagchar__c", "bagchar__d", "bagchar__e", ] pipe1 = Pipeline([("nothing", DebugPassThrough()), ("vec", CountVectorizer())]) pipe2 = Pipeline([("nothing", DebugPassThrough()), ("vec", CountVectorizer(analyzer="char"))]) union = FeatureUnion(transformer_list=[("bagword", pipe1), ("bagchar", pipe2)]) union.fit_transform(list_of_words) assert try_to_find_features_names(union) == [ "bagword__aa", "bagword__bb", "bagword__cc", "bagword__dd", "bagword__ee", "bagchar__ ", "bagchar__a", "bagchar__b", "bagchar__c", "bagchar__d", "bagchar__e", ] class DummyModelAcceptInputFeature(object): def get_feature_names(self, input_features=None): if input_features is None: return [0, 1, 2, 3] else: return input_features class DummyModelDontInputFeature(object): def get_feature_names(self): return [0, 1, 2, 3] class DummyModelDoesntHaveGetFeatures(object): pass m = DummyModelAcceptInputFeature() assert try_to_find_features_names(m) == [0, 1, 2, 3] assert try_to_find_features_names( m, input_features=["a", "b", "c", "d"]) == ["a", "b", "c", "d"] m = DummyModelDontInputFeature() assert try_to_find_features_names(m) == [0, 1, 2, 3] assert try_to_find_features_names(m, input_features=["a", "b", "c", "d"]) == [0, 1, 2, 3] m = DummyModelDoesntHaveGetFeatures() assert try_to_find_features_names(m) is None assert try_to_find_features_names(m, input_features=["a", "b", "c", "d" ]) is None
def _fit_transform(self, X, y=None, groups=None, method=None, fit_params=None): """ main method of GraphPipeline, handles the fit and predict of object """ do_fit = method in ("fit", "fit_transform", "fit_predict") if not self._already_fitted and not do_fit: raise NotFittedError("Please fit the model before") # Split fit_params into a 'step-by-step' dictionnary fit_params_step = {name: {} for name in self.complete_graph.nodes} if fit_params is not None: for key, value in fit_params.items(): step, param = key.split("__", 1) fit_params_step[step][param] = value data_dico = {} # Will contain transformed blocks at each node feature_dico = {} # Will contain the get_feature_names() of each node if do_fit: input_features = getattr(X, "columns", None) if input_features is not None: input_features = list(input_features) self._Xinput_features = input_features else: input_features = self._Xinput_features nodes_done = set() for node in self._nodes_order: nodes_done.add(node) if self.verbose: print("start processing node %s ..." % node) ### Debugging Help ### if (getattr(self, "_return_before_node", None) is not None and getattr(self, "_return_before_node", None) == node): return data_dico model = self._models[node] predecessors = list(self.complete_graph.predecessors(node)) # Carefull : here it is not necessary always in the same order #### I'll use the order in which the edges were given # Concatenation : alphabetical order concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes if len(predecessors) == 0: ######################### ### No predecessors ### ######################### if concat_at_this_node: lastX = X else: lastX = {"_data": X} # ==> Apply on original data last_features = input_features elif len(predecessors) == 1: ######################## ### One predecessor ### ######################## # ==> Apply on data coming out of last node if concat_at_this_node: lastX = data_dico[predecessors[0]] else: lastX = { predecessor: data_dico[predecessor] for predecessor in predecessors } last_features = feature_dico[predecessors[0]] elif len(predecessors) > 1: ####################### ### More than one ### ####################### # ==> concat all the predecessors node and apply it ### Fix concatenation order ### if do_fit: edges_number = self._get_edges_number(predecessors, node) predecessors = sorted(predecessors, key=lambda p: (edges_number.get(p, -1), p)) self._all_concat_order[node] = predecessors else: predecessors = self._all_concat_order[node] all_lastX = [ data_dico[predecessor] for predecessor in predecessors ] all_last_features = [ feature_dico[predecessor] for predecessor in predecessors ] if all_last_features is None or None in all_last_features: last_features = None else: last_features = unlist(all_last_features) # all_columns_names = [try_to_find_features_names( self._models[predecessor], input_features = input_features) # for predecessor, input_features in zip(predecessors, all_last_features)] # for predecessor, input_features in zip(predecessors,all_last_features): # try_to_find_features_names( self._models[predecessor], input_features = input_features) if self.verbose: print("start aggregation...") if do_fit: output_type = guess_output_type(all_lastX) self._all_concat_type[node] = output_type else: output_type = self._all_concat_type[node] if concat_at_this_node: lastX = generic_hstack(all_lastX, output_type=output_type, all_columns_names=all_last_features) else: lastX = { predecessor: data_dico[predecessor] for predecessor in predecessors } if node != self._terminal_node: # This is not the end of the graph if do_fit: if groups is not None and function_has_named_argument( model.fit_transform, "groups"): data_dico[node] = model.fit_transform( lastX, y, groups=groups, **fit_params_step[node]) else: data_dico[node] = model.fit_transform( lastX, y, **fit_params_step[node]) # ICI : on pourrait sauté le fit pour certains models dans le fit params # Quelque-chose comme : # if node in preffited_models: # # self._model[node] = preffited_models[node] # model = preffited_models[node] # + copy model into pipeline # data_dico[node] = model.transform(lastX, y) # else: # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] ) else: data_dico[node] = model.transform(lastX) feature_dico[node] = try_to_find_features_names( model, input_features=last_features) else: # This is the last node of the Graph if method == "fit": if groups is not None and function_has_named_argument( model.fit, "groups"): model.fit(lastX, y, groups, **fit_params_step[node]) else: model.fit(lastX, y, **fit_params_step[node]) result = self elif method == "fit_predict": if groups is not None and function_has_named_argument( model.fit_predict, "groups"): result = model.fit_predict(lastX, y, groups, **fit_params_step[node]) else: result = model.fit_predict(lastX, y, **fit_params_step[node]) elif method == "fit_transform": if groups is not None and function_has_named_argument( model.fit_transform, "groups"): result = model.fit_transform(lastX, y, groups, **fit_params_step[node]) else: result = model.fit_transform(lastX, y, **fit_params_step[node]) elif method == "transform": result = model.transform(lastX) elif method == "predict": result = model.predict(lastX) elif method == "predict_proba": result = model.predict_proba(lastX) elif method == "predict_log_proba": result = model.predict_log_proba(lastX) elif method == "decision_function": result = model.decision_function(lastX) elif method == "score": result = model.score(lastX, y) else: raise ValueError("I don't know that kind of method '%s' " % method) feature_dico[node] = try_to_find_features_names( model, input_features=last_features) return result ####################### #### Dico cleaning #### ####################### # I'll do a step of cleaning to remove useless blocks in memory # I need to remove data in nodes that wont be accessed anymore still_usefull = set() for n in self.complete_graph.nodes: if n in nodes_done: continue p = list(self.complete_graph.predecessors(n)) still_usefull.update(p) for n in data_dico.keys(): if data_dico[n] is None: continue if n not in still_usefull: if self.verbose: print("deleting useless node %s" % n) data_dico[n] = None