def edges_from_graph(G): """ return the edges from a graph """ all_edges = list(sorted(set( G.edges))) # to make sure the order of the edges doesn't change goon = True while goon: something_has_change = False for e1, e2 in itertools.product(all_edges, all_edges): if e1 != e2 and e1[-1] == e2[0]: all_edges = [e for e in all_edges if e != e1] all_edges = [e for e in all_edges if e != e2] all_edges.append(tuple(e1[0:-1]) + tuple(e2)) something_has_change = True if something_has_change: break if not something_has_change: goon = False all_edges = list(all_edges) # Re-add node not in edges all_nodes_in_edge = unlist(all_edges) all_nodes = sorted(G.nodes) all_nodes = [n for n in all_nodes if n not in all_nodes_in_edge] all_edges += [(n, ) for n in all_nodes] G2 = graph_from_edges(*all_edges) assert set(G.nodes) == set(G2.nodes) assert set(G.edges) == set(G2.edges) return all_edges
def _get_feature_names_at_node(self, node, input_features=None, entry=False): """ main function to make the feature go down the graphpipleine and retrieve the features at a given node Parameter --------- node : string or .. name of the node input_features : None or list if not None, the list of feature (at the input of the graphpipeline) entry : boolean, default = False if True will retrieve the feature at the ENTRY of a given model, otherwise the feature at the EXIT of a given model Returns ------- list of features for the given node, or None """ if not self._already_fitted: raise NotFittedError("Please fit the model before") if input_features is None: input_features = self._Xinput_features feature_dico = {} for n in self._nodes_order: predecessors = list(self.complete_graph.predecessors(n)) if len(predecessors) == 0: last_features = input_features elif len(predecessors) == 1: last_features = feature_dico[predecessors[0]] else: predecessors = self._all_concat_order[n] all_last_features = [ feature_dico[predecessor] for predecessor in predecessors ] if all_last_features is None or None in all_last_features: last_features = None else: last_features = unlist(all_last_features) model = self._models[n] if last_features is None or None in last_features: last_features = None if n != node: feature_dico[n] = try_to_find_features_names( model, input_features=last_features) if feature_dico[n] is not None: feature_dico[n] = list(feature_dico[n]) else: if entry: # Entry, I'll return the features at the entry of the node return last_features else: # Otherwise I'll return the features at the exit of the node feature_dico[n] = try_to_find_features_names( model, input_features=last_features) if feature_dico[n] is not None: feature_dico[n] = list(feature_dico[n]) return feature_dico[n] raise ValueError("node %s isn't in the graph" % node)
def _fit_transform(self, X, y=None, groups=None, method=None, fit_params=None): """ main method of GraphPipeline, handles the fit and predict of object """ do_fit = method in ("fit", "fit_transform", "fit_predict") if not self._already_fitted and not do_fit: raise NotFittedError("Please fit the model before") # Split fit_params into a 'step-by-step' dictionnary fit_params_step = {name: {} for name in self.complete_graph.nodes} if fit_params is not None: for key, value in fit_params.items(): step, param = key.split("__", 1) fit_params_step[step][param] = value data_dico = {} # Will contain transformed blocks at each node feature_dico = {} # Will contain the get_feature_names() of each node if do_fit: input_features = getattr(X, "columns", None) if input_features is not None: input_features = list(input_features) self._Xinput_features = input_features else: input_features = self._Xinput_features nodes_done = set() for node in self._nodes_order: nodes_done.add(node) if self.verbose: print("start processing node %s ..." % node) ### Debugging Help ### if (getattr(self, "_return_before_node", None) is not None and getattr(self, "_return_before_node", None) == node): return data_dico model = self._models[node] predecessors = list(self.complete_graph.predecessors(node)) # Carefull : here it is not necessary always in the same order #### I'll use the order in which the edges were given # Concatenation : alphabetical order concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes if len(predecessors) == 0: ######################### ### No predecessors ### ######################### if concat_at_this_node: lastX = X else: lastX = {"_data": X} # ==> Apply on original data last_features = input_features elif len(predecessors) == 1: ######################## ### One predecessor ### ######################## # ==> Apply on data coming out of last node if concat_at_this_node: lastX = data_dico[predecessors[0]] else: lastX = { predecessor: data_dico[predecessor] for predecessor in predecessors } last_features = feature_dico[predecessors[0]] elif len(predecessors) > 1: ####################### ### More than one ### ####################### # ==> concat all the predecessors node and apply it ### Fix concatenation order ### if do_fit: edges_number = self._get_edges_number(predecessors, node) predecessors = sorted(predecessors, key=lambda p: (edges_number.get(p, -1), p)) self._all_concat_order[node] = predecessors else: predecessors = self._all_concat_order[node] all_lastX = [ data_dico[predecessor] for predecessor in predecessors ] all_last_features = [ feature_dico[predecessor] for predecessor in predecessors ] if all_last_features is None or None in all_last_features: last_features = None else: last_features = unlist(all_last_features) # all_columns_names = [try_to_find_features_names( self._models[predecessor], input_features = input_features) # for predecessor, input_features in zip(predecessors, all_last_features)] # for predecessor, input_features in zip(predecessors,all_last_features): # try_to_find_features_names( self._models[predecessor], input_features = input_features) if self.verbose: print("start aggregation...") if do_fit: output_type = guess_output_type(all_lastX) self._all_concat_type[node] = output_type else: output_type = self._all_concat_type[node] if concat_at_this_node: lastX = generic_hstack(all_lastX, output_type=output_type, all_columns_names=all_last_features) else: lastX = { predecessor: data_dico[predecessor] for predecessor in predecessors } if node != self._terminal_node: # This is not the end of the graph if do_fit: if groups is not None and function_has_named_argument( model.fit_transform, "groups"): data_dico[node] = model.fit_transform( lastX, y, groups=groups, **fit_params_step[node]) else: data_dico[node] = model.fit_transform( lastX, y, **fit_params_step[node]) # ICI : on pourrait sauté le fit pour certains models dans le fit params # Quelque-chose comme : # if node in preffited_models: # # self._model[node] = preffited_models[node] # model = preffited_models[node] # + copy model into pipeline # data_dico[node] = model.transform(lastX, y) # else: # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] ) else: data_dico[node] = model.transform(lastX) feature_dico[node] = try_to_find_features_names( model, input_features=last_features) else: # This is the last node of the Graph if method == "fit": if groups is not None and function_has_named_argument( model.fit, "groups"): model.fit(lastX, y, groups, **fit_params_step[node]) else: model.fit(lastX, y, **fit_params_step[node]) result = self elif method == "fit_predict": if groups is not None and function_has_named_argument( model.fit_predict, "groups"): result = model.fit_predict(lastX, y, groups, **fit_params_step[node]) else: result = model.fit_predict(lastX, y, **fit_params_step[node]) elif method == "fit_transform": if groups is not None and function_has_named_argument( model.fit_transform, "groups"): result = model.fit_transform(lastX, y, groups, **fit_params_step[node]) else: result = model.fit_transform(lastX, y, **fit_params_step[node]) elif method == "transform": result = model.transform(lastX) elif method == "predict": result = model.predict(lastX) elif method == "predict_proba": result = model.predict_proba(lastX) elif method == "predict_log_proba": result = model.predict_log_proba(lastX) elif method == "decision_function": result = model.decision_function(lastX) elif method == "score": result = model.score(lastX, y) else: raise ValueError("I don't know that kind of method '%s' " % method) feature_dico[node] = try_to_find_features_names( model, input_features=last_features) return result ####################### #### Dico cleaning #### ####################### # I'll do a step of cleaning to remove useless blocks in memory # I need to remove data in nodes that wont be accessed anymore still_usefull = set() for n in self.complete_graph.nodes: if n in nodes_done: continue p = list(self.complete_graph.predecessors(n)) still_usefull.update(p) for n in data_dico.keys(): if data_dico[n] is None: continue if n not in still_usefull: if self.verbose: print("deleting useless node %s" % n) data_dico[n] = None
def _fit_transform(self, X, y, do_fit, do_transform): ############################### ### 1) Create preprocessing ### ############################### if do_fit: if self.text_preprocess is None: self._text_preprocessor = None elif self.text_preprocess == "default": self._text_preprocessor = TextDefaultProcessing() elif self.text_preprocess == "digit": self._text_preprocessor = TextDigitAnonymizer() elif self.text_preprocess == "nltk": self._text_preprocessor = TextNltkProcessing() ############################## ### 2) Apply preprocessing ### ############################## if self._text_preprocessor is not None: if do_fit: newX = self._text_preprocessor.fit_transform(X) else: newX = self._text_preprocessor.transform(X) else: newX = X if do_fit: self._nbcols = newX.shape[1] else: if newX.shape[1] != self._nbcols: raise ValueError( "I don't have the correct number of columns %d, expected %d" ( newX.shape[1], self._nbcols)) ####################################################### ### 2) get all sub string of length 'self.nb_chars' ### ####################################################### Xsplitted = [[ _retrieve_all_rolling_string_parts(string, ngram=self.ngram) for string in newX.iloc[:, j] ] for j in range(self._nbcols)] ################################# ### 3) fit Word2Vec embedding ### ################################# if do_fit: if self.other_params is None: other_params = {} else: other_params = self.other_params if self.same_embedding_all_columns: ############################################## ### One embedding for ALL the text columns ### ############################################## Xsplitted_all = [] for Xs in Xsplitted: Xsplitted_all += unlist(Xs) model = Word2Vec(size=self.size, window=self.window, seed=self.random_state, **other_params) model.build_vocab(Xsplitted_all) model.train(Xsplitted_all, total_examples=model.corpus_count, epochs=model.epochs) self.models = [ model for j in range(self._nbcols) ] # j time the same model, model train on everything else: ###################################### ### One embedding PER text columns ### ###################################### self.models = [] for jj, Xs in enumerate(Xsplitted): seed = self.random_state + jj if self.random_state else None uXs = unlist(Xs) model = Word2Vec(size=self.size, window=self.window, seed=seed, **other_params) model.build_vocab(uXs) model.train(uXs, total_examples=model.corpus_count, epochs=model.epochs) self.models.append(model) self._features_names = [] for j in range(self._nbcols): self._features_names += [ "%s__EMB__%d" % (X.columns[j], w) for w in range(self.size) ] if not do_transform: return self if self.models is None: raise NotFittedError("You must fit the model first") ######################### ### 5) Apply Word2Vec ### ######################### # Rmk : il faudrait refaire ca en vectorialisee... ou peut etre accelerer avec numba XXres = np.zeros((X.shape[0], self.size * self._nbcols), dtype=np.float32) for j, (modelj, Xs) in enumerate(zip(self.models, Xsplitted)): for i, sentence in enumerate(Xs): count = 0 for k, sub_sentence in enumerate(sentence): for word in sub_sentence: try: emb = modelj.wv[word] except KeyError: emb = None if emb is not None: count += 1 XXres[i, (self.size * j):(self.size * (j + 1))] += emb if count > 0: XXres[i, (self.size * j):(self.size * (j + 1))] /= count return pd.DataFrame(XXres, columns=self._features_names, index=X.index)
def test_unlist(): assert unlist([[1, 10], [32]]) == [1, 10, 32] assert unlist([[10], [11], [], [45]]) == [10, 11, 45]