def CalculatePi(self): njobs = self.n_jobs front_y = self.pareto_front_point() front_y = self.y[front_y, :].T predict_y_all = self.predict_y_all del self.predict_y_all gc.collect() def tile_func(i, front_y0): tile = 0 for front_y_i in front_y0.T: big = i - front_y_i big_bool = np.max(big, axis=1) < 0 tile |= big_bool return tile tile_all = parallelize(n_jobs=njobs, func=tile_func, iterable=predict_y_all, front_y=front_y) pi = np.sum(1 - np.array(tile_all), axis=1) / self.number self.Pi = pi return pi
def parallelize_score(self, inds): """ Parameters ---------- inds:list of SymbolTree Returns ------- list of (score,dim,dim_score) """ indss = [i.capsule for i in inds] calls = functools.partial(calculate_collect, context=self.context, x=self.data_x, y=self.y, terminals_and_constants_repr=self.terminals_and_constants_repr, gro_ter_con=self.gro_ter_con, cv=self.cv, refit=self.refit, dim_ter_con_list=self.dim_ter_con_list, dim_type=self.dim_type, fuzzy=self.fuzzy, scoring=self.scoring, score_pen=self.score_pen, vector_add=self.vector_add, add_coef=self.add_coef, inter_add=self.inter_add, inner_add=self.inner_add, np_maps=self.np_map, filter_warning=self.filter_warning, dim_maps=self.dim_map, cal_dim=self.cal_dim) score_dim_list = parallelize(func=calls, iterable=indss, n_jobs=self.n_jobs, respective=False, tq=self.tq, batch_size=self.batch_size) return score_dim_list
def cal_binary_distance_all(self, slices=None, estimator_i=0): """ calculate the distance matrix of slices """ self.estimator_i = estimator_i if isinstance(estimator_i, int) else self.estimator_i n_jobs = self.n_jobs slices = slices if slices else self.slices ret = self.check_prop("cal_binary_distance_all", estimator_i=self.estimator_i, slices=slices) if ret is not None: pass else: cal_binary_distance = partial(self.cal_binary_distance) slices_cuple = list(itertools.product(slices, repeat=2)) ret = parallelize(n_jobs=n_jobs, func=cal_binary_distance, iterable=slices_cuple, respective=True) ret = np.reshape(ret, (len(slices), len(slices)), order='F') self.add_prop("cal_binary_distance_all", estimator_i=self.estimator_i, slices=slices, values=ret) self.slices = slices return ret
def cal_y_distance_all(self, slices=None, estimator_i=0): """ calculate binary distance of 2 nodes """ self.estimator_i = estimator_i if isinstance(estimator_i, int) else self.estimator_i n_jobs = self.n_jobs slices = slices if slices else self.slices ret = self.check_prop("cal_y_distance_all", estimator_i=self.estimator_i, slices=slices) if ret is not None: pass else: cal_score = partial(self.cal_y_distance) ret = parallelize(n_jobs=n_jobs, func=cal_score, iterable=slices) self.add_prop("cal_y_distance_all", estimator_i=self.estimator_i, slices=slices, values=ret) self.slices = slices return np.array(ret)
def _fit(self, x, y): def score_pri(slices, x0, y0): slices = list(slices) if len(slices) < 1: score0 = -np.inf else: slices = self.feature_unfold(slices) data_x0 = x0[:, slices] self.estimator.fit(data_x0, y0) score0 = np.mean(self.estimator.best_score_) # score_test # print(slices, score0) return score0 score = partial(score_pri, x0=x, y0=y) self.score_ = [] x, y = check_X_y(x, y, "csc") assert all((self.check_must, self.check_muti)) in [True, False] feature_list = list(range(x.shape[1])) fold_feature_list = self.feature_fold(feature_list) if self.check_must: fold_feature_list = [ i for i in fold_feature_list if i not in self.check_must ] slice_all = [combinations(fold_feature_list, i) for i in self.n_select] slice_all = [ list(self.feature_must_fold(_)) for i in slice_all for _ in i ] scores = parallelize(n_jobs=self.n_jobs, func=score, iterable=slice_all) feature_combination = [self.feature_unfold(_) for _ in slice_all] index = np.argmax(scores) select_feature = feature_combination[index] su = np.zeros(x.shape[1], dtype=np.bool) su[select_feature] = 1 self.best_score_ = max(scores) self.score_ = scores self.support_ = su self.estimator_ = clone(self.estimator) if self.refit: self.estimator_.fit(x[:, select_feature], y) self.n_feature_ = len(select_feature) self.score_ex = list(zip(feature_combination, scores)) self.scatter = list(zip([len(i) for i in slice_all], scores)) self.score_ex.sort(key=lambda _: _[1], reverse=True) return self
def _fit(self, x, y, searchspace0, regclf0): def fit_parllize(random_state): data_train, y_train = sklearn.utils.resample( x, y, n_samples=None, replace=True, random_state=random_state) regclf0.fit(data_train, y_train) predict_data = regclf0.predict(searchspace0) predict_data.ravel() return predict_data njobs = self.n_jobs number = self.number predict_dataj = parallelize(n_jobs=njobs, func=fit_parllize, iterable=range(number)) return np.array(predict_dataj)
def cv_score_all(self, slices=None, estimator_i=0): """score all node with r2 Parameters ---------- slices : list, or None, default spath.slices change to new slices to calculate the lists of the index of feature subsets, each feature subset is a node,each int is the index of X Examples 3 nodes [[1,4,5],[1,4,6],[1,2,7]] estimator_i: int, default spath.estimator_i change to the estimator_i to calculate Returns ---------- score_mean_std: nd.ndarray 2D the mean and std """ self.estimator_i = estimator_i if isinstance(estimator_i, int) else self.estimator_i n_jobs = self.n_jobs slices = slices if slices else self.slices ret = self.check_prop("cv_score_all", estimator_i=self.estimator_i, slices=slices) if ret is not None: pass else: cal_score = partial(self.cv_score) ret = parallelize(n_jobs=n_jobs, func=cal_score, iterable=slices) self.add_prop("cv_score_all", estimator_i=self.estimator_i, slices=slices, values=ret) self.slices = slices return np.array(ret)
def _cv_predict_all(self, slices=None, estimator_i=0): """ calculate binary distance of 2 nodes """ self.estimator_i = estimator_i if isinstance(estimator_i, int) else self.estimator_i n_jobs = self.n_jobs batch_size = self.batch_size slices = slices if slices else self.slices ret = self.check_prop("cv_predict_all", estimator_i=self.estimator_i, slices=slices) if ret is not None: pass else: cal_score = partial(self.predict) ret = parallelize(n_jobs=n_jobs, func=cal_score, iterable=slices, batch_size=batch_size) self.add_prop("cv_predict_all", estimator_i=self.estimator_i, slices=slices, values=ret) self.slices = slices return ret
def fit(self, X, y, groups=None): """Fit the baf model and automatically tune the number of selected feature. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_feature] Training vector, where `n_samples` is the number of samples and `n_feature` is the total number of feature. y : array-like, shape = [n_samples] Target values (integers for classification, real numbers for regression). groups : array-like, shape = [n_samples], optional cal_group labels for the samples used while splitting the dataset into train/test set. """ X, y = check_X_y(X, y, "csr") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) ran = check_random_state(self.random_state) baf = BackForward( estimator=self.estimator, n_type_feature_to_select=self.n_type_feature_to_select, verbose=self.verbose, primary_feature=self.primary_feature, muti_grade=self.muti_grade, muti_index=self.muti_index, must_index=self.must_index, random_state=ran) func = partial(_baf_single_fit, baf=baf, estimator=self.estimator, X=X, y=y, scorer=scorer, random_state=ran) scores = parallelize(n_jobs=self.n_jobs, func=func, iterable=cv.split(X, y, groups), respective=True) support, scores, score_step = zip(*scores) best_support = support[np.argmax(scores)] best_score = max(scores) # Re-execute an elimination with best_k over the whole set # Set final attributes self.support_step = score_step self.support_cv = support self.support_ = best_support self.score_cv = scores self.score_ = best_score self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, self.support_], y) self.n_feature_ = np.count_nonzero(support) return self