class CustomRegressor(BaseEstimator, RegressorMixin): def __init__(self, **estimator_params): super().__init__() self.selector = SelectKBest(score_func=mutual_info_regression, k="all") self.base_model = lightgbm.LGBMRegressor(random_state=24, objective="regression_l1") self.set_params(**estimator_params) def fit(self, X, y=None): X_tr = self.selector.fit_transform(X, y) self.base_model.fit(X_tr, y) return self def predict(self, X): X_tr = self.selector.transform(X) y = self.base_model.predict(X_tr) y[X.false_positive == 1] = 1 y[X.false_negative == 1] = 1 y[(X.area_model + X.area_expert) == 0] = 5 y = np.round(y, 0) return y def get_params(self, **params): pars = self.base_model.get_params() pars["k"] = self.selector.k return pars def set_params(self, **params): if "k" in params: k = params.pop("k") self.selector = self.selector.set_params(k=k) self.base_model = self.base_model.set_params(**params) return self
class FeatureSel(BaseEstimator,TransformerMixin): def __init__(self,k_best=5,pca_comp=8): self.k_best=k_best self.pca_comp=pca_comp if pca_comp>0: self.pca=PCA(n_components=self.pca_comp) if k_best>0: self.skb=SelectKBest(k=self.k_best) def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self,parameter, value) return self self.pca.set_params(n_components=self.pca_comp) self.skb.set_params(k=self.k_best) return self def transform(self,X): X1=self.pca.transform(X) X2=self.skb.transform(X) return np.hstack((X1,X2)) def fit_transform(self,X,y): X1=self.pca.fit_transform(X,y) X2=self.skb.fit_transform(X,y) return np.hstack((X1,X2)) def fit(self,X,y): if self.pca_comp>0: self.pca.fit(X,y) if self.k_best>0: self.skb.fit(X,y)
class FeatureSel(BaseEstimator, TransformerMixin): def __init__(self, k_best=5, pca_comp=8): self.k_best = k_best self.pca_comp = pca_comp if pca_comp > 0: self.pca = PCA(n_components=self.pca_comp) if k_best > 0: self.skb = SelectKBest(k=self.k_best) def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self, parameter, value) return self self.pca.set_params(n_components=self.pca_comp) self.skb.set_params(k=self.k_best) return self def transform(self, X): X1 = self.pca.transform(X) X2 = self.skb.transform(X) return np.hstack((X1, X2)) def fit_transform(self, X, y): X1 = self.pca.fit_transform(X, y) X2 = self.skb.fit_transform(X, y) return np.hstack((X1, X2)) def fit(self, X, y): if self.pca_comp > 0: self.pca.fit(X, y) if self.k_best > 0: self.skb.fit(X, y)
def runXVal(X1, Y1, n_ds=None, CLF=None, cv_nfolds=10, DS_top=pd.DataFrame()): Perf = [] FS = SelectKBest(f_classif) # Feature selection SKF = StratifiedKFold(Y1, n_folds=cv_nfolds, shuffle=True) if DS_top.shape[0] == 0: DS_top = pd.DataFrame(np.zeros(X1.shape[1]), index=X1.columns) k_fold = 0 for I1, I2 in SKF: k_fold += 1 I_train, I_test = X1.index[I1], X1.index[I2] FS.set_params(k=n_ds) FS.fit_transform(X1.ix[I_train], y=Y1.ix[I_train]) DS = X1.columns[FS.get_support()] DS_top.ix[DS] += 1 X_train, Y_train = X1.ix[I_train, DS], Y1.ix[I_train] X_test, Y_test = X1.ix[I_test, DS], Y1.ix[I_test] CLF = getClassifiers() for Nm, Clf in CLF.iteritems(): Clf.fit(X_train, Y_train) try: Y_pred = Clf.predict(X_test) except: print " >> Failed", Nm else: P = dict(lr=Nm, pt='cvt', n_train=len(I_train), n_test=len(I_test), n_ds=n_ds, cv_kfold=cv_nfolds) P.update(evalPred(Y_test, Y_pred)) Perf.append(P) return Perf
def feature_sweep(features, boolean_labels, labels, seed, save_plot, show_plot, n_features=20): """ Performs a sweep of top 'n_features' bayesian features per label, top 'n_features' bayesian features overall and the top 'n_features' selected by the k-best selector using the f_classifier function. """ model = KMeans(n_clusters=10) # Finds the best performing features per label from the naive bayesian net print("Performing sweep of top bayesian features per label...") bayes_classifiers = fit_labels(features, boolean_labels) bayes_per_label_analysis = [] with click.progressbar(range(n_features // 10)) as feature_range: for n in feature_range: top_n_features = set( itertools.chain.from_iterable( x.top_features[:n + 1] for x in bayes_classifiers.values())) numpy.random.set_state(seed) predictions = model.fit_predict( features[(str(x) for x in top_n_features)]) bayes_per_label_analysis.append( (len(top_n_features), score_clustering(labels, predictions))) # Finds the best performing features overall from the naive bayesian net print("Performing sweep of top bayesian features overall...") bayes_classifier = bayesian_classification(features, labels, n_correlated=n_features + 1) bayes_overall_analysis = [] with click.progressbar(range(1, n_features + 1)) as feature_range: for n in feature_range: top_n_features = bayes_classifier.top_features[:n] numpy.random.set_state(seed) predictions = model.fit_predict( features[(str(x) for x in top_n_features)]) bayes_overall_analysis.append( (n, score_clustering(labels, predictions))) # Finds the best perfroming features from the K-Best algorithm using the f_classifier scoring function print("Performing sweep of K-best features...") k_best_analysis = [] selector = SelectKBest() selector.fit(features, column_or_1d(labels)) with click.progressbar(range(1, n_features + 1)) as feature_range: for n in feature_range: selector.set_params(k=n) selected_features = selector.transform(features) numpy.random.set_state(seed) predictions = model.fit_predict(selected_features) k_best_analysis.append((n, score_clustering(labels, predictions))) # Plots the data gathered from the sweeps aboveß if show_plot or save_plot: handles = [] plot_info = [(bayes_per_label_analysis, 'r', "Bayes per Label"), (bayes_overall_analysis, 'g', "Bayes Overall"), (k_best_analysis, 'b', "K Best")] for data, colour, name in plot_info: data = list(zip(*[(x, *y.values()) for x, y in data])) handles += plt.plot(data[0], data[3], '-' + colour, label=name + " V Score") handles += plt.plot(data[0], data[4], '--' + colour, label=name + " Rand") plt.legend(handles, loc="lower left") plt.xlabel("Number of Features") plt.title("Comparison of Feature Selection Algorithms") if save_plot is not None: path = os.path.join(save_plot, "feature_sweep.png") plt.savefig(path) print("") print("saved figure to " + path) if show_plot: plt.show() plt.clf() # Plots a heatmap of the results of the f_classifier scoring function plt.imshow(selector.scores_.reshape(48, -1), cmap='hot', interpolation='lanczos') plt.title("K-Best Feature Heatmap") if save_plot is not None: path = os.path.join(save_plot, "k_best_heatmap.png") plt.savefig(path) print("") print("saved figure to " + path) if show_plot: plt.show() plt.clf() # Returns the best performing number of features for the K-best selector score = [ v_score + rand for _, _, v_score, rand in [scores.values() for (_, scores) in k_best_analysis] ] score = numpy.argmax(score) print(f"Best performance out of {n_features} features: {score}") return score
def runML1(Data_cv, y, dt_out=None, dt_in=None, n_ds=10, cv_nfolds=10, cv_iters=5, CLF=None, DS_top=None, save=False, Col_ds=None, Col_lr=None): """ Data_cv: Data for cross validation y : Column of Data_cv with classification attribute n_ds : number of top descriptors to use """ # Data For CV X1, Y1 = Data_cv.drop(y, axis=1), Data_cv[y] ID_cv = X1.index cls_ds = y Neg = Data_cv.index[Data_cv[cls_ds] == 0] Pos = Data_cv.index[Data_cv[cls_ds] == 1] n_n = len(Neg) n_p = len(Pos) if not DS_top: DS_top = pd.DataFrame(np.zeros(X1.shape[1]), index=X1.columns, columns=['N']) # The dataset R_ds = dict(pred=y, dt_in=dt_in, dt_out=dt_out, n_ds=n_ds, n_obs=X1.shape[0], n_neg=n_n, n_pos=n_p, cvt=dict(pos=list(Pos), neg=list(Neg))) if Col_ds: ds_id = Col_ds.save(R_ds) Perf = [] # Iterate through cross-validation for k_cv in range(cv_iters): P = runXVal(X1, Y1, n_ds=n_ds, cv_nfolds=cv_nfolds, DS_top=DS_top) Perf += P P1 = summarizePerf(Perf) P1['n_iter'] = cv_iters P1a = {i['lr']: i for i in P1.to_dict('records')} # Store the full classifiers CLF = getClassifiers() FS = SelectKBest(f_classif) FS.set_params(k=n_ds) FS.fit_transform(X1, y=Y1) DS = X1.columns[FS.get_support()] DS_top.sort('N', ascending=False, inplace=True) DS = DS_top.ix[:n_ds].index LR_db = {} for Nm, Clf in CLF.iteritems(): Clf.fit(X1[DS], Y1) try: Y = Clf.predict(X1[DS]) except: print " >> Training evaluation failed", Nm else: R = dict(pred=y, dt_in=dt_in, dt_out=dt_out, n_ds=n_ds, lr=Nm, n_pos=n_p, n_neg=n_n, n_obs=X1.shape[0]) P = dict(pt='all_obs') P.update(evalPred(Y1, Y)) P['ds'] = list(DS) R['perf_trn'] = P R['perf_cvt'] = P1a[Nm] #R['clf_pkl']=pickle.dumps(Clf) #R['qmrf'] = 'TBD' #R['ds_id'] = ds_id, LR_db[Nm] = Col_lr.insert(R)
""" ### INITIAL TUNING OF PARAMETERS, including selection of features. optimum k found to be 8. ### set params for gridsearch params = dict(feat_select__k = range(4, len(features_list)-1), svm__gamma=[0.1, 0.5, 1], svm__C=[1,10,100], svm__kernel=['rbf', 'poly', 'sigmoid']) estimator = grid_search.GridSearchCV(pipe, param_grid=params, scoring = 'f1', cv=cv) estimator.fit(features_train, labels_train) print estimator.best_params_ #pprint.pprint(estimator.grid_scores_) """ ### identify best k features (k=8) used in POI identifier and save to updated_features list feat_select.set_params(k=13) feat_select.fit(features_train, labels_train) array1 = feat_select.get_support() array2 = feat_select.scores_ print "Scores for features selected using SelectKBest: " updated_features = ['poi'] for i in range(0,len(array1)): if array1[i]==True: print features_list[i+1], array2[i] updated_features.append(features_list[i+1]) ### store updated_features in features_list for export features_list = updated_features print "Features used in classifier: ", features_list
class SelectKBest(FeatureSelectionAlgorithm): r"""Implementation of feature selection using selection of k best features according to used score function. Date: 2020 Author: Luka Pečnik License: MIT Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html See Also: * :class:`niaaml.preprocessing.feature_selection.feature_selection_algorithm.FeatureSelectionAlgorithm` """ Name = 'Select K Best' def __init__(self, **kwargs): r"""Initialize SelectKBest feature selection algorithm. Notes: _params['k'] is initialized to None as it is included in the optimization process later since we cannot determine a proper value range until length of the feature vector becomes known. """ self._params = dict(score_func=ParameterDefinition( [chi2, f_classif, mutual_info_classif]), k=None) self.__k = None self.__select_k_best = SelectKB() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__select_k_best.set_params(**kwargs) def select_features(self, x, y, **kwargs): r"""Perform the feature selection process. Arguments: x (pandas.core.frame.DataFrame): Array of original features. y (pandas.core.series.Series) Expected classifier results. Returns: numpy.ndarray[bool]: Mask of selected features. """ if self.__k is None: self.__k = x.shape[1] self._params['k'] = ParameterDefinition(MinMax(1, self.__k), np.int) val = np.int(np.around(np.random.uniform(1, self.__k))) self.__select_k_best.set_params(k=val) self.__select_k_best.fit(x, y) return self.__select_k_best.get_support() def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return FeatureSelectionAlgorithm.to_string(self).format( name=self.Name, args=self._parameters_to_string(self.__select_k_best.get_params()))