def plot_transfer_graph_fitted(path, name, analysis_folder): from sklearn.linear_model import Ridge from scipy.interpolate import UnivariateSpline result_file = open( os.path.join(path, '0_results', analysis_folder, name, name + '_classifier.pyobj'), 'r') results = pickle.load(result_file) runs = 6 values = results.estimates run_length = len(values) / runs ridge = Ridge() f = plt.figure() a = f.add_subplot(111) for i in range(runs): v = values[i * run_length:(i + 1) * run_length] yy = v.copy() xx = np.linspace(0, len(v), len(v)) try: ridge.transform(np.vander(xx, 12), yy) y_fit = ridge.predict(np.vander(xx, 12)) except LinAlgError, err: ridge.transform(np.vander(xx, 9), yy) y_fit = ridge.predict(np.vander(xx, 9)) a.plot(y_fit)
def plot_transfer_graph_fitted(path, name, analysis_folder): from sklearn.linear_model import Ridge from scipy.interpolate import UnivariateSpline result_file = open( os.path.join(path, '0_results', analysis_folder, name, name+'_classifier.pyobj') , 'r') results = pickle.load(result_file) runs = 6 values = results.estimates run_length = len(values)/runs ridge = Ridge() f = plt.figure() a = f.add_subplot(111) for i in range(runs): v = values[i*run_length:(i+1)*run_length] yy = v.copy() xx = np.linspace(0, len(v), len(v)) try: ridge.transform(np.vander(xx, 12), yy) y_fit = ridge.predict(np.vander(xx, 12)) except LinAlgError,err: ridge.transform(np.vander(xx, 9), yy) y_fit = ridge.predict(np.vander(xx, 9)) a.plot(y_fit)
class Regressor(): """ Wraps scikitlearn regressors. Parameters ---------- strategy : string, defaut = "LightGBM" (if installed else "XGBoost") The choice for the regressor. Available strategies = "LightGBM" (if installed), "XGBoost", "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear". **params : parameters of the corresponding regressor. Examples : n_estimators, max_depth... """ def __init__(self, **params): if ("strategy" in params): self.__strategy = params["strategy"] else: if (lgbm_installed): self.__strategy = "LightGBM" else: self.__strategy = "XGBoost" self.__regress_params = {} self.__regressor = None self.__set_regressor(self.__strategy) self.__col = None self.set_params(**params) self.__fitOK = False def get_params(self, deep=True): params = {} params["strategy"] = self.__strategy params.update(self.__regress_params) return params def set_params(self, **params): self.__fitOK = False if 'strategy' in params.keys(): self.__set_regressor(params['strategy']) for k,v in self.__regress_params.items(): if k not in self.get_params().keys(): warnings.warn("Invalid parameter for regressor "+str(self.__strategy)+". Parameter IGNORED. Check the list of available parameters with `regressor.get_params().keys()`") else: ret = setattr(self.__regressor,k,v) for k,v in params.items(): if(k=="strategy"): pass else: if k not in self.__regressor.get_params().keys(): warnings.warn("Invalid parameter for regressor "+str(self.__strategy)+". Parameter IGNORED. Check the list of available parameters with `regressor.get_params().keys()`") else: ret = setattr(self.__regressor,k,v) self.__regress_params[k] = v def __set_regressor(self, strategy): self.__strategy = strategy if(strategy == 'RandomForest'): self.__regressor = RandomForestRegressor(n_estimators=400, max_depth=10, max_features='sqrt', bootstrap = True, n_jobs=-1, random_state=0) elif(strategy == 'XGBoost'): self.__regressor = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, colsample_bytree=0.8, colsample_bylevel=1., subsample=0.9, nthread=-1, seed=0) elif(strategy == "LightGBM"): if(lgbm_installed): self.__regressor = LGBMRegressor(n_estimators=500, learning_rate=0.05, colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0) else: warnings.warn("Package lightgbm is not installed. Model LightGBM will be replaced by XGBoost") self.__strategy = "XGBoost" self.__regressor = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, colsample_bytree=0.8, colsample_bylevel=1., subsample=0.9, nthread=-1, seed=0) elif(strategy == 'ExtraTrees'): self.__regressor = ExtraTreesRegressor(n_estimators=400, max_depth=10, max_features='sqrt', bootstrap = True, n_jobs=-1, random_state=0) elif(strategy == 'Tree'): self.__regressor = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None, presort=False) elif(strategy == "Bagging"): self.__regressor = BaggingRegressor(base_estimator=None, n_estimators=500, max_samples=.9, max_features=.85, bootstrap=False, bootstrap_features=False, n_jobs=-1, random_state=0) elif(strategy == "AdaBoost"): self.__regressor = AdaBoostRegressor(base_estimator=None, n_estimators=400, learning_rate=.05, random_state=0) elif(strategy == "Linear"): self.__regressor = Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=0) else: raise ValueError("Strategy invalid. Please choose between 'LightGBM' (if installed), 'XGBoost', 'RandomForest', 'ExtraTrees', 'Tree', 'Bagging', 'AdaBoost' or 'Linear'") def fit(self, df_train, y_train): ''' Fits Regressor. Parameters ---------- df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features. y_train : pandas series of shape = (n_train, ) The target for regression tasks. Returns ------- self ''' ### sanity checks if ((type(df_train)!=pd.SparseDataFrame)&(type(df_train)!=pd.DataFrame)): raise ValueError("df_train must be a DataFrame") if (type(y_train) != pd.core.series.Series): raise ValueError("y_train must be a Series") self.__regressor.fit(df_train.values, y_train) self.__col = df_train.columns self.__fitOK = True return self def feature_importances(self): if self.__fitOK: if (self.get_params()["strategy"] in ["Linear"]): importance = {} for i, col in enumerate(self.__col): importance[col] = np.abs(self.get_estimator().coef_)[i] elif (self.get_params()["strategy"] in ["LightGBM", "XGBoost", "RandomForest", "ExtraTrees", "Tree"]): importance = {} for i, col in enumerate(self.__col): importance[col] = self.get_estimator().feature_importances_[i] elif (self.get_params()["strategy"] in ["AdaBoost"]): importance = {} norm = self.get_estimator().estimator_weights_.sum() try: f = sum(weight * est.feature_importances_ for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # XGB, RF, ET, Tree and AdaBoost except: f = sum(weight * np.abs(est.coef_) for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # Linear for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["Bagging"]): importance = {} importance_bag = [] for i, b in enumerate(self.get_estimator().estimators_): d = {} try: f = b.feature_importances_ # XGB, RF, ET, Tree and AdaBoost except: f = np.abs(b.coef_) # Linear for j, c in enumerate(self.get_estimator().estimators_features_[i]): d[self.__col[c]] = f[j] importance_bag.append(d.copy()) for i, col in enumerate(self.__col): importance[col] = np.mean(filter(lambda x: x != 0, [d[col] if col in d else 0 for d in importance_bag])) else: importance = {} return importance else: raise ValueError("You must call the fit function before !") def predict(self, df): ''' Predicts the target. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- y : array of shape = (n, ) The target to be predicted. ''' try: if not callable(getattr(self.__regressor, "predict")): raise ValueError("predict attribute is not callable") except Exception as e: raise e if self.__fitOK: ### sanity checks if ((type(df)!=pd.SparseDataFrame)&(type(df)!=pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__regressor.predict(df.values) else: raise ValueError("You must call the fit function before !") def transform(self, df): ''' Transforms df. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- df_transform : pandas dataframe of shape = (n, n_selected_features) The transformed dataset with its most important features. ''' try: if not callable(getattr(self.__regressor, "transform")): raise ValueError("transform attribute is not callable") except Exception as e: raise e if self.__fitOK: ### sanity checks if ((type(df)!=pd.SparseDataFrame)&(type(df)!=pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__regressor.transform(df.values) else: raise ValueError("You must call the fit function before !") def score(self, df, y, sample_weight=None): ''' Returns the coefficient of determination R^2 of the prediction. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. y : pandas series of shape = (n,) The numerical encoded target for classification tasks. Returns ------- score : float R^2 of self.predict(df) wrt. y. ''' try: if not callable(getattr(self.__regressor, "score")): raise ValueError("score attribute is not callable") except Exception as e: raise e if self.__fitOK: ### sanity checks if ((type(df)!=pd.SparseDataFrame)&(type(df)!=pd.DataFrame)): raise ValueError("df must be a DataFrame") if (type(y) != pd.core.series.Series): raise ValueError("y must be a Series") return self.__regressor.score(df.values, y, sample_weight) else: raise ValueError("You must call the fit function before !") def get_estimator(self): return copy(self.__regressor)
class Regressor(): """Wrap scikitlearn regressors. Parameters ---------- strategy : str, default = "LightGBM" The choice for the regressor. Available strategies = {"LightGBM", "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear"} **params : default = None Parameters of the corresponding regressor. Examples : n_estimators, max_depth... """ def __init__(self, **params): """Init Regressor object where user can pass a strategy.""" if ("strategy" in params): self.__strategy = params["strategy"] else: self.__strategy = "LightGBM" self.__regress_params = {} self.__regressor = None self.__set_regressor(self.__strategy) self.__col = None self.set_params(**params) self.__fitOK = False def get_params(self, deep=True): """Get parameters of Regressor object.""" params = {} params["strategy"] = self.__strategy params.update(self.__regress_params) return params def set_params(self, **params): """Set parameters of Regressor object.""" self.__fitOK = False if 'strategy' in params.keys(): self.__set_regressor(params['strategy']) for k, v in self.__regress_params.items(): if k not in self.get_params().keys(): warnings.warn("Invalid parameter for regressor " + str(self.__strategy) + ". Parameter IGNORED. Check the list of " "available parameters with " "`regressor.get_params().keys()`") else: setattr(self.__regressor, k, v) for k, v in params.items(): if(k == "strategy"): pass else: if k not in self.__regressor.get_params().keys(): warnings.warn("Invalid parameter for regressor " + str(self.__strategy) + ". Parameter IGNORED. Check the list of " "available parameters with " "`regressor.get_params().keys()`") else: setattr(self.__regressor, k, v) self.__regress_params[k] = v def __set_regressor(self, strategy): """Set strategy of a regressor object.""" self.__strategy = strategy if(strategy == 'RandomForest'): self.__regressor = RandomForestRegressor( n_estimators=400, max_depth=10, max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=0) elif(strategy == "LightGBM"): self.__regressor = LGBMRegressor( n_estimators=500, learning_rate=0.05, colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0) elif(strategy == 'ExtraTrees'): self.__regressor = ExtraTreesRegressor( n_estimators=400, max_depth=10, max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=0) elif(strategy == 'Tree'): self.__regressor = DecisionTreeRegressor( criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None, presort=False) elif(strategy == "Bagging"): self.__regressor = BaggingRegressor( base_estimator=None, n_estimators=500, max_samples=.9, max_features=.85, bootstrap=False, bootstrap_features=False, n_jobs=-1, random_state=0) elif(strategy == "AdaBoost"): self.__regressor = AdaBoostRegressor( base_estimator=None, n_estimators=400, learning_rate=.05, random_state=0) elif(strategy == "Linear"): self.__regressor = Ridge( alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=0) else: raise ValueError( "Strategy invalid. Please choose between 'LightGBM'" ", 'RandomForest', 'ExtraTrees', " "'Tree', 'Bagging', 'AdaBoost' or 'Linear'") def fit(self, df_train, y_train): """Fits Regressor. Parameters ---------- df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features. y_train : pandas series of shape = (n_train, ) The target for regression tasks. Returns ------- object self """ # sanity checks if((type(df_train) != pd.SparseDataFrame) and (type(df_train) != pd.DataFrame)): raise ValueError("df_train must be a DataFrame") if (type(y_train) != pd.core.series.Series): raise ValueError("y_train must be a Series") self.__regressor.fit(df_train.values, y_train) self.__col = df_train.columns self.__fitOK = True return self def feature_importances(self): """Computes feature importances. Regressor must be fitted before. Returns ------- dict Dictionnary containing a measure of feature importance (value) for each feature (key). """ if self.__fitOK: if (self.get_params()["strategy"] in ["Linear"]): importance = {} f = np.abs(self.get_estimator().coef_) for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["LightGBM", "RandomForest", "ExtraTrees", "Tree"]): importance = {} f = self.get_estimator().feature_importances_ for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["AdaBoost"]): importance = {} norm = self.get_estimator().estimator_weights_.sum() try: # LGB, RF, ET, Tree and AdaBoost # TODO: Refactor this part f = sum(weight * est.feature_importances_ for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa except Exception: f = sum(weight * np.abs(est.coef_) for weight, est in zip(self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["Bagging"]): importance = {} importance_bag = [] for i, b in enumerate(self.get_estimator().estimators_): d = {} try: # LGB, RF, ET, Tree and AdaBoost f = b.feature_importances_ except Exception: f = np.abs(b.coef_) # Linear estimator = self.get_estimator() items = enumerate(estimator.estimators_features_[i]) for j, c in items: d[self.__col[c]] = f[j] importance_bag.append(d.copy()) for i, col in enumerate(self.__col): list_filtered = filter(lambda x: x != 0, [k[col] if col in k else 0 for k in importance_bag]) importance[col] = np.mean(list(list_filtered)) else: importance = {} return importance else: raise ValueError("You must call the fit function before !") def predict(self, df): """Predicts the target. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- array of shape = (n, ) The target to be predicted. """ try: if not callable(getattr(self.__regressor, "predict")): raise ValueError("predict attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__regressor.predict(df.values) else: raise ValueError("You must call the fit function before !") def transform(self, df): """Transform dataframe df. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- pandas dataframe of shape = (n, n_selected_features) The transformed dataset with its most important features. """ try: if not callable(getattr(self.__regressor, "transform")): raise ValueError("transform attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__regressor.transform(df.values) else: raise ValueError("You must call the fit function before !") def score(self, df, y, sample_weight=None): """Return R^2 coefficient of determination of the prediction. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. y : pandas series of shape = (n,) The numerical encoded target for classification tasks. Returns ------- float R^2 of self.predict(df) wrt. y. """ try: if not callable(getattr(self.__regressor, "score")): raise ValueError("score attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if((type(df) != pd.SparseDataFrame) and (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") if (type(y) != pd.core.series.Series): raise ValueError("y must be a Series") return self.__regressor.score(df.values, y, sample_weight) else: raise ValueError("You must call the fit function before !") def get_estimator(self): """Return classfier.""" return copy(self.__regressor)
def plot_transfer_graph_prob_fitted(path, name, analysis_folder): from sklearn.linear_model import Ridge from scipy.interpolate import UnivariateSpline result_file = open( os.path.join(path, '0_results', analysis_folder, name, name+'_classifier.pyobj') , 'r') results = pickle.load(result_file) runs = 12 probabilities = results.probabilities prob = np.array([p[1][p[0]] for p in probabilities]) pred = np.array([p[0] for p in probabilities]) lab = np.unique(results.predictions) run_length = len(prob)/runs ridge = Ridge() f = plt.figure(figsize=(11,8)) f2 = plt.figure(figsize=(11,8)) data_sm = dict() data_or = dict() for c in np.unique(lab): data_sm[c] = [] data_or[c] = [] for i in range(12): if i < 6: aggregate = 1 l = '_pre' else: aggregate = 2 l = '_post' avg = [] for c in np.unique(pred): a = f.add_subplot(3,2,(c*2)+aggregate) a2 = f2.add_subplot(3,2,(c*2)+aggregate) a.set_title(lab[c]+l) #v = prob[i*run_length:(i+1)*run_length] v = prob[i*run_length:(i+1)*run_length] * (pred[i*run_length:(i+1)*run_length] == c) v[len(v)-1] = 0 yy = v.copy() xx = np.linspace(0, len(v), len(v)) s = UnivariateSpline(xx, yy, s=5) ys = s(xx) try: ridge.transform(np.vander(xx, 7), yy) y_fit = ridge.predict(np.vander(xx, 7)) except LinAlgError,err: ridge.transform(np.vander(xx, 9), yy) y_fit = ridge.predict(np.vander(xx, 9)) data_sm[lab[c]].append(ys) data_or[lab[c]].append(v) a.plot(y_fit) a2.plot(ys) a.set_ybound(upper=1.1, lower=-0.1) a2.set_ybound(upper=1.1, lower=-0.1)
class Regressor(object): """Wraps scikit regressors""" def __init__(self, modelname='Linear', num_bagged_est=None, random_state=None, **kwargs): """Construct a regressor Parameters ---------- modelname : str, model name to be used as regressor Available models: - "XGBoost", - "LightGBM", - "Keras", - "RandomForest", - "ExtraTrees", - "Tree", - "Bagging", - "AdaBoost" - "Linear" num_bagged_est: int or None Number of estimators to be averaged after bagged fitting. If None then bagged fitting is not performed. random_state: int, RandomState instance or None, optional, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by models. **kwargs : default = None Parameters of the corresponding regressor. Examples : n_estimators, max_depth, ... """ if not _IS_SKLEARN_INSTALLED: raise ValueError('Scikit-learn is required for this module') self.__modelname = modelname if self.__modelname == "XGBoost" and not _IS_XGBOOST_INSTALLED: raise ValueError('Package XGBoost is not installed.') elif self.__modelname == "LightGBM" and not _IS_LIGHTGBM_INSTALLED: raise ValueError('Package LightGBM is not installed.') elif self.__modelname == "Keras" and not _IS_KERAS_INSTALLED: raise ValueError('Package Keras is not installed.') self.__regressor = None self.__set_regressor(self.__modelname) self.set_params(**kwargs) self.__num_bagged_est = num_bagged_est if type(self.__num_bagged_est) != int and self.__num_bagged_est is not None: raise ValueError("num_bagged_est must be either None or an integer.") self.__random_state = random_state if type(self.__random_state) != int and self.__random_state is not None: raise ValueError("random_state must be either None or an integer.") self.set_params(random_state=self.__random_state) self.__fitOK = False self.__bagged_est = None def get_params(self, deep=True): params = {} params.update({"modelname": self.__modelname, "num_bagged_est": self.__num_bagged_est, "random_state": self.__random_state}) params.update(self.__regressor.get_params()) return params def set_params(self, **params): self.__fitOK = False self.__bagged_est = None if 'modelname' in params.keys(): self.__set_regressor(params['modelname']) del params['modelname'] if self.__modelname == "XGBoost" and not _IS_XGBOOST_INSTALLED: raise ValueError('Package XGBoost is not installed.') elif self.__modelname == "LightGBM" and not _IS_LIGHTGBM_INSTALLED: raise ValueError('Package LightGBM is not installed.') elif self.__modelname == "Keras" and not _IS_KERAS_INSTALLED: raise ValueError('Package Keras is not installed.') if 'num_bagged_est' in params.keys(): self.__num_bagged_est = params['num_bagged_est'] del params['num_bagged_est'] if type(self.__num_bagged_est) != int and self.__num_bagged_est is not None: raise ValueError("num_bagged_est must be either None or an integer.") if 'random_state' in params.keys(): self.__random_state = params['random_state'] if 'random_state' not in self.__regressor.get_params().keys(): del params['random_state'] if type(self.__random_state) != int and self.__random_state is not None: raise ValueError("random_state must be either None or an integer.") if 'build_fn' in params.keys() and self.get_estimator_name == 'Keras': setattr(self.__regressor, 'build_fn', params['build_fn']) del params['build_fn'] self.__regressor.set_params(**params) def __set_regressor(self, modelname): self.__modelname = modelname if(modelname == 'XGBoost'): self.__regressor = XGBRegressor() elif(modelname == "LightGBM"): self.__regressor = LGBMRegressor() elif(modelname == "Keras"): self.__regressor = KerasRegressor(build_fn=Sequential()) elif(modelname == 'RandomForest'): self.__regressor = RandomForestRegressor() elif(modelname == 'ExtraTrees'): self.__regressor = ExtraTreesRegressor() elif(modelname == 'Tree'): self.__regressor = DecisionTreeRegressor() elif(modelname == "Bagging"): self.__regressor = BaggingRegressor() elif(modelname == "AdaBoost"): self.__regressor = AdaBoostRegressor() elif(modelname == "Linear"): self.__regressor = Ridge() else: raise ValueError( "Model name invalid. Please choose between LightGBM " + "(if installed), XGBoost(if installed), Keras(if installed)," + "RandomForest, ExtraTrees, Tree, Bagging, AdaBoost or Linear") def fit(self, X, y, **kwargs): """Fit model. In case num_bagged_est is not None then additionally performing a type of bagging ensamble - ensamble from the same models, but with different seed values/reshuffled data which aims to decrease variance of the predictions. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Input feature matrix used for training. y : array-like of shape = [n_samples, ] The numerical encoded target for regression tasks. **kwargs : default = None Additional fitting arguments accepted by model. Not tested. Returns ------- object self """ y = self.__process_target(y) if self.__num_bagged_est is None: self.__regressor.fit(X, y, **kwargs) else: if not hasattr(self.__regressor, 'random_state'): warnings.warn("The regressor " + str(self.__modelname) + " has no random_state attribute and only random " + " shuffling will be used.") self.__bagged_est = [] for i in range(0, self.__num_bagged_est): X_shuff, y_shuff = shuffle(X, y, random_state=self.__random_state+i) est = self.get_estimator() if hasattr(est, 'random_state'): est.set_params(random_state=self.__random_state+i) est.fit(X_shuff, y_shuff, **kwargs) self.__bagged_est.append(est) self.__fitOK = True return self def predict(self, X): """Predicts the target. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Returns ------- array of shape = [n_samples, ] The target to be predicted. """ try: if not callable(getattr(self.__regressor, "predict")): raise ValueError("predict attribute is not callable") except Exception as e: raise e if self.__fitOK: if self.__num_bagged_est is None: return self.__regressor.predict(X) else: bagged_pred = np.zeros(X.shape[0]) for c, est in enumerate(self.__bagged_est): bagged_pred += est.predict(X) / self.__num_bagged_est else: raise ValueError("You must call the fit function before !") return bagged_pred def transform(self, X): """Transforms X. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Returns ------- array-like or sparse matrix of shape = [n_samples, n_features] The transformed X. """ try: if not callable(getattr(self.__regressor, "transform")): raise ValueError("transform attribute is not callable") except Exception as e: raise e if self.__fitOK: return self.__regressor.transform(X) else: raise ValueError("You must call the fit function before !") def score(self, X, y, sample_weight=None): """Returns the coefficient of determination R^2 of the prediction. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Input feature matrix used for training and cv. y : array-like of shape = [n_samples, ] The numerical encoded target for regression tasks. Returns ------- float R^2 of self.predict(df) wrt. y. """ try: if not callable(getattr(self.__regressor, "score")): raise ValueError("score attribute is not callable") except Exception as e: raise e if self.__fitOK: return self.__regressor.score(X, y, sample_weight) else: raise ValueError("You must call the fit function before !") def cross_val_predict(self, X, y, cv=None, scoring=None, **kwargs): """Performing cross validation hold out predictions for stacking. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Input feature matrix used for training and cv. y : array-like of shape = [n_samples, ] The numerical encoded target for regression tasks. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a StratifiedKFold, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. scoring : callable, default: None A callable to evaluate the predictions on the cv set. None, accuracy score **kwargs : default = None Additional fitting arguments accepted by model. Not tested. Returns ------- array of shape = [n_samples, ] The hold out target """ y = self.__process_target(y) y_pred = np.zeros(X.shape[0]) cv = check_cv(cv, y, classifier=False) n_splits = cv.get_n_splits(X, y) if scoring is None: scoring = make_scorer(accuracy_score) i = 0 score_mean = 0.0 print("Starting hold out prediction with {} splits.".format(n_splits)) for train_index, cv_index in cv.split(X, y): X_train = X[train_index] y_train = y[train_index] X_cv = X[cv_index] y_cv = y[cv_index] est = self.get_estimator() est.fit(X_train, y_train, **kwargs) y_pred_cv = est.predict(X_cv) # score = scoring(y_cv, y_pred_proba_cv) # print("Train size: {} ::: cv size: {} score (fold {}/{}): {:.4f}".format(len(train_index), len(cv_index), i + 1, n_splits, score)) # score_mean += score / float(n_splits) y_pred[cv_index] = y_pred_cv i += 1 # print("Mean score: {:.4f}".format(score_mean)) return y_pred def cross_validate(self, X, y, cv=None, scoring=None, **kwargs): """Performing a cross validation method. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Input feature matrix used for training. y : array-like of shape = [n_samples, ] The numerical encoded target for regression tasks. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a StratifiedKFold, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. scoring : For scikit learn models: string, callable, list/tuple, dict or None, default: None A single string or a callable to evaluate the predictions on the test set. None, the estimator’s default scorer (if available) is used. For LightGBM: callable or None, optional (default=None) Customized evaluation function. Note: should return (eval_name, eval_result, is_higher_better) or list of such tuples. For XGBoost: callable or None, optional (default=None) Customized evaluation function. **kwargs : default = None Additional fitting arguments. Returns ------- object self """ y = self.__process_target(y) if self.get_estimator_name == 'LightGBM': params = self.__regressor.get_params() data = lgb.Dataset(X, label=y) cv = check_cv(cv, y, classifier=False) ret = lgb.cv(params, data, feval=scoring, folds=cv, **kwargs) elif self.get_estimator_name == 'XGBoost': params = self.__regressor.get_xgb_params() data = xgb.DMatrix(X, label=y) cv = check_cv(cv, y, classifier=False) ret = xgb.cv(params, data, feval=scoring, folds=cv, **kwargs) else: ret = cross_validate(self.__regressor, X, y, cv=cv, scoring=scoring) return ret def __process_target(self, y): y = np.array(y, dtype='float') return y def get_estimator(self): return self.__classifier def get_estimator_copy(self): return make_copy(self.__classifier) @property def feature_importances_(self): if self.__fitOK: if hasattr(self.__regressor, 'feature_importances_'): return self.__regressor.feature_importances_ else: raise ValueError('The regressor ' + self.get_estimator_name + ' does not have feature_importances_ attribute.') else: raise ValueError("You must call the fit function before !") @property def get_estimator_name(self): return self.__modelname
def plot_transfer_graph_prob_fitted(path, name, analysis_folder): from sklearn.linear_model import Ridge from scipy.interpolate import UnivariateSpline result_file = open( os.path.join(path, '0_results', analysis_folder, name, name + '_classifier.pyobj'), 'r') results = pickle.load(result_file) runs = 12 probabilities = results.probabilities prob = np.array([p[1][p[0]] for p in probabilities]) pred = np.array([p[0] for p in probabilities]) lab = np.unique(results.predictions) run_length = len(prob) / runs ridge = Ridge() f = plt.figure(figsize=(11, 8)) f2 = plt.figure(figsize=(11, 8)) data_sm = dict() data_or = dict() for c in np.unique(lab): data_sm[c] = [] data_or[c] = [] for i in range(12): if i < 6: aggregate = 1 l = '_pre' else: aggregate = 2 l = '_post' avg = [] for c in np.unique(pred): a = f.add_subplot(3, 2, (c * 2) + aggregate) a2 = f2.add_subplot(3, 2, (c * 2) + aggregate) a.set_title(lab[c] + l) #v = prob[i*run_length:(i+1)*run_length] v = prob[i * run_length:(i + 1) * run_length] * ( pred[i * run_length:(i + 1) * run_length] == c) v[len(v) - 1] = 0 yy = v.copy() xx = np.linspace(0, len(v), len(v)) s = UnivariateSpline(xx, yy, s=5) ys = s(xx) try: ridge.transform(np.vander(xx, 7), yy) y_fit = ridge.predict(np.vander(xx, 7)) except LinAlgError, err: ridge.transform(np.vander(xx, 9), yy) y_fit = ridge.predict(np.vander(xx, 9)) data_sm[lab[c]].append(ys) data_or[lab[c]].append(v) a.plot(y_fit) a2.plot(ys) a.set_ybound(upper=1.1, lower=-0.1) a2.set_ybound(upper=1.1, lower=-0.1)