class Stacker: def __init__(self, meta_model, base_models, num_splits, feature_builder, meta_model_params='', base_model_params=''): '''Initializes a meta stacking model: ----------- meta_model: str Name of the meta model to be used. Follows a strict convention of TYPE-NAME where TYPE must be either "c" for classification or "r" for regression and NAME represents either xgb for XGBoost, rf for randomforest or dt for decisiontrees params_path: str Filepath to the parameter file that either sets the bounds for random parameter initializations or to load specific params num_splits: int The amount of splits that have to be created to create the meta training data. feature_builder: FeatureBuilder Instance of FeatureBuilder class already initialized with all feature functions. meta_model_params: str Path to the params file of the meta model base_model_params: str Path to the params directory of the base models Returns: -------- - ''' self.meta_model = BaseModel(meta_model, meta_model_params) self.base_models = [ BaseModel(model, params) for model, params in zip(base_models, base_model_params) ] self.num_splits = num_splits self.feature_builder = feature_builder def generate_base_model_predictions(self, X, y, df=None): '''Split the training data and create predictions for each model to create the complete meta training data. ----------- X: numpy.array Data matrix y: labels df: pandas.DataFrame Raw DataFrame to be used if historical features need to be created. Returns: -------- Meta training data that contains all base model predictions for each training instance ''' model_predictions = [] for model in self.base_models: is_xgb = True if 'xgb' in model.name else False is_c = True if 'c' == model.name[0] else False split_predictions = [] it = helper.split_train_validation_data(X, y, self.num_splits) for X_train, y_train, X_valid, y_valid, index1, index2 in it: if not isinstance(df, pd.DataFrame): model.fit(X_train, y_train) if is_xgb or not is_c: split_predictions.append(model.predict(X_valid)) else: split_predictions.append(model.predict_proba(X_valid)) else: historical_df = df[index1:index2] df_temp = pd.concat([df[:index1], df[index2:]]) historical_features = self.feature_builder.create_historical_features( df, historical_df) train_historical_features = np.concatenate([ historical_features[:index1], historical_features[index2:] ], axis=0) validation_historical_features = historical_features[ index1:index2] X_train = np.hstack([X_train, train_historical_features]) X_valid = np.hstack( [X_valid, validation_historical_features]) model.fit(X_train, y_train) if is_xgb or not is_c: split_predictions.append(model.predict(X_valid)) else: split_predictions.append(model.predict_proba(X_valid)) model_predictions.append(np.vstack(split_predictions)) return np.hstack(model_predictions) def generate_new_base_model_predictions(self, X, df, historical_df): model_predictions = [] for model in self.base_models: is_xgb = True if 'xgb' in model.name else False is_c = True if 'c' == model.name[0] else False if isinstance(df, pd.DataFrame): historical_features = self.feature_builder.create_historical_features( df, historical_df) X_temp = np.hstack([X, historical_features]) if is_xgb or not is_c: model_predictions.append(model.predict(X_temp)) else: model_predictions.append(model.predict_proba(X_temp)) return np.hstack(model_predictions) def fit(self, X, y, df=None): '''Fit the meta model on the predictions made by all base models. ----------- X: numpy.array Data matrix y: numpy.array labels df: pd.DataFrame Raw DataFrame to be used for creating the features that have to be created using historical knowledge. Returns: -------- Meta training data that contains all base model predictions for each training instance ''' X_train_meta = self.generate_base_model_predictions(X, y, df=df) self.meta_model.fit(X_train_meta, y) def predict(self, X, df=None, historical_df=None): '''Predict with the meta model on the predictions made by all base models. ----------- X: numpy.array Data matrix df: pd.DataFrame Raw DataFrame to be used for creating the features that have to be created using historical knowledge. historical_df: pd.DataFrame Raw DataFrame containing the historical knowledge to create the predictions. Returns: -------- Meta training data that contains all base model predictions for each training instance ''' X_train_meta = self.generate_new_base_model_predictions( X, df, historical_df) return self.meta_model.predict(X_train_meta)
def test_fixed_init_xgb(self): model = BaseModel('xgb', 'test_params_set') model.fit(X, y) self.assertEqual(model.predict(X).shape[0], n_samples) self.assertEqual(model.predict(X).shape[1], num_classes)
def test_fixed_init_rf(self): model = BaseModel('c-rf', 'test_params_rf_set') model.fit(X, y) self.assertEqual(model.predict_proba(X).shape[0], n_samples) self.assertEqual(model.predict_proba(X).shape[1], num_classes)
def fit(self): # First fit individual batches to create train matrix BaseModel.fit(self) # Finally, fit using model print "Calling GradientBoostingClassidier.fit " self.model.fit(self.X_train, self.y_train)