def predict(df: pd.DataFrame, model: Model, tail: int = None, samples: int = 1, **kwargs) -> pd.DataFrame: min_required_samples = model.features_and_labels.min_required_samples if tail is not None: if min_required_samples is not None: # just use the tail for feature engineering df = df[-(abs(tail) + (min_required_samples - 1)):] else: _log.warning( "could not determine the minimum required data from the model") kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs) columns, features, targets = extract(model.features_and_labels, df, extract_features, **kwargs) if samples > 1: print(f"draw {samples} samples") sampler = DataGenerator(DummySplitter(samples), features, None, targets, None).complete_samples() predictions = model.predict(sampler, **kwargs) y_hat = to_pandas(predictions, index=features.index, columns=columns) return _assemble_result_frame(targets, y_hat, None, None, None, features)
def predict(df: pd.DataFrame, model: Model, tail: int = None, samples: int = 1, **kwargs) -> pd.DataFrame: min_required_samples = model.features_and_labels.min_required_samples if tail is not None: if min_required_samples is not None: # just use the tail for feature engineering df = df[-(abs(tail) + (min_required_samples - 1)):] else: _log.warning( "could not determine the minimum required data from the model") kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs) columns, features, targets = extract(model.features_and_labels, df, extract_features, **kwargs) if samples > 1: print(f"draw {samples} samples") predictions = np.array([ model.predict(features.ml.values) for _ in range(samples) ]).swapaxes(0, 1) y_hat = to_pandas(predictions, index=features.index, columns=columns) return assemble_prediction_frame({ TARGET_COLUMN_NAME: targets, PREDICTION_COLUMN_NAME: y_hat, FEATURE_COLUMN_NAME: features })
def __call__(self, *args, **kwargs): new_model = KerasModel(self.keras_model_provider, self.features_and_labels, self.summary_provider, self.epochs, deepcopy(self.callbacks), **merge_kwargs(deepcopy(self.kwargs), kwargs)) # copy weights before return new_model.set_weights(self.get_weights()) return new_model
def backtest(df: pd.DataFrame, model: Model, summary_provider: Callable[[pd.DataFrame], Summary] = None, **kwargs) -> Summary: kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs) (features, _), labels, targets, weights, gross_loss =\ extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs) sampler = DataGenerator(DummySplitter(1), features, labels, targets, None).complete_samples() predictions = model.predict(sampler, **kwargs) y_hat = to_pandas(predictions, index=features.index, columns=labels.columns) df_backtest = _assemble_result_frame(targets, y_hat, labels, gross_loss, weights, features) return (summary_provider or model.summary_provider)(df_backtest, model, **kwargs)
def __call__(self, *args, **kwargs): """ returns a copy pf the model with eventually different configuration (kwargs). This is useful for hyper paramter tuning or for MultiModels :param args: :param kwargs: arguments which are eventually provided by hyperopt or by different targets :return: """ copy = deepcopy(self) copy.kwargs = merge_kwargs(copy.kwargs, kwargs) return copy
def feature_selection(self, features_and_labels: FeaturesAndLabels, top_features: int = 5, correlation_threshold: float = 0.5, minimum_features: int = 1, lags: Iterable[int] = range(100), show_plots: bool = True, figsize: Tuple[int, int] = (12, 10), **kwargs): # extract pandas objects kwargs = merge_kwargs(features_and_labels.kwargs, kwargs) (features, _), label, _, _, _ = extract_feature_labels_weights( self.df, features_and_labels, **kwargs) # try to estimate good features return feature_selection(features, label, top_features, correlation_threshold, minimum_features, lags, show_plots, figsize)
def predict(self, model: MlModel, tail: int = None, samples: int = 1, forecast_provider: Callable[[Typing.PatchedDataFrame], Forecast] = None, **kwargs) -> Union[Typing.PatchedDataFrame, Forecast]: min_required_samples = model.features_and_labels.min_required_samples df = self.df if tail is not None: if min_required_samples is not None: # just use the tail for feature engineering df = df[-(abs(tail) + (min_required_samples - 1)):] else: _log.warning( "could not determine the minimum required data from the model" ) kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs) typemap_pred = { SubModelFeature: lambda df, model, **kwargs: model.predict(df, **kwargs), **self._type_mapping } frames: FeaturesWithTargets = model.features_and_labels( df, extract_features, type_map=typemap_pred, **kwargs) predictions = call_callable_dynamic_args(model.predict, features=frames.features, targets=frames.targets, latent=frames.latent, samples=samples, df=df, **kwargs) fc_provider = forecast_provider or model.forecast_provider res_df = assemble_result_frame(predictions, frames.targets, None, None, None, frames.features) return res_df if fc_provider is None else call_callable_dynamic_args( fc_provider, res_df, **kwargs)
def backtest(self, model: MlModel, summary_provider: Callable[[Typing.PatchedDataFrame], Summary] = None, tail: int = None, **kwargs) -> Summary: min_required_samples = model.features_and_labels.min_required_samples df = self.df if tail is not None: if min_required_samples is not None: # just use the tail for feature engineering df = df[-(abs(tail) + (min_required_samples - 1)):] else: _log.warning( "could not determine the minimum required data from the model" ) kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs) typemap_pred = { SubModelFeature: lambda df, model, **kwargs: model.predict(df, **kwargs), **self._type_mapping } frames: FeaturesWithLabels = model.features_and_labels( df, extract_feature_labels_weights, type_map=typemap_pred, **kwargs) predictions = model.predict(frames.features, **kwargs) df_backtest = assemble_result_frame(predictions, frames.targets, frames.labels, frames.gross_loss, frames.sample_weights, frames.features) return call_callable_dynamic_args( summary_provider or model.summary_provider, df_backtest, model, **kwargs)
def backtest(df: pd.DataFrame, model: Model, summary_provider: Callable[[pd.DataFrame], Summary] = Summary, **kwargs) -> Summary: kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs) (features, _), labels, targets, _ = extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs) y_hat = to_pandas(model.predict(features.ml.values), index=features.index, columns=labels.columns) df_backtest = assemble_prediction_frame({ TARGET_COLUMN_NAME: targets, PREDICTION_COLUMN_NAME: y_hat, LABEL_COLUMN_NAME: labels, FEATURE_COLUMN_NAME: features }) return (summary_provider or model.summary_provider)(df_backtest)
def fetch_yahoo(*args: str, period: str = 'max', multi_index: bool = False, **kwargs: str): df = None if len(args) == 1: df = __download_yahoo_data(args[0], period) else: # convert args to kwargs if len(args) > 0: kwargs = merge_kwargs({arg: arg for arg in args}, kwargs) for k, v in kwargs.items(): px = f'{k}_' df_ = __download_yahoo_data(v, period) if multi_index: df_.columns = pd.MultiIndex.from_product([[k], df_.columns]) if df is None: df = df_ else: df = inner_join(df, df_) else: if df is None: df = df_.add_prefix(px) else: df = inner_join(df, df_, prefix=px) # print some statistics if df is None: logging.warning("nothing downloaded") else: logging.info( f'number of rows for joined dataframe = {len(df)}, from {df.index[0]} to {df.index[-1]}' ) return df
def fit(df: pd.DataFrame, model_provider: Callable[[int], Model], test_size: float = 0.4, youngest_size: float = None, cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None, test_validate_split_seed=42, hyper_parameter_space: Dict = None, **kwargs) -> Fit: """ :param df: the DataFrame you apply this function to :param model_provider: a callable which provides a new :class:`.Model` instance i.e. for each hyper parameter if hyper parameter tuning is enforced. Usually all the Model subclasses implement __call__ thus they are a provider of itself :param test_size: the fraction [0, 1] of random samples which are used for a test set :param youngest_size: the fraction [0, 1] of the test samples which are not random but are the youngest :param cross_validation: tuple of number of epochs for each fold provider and a cross validation provider :param test_validate_split_seed: seed if train, test splitting needs to be reproduceable. A magic seed 'youngest' is available, which just uses the youngest data as test data :param hyper_parameter_space: space of hyper parameters passed as kwargs to your model provider :return: returns a :class:`pandas_ml_utils.model.fitting.fit.Fit` object """ trails = None model = model_provider() kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs) (features, min_required_samples), labels, targets, weights = \ extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs) start_performance_count = perf_counter() _log.info("create model") # get indices and make training and test data sets train_idx, test_idx = train_test_split(features.index, test_size, youngest_size, test_validate_split_seed) train = (features.loc[train_idx], labels.loc[train_idx], loc_if_not_none(weights, train_idx)) test = (features.loc[test_idx], labels.loc[test_idx], loc_if_not_none(weights, test_idx)) # eventually perform a hyper parameter optimization first if hyper_parameter_space is not None: # next isolate hyperopt parameters and constants only used for hyper parameter tuning like early stopping constants = {} hyperopt_params = {} for k, v in list(hyper_parameter_space.items()): if k.startswith("__"): hyperopt_params[k[2:]] = hyper_parameter_space.pop(k) elif isinstance(v, (int, float, bool)): constants[k] = hyper_parameter_space.pop(k) # optimize hyper parameters model, trails = __hyper_opt(hyper_parameter_space, hyperopt_params, constants, model_provider, cross_validation, train, test) # finally train the model with eventually tuned hyper parameters __train_loop(model, cross_validation, train, test) _log.info( f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!" ) # assemble result objects prediction_train = to_pandas(model.predict(train[0].ml.values), train_idx, labels.columns) prediction_test = to_pandas(model.predict(test[0].ml.values), test_idx, labels.columns) targets = (loc_if_not_none(targets, train_idx), loc_if_not_none(targets, test_idx)) df_train = assemble_prediction_frame({ TARGET_COLUMN_NAME: targets[0], PREDICTION_COLUMN_NAME: prediction_train, LABEL_COLUMN_NAME: train[1], FEATURE_COLUMN_NAME: train[0] }) df_test = assemble_prediction_frame({ TARGET_COLUMN_NAME: targets[1], PREDICTION_COLUMN_NAME: prediction_test, LABEL_COLUMN_NAME: test[1], FEATURE_COLUMN_NAME: test[0] }) # update model properties and return the fit model._validation_indices = test_idx model.features_and_labels._min_required_samples = min_required_samples model.features_and_labels._label_columns = labels.columns return Fit(model, model.summary_provider(df_train), model.summary_provider(df_test), trails)
def fit(df: pd.DataFrame, model_provider: Callable[[int], Model], training_data_splitter: Splitter = RandomSplits(), hyper_parameter_space: Dict = None, **kwargs) -> Fit: """ :param df: the DataFrame you apply this function to :param model_provider: a callable which provides a new :class:`.Model` instance i.e. for each hyper parameter if hyper parameter tuning is enforced. Usually all the Model subclasses implement __call__ thus they are a provider of itself :param training_data_splitter: a :class:`pandas_ml_utils.ml.data.splitting.Splitter` object which provides traning and test data splits (eventually multiple folds) :param hyper_parameter_space: space of hyper parameters passed as kwargs to your model provider :return: returns a :class:`pandas_ml_utils.model.fitting.fit.Fit` object """ trails = None model = model_provider() kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs) (features, min_required_samples), labels, targets, weights, gross_loss = \ extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs) start_performance_count = perf_counter() _log.info("create model") # get indices and make training and test data sets #train_idx, test_idx = training_data_splitter.train_test_split(features.index) #train = (features.loc[train_idx], labels.loc[train_idx], loc_if_not_none(weights, train_idx)) #test = (features.loc[test_idx], labels.loc[test_idx], loc_if_not_none(weights, test_idx)) # FIXME eventually perform a hyper parameter optimization first #if hyper_parameter_space is not None: # # next isolate hyperopt parameters and constants only used for hyper parameter tuning like early stopping # constants = {} # hyperopt_params = {} # for k, v in list(hyper_parameter_space.items()): # if k.startswith("__"): # hyperopt_params[k[2:]] = hyper_parameter_space.pop(k) # elif isinstance(v, (int, float, bool)): # constants[k] = hyper_parameter_space.pop(k) # # # optimize hyper parameters # model, trails = __hyper_opt(hyper_parameter_space, # hyperopt_params, # constants, # model_provider, # None, # FIXME Ecross_validation, # train, # test) # finally train the model with eventually tuned hyper parameters sampler = DataGenerator(training_data_splitter, features, labels, targets, weights, gross_loss).train_test_sampler() model.fit(sampler, **kwargs) _log.info( f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!" ) # assemble result objects train_sampler, train_idx = sampler.training() test_sampler, test_idx = sampler.validation() try: prediction = (to_pandas(model.predict(train_sampler, **kwargs), train_idx, labels.columns), to_pandas(model.predict(test_sampler, **kwargs), test_idx, labels.columns)) # get training and test data tuples of the provided frames features, labels, targets, weights, gross_loss = sampler[0], sampler[ 1], sampler[2], sampler[3], sampler[4] df_train, df_test = [ _assemble_result_frame(targets[i], prediction[i], labels[i], gross_loss[i], weights[i], features[i]) for i in range(2) ] # update model properties and return the fit model._validation_indices = test_idx model.features_and_labels.set_min_required_samples( min_required_samples) model.features_and_labels.set_label_columns(labels[0].columns.tolist()) return Fit(model, model.summary_provider(df_train, **kwargs), model.summary_provider(df_test, **kwargs), trails, **kwargs) except Exception as e: raise FitException(e, model)
def fit(self, model_provider: Callable[[], MlModel], fitting_parameter: FittingParameter = FittingParameter(), verbose: int = 0, callbacks: Union[Callable, List[Callable]] = None, fail_silent: bool = False, **kwargs) -> Fit: df = self.df trails = None model = model_provider() kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs) def fit_submodel(df, model, **kwargs): return model.fit(df, **kwargs) typemap_fitting = {SubModelFeature: fit_submodel, **self._type_mapping} frames: FeaturesWithLabels = model.features_and_labels( df, extract_feature_labels_weights, type_map=typemap_fitting, fitting_parameter=fitting_parameter, verbose=verbose, **kwargs) start_performance_count = perf_counter() _log.info("create model") df_train_prediction, df_test_prediction = model.fit( XYWeight(frames.features, frames.labels, frames.sample_weights), fitting_parameter, verbose, callbacks, **kwargs) _log.info( f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!" ) # assemble result objects # get training and test data tuples of the provided frames. # NOTE due to event boosting there might be duplicate events in the test data which we need to filter away ext_frames = frames.targets, frames.labels, frames.gross_loss, frames.sample_weights, frames.features df_train = assemble_result_frame(df_train_prediction, *ext_frames) df_test = assemble_result_frame( df_test_prediction[~df_test_prediction.index.duplicated()], *ext_frames) # update model properties and return the fit model.features_and_labels.set_min_required_samples( frames.features_with_required_samples.min_required_samples) model.features_and_labels._kwargs = { k: a for k, a in kwargs.items() if k in model.features_and_labels.kwargs } def assemble_fit(): return Fit( model, model.summary_provider(df_train, model, is_test=False, **kwargs), model.summary_provider(df_test, model, is_test=True, **kwargs), trails, **kwargs) return call_silent(assemble_fit, lambda e: FitException(e, model) ) if fail_silent else assemble_fit()