コード例 #1
0
def predict(df: pd.DataFrame,
            model: Model,
            tail: int = None,
            samples: int = 1,
            **kwargs) -> pd.DataFrame:
    min_required_samples = model.features_and_labels.min_required_samples

    if tail is not None:
        if min_required_samples is not None:
            # just use the tail for feature engineering
            df = df[-(abs(tail) + (min_required_samples - 1)):]
        else:
            _log.warning(
                "could not determine the minimum required data from the model")

    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    columns, features, targets = extract(model.features_and_labels, df,
                                         extract_features, **kwargs)

    if samples > 1:
        print(f"draw {samples} samples")

    sampler = DataGenerator(DummySplitter(samples), features, None, targets,
                            None).complete_samples()
    predictions = model.predict(sampler, **kwargs)

    y_hat = to_pandas(predictions, index=features.index, columns=columns)
    return _assemble_result_frame(targets, y_hat, None, None, None, features)
コード例 #2
0
def predict(df: pd.DataFrame,
            model: Model,
            tail: int = None,
            samples: int = 1,
            **kwargs) -> pd.DataFrame:
    min_required_samples = model.features_and_labels.min_required_samples

    if tail is not None:
        if min_required_samples is not None:
            # just use the tail for feature engineering
            df = df[-(abs(tail) + (min_required_samples - 1)):]
        else:
            _log.warning(
                "could not determine the minimum required data from the model")

    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    columns, features, targets = extract(model.features_and_labels, df,
                                         extract_features, **kwargs)

    if samples > 1:
        print(f"draw {samples} samples")

    predictions = np.array([
        model.predict(features.ml.values) for _ in range(samples)
    ]).swapaxes(0, 1)

    y_hat = to_pandas(predictions, index=features.index, columns=columns)
    return assemble_prediction_frame({
        TARGET_COLUMN_NAME: targets,
        PREDICTION_COLUMN_NAME: y_hat,
        FEATURE_COLUMN_NAME: features
    })
コード例 #3
0
    def __call__(self, *args, **kwargs):
        new_model = KerasModel(self.keras_model_provider,
                               self.features_and_labels, self.summary_provider,
                               self.epochs, deepcopy(self.callbacks),
                               **merge_kwargs(deepcopy(self.kwargs), kwargs))

        # copy weights before return
        new_model.set_weights(self.get_weights())
        return new_model
コード例 #4
0
def backtest(df: pd.DataFrame, model: Model, summary_provider: Callable[[pd.DataFrame], Summary] = None, **kwargs) -> Summary:
    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs)
    (features, _), labels, targets, weights, gross_loss =\
        extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs)

    sampler = DataGenerator(DummySplitter(1), features, labels, targets, None).complete_samples()
    predictions = model.predict(sampler, **kwargs)

    y_hat = to_pandas(predictions, index=features.index, columns=labels.columns)
    df_backtest = _assemble_result_frame(targets, y_hat, labels, gross_loss, weights, features)
    return (summary_provider or model.summary_provider)(df_backtest, model, **kwargs)
コード例 #5
0
ファイル: base_model.py プロジェクト: KIC/pandas-ml-quant
    def __call__(self, *args, **kwargs):
        """
        returns a copy pf the model with eventually different configuration (kwargs). This is useful for hyper paramter
        tuning or for MultiModels

        :param args:
        :param kwargs: arguments which are eventually provided by hyperopt or by different targets
        :return:
        """
        copy = deepcopy(self)
        copy.kwargs = merge_kwargs(copy.kwargs, kwargs)
        return copy
コード例 #6
0
ファイル: model.py プロジェクト: moguli/pandas-ml-quant
    def feature_selection(self,
                          features_and_labels: FeaturesAndLabels,
                          top_features: int = 5,
                          correlation_threshold: float = 0.5,
                          minimum_features: int = 1,
                          lags: Iterable[int] = range(100),
                          show_plots: bool = True,
                          figsize: Tuple[int, int] = (12, 10),
                          **kwargs):
        # extract pandas objects
        kwargs = merge_kwargs(features_and_labels.kwargs, kwargs)
        (features, _), label, _, _, _ = extract_feature_labels_weights(
            self.df, features_and_labels, **kwargs)

        # try to estimate good features
        return feature_selection(features, label, top_features,
                                 correlation_threshold, minimum_features, lags,
                                 show_plots, figsize)
コード例 #7
0
ファイル: model_patch.py プロジェクト: KIC/pandas-ml-quant
    def predict(self,
                model: MlModel,
                tail: int = None,
                samples: int = 1,
                forecast_provider: Callable[[Typing.PatchedDataFrame],
                                            Forecast] = None,
                **kwargs) -> Union[Typing.PatchedDataFrame, Forecast]:
        min_required_samples = model.features_and_labels.min_required_samples
        df = self.df

        if tail is not None:
            if min_required_samples is not None:
                # just use the tail for feature engineering
                df = df[-(abs(tail) + (min_required_samples - 1)):]
            else:
                _log.warning(
                    "could not determine the minimum required data from the model"
                )

        kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                              kwargs)
        typemap_pred = {
            SubModelFeature:
            lambda df, model, **kwargs: model.predict(df, **kwargs),
            **self._type_mapping
        }
        frames: FeaturesWithTargets = model.features_and_labels(
            df, extract_features, type_map=typemap_pred, **kwargs)

        predictions = call_callable_dynamic_args(model.predict,
                                                 features=frames.features,
                                                 targets=frames.targets,
                                                 latent=frames.latent,
                                                 samples=samples,
                                                 df=df,
                                                 **kwargs)

        fc_provider = forecast_provider or model.forecast_provider
        res_df = assemble_result_frame(predictions, frames.targets, None, None,
                                       None, frames.features)

        return res_df if fc_provider is None else call_callable_dynamic_args(
            fc_provider, res_df, **kwargs)
コード例 #8
0
ファイル: model_patch.py プロジェクト: KIC/pandas-ml-quant
    def backtest(self,
                 model: MlModel,
                 summary_provider: Callable[[Typing.PatchedDataFrame],
                                            Summary] = None,
                 tail: int = None,
                 **kwargs) -> Summary:

        min_required_samples = model.features_and_labels.min_required_samples
        df = self.df

        if tail is not None:
            if min_required_samples is not None:
                # just use the tail for feature engineering
                df = df[-(abs(tail) + (min_required_samples - 1)):]
            else:
                _log.warning(
                    "could not determine the minimum required data from the model"
                )

        kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                              kwargs)
        typemap_pred = {
            SubModelFeature:
            lambda df, model, **kwargs: model.predict(df, **kwargs),
            **self._type_mapping
        }
        frames: FeaturesWithLabels = model.features_and_labels(
            df,
            extract_feature_labels_weights,
            type_map=typemap_pred,
            **kwargs)

        predictions = model.predict(frames.features, **kwargs)
        df_backtest = assemble_result_frame(predictions, frames.targets,
                                            frames.labels, frames.gross_loss,
                                            frames.sample_weights,
                                            frames.features)

        return call_callable_dynamic_args(
            summary_provider or model.summary_provider, df_backtest, model,
            **kwargs)
コード例 #9
0
def backtest(df: pd.DataFrame,
             model: Model,
             summary_provider: Callable[[pd.DataFrame], Summary] = Summary,
             **kwargs) -> Summary:
    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    (features,
     _), labels, targets, _ = extract(model.features_and_labels, df,
                                      extract_feature_labels_weights, **kwargs)

    y_hat = to_pandas(model.predict(features.ml.values),
                      index=features.index,
                      columns=labels.columns)

    df_backtest = assemble_prediction_frame({
        TARGET_COLUMN_NAME: targets,
        PREDICTION_COLUMN_NAME: y_hat,
        LABEL_COLUMN_NAME: labels,
        FEATURE_COLUMN_NAME: features
    })
    return (summary_provider or model.summary_provider)(df_backtest)
コード例 #10
0
def fetch_yahoo(*args: str,
                period: str = 'max',
                multi_index: bool = False,
                **kwargs: str):
    df = None

    if len(args) == 1:
        df = __download_yahoo_data(args[0], period)
    else:
        # convert args to kwargs
        if len(args) > 0:
            kwargs = merge_kwargs({arg: arg for arg in args}, kwargs)

        for k, v in kwargs.items():
            px = f'{k}_'
            df_ = __download_yahoo_data(v, period)

            if multi_index:
                df_.columns = pd.MultiIndex.from_product([[k], df_.columns])

                if df is None:
                    df = df_
                else:
                    df = inner_join(df, df_)
            else:
                if df is None:
                    df = df_.add_prefix(px)
                else:
                    df = inner_join(df, df_, prefix=px)

    # print some statistics
    if df is None:
        logging.warning("nothing downloaded")
    else:
        logging.info(
            f'number of rows for joined dataframe = {len(df)}, from {df.index[0]} to {df.index[-1]}'
        )

    return df
コード例 #11
0
def fit(df: pd.DataFrame,
        model_provider: Callable[[int], Model],
        test_size: float = 0.4,
        youngest_size: float = None,
        cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray],
                                              Tuple[np.ndarray,
                                                    np.ndarray]]] = None,
        test_validate_split_seed=42,
        hyper_parameter_space: Dict = None,
        **kwargs) -> Fit:
    """

    :param df: the DataFrame you apply this function to
    :param model_provider: a callable which provides a new :class:`.Model` instance i.e. for each hyper parameter if
                           hyper parameter tuning is enforced. Usually all the Model subclasses implement __call__
                           thus they are a provider of itself
    :param test_size: the fraction [0, 1] of random samples which are used for a test set
    :param youngest_size: the fraction [0, 1] of the test samples which are not random but are the youngest
    :param cross_validation: tuple of number of epochs for each fold provider and a cross validation provider
    :param test_validate_split_seed: seed if train, test splitting needs to be reproduceable. A magic seed 'youngest' is
                                     available, which just uses the youngest data as test data
    :param hyper_parameter_space: space of hyper parameters passed as kwargs to your model provider
    :return: returns a :class:`pandas_ml_utils.model.fitting.fit.Fit` object
    """

    trails = None
    model = model_provider()
    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    (features, min_required_samples), labels, targets, weights = \
        extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs)

    start_performance_count = perf_counter()
    _log.info("create model")

    # get indices and make training and test data sets
    train_idx, test_idx = train_test_split(features.index, test_size,
                                           youngest_size,
                                           test_validate_split_seed)
    train = (features.loc[train_idx], labels.loc[train_idx],
             loc_if_not_none(weights, train_idx))
    test = (features.loc[test_idx], labels.loc[test_idx],
            loc_if_not_none(weights, test_idx))

    # eventually perform a hyper parameter optimization first
    if hyper_parameter_space is not None:
        # next isolate hyperopt parameters and constants only used for hyper parameter tuning like early stopping
        constants = {}
        hyperopt_params = {}
        for k, v in list(hyper_parameter_space.items()):
            if k.startswith("__"):
                hyperopt_params[k[2:]] = hyper_parameter_space.pop(k)
            elif isinstance(v, (int, float, bool)):
                constants[k] = hyper_parameter_space.pop(k)

        # optimize hyper parameters
        model, trails = __hyper_opt(hyper_parameter_space, hyperopt_params,
                                    constants, model_provider,
                                    cross_validation, train, test)

    # finally train the model with eventually tuned hyper parameters
    __train_loop(model, cross_validation, train, test)
    _log.info(
        f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!"
    )

    # assemble result objects
    prediction_train = to_pandas(model.predict(train[0].ml.values), train_idx,
                                 labels.columns)
    prediction_test = to_pandas(model.predict(test[0].ml.values), test_idx,
                                labels.columns)

    targets = (loc_if_not_none(targets,
                               train_idx), loc_if_not_none(targets, test_idx))
    df_train = assemble_prediction_frame({
        TARGET_COLUMN_NAME: targets[0],
        PREDICTION_COLUMN_NAME: prediction_train,
        LABEL_COLUMN_NAME: train[1],
        FEATURE_COLUMN_NAME: train[0]
    })
    df_test = assemble_prediction_frame({
        TARGET_COLUMN_NAME: targets[1],
        PREDICTION_COLUMN_NAME: prediction_test,
        LABEL_COLUMN_NAME: test[1],
        FEATURE_COLUMN_NAME: test[0]
    })

    # update model properties and return the fit
    model._validation_indices = test_idx
    model.features_and_labels._min_required_samples = min_required_samples
    model.features_and_labels._label_columns = labels.columns
    return Fit(model, model.summary_provider(df_train),
               model.summary_provider(df_test), trails)
コード例 #12
0
def fit(df: pd.DataFrame,
        model_provider: Callable[[int], Model],
        training_data_splitter: Splitter = RandomSplits(),
        hyper_parameter_space: Dict = None,
        **kwargs) -> Fit:
    """

    :param df: the DataFrame you apply this function to
    :param model_provider: a callable which provides a new :class:`.Model` instance i.e. for each hyper parameter if
           hyper parameter tuning is enforced. Usually all the Model subclasses implement __call__ thus they are a
           provider of itself
    :param training_data_splitter: a :class:`pandas_ml_utils.ml.data.splitting.Splitter` object
           which provides traning and test data splits (eventually multiple folds)
    :param hyper_parameter_space: space of hyper parameters passed as kwargs to your model provider
    :return: returns a :class:`pandas_ml_utils.model.fitting.fit.Fit` object
    """

    trails = None
    model = model_provider()
    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    (features, min_required_samples), labels, targets, weights, gross_loss = \
        extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs)

    start_performance_count = perf_counter()
    _log.info("create model")

    # get indices and make training and test data sets
    #train_idx, test_idx = training_data_splitter.train_test_split(features.index)
    #train = (features.loc[train_idx], labels.loc[train_idx], loc_if_not_none(weights, train_idx))
    #test = (features.loc[test_idx], labels.loc[test_idx], loc_if_not_none(weights, test_idx))

    # FIXME eventually perform a hyper parameter optimization first
    #if hyper_parameter_space is not None:
    #    # next isolate hyperopt parameters and constants only used for hyper parameter tuning like early stopping
    #    constants = {}
    #    hyperopt_params = {}
    #    for k, v in list(hyper_parameter_space.items()):
    #        if k.startswith("__"):
    #            hyperopt_params[k[2:]] = hyper_parameter_space.pop(k)
    #        elif isinstance(v, (int, float, bool)):
    #            constants[k] = hyper_parameter_space.pop(k)
    #
    #    # optimize hyper parameters
    #    model, trails = __hyper_opt(hyper_parameter_space,
    #                                hyperopt_params,
    #                                constants,
    #                                model_provider,
    #                                None, # FIXME Ecross_validation,
    #                                train,
    #                                test)

    # finally train the model with eventually tuned hyper parameters
    sampler = DataGenerator(training_data_splitter, features, labels, targets,
                            weights, gross_loss).train_test_sampler()
    model.fit(sampler, **kwargs)
    _log.info(
        f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!"
    )

    # assemble result objects
    train_sampler, train_idx = sampler.training()
    test_sampler, test_idx = sampler.validation()

    try:
        prediction = (to_pandas(model.predict(train_sampler, **kwargs),
                                train_idx, labels.columns),
                      to_pandas(model.predict(test_sampler, **kwargs),
                                test_idx, labels.columns))

        # get training and test data tuples of the provided frames
        features, labels, targets, weights, gross_loss = sampler[0], sampler[
            1], sampler[2], sampler[3], sampler[4]
        df_train, df_test = [
            _assemble_result_frame(targets[i], prediction[i], labels[i],
                                   gross_loss[i], weights[i], features[i])
            for i in range(2)
        ]

        # update model properties and return the fit
        model._validation_indices = test_idx
        model.features_and_labels.set_min_required_samples(
            min_required_samples)
        model.features_and_labels.set_label_columns(labels[0].columns.tolist())
        return Fit(model, model.summary_provider(df_train, **kwargs),
                   model.summary_provider(df_test, **kwargs), trails, **kwargs)
    except Exception as e:
        raise FitException(e, model)
コード例 #13
0
ファイル: model_patch.py プロジェクト: KIC/pandas-ml-quant
    def fit(self,
            model_provider: Callable[[], MlModel],
            fitting_parameter: FittingParameter = FittingParameter(),
            verbose: int = 0,
            callbacks: Union[Callable, List[Callable]] = None,
            fail_silent: bool = False,
            **kwargs) -> Fit:
        df = self.df
        trails = None
        model = model_provider()
        kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                              kwargs)

        def fit_submodel(df, model, **kwargs):
            return model.fit(df, **kwargs)

        typemap_fitting = {SubModelFeature: fit_submodel, **self._type_mapping}
        frames: FeaturesWithLabels = model.features_and_labels(
            df,
            extract_feature_labels_weights,
            type_map=typemap_fitting,
            fitting_parameter=fitting_parameter,
            verbose=verbose,
            **kwargs)

        start_performance_count = perf_counter()
        _log.info("create model")

        df_train_prediction, df_test_prediction = model.fit(
            XYWeight(frames.features, frames.labels, frames.sample_weights),
            fitting_parameter, verbose, callbacks, **kwargs)

        _log.info(
            f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!"
        )

        # assemble result objects
        # get training and test data tuples of the provided frames.
        # NOTE due to event boosting there might be duplicate events in the test data which we need to filter away
        ext_frames = frames.targets, frames.labels, frames.gross_loss, frames.sample_weights, frames.features
        df_train = assemble_result_frame(df_train_prediction, *ext_frames)
        df_test = assemble_result_frame(
            df_test_prediction[~df_test_prediction.index.duplicated()],
            *ext_frames)

        # update model properties and return the fit
        model.features_and_labels.set_min_required_samples(
            frames.features_with_required_samples.min_required_samples)
        model.features_and_labels._kwargs = {
            k: a
            for k, a in kwargs.items() if k in model.features_and_labels.kwargs
        }

        def assemble_fit():
            return Fit(
                model,
                model.summary_provider(df_train,
                                       model,
                                       is_test=False,
                                       **kwargs),
                model.summary_provider(df_test, model, is_test=True, **kwargs),
                trails, **kwargs)

        return call_silent(assemble_fit, lambda e: FitException(e, model)
                           ) if fail_silent else assemble_fit()