Example #1
0
def predict(df: pd.DataFrame,
            model: Model,
            tail: int = None,
            samples: int = 1,
            **kwargs) -> pd.DataFrame:
    min_required_samples = model.features_and_labels.min_required_samples

    if tail is not None:
        if min_required_samples is not None:
            # just use the tail for feature engineering
            df = df[-(abs(tail) + (min_required_samples - 1)):]
        else:
            _log.warning(
                "could not determine the minimum required data from the model")

    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    columns, features, targets = extract(model.features_and_labels, df,
                                         extract_features, **kwargs)

    if samples > 1:
        print(f"draw {samples} samples")

    predictions = np.array([
        model.predict(features.ml.values) for _ in range(samples)
    ]).swapaxes(0, 1)

    y_hat = to_pandas(predictions, index=features.index, columns=columns)
    return assemble_prediction_frame({
        TARGET_COLUMN_NAME: targets,
        PREDICTION_COLUMN_NAME: y_hat,
        FEATURE_COLUMN_NAME: features
    })
Example #2
0
def predict(df: pd.DataFrame,
            model: Model,
            tail: int = None,
            samples: int = 1,
            **kwargs) -> pd.DataFrame:
    min_required_samples = model.features_and_labels.min_required_samples

    if tail is not None:
        if min_required_samples is not None:
            # just use the tail for feature engineering
            df = df[-(abs(tail) + (min_required_samples - 1)):]
        else:
            _log.warning(
                "could not determine the minimum required data from the model")

    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    columns, features, targets = extract(model.features_and_labels, df,
                                         extract_features, **kwargs)

    if samples > 1:
        print(f"draw {samples} samples")

    sampler = DataGenerator(DummySplitter(samples), features, None, targets,
                            None).complete_samples()
    predictions = model.predict(sampler, **kwargs)

    y_hat = to_pandas(predictions, index=features.index, columns=columns)
    return _assemble_result_frame(targets, y_hat, None, None, None, features)
Example #3
0
def backtest(df: pd.DataFrame, model: Model, summary_provider: Callable[[pd.DataFrame], Summary] = None, **kwargs) -> Summary:
    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs)
    (features, _), labels, targets, weights, gross_loss =\
        extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs)

    sampler = DataGenerator(DummySplitter(1), features, labels, targets, None).complete_samples()
    predictions = model.predict(sampler, **kwargs)

    y_hat = to_pandas(predictions, index=features.index, columns=labels.columns)
    df_backtest = _assemble_result_frame(targets, y_hat, labels, gross_loss, weights, features)
    return (summary_provider or model.summary_provider)(df_backtest, model, **kwargs)
Example #4
0
def backtest(df: pd.DataFrame,
             model: Model,
             summary_provider: Callable[[pd.DataFrame], Summary] = Summary,
             **kwargs) -> Summary:
    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    (features,
     _), labels, targets, _ = extract(model.features_and_labels, df,
                                      extract_feature_labels_weights, **kwargs)

    y_hat = to_pandas(model.predict(features.ml.values),
                      index=features.index,
                      columns=labels.columns)

    df_backtest = assemble_prediction_frame({
        TARGET_COLUMN_NAME: targets,
        PREDICTION_COLUMN_NAME: y_hat,
        LABEL_COLUMN_NAME: labels,
        FEATURE_COLUMN_NAME: features
    })
    return (summary_provider or model.summary_provider)(df_backtest)
Example #5
0
def fit(df: pd.DataFrame,
        model_provider: Callable[[int], Model],
        test_size: float = 0.4,
        youngest_size: float = None,
        cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray],
                                              Tuple[np.ndarray,
                                                    np.ndarray]]] = None,
        test_validate_split_seed=42,
        hyper_parameter_space: Dict = None,
        **kwargs) -> Fit:
    """

    :param df: the DataFrame you apply this function to
    :param model_provider: a callable which provides a new :class:`.Model` instance i.e. for each hyper parameter if
                           hyper parameter tuning is enforced. Usually all the Model subclasses implement __call__
                           thus they are a provider of itself
    :param test_size: the fraction [0, 1] of random samples which are used for a test set
    :param youngest_size: the fraction [0, 1] of the test samples which are not random but are the youngest
    :param cross_validation: tuple of number of epochs for each fold provider and a cross validation provider
    :param test_validate_split_seed: seed if train, test splitting needs to be reproduceable. A magic seed 'youngest' is
                                     available, which just uses the youngest data as test data
    :param hyper_parameter_space: space of hyper parameters passed as kwargs to your model provider
    :return: returns a :class:`pandas_ml_utils.model.fitting.fit.Fit` object
    """

    trails = None
    model = model_provider()
    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    (features, min_required_samples), labels, targets, weights = \
        extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs)

    start_performance_count = perf_counter()
    _log.info("create model")

    # get indices and make training and test data sets
    train_idx, test_idx = train_test_split(features.index, test_size,
                                           youngest_size,
                                           test_validate_split_seed)
    train = (features.loc[train_idx], labels.loc[train_idx],
             loc_if_not_none(weights, train_idx))
    test = (features.loc[test_idx], labels.loc[test_idx],
            loc_if_not_none(weights, test_idx))

    # eventually perform a hyper parameter optimization first
    if hyper_parameter_space is not None:
        # next isolate hyperopt parameters and constants only used for hyper parameter tuning like early stopping
        constants = {}
        hyperopt_params = {}
        for k, v in list(hyper_parameter_space.items()):
            if k.startswith("__"):
                hyperopt_params[k[2:]] = hyper_parameter_space.pop(k)
            elif isinstance(v, (int, float, bool)):
                constants[k] = hyper_parameter_space.pop(k)

        # optimize hyper parameters
        model, trails = __hyper_opt(hyper_parameter_space, hyperopt_params,
                                    constants, model_provider,
                                    cross_validation, train, test)

    # finally train the model with eventually tuned hyper parameters
    __train_loop(model, cross_validation, train, test)
    _log.info(
        f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!"
    )

    # assemble result objects
    prediction_train = to_pandas(model.predict(train[0].ml.values), train_idx,
                                 labels.columns)
    prediction_test = to_pandas(model.predict(test[0].ml.values), test_idx,
                                labels.columns)

    targets = (loc_if_not_none(targets,
                               train_idx), loc_if_not_none(targets, test_idx))
    df_train = assemble_prediction_frame({
        TARGET_COLUMN_NAME: targets[0],
        PREDICTION_COLUMN_NAME: prediction_train,
        LABEL_COLUMN_NAME: train[1],
        FEATURE_COLUMN_NAME: train[0]
    })
    df_test = assemble_prediction_frame({
        TARGET_COLUMN_NAME: targets[1],
        PREDICTION_COLUMN_NAME: prediction_test,
        LABEL_COLUMN_NAME: test[1],
        FEATURE_COLUMN_NAME: test[0]
    })

    # update model properties and return the fit
    model._validation_indices = test_idx
    model.features_and_labels._min_required_samples = min_required_samples
    model.features_and_labels._label_columns = labels.columns
    return Fit(model, model.summary_provider(df_train),
               model.summary_provider(df_test), trails)
Example #6
0
def fit(df: pd.DataFrame,
        model_provider: Callable[[int], Model],
        training_data_splitter: Splitter = RandomSplits(),
        hyper_parameter_space: Dict = None,
        **kwargs) -> Fit:
    """

    :param df: the DataFrame you apply this function to
    :param model_provider: a callable which provides a new :class:`.Model` instance i.e. for each hyper parameter if
           hyper parameter tuning is enforced. Usually all the Model subclasses implement __call__ thus they are a
           provider of itself
    :param training_data_splitter: a :class:`pandas_ml_utils.ml.data.splitting.Splitter` object
           which provides traning and test data splits (eventually multiple folds)
    :param hyper_parameter_space: space of hyper parameters passed as kwargs to your model provider
    :return: returns a :class:`pandas_ml_utils.model.fitting.fit.Fit` object
    """

    trails = None
    model = model_provider()
    kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs,
                          kwargs)
    (features, min_required_samples), labels, targets, weights, gross_loss = \
        extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs)

    start_performance_count = perf_counter()
    _log.info("create model")

    # get indices and make training and test data sets
    #train_idx, test_idx = training_data_splitter.train_test_split(features.index)
    #train = (features.loc[train_idx], labels.loc[train_idx], loc_if_not_none(weights, train_idx))
    #test = (features.loc[test_idx], labels.loc[test_idx], loc_if_not_none(weights, test_idx))

    # FIXME eventually perform a hyper parameter optimization first
    #if hyper_parameter_space is not None:
    #    # next isolate hyperopt parameters and constants only used for hyper parameter tuning like early stopping
    #    constants = {}
    #    hyperopt_params = {}
    #    for k, v in list(hyper_parameter_space.items()):
    #        if k.startswith("__"):
    #            hyperopt_params[k[2:]] = hyper_parameter_space.pop(k)
    #        elif isinstance(v, (int, float, bool)):
    #            constants[k] = hyper_parameter_space.pop(k)
    #
    #    # optimize hyper parameters
    #    model, trails = __hyper_opt(hyper_parameter_space,
    #                                hyperopt_params,
    #                                constants,
    #                                model_provider,
    #                                None, # FIXME Ecross_validation,
    #                                train,
    #                                test)

    # finally train the model with eventually tuned hyper parameters
    sampler = DataGenerator(training_data_splitter, features, labels, targets,
                            weights, gross_loss).train_test_sampler()
    model.fit(sampler, **kwargs)
    _log.info(
        f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!"
    )

    # assemble result objects
    train_sampler, train_idx = sampler.training()
    test_sampler, test_idx = sampler.validation()

    try:
        prediction = (to_pandas(model.predict(train_sampler, **kwargs),
                                train_idx, labels.columns),
                      to_pandas(model.predict(test_sampler, **kwargs),
                                test_idx, labels.columns))

        # get training and test data tuples of the provided frames
        features, labels, targets, weights, gross_loss = sampler[0], sampler[
            1], sampler[2], sampler[3], sampler[4]
        df_train, df_test = [
            _assemble_result_frame(targets[i], prediction[i], labels[i],
                                   gross_loss[i], weights[i], features[i])
            for i in range(2)
        ]

        # update model properties and return the fit
        model._validation_indices = test_idx
        model.features_and_labels.set_min_required_samples(
            min_required_samples)
        model.features_and_labels.set_label_columns(labels[0].columns.tolist())
        return Fit(model, model.summary_provider(df_train, **kwargs),
                   model.summary_provider(df_test, **kwargs), trails, **kwargs)
    except Exception as e:
        raise FitException(e, model)