Example #1
0
def sparsen(df,
            aggregator=mean_or_last,
            date=None,
            time=None,
            datetime=None,
            bucket='date',
            new_bucket_column=None,
            fix_kind='last',
            fix_time=None,
            fix_points=10,
            min_fix_point_count=None,
            max_fix_point_count=None,
            min_min_fix_point_time=None,
            max_min_fix_point_time=None,
            min_max_fix_point_time=None,
            max_max_fix_point_time=None,
            already_sorted=False,
            aggregators_apply_to_df=False,
            exclude_original_temporal_columns=True,
            columns_to_exclude=None,
            return_extra_info=False):
    checks.is_at_least_one_not_none(datetime, date, time)

    if bucket == 'date':
        bucket = lambda x: conv.to_python_date(x, allow_datetimes=True)
    elif bucket == 'week':
        bucket = lambda x: tsatimes.first_day_of_week(x)

    columns_to_exclude = set() if columns_to_exclude is None else set(
        columns_to_exclude)

    if datetime is not None:
        checks.check_all_none(date, time)
        if isinstance(datetime, str):
            if exclude_original_temporal_columns:
                columns_to_exclude.add(datetime)
            if new_bucket_column is None and exclude_original_temporal_columns:
                new_bucket_column = datetime
            datetime = df[datetime].values
        temporals = datetime
    else:
        if isinstance(date, str):
            if exclude_original_temporal_columns: columns_to_exclude.add(date)
            if new_bucket_column is None and exclude_original_temporal_columns:
                new_bucket_column = date
            date = df[date].values
        if isinstance(time, str):
            if exclude_original_temporal_columns: columns_to_exclude.add(time)
            if new_bucket_column is None and exclude_original_temporal_columns:
                new_bucket_column = time
            time = df[time].values

        if date is not None and time is not None:
            temporals = [dt.datetime.combine(d, t) for d, t in zip(date, time)]
        elif date is not None:
            temporals = date
        else:  # time is not None
            temporals = time

    if new_bucket_column is None: new_bucket_column = 'bucket'

    if fix_kind in ('first', 'after'): comparison = 'ge'
    elif fix_kind == 'after_exclusive': comparison = 'gt'
    elif fix_kind in ('last', 'before'): comparison = 'le'
    elif fix_kind == 'before_exclusive': comparison = 'lt'
    else: raise ValueError('Unfamiliar fix_kind: "%s"' % str(fix_kind))

    if fix_kind in ('first', 'last'): checks.check_none(fix_time)
    else: checks.check_not_none(fix_time)

    numeric_fix_points = checks.is_some_number(fix_points)
    if not numeric_fix_points:
        fix_points = conv.to_python_timedelta(fix_points)

    grouping_df = pd.DataFrame({'temporals': temporals})

    grouped_df = grouping_df.groupby(bucket(temporals))

    columns = [new_bucket_column]
    data = {new_bucket_column: []}
    aggs = {}

    if checks.is_some_dict(aggregator): column_agg_pairs = aggregator.items()
    elif checks.is_iterable(aggregator): column_agg_pairs = aggregator
    else: column_agg_pairs = zip(df.columns, utils.xconst(aggregator))
    for column, agg in column_agg_pairs:
        if column not in columns_to_exclude:
            columns.append(column)
            data[column] = []
            aggs[column] = agg

    dates_with_no_points = []
    dates_with_fix_point_limits_breached = col.OrderedDict()
    fix_point_counts = col.OrderedDict()

    for bucket, group_df in grouped_df:
        if len(group_df) == 0: dates_with_no_points.append(bucket)
        if not already_sorted:
            group_df = group_df.copy()
            group_df.sort_values('temporals', inplace=True)
        if fix_kind == 'first': fix_time = group_df['temporals'].values[0]
        elif fix_kind == 'last': fix_time = group_df['temporals'].values[-1]

        if numeric_fix_points:
            if comparison == 'ge':
                fix_point_indices = group_df.index[tsatimes.temporal_ge(
                    group_df['temporals'], fix_time)][0:fix_points]
            elif comparison == 'gt':
                fix_point_indices = group_df.index[tsatimes.temporal_gt(
                    group_df['temporals'], fix_time)][0:fix_points]
            elif comparison == 'le':
                fix_point_indices = group_df.index[tsatimes.temporal_le(
                    group_df['temporals'], fix_time)][-fix_points:]
            else:  # comparison == 'lt'
                fix_point_indices = group_df.index[tsatimes.temporal_lt(
                    group_df['temporals'], fix_time)][-fix_points:]
        else:
            if comparison == 'ge':
                fix_point_indices = group_df.index[(tsatimes.temporal_ge(group_df['temporals'], fix_time)) & \
                                                   (tsatimes.temporal_le(group_df['temporals'], tsatimes.plus_timedelta(fix_time, fix_points)))]
            elif comparison == 'gt':
                fix_point_indices = group_df.index[(tsatimes.temporal_gt(group_df['temporals'], fix_time)) & \
                                                   (tsatimes.temporal_le(group_df['temporals'], tsatimes.plus_timedelta(fix_time, fix_points)))]
            elif comparison == 'le':
                fix_point_indices = group_df.index[(tsatimes.temporal_le(group_df['temporals'], fix_time)) & \
                                                   (tsatimes.temporal_ge(group_df['temporals'], tsatimes.plus_timedelta(fix_time, -fix_points)))]
            else:  # comparison == 'lt':
                fix_point_indices = group_df.index[(tsatimes.temporal_lt(group_df['temporals'], fix_time)) & \
                                                   (tsatimes.temporal_ge(group_df['temporals'], tsatimes.plus_timedelta(fix_time, -fix_points)))]

        fix_point_limits_breached = set()

        if min_fix_point_count is not None and len(
                fix_point_indices) < min_fix_point_count:
            fix_point_limits_breached.add('min_fix_point_count')
        if max_fix_point_count is not None and len(
                fix_point_indices) > max_fix_point_count:
            fix_point_limits_breached.add('max_fix_point_count')
        if min_min_fix_point_time is not None:
            if checks.is_some_timedelta(min_min_fix_point_time):
                the_min_min_fix_point_time = fix_time + min_min_fix_point_time if comparison in (
                    'ge', 'gt') else fix_time - min_min_fix_point_time
            else:
                the_min_min_fix_point_time = min_min_fix_point_time
            if tsatimes.temporal_lt(
                    min(grouping_df['temporals'].values[fix_point_indices]),
                    the_min_min_fix_point_time):
                fix_point_limits_breached.add('min_min_fix_point_time')
        if max_min_fix_point_time is not None:
            if checks.is_some_timedelta(max_min_fix_point_time):
                the_max_min_fix_point_time = fix_time + max_min_fix_point_time if comparison in (
                    'ge', 'gt') else fix_time - max_min_fix_point_time
            else:
                the_max_min_fix_point_time = max_min_fix_point_time
            if tsatimes.temporal_gt(
                    min(grouping_df['temporals'].values[fix_point_indices]),
                    the_max_min_fix_point_time):
                fix_point_limits_breached.add('max_min_fix_point_time')
        if min_max_fix_point_time is not None:
            if checks.is_some_timedelta(min_max_fix_point_time):
                the_min_max_fix_point_time = fix_time + min_max_fix_point_time if comparison in (
                    'ge', 'gt') else fix_time - min_max_fix_point_time
            else:
                the_min_max_fix_point_time = min_max_fix_point_time
            if tsatimes.temporal_lt(
                    max(grouping_df['temporals'].values[fix_point_indices]),
                    the_min_max_fix_point_time):
                fix_point_limits_breached.add('min_max_fix_point_time')
        if max_max_fix_point_time is not None:
            if checks.is_some_timedelta(max_max_fix_point_time):
                the_max_max_fix_point_time = fix_time + max_max_fix_point_time if comparison in (
                    'ge', 'gt') else fix_time - max_max_fix_point_time
            else:
                the_max_max_fix_point_time = max_max_fix_point_time
            if tsatimes.temporal_gt(
                    max(grouping_df['temporals'].values[fix_point_indices]),
                    the_max_max_fix_point_time):
                fix_point_limits_breached.add('max_max_fix_point_time')

        if len(fix_point_limits_breached) > 0:
            dates_with_fix_point_limits_breached[
                bucket] = fix_point_limits_breached
        else:
            data[new_bucket_column].append(bucket)
            for column in columns[1:]:
                if column not in columns_to_exclude:
                    arg = df.iloc[
                        fix_point_indices] if aggregators_apply_to_df else df.iloc[
                            fix_point_indices][column].values
                    data[column].append(aggs[column](arg))
            fix_point_counts[bucket] = len(fix_point_indices)

    df = pd.DataFrame(data, columns=columns)

    if return_extra_info:
        return {
            'df': df,
            'dates_with_no_points': dates_with_no_points,
            'dates_with_fix_point_limits_breached':
            dates_with_fix_point_limits_breached,
            'fix_point_counts': fix_point_counts
        }
    else:
        return df
Example #2
0
def run(observable,
        obss=None,
        times=None,
        obs_covs=None,
        true_values=None,
        df=None,
        fun=None,
        return_df=False):
    if df is not None:
        if obss is not None and (checks.is_string(obss)
                                 or checks.is_int(obss)):
            obss = df[obss]

        if times is None:
            if isinstance(obss, pd.Series): times = obss.index.values
        elif (checks.is_string(times) or checks.is_int(times)):
            times = df[times].values

        if isinstance(obss, pd.Series): obss = obss.values

        if obs_covs is not None and (checks.is_string(obs_covs)
                                     or checks.is_int(obs_covs)):
            obs_covs = df[obs_covs].values

        if true_values is not None and (checks.is_string(true_values)
                                        or checks.is_int(true_values)):
            true_values = df[true_values].values

    checks.check_not_none(obss)

    if not checks.is_iterable_not_string(observable):
        observable = utils.xconst(observable)
    if not checks.is_iterable_not_string(obss): obss = [obss]
    if not checks.is_iterable_not_string(times): times = utils.xconst(times)
    if not checks.is_iterable_not_string(obs_covs):
        obs_covs = utils.xconst(obs_covs)
    if not checks.is_iterable_not_string(true_values):
        true_values = utils.xconst(true_values)

    obs_result = None
    cumulative_log_likelihood = 0.

    if return_df:
        time = []
        filter_name = []
        filter_type = []
        observable_name = []
        accepted = []
        obs_mean = []
        obs_cov = []
        predicted_obs_mean = []
        predicted_obs_cov = []
        cross_cov = []
        innov_mean = []
        innov_cov = []
        prior_state_mean = []
        prior_state_cov = []
        posterior_state_mean = []
        posterior_state_cov = []
        true_value = []
        log_likelihood = []
        gain = []

    last_time = None

    for an_observable, an_obs, a_time, an_obs_cov, a_true_value in zip(
            observable, obss, times, obs_covs, true_values):
        if a_time is None:
            if last_time is None: a_time = 0
            else: a_time = last_time + 1
        last_time = a_time

        if checks.is_callable(an_observable):
            an_observable = an_observable(an_obs)
        if fun is not None: an_obs = fun(an_obs)
        if an_obs_cov is not None:
            if isinstance(an_obs, (Obs, distrs.Distr)):
                raise ValueError(
                    'An observation covariance is provided while the observation is given by a distribution --- conflicting arguments'
                )
            an_obs = distrs.NormalDistr(an_obs, an_obs_cov)

        if return_df and len(time) == 0:
            an_initial_state_mean = an_observable.filter.state.state_distr.mean
            an_initial_state_cov = an_observable.filter.state.state_distr.cov
            time.append(an_observable.filter.time)
            filter_name.append(an_observable.filter.name)
            filter_type.append(type(an_observable.filter))
            observable_name.append(None)
            accepted.append(None)
            obs_mean.append(None)
            obs_cov.append(None)
            predicted_obs_mean.append(None)
            predicted_obs_cov.append(None)
            cross_cov.append(None)
            innov_mean.append(None)
            innov_cov.append(None)
            prior_state_mean.append(
                npu.to_scalar(an_initial_state_mean, raise_value_error=False))
            prior_state_cov.append(
                npu.to_scalar(an_initial_state_cov, raise_value_error=False))
            posterior_state_mean.append(
                npu.to_scalar(an_initial_state_mean, raise_value_error=False))
            posterior_state_cov.append(
                npu.to_scalar(an_initial_state_cov, raise_value_error=False))
            true_value.append(None)
            log_likelihood.append(None)
            gain.append(None)

        if isinstance(an_obs, Obs):
            a_time, _ = _time_and_obs_distr(an_obs, a_time,
                                            an_observable.filter.time)

        predicted_obs = an_observable.predict(time=a_time,
                                              true_value=a_true_value)
        a_prior_state_mean = an_observable.filter.state.state_distr.mean
        a_prior_state_cov = an_observable.filter.state.state_distr.cov
        obs_result = an_observable.observe(obs=an_obs,
                                           time=a_time,
                                           true_value=a_true_value,
                                           predicted_obs=predicted_obs)
        if obs_result.accepted:
            cumulative_log_likelihood += obs_result.log_likelihood
        a_posterior_state_mean = an_observable.filter.state.state_distr.mean
        a_posterior_state_cov = an_observable.filter.state.state_distr.cov

        if return_df:
            time.append(obs_result.obs.time)
            filter_name.append(an_observable.filter.name)
            filter_type.append(type(an_observable.filter))
            observable_name.append(an_observable.name)
            accepted.append(obs_result.accepted)
            obs_mean.append(
                npu.to_scalar(obs_result.obs.distr.mean,
                              raise_value_error=False))
            obs_cov.append(
                npu.to_scalar(obs_result.obs.distr.cov,
                              raise_value_error=False))
            predicted_obs_mean.append(
                npu.to_scalar(obs_result.predicted_obs.distr.mean,
                              raise_value_error=False))
            predicted_obs_cov.append(
                npu.to_scalar(obs_result.predicted_obs.distr.cov,
                              raise_value_error=False))
            cross_cov.append(
                npu.to_scalar(obs_result.predicted_obs.cross_cov,
                              raise_value_error=False))
            innov_mean.append(
                npu.to_scalar(obs_result.innov_distr.mean,
                              raise_value_error=False))
            innov_cov.append(
                npu.to_scalar(obs_result.innov_distr.cov,
                              raise_value_error=False))
            prior_state_mean.append(
                npu.to_scalar(a_prior_state_mean, raise_value_error=False))
            prior_state_cov.append(
                npu.to_scalar(a_prior_state_cov, raise_value_error=False))
            posterior_state_mean.append(
                npu.to_scalar(a_posterior_state_mean, raise_value_error=False))
            posterior_state_cov.append(
                npu.to_scalar(a_posterior_state_cov, raise_value_error=False))
            true_value.append(
                npu.to_scalar(a_true_value, raise_value_error=False))
            log_likelihood.append(
                npu.to_scalar(obs_result.log_likelihood,
                              raise_value_error=False))
            gain.append(
                obs_result.gain if hasattr(obs_result, 'gain') else None)

    df = None
    if return_df:
        df = pd.DataFrame(
            {
                'time': time,
                'filter_name': filter_name,
                'filter_type': filter_type,
                'observable_name': observable_name,
                'accepted': accepted,
                'obs_mean': obs_mean,
                'obs_cov': obs_cov,
                'predicted_obs_mean': predicted_obs_mean,
                'predicted_obs_cov': predicted_obs_cov,
                'cross_cov': cross_cov,
                'innov_mean': innov_mean,
                'innov_cov': innov_cov,
                'prior_state_mean': prior_state_mean,
                'prior_state_cov': prior_state_cov,
                'posterior_state_mean': prior_state_mean,
                'posterior_state_cov': prior_state_cov,
                'true_value': true_value,
                'log_likelihood': log_likelihood,
                'gain': gain
            },
            columns=('time', 'filter_name', 'filter_type', 'observable_name',
                     'accepted', 'obs_mean', 'obs_cov', 'predicted_obs_mean',
                     'predicted_obs_cov', 'cross_cov', 'innov_mean',
                     'innov_cov', 'prior_state_mean', 'prior_state_cov',
                     'posterior_state_mean', 'posterior_state_cov',
                     'true_value', 'log_likelihood', 'gain'))

    return FilterRunResult(obs_result, cumulative_log_likelihood, df)
Example #3
0
def run(observable,
        obss=None,
        times=None,
        obs_covs=None,
        true_values=None,
        df=None,
        fun=None,
        return_df=False):
    if df is not None:
        if obss is not None and checks.is_string(obss):
            obss = df[obss].values
        if times is not None and checks.is_string(times):
            times = df[times].values
        if obs_covs is not None and checks.is_string(obs_covs):
            obs_covs = df[obs_covs].values
        if true_values is not None and checks.is_string(true_values):
            true_values = df[true_values].values

    checks.check_not_none(obss)

    if not checks.is_iterable_not_string(observable):
        observable = utils.xconst(observable)
    if not checks.is_iterable_not_string(obss): obss = [obss]
    if not checks.is_iterable_not_string(times): times = utils.xconst(times)
    if not checks.is_iterable_not_string(obs_covs):
        obs_covs = utils.xconst(obs_covs)
    if not checks.is_iterable_not_string(true_values):
        true_values = utils.xconst(true_values)

    obs_result = None

    if return_df:
        time = []
        accepted = []
        obs_mean = []
        obs_cov = []
        predicted_obs_mean = []
        predicted_obs_cov = []
        innov_mean = []
        innov_cov = []
        prior_state_mean = []
        prior_state_cov = []
        posterior_state_mean = []
        posterior_state_cov = []
        log_likelihood = []

    for an_observable, an_obs, a_time, an_obs_cov, a_true_value in zip(
            observable, obss, times, obs_covs, true_values):
        if checks.is_callable(an_observable):
            an_observable = an_observable(an_obs)
        if fun is not None: an_obs = fun(an_obs)
        if an_obs_cov is not None:
            if isinstance(an_obs, (Obs, distrs.Distr)):
                raise ValueError(
                    'An observation covariance is provided while the observation is given by a distribution --- conflicting arguments'
                )
            an_obs = distrs.NormalDistr(an_obs, an_obs_cov)

        if return_df and len(time) == 0:
            an_initial_state_mean = an_observable.filter.state.state_distr.mean
            an_initial_state_cov = an_observable.filter.state.state_distr.cov
            time.append(an_observable.filter.time)
            accepted.append(None)
            obs_mean.append(None)
            obs_cov.append(None)
            predicted_obs_mean.append(None)
            predicted_obs_cov.append(None)
            innov_mean.append(None)
            innov_cov.append(None)
            prior_state_mean.append(
                npu.to_scalar(an_initial_state_mean, raise_value_error=False))
            prior_state_cov.append(
                npu.to_scalar(an_initial_state_cov, raise_value_error=False))
            posterior_state_mean.append(
                npu.to_scalar(an_initial_state_mean, raise_value_error=False))
            posterior_state_cov.append(
                npu.to_scalar(an_initial_state_cov, raise_value_error=False))
            log_likelihood.append(None)

        if isinstance(an_obs, Obs):
            a_time, _ = _time_and_obs_distr(an_obs, a_time,
                                            an_observable.filter.time)

        predicted_obs = an_observable.predict(time=a_time,
                                              true_value=a_true_value)
        a_prior_state_mean = an_observable.filter.state.state_distr.mean
        a_prior_state_cov = an_observable.filter.state.state_distr.cov
        obs_result = an_observable.observe(obs=an_obs,
                                           time=a_time,
                                           true_value=a_true_value,
                                           predicted_obs=predicted_obs)
        a_posterior_state_mean = an_observable.filter.state.state_distr.mean
        a_posterior_state_cov = an_observable.filter.state.state_distr.cov

        if return_df:
            time.append(obs_result.obs.time)
            accepted.append(obs_result.accepted)
            obs_mean.append(
                npu.to_scalar(obs_result.obs.distr.mean,
                              raise_value_error=False))
            obs_cov.append(
                npu.to_scalar(obs_result.obs.distr.cov,
                              raise_value_error=False))
            predicted_obs_mean.append(
                npu.to_scalar(obs_result.predicted_obs.distr.mean,
                              raise_value_error=False))
            predicted_obs_cov.append(
                npu.to_scalar(obs_result.predicted_obs.distr.cov,
                              raise_value_error=False))
            innov_mean.append(
                npu.to_scalar(obs_result.innov_distr.mean,
                              raise_value_error=False))
            innov_cov.append(
                npu.to_scalar(obs_result.innov_distr.cov,
                              raise_value_error=False))
            prior_state_mean.append(
                npu.to_scalar(a_prior_state_mean, raise_value_error=False))
            prior_state_cov.append(
                npu.to_scalar(a_prior_state_cov, raise_value_error=False))
            posterior_state_mean.append(
                npu.to_scalar(a_posterior_state_mean, raise_value_error=False))
            posterior_state_cov.append(
                npu.to_scalar(a_posterior_state_cov, raise_value_error=False))
            log_likelihood.append(
                npu.to_scalar(obs_result.log_likelihood,
                              raise_value_error=False))

    if return_df:
        return pd.DataFrame(
            {
                'time': time,
                'accepted': accepted,
                'obs_mean': obs_mean,
                'obs_cov': obs_cov,
                'predicted_obs_mean': predicted_obs_mean,
                'predicted_obs_cov': predicted_obs_cov,
                'innov_mean': innov_mean,
                'innov_cov': innov_cov,
                'prior_state_mean': prior_state_mean,
                'prior_state_cov': prior_state_cov,
                'posterior_state_mean': prior_state_mean,
                'posterior_state_cov': prior_state_cov,
                'log_likelihood': log_likelihood
            },
            columns=('time', 'accepted', 'obs_mean', 'obs_cov',
                     'predicted_obs_mean', 'predicted_obs_cov', 'innov_mean',
                     'innov_cov', 'prior_state_mean', 'prior_state_cov',
                     'posterior_state_mean', 'posterior_state_cov',
                     'log_likelihood'))

    return obs_result