def sparsen(df, aggregator=mean_or_last, date=None, time=None, datetime=None, bucket='date', new_bucket_column=None, fix_kind='last', fix_time=None, fix_points=10, min_fix_point_count=None, max_fix_point_count=None, min_min_fix_point_time=None, max_min_fix_point_time=None, min_max_fix_point_time=None, max_max_fix_point_time=None, already_sorted=False, aggregators_apply_to_df=False, exclude_original_temporal_columns=True, columns_to_exclude=None, return_extra_info=False): checks.is_at_least_one_not_none(datetime, date, time) if bucket == 'date': bucket = lambda x: conv.to_python_date(x, allow_datetimes=True) elif bucket == 'week': bucket = lambda x: tsatimes.first_day_of_week(x) columns_to_exclude = set() if columns_to_exclude is None else set( columns_to_exclude) if datetime is not None: checks.check_all_none(date, time) if isinstance(datetime, str): if exclude_original_temporal_columns: columns_to_exclude.add(datetime) if new_bucket_column is None and exclude_original_temporal_columns: new_bucket_column = datetime datetime = df[datetime].values temporals = datetime else: if isinstance(date, str): if exclude_original_temporal_columns: columns_to_exclude.add(date) if new_bucket_column is None and exclude_original_temporal_columns: new_bucket_column = date date = df[date].values if isinstance(time, str): if exclude_original_temporal_columns: columns_to_exclude.add(time) if new_bucket_column is None and exclude_original_temporal_columns: new_bucket_column = time time = df[time].values if date is not None and time is not None: temporals = [dt.datetime.combine(d, t) for d, t in zip(date, time)] elif date is not None: temporals = date else: # time is not None temporals = time if new_bucket_column is None: new_bucket_column = 'bucket' if fix_kind in ('first', 'after'): comparison = 'ge' elif fix_kind == 'after_exclusive': comparison = 'gt' elif fix_kind in ('last', 'before'): comparison = 'le' elif fix_kind == 'before_exclusive': comparison = 'lt' else: raise ValueError('Unfamiliar fix_kind: "%s"' % str(fix_kind)) if fix_kind in ('first', 'last'): checks.check_none(fix_time) else: checks.check_not_none(fix_time) numeric_fix_points = checks.is_some_number(fix_points) if not numeric_fix_points: fix_points = conv.to_python_timedelta(fix_points) grouping_df = pd.DataFrame({'temporals': temporals}) grouped_df = grouping_df.groupby(bucket(temporals)) columns = [new_bucket_column] data = {new_bucket_column: []} aggs = {} if checks.is_some_dict(aggregator): column_agg_pairs = aggregator.items() elif checks.is_iterable(aggregator): column_agg_pairs = aggregator else: column_agg_pairs = zip(df.columns, utils.xconst(aggregator)) for column, agg in column_agg_pairs: if column not in columns_to_exclude: columns.append(column) data[column] = [] aggs[column] = agg dates_with_no_points = [] dates_with_fix_point_limits_breached = col.OrderedDict() fix_point_counts = col.OrderedDict() for bucket, group_df in grouped_df: if len(group_df) == 0: dates_with_no_points.append(bucket) if not already_sorted: group_df = group_df.copy() group_df.sort_values('temporals', inplace=True) if fix_kind == 'first': fix_time = group_df['temporals'].values[0] elif fix_kind == 'last': fix_time = group_df['temporals'].values[-1] if numeric_fix_points: if comparison == 'ge': fix_point_indices = group_df.index[tsatimes.temporal_ge( group_df['temporals'], fix_time)][0:fix_points] elif comparison == 'gt': fix_point_indices = group_df.index[tsatimes.temporal_gt( group_df['temporals'], fix_time)][0:fix_points] elif comparison == 'le': fix_point_indices = group_df.index[tsatimes.temporal_le( group_df['temporals'], fix_time)][-fix_points:] else: # comparison == 'lt' fix_point_indices = group_df.index[tsatimes.temporal_lt( group_df['temporals'], fix_time)][-fix_points:] else: if comparison == 'ge': fix_point_indices = group_df.index[(tsatimes.temporal_ge(group_df['temporals'], fix_time)) & \ (tsatimes.temporal_le(group_df['temporals'], tsatimes.plus_timedelta(fix_time, fix_points)))] elif comparison == 'gt': fix_point_indices = group_df.index[(tsatimes.temporal_gt(group_df['temporals'], fix_time)) & \ (tsatimes.temporal_le(group_df['temporals'], tsatimes.plus_timedelta(fix_time, fix_points)))] elif comparison == 'le': fix_point_indices = group_df.index[(tsatimes.temporal_le(group_df['temporals'], fix_time)) & \ (tsatimes.temporal_ge(group_df['temporals'], tsatimes.plus_timedelta(fix_time, -fix_points)))] else: # comparison == 'lt': fix_point_indices = group_df.index[(tsatimes.temporal_lt(group_df['temporals'], fix_time)) & \ (tsatimes.temporal_ge(group_df['temporals'], tsatimes.plus_timedelta(fix_time, -fix_points)))] fix_point_limits_breached = set() if min_fix_point_count is not None and len( fix_point_indices) < min_fix_point_count: fix_point_limits_breached.add('min_fix_point_count') if max_fix_point_count is not None and len( fix_point_indices) > max_fix_point_count: fix_point_limits_breached.add('max_fix_point_count') if min_min_fix_point_time is not None: if checks.is_some_timedelta(min_min_fix_point_time): the_min_min_fix_point_time = fix_time + min_min_fix_point_time if comparison in ( 'ge', 'gt') else fix_time - min_min_fix_point_time else: the_min_min_fix_point_time = min_min_fix_point_time if tsatimes.temporal_lt( min(grouping_df['temporals'].values[fix_point_indices]), the_min_min_fix_point_time): fix_point_limits_breached.add('min_min_fix_point_time') if max_min_fix_point_time is not None: if checks.is_some_timedelta(max_min_fix_point_time): the_max_min_fix_point_time = fix_time + max_min_fix_point_time if comparison in ( 'ge', 'gt') else fix_time - max_min_fix_point_time else: the_max_min_fix_point_time = max_min_fix_point_time if tsatimes.temporal_gt( min(grouping_df['temporals'].values[fix_point_indices]), the_max_min_fix_point_time): fix_point_limits_breached.add('max_min_fix_point_time') if min_max_fix_point_time is not None: if checks.is_some_timedelta(min_max_fix_point_time): the_min_max_fix_point_time = fix_time + min_max_fix_point_time if comparison in ( 'ge', 'gt') else fix_time - min_max_fix_point_time else: the_min_max_fix_point_time = min_max_fix_point_time if tsatimes.temporal_lt( max(grouping_df['temporals'].values[fix_point_indices]), the_min_max_fix_point_time): fix_point_limits_breached.add('min_max_fix_point_time') if max_max_fix_point_time is not None: if checks.is_some_timedelta(max_max_fix_point_time): the_max_max_fix_point_time = fix_time + max_max_fix_point_time if comparison in ( 'ge', 'gt') else fix_time - max_max_fix_point_time else: the_max_max_fix_point_time = max_max_fix_point_time if tsatimes.temporal_gt( max(grouping_df['temporals'].values[fix_point_indices]), the_max_max_fix_point_time): fix_point_limits_breached.add('max_max_fix_point_time') if len(fix_point_limits_breached) > 0: dates_with_fix_point_limits_breached[ bucket] = fix_point_limits_breached else: data[new_bucket_column].append(bucket) for column in columns[1:]: if column not in columns_to_exclude: arg = df.iloc[ fix_point_indices] if aggregators_apply_to_df else df.iloc[ fix_point_indices][column].values data[column].append(aggs[column](arg)) fix_point_counts[bucket] = len(fix_point_indices) df = pd.DataFrame(data, columns=columns) if return_extra_info: return { 'df': df, 'dates_with_no_points': dates_with_no_points, 'dates_with_fix_point_limits_breached': dates_with_fix_point_limits_breached, 'fix_point_counts': fix_point_counts } else: return df
def run(observable, obss=None, times=None, obs_covs=None, true_values=None, df=None, fun=None, return_df=False): if df is not None: if obss is not None and (checks.is_string(obss) or checks.is_int(obss)): obss = df[obss] if times is None: if isinstance(obss, pd.Series): times = obss.index.values elif (checks.is_string(times) or checks.is_int(times)): times = df[times].values if isinstance(obss, pd.Series): obss = obss.values if obs_covs is not None and (checks.is_string(obs_covs) or checks.is_int(obs_covs)): obs_covs = df[obs_covs].values if true_values is not None and (checks.is_string(true_values) or checks.is_int(true_values)): true_values = df[true_values].values checks.check_not_none(obss) if not checks.is_iterable_not_string(observable): observable = utils.xconst(observable) if not checks.is_iterable_not_string(obss): obss = [obss] if not checks.is_iterable_not_string(times): times = utils.xconst(times) if not checks.is_iterable_not_string(obs_covs): obs_covs = utils.xconst(obs_covs) if not checks.is_iterable_not_string(true_values): true_values = utils.xconst(true_values) obs_result = None cumulative_log_likelihood = 0. if return_df: time = [] filter_name = [] filter_type = [] observable_name = [] accepted = [] obs_mean = [] obs_cov = [] predicted_obs_mean = [] predicted_obs_cov = [] cross_cov = [] innov_mean = [] innov_cov = [] prior_state_mean = [] prior_state_cov = [] posterior_state_mean = [] posterior_state_cov = [] true_value = [] log_likelihood = [] gain = [] last_time = None for an_observable, an_obs, a_time, an_obs_cov, a_true_value in zip( observable, obss, times, obs_covs, true_values): if a_time is None: if last_time is None: a_time = 0 else: a_time = last_time + 1 last_time = a_time if checks.is_callable(an_observable): an_observable = an_observable(an_obs) if fun is not None: an_obs = fun(an_obs) if an_obs_cov is not None: if isinstance(an_obs, (Obs, distrs.Distr)): raise ValueError( 'An observation covariance is provided while the observation is given by a distribution --- conflicting arguments' ) an_obs = distrs.NormalDistr(an_obs, an_obs_cov) if return_df and len(time) == 0: an_initial_state_mean = an_observable.filter.state.state_distr.mean an_initial_state_cov = an_observable.filter.state.state_distr.cov time.append(an_observable.filter.time) filter_name.append(an_observable.filter.name) filter_type.append(type(an_observable.filter)) observable_name.append(None) accepted.append(None) obs_mean.append(None) obs_cov.append(None) predicted_obs_mean.append(None) predicted_obs_cov.append(None) cross_cov.append(None) innov_mean.append(None) innov_cov.append(None) prior_state_mean.append( npu.to_scalar(an_initial_state_mean, raise_value_error=False)) prior_state_cov.append( npu.to_scalar(an_initial_state_cov, raise_value_error=False)) posterior_state_mean.append( npu.to_scalar(an_initial_state_mean, raise_value_error=False)) posterior_state_cov.append( npu.to_scalar(an_initial_state_cov, raise_value_error=False)) true_value.append(None) log_likelihood.append(None) gain.append(None) if isinstance(an_obs, Obs): a_time, _ = _time_and_obs_distr(an_obs, a_time, an_observable.filter.time) predicted_obs = an_observable.predict(time=a_time, true_value=a_true_value) a_prior_state_mean = an_observable.filter.state.state_distr.mean a_prior_state_cov = an_observable.filter.state.state_distr.cov obs_result = an_observable.observe(obs=an_obs, time=a_time, true_value=a_true_value, predicted_obs=predicted_obs) if obs_result.accepted: cumulative_log_likelihood += obs_result.log_likelihood a_posterior_state_mean = an_observable.filter.state.state_distr.mean a_posterior_state_cov = an_observable.filter.state.state_distr.cov if return_df: time.append(obs_result.obs.time) filter_name.append(an_observable.filter.name) filter_type.append(type(an_observable.filter)) observable_name.append(an_observable.name) accepted.append(obs_result.accepted) obs_mean.append( npu.to_scalar(obs_result.obs.distr.mean, raise_value_error=False)) obs_cov.append( npu.to_scalar(obs_result.obs.distr.cov, raise_value_error=False)) predicted_obs_mean.append( npu.to_scalar(obs_result.predicted_obs.distr.mean, raise_value_error=False)) predicted_obs_cov.append( npu.to_scalar(obs_result.predicted_obs.distr.cov, raise_value_error=False)) cross_cov.append( npu.to_scalar(obs_result.predicted_obs.cross_cov, raise_value_error=False)) innov_mean.append( npu.to_scalar(obs_result.innov_distr.mean, raise_value_error=False)) innov_cov.append( npu.to_scalar(obs_result.innov_distr.cov, raise_value_error=False)) prior_state_mean.append( npu.to_scalar(a_prior_state_mean, raise_value_error=False)) prior_state_cov.append( npu.to_scalar(a_prior_state_cov, raise_value_error=False)) posterior_state_mean.append( npu.to_scalar(a_posterior_state_mean, raise_value_error=False)) posterior_state_cov.append( npu.to_scalar(a_posterior_state_cov, raise_value_error=False)) true_value.append( npu.to_scalar(a_true_value, raise_value_error=False)) log_likelihood.append( npu.to_scalar(obs_result.log_likelihood, raise_value_error=False)) gain.append( obs_result.gain if hasattr(obs_result, 'gain') else None) df = None if return_df: df = pd.DataFrame( { 'time': time, 'filter_name': filter_name, 'filter_type': filter_type, 'observable_name': observable_name, 'accepted': accepted, 'obs_mean': obs_mean, 'obs_cov': obs_cov, 'predicted_obs_mean': predicted_obs_mean, 'predicted_obs_cov': predicted_obs_cov, 'cross_cov': cross_cov, 'innov_mean': innov_mean, 'innov_cov': innov_cov, 'prior_state_mean': prior_state_mean, 'prior_state_cov': prior_state_cov, 'posterior_state_mean': prior_state_mean, 'posterior_state_cov': prior_state_cov, 'true_value': true_value, 'log_likelihood': log_likelihood, 'gain': gain }, columns=('time', 'filter_name', 'filter_type', 'observable_name', 'accepted', 'obs_mean', 'obs_cov', 'predicted_obs_mean', 'predicted_obs_cov', 'cross_cov', 'innov_mean', 'innov_cov', 'prior_state_mean', 'prior_state_cov', 'posterior_state_mean', 'posterior_state_cov', 'true_value', 'log_likelihood', 'gain')) return FilterRunResult(obs_result, cumulative_log_likelihood, df)
def run(observable, obss=None, times=None, obs_covs=None, true_values=None, df=None, fun=None, return_df=False): if df is not None: if obss is not None and checks.is_string(obss): obss = df[obss].values if times is not None and checks.is_string(times): times = df[times].values if obs_covs is not None and checks.is_string(obs_covs): obs_covs = df[obs_covs].values if true_values is not None and checks.is_string(true_values): true_values = df[true_values].values checks.check_not_none(obss) if not checks.is_iterable_not_string(observable): observable = utils.xconst(observable) if not checks.is_iterable_not_string(obss): obss = [obss] if not checks.is_iterable_not_string(times): times = utils.xconst(times) if not checks.is_iterable_not_string(obs_covs): obs_covs = utils.xconst(obs_covs) if not checks.is_iterable_not_string(true_values): true_values = utils.xconst(true_values) obs_result = None if return_df: time = [] accepted = [] obs_mean = [] obs_cov = [] predicted_obs_mean = [] predicted_obs_cov = [] innov_mean = [] innov_cov = [] prior_state_mean = [] prior_state_cov = [] posterior_state_mean = [] posterior_state_cov = [] log_likelihood = [] for an_observable, an_obs, a_time, an_obs_cov, a_true_value in zip( observable, obss, times, obs_covs, true_values): if checks.is_callable(an_observable): an_observable = an_observable(an_obs) if fun is not None: an_obs = fun(an_obs) if an_obs_cov is not None: if isinstance(an_obs, (Obs, distrs.Distr)): raise ValueError( 'An observation covariance is provided while the observation is given by a distribution --- conflicting arguments' ) an_obs = distrs.NormalDistr(an_obs, an_obs_cov) if return_df and len(time) == 0: an_initial_state_mean = an_observable.filter.state.state_distr.mean an_initial_state_cov = an_observable.filter.state.state_distr.cov time.append(an_observable.filter.time) accepted.append(None) obs_mean.append(None) obs_cov.append(None) predicted_obs_mean.append(None) predicted_obs_cov.append(None) innov_mean.append(None) innov_cov.append(None) prior_state_mean.append( npu.to_scalar(an_initial_state_mean, raise_value_error=False)) prior_state_cov.append( npu.to_scalar(an_initial_state_cov, raise_value_error=False)) posterior_state_mean.append( npu.to_scalar(an_initial_state_mean, raise_value_error=False)) posterior_state_cov.append( npu.to_scalar(an_initial_state_cov, raise_value_error=False)) log_likelihood.append(None) if isinstance(an_obs, Obs): a_time, _ = _time_and_obs_distr(an_obs, a_time, an_observable.filter.time) predicted_obs = an_observable.predict(time=a_time, true_value=a_true_value) a_prior_state_mean = an_observable.filter.state.state_distr.mean a_prior_state_cov = an_observable.filter.state.state_distr.cov obs_result = an_observable.observe(obs=an_obs, time=a_time, true_value=a_true_value, predicted_obs=predicted_obs) a_posterior_state_mean = an_observable.filter.state.state_distr.mean a_posterior_state_cov = an_observable.filter.state.state_distr.cov if return_df: time.append(obs_result.obs.time) accepted.append(obs_result.accepted) obs_mean.append( npu.to_scalar(obs_result.obs.distr.mean, raise_value_error=False)) obs_cov.append( npu.to_scalar(obs_result.obs.distr.cov, raise_value_error=False)) predicted_obs_mean.append( npu.to_scalar(obs_result.predicted_obs.distr.mean, raise_value_error=False)) predicted_obs_cov.append( npu.to_scalar(obs_result.predicted_obs.distr.cov, raise_value_error=False)) innov_mean.append( npu.to_scalar(obs_result.innov_distr.mean, raise_value_error=False)) innov_cov.append( npu.to_scalar(obs_result.innov_distr.cov, raise_value_error=False)) prior_state_mean.append( npu.to_scalar(a_prior_state_mean, raise_value_error=False)) prior_state_cov.append( npu.to_scalar(a_prior_state_cov, raise_value_error=False)) posterior_state_mean.append( npu.to_scalar(a_posterior_state_mean, raise_value_error=False)) posterior_state_cov.append( npu.to_scalar(a_posterior_state_cov, raise_value_error=False)) log_likelihood.append( npu.to_scalar(obs_result.log_likelihood, raise_value_error=False)) if return_df: return pd.DataFrame( { 'time': time, 'accepted': accepted, 'obs_mean': obs_mean, 'obs_cov': obs_cov, 'predicted_obs_mean': predicted_obs_mean, 'predicted_obs_cov': predicted_obs_cov, 'innov_mean': innov_mean, 'innov_cov': innov_cov, 'prior_state_mean': prior_state_mean, 'prior_state_cov': prior_state_cov, 'posterior_state_mean': prior_state_mean, 'posterior_state_cov': prior_state_cov, 'log_likelihood': log_likelihood }, columns=('time', 'accepted', 'obs_mean', 'obs_cov', 'predicted_obs_mean', 'predicted_obs_cov', 'innov_mean', 'innov_cov', 'prior_state_mean', 'prior_state_cov', 'posterior_state_mean', 'posterior_state_cov', 'log_likelihood')) return obs_result