def split(self, purpose=('training', 'validation', 'test'), fraction=(.5, .25, .25)): logger = logging.getLogger() if not checks.is_iterable_not_string(purpose): purpose = [purpose] if not checks.is_iterable(fraction): fraction = [fraction] split_purposes = [] split_starts_inclusive = [] split_ends_exclusive = [] count_remaining = len(self.input_working) fraction_done = 0. count_done = 0 for p, f in zip(purpose, fraction): assert p in ('training', 'validation', 'test') split_purposes.append(p) next_count = int(count_remaining * f / (1. - fraction_done)) split_starts_inclusive.append(count_done) count_done += next_count split_ends_exclusive.append(count_done) count_remaining -= next_count fraction_done += f logger.info('A %s set: [%d, %d)' % (split_purposes[-1], split_starts_inclusive[-1], split_ends_exclusive[-1])) self.__is_split = True self.__split_purposes = tuple(split_purposes) self.__split_starts_inclusive = tuple(split_starts_inclusive) self.__split_ends_exclusive = tuple(split_ends_exclusive)
def add_diff(self, column=None, prefix='diff(', suffix=')', exclude_column_re=None, include_column_re=None): logger = logging.getLogger() if column is None: column = self.__input_df.columns if not checks.is_iterable_not_string(column): column = [column] if exclude_column_re is not None: exclude_column_re = re.compile(exclude_column_re) if include_column_re is not None: include_column_re = re.compile(include_column_re) for c in column: if include_column_re is not None and not include_column_re.match( c): logger.info('- Excluding column due to include_column_re: %s' % c) continue if exclude_column_re is not None and exclude_column_re.match(c): logger.info('- Excluding column due to exclude_column_re: %s' % c) continue new_column_name = prefix + c + suffix logger.info('- Adding new diff column: %s' % new_column_name) self.__input_df[new_column_name] = self.__input_df[c].diff() try: self.__truncate_from_above = max( self.__truncate_from_above, list(self.__input_df[new_column_name].isnull().values). index(False)) except ValueError: self.__truncate_from_above = max(self.__truncate_from_above, len(self.__input_df))
def add_ln(self, column=None, prefix='ln(', suffix=')', exclude_column_re=None, include_column_re=None, exclude_columns_with_negative_values=True): logger = logging.getLogger() if column is None: column = self.__input_df.columns if not checks.is_iterable_not_string(column): column = [column] if exclude_column_re is not None: exclude_column_re = re.compile(exclude_column_re) if include_column_re is not None: include_column_re = re.compile(include_column_re) for c in column: if include_column_re is not None and not include_column_re.match( c): logger.info('- Excluding column due to include_column_re: %s' % c) continue if exclude_column_re is not None and exclude_column_re.match(c): logger.info('- Excluding column due to exclude_column_re: %s' % c) continue if exclude_columns_with_negative_values and any( self.__input_df[c] < 0.): logger.info( '- Excluding column since it contains negative values: %s' % c) continue new_column_name = prefix + c + suffix logger.info('- Adding new ln column: %s' % new_column_name) self.__input_df[new_column_name] = self.__input_df[c].apply(np.log)
def add_ma(self, window, column=None, prefix='ma(${WINDOW},', suffix=')', exclude_column_re=None, include_column_re=None): logger = logging.getLogger() checks.check_not_none(window) if not checks.is_iterable(window): window = [window] if column is None: column = self.__input_df.columns if not checks.is_iterable_not_string(column): column = [column] if exclude_column_re is not None: exclude_column_re = re.compile(exclude_column_re) if include_column_re is not None: include_column_re = re.compile(include_column_re) for c in column: if include_column_re is not None and not include_column_re.match(c): logger.info('- Excluding column due to include_column_re: %s' % c) continue if exclude_column_re is not None and exclude_column_re.match(c): logger.info('- Excluding column due to exclude_column_re: %s' % c) continue for w in window: c_prefix = prefix.replace('${WINDOW}', str(w)) c_suffix = suffix.replace('${WINDOW}', str(w)) new_column_name = c_prefix + c + c_suffix logger.info('- Adding new MA column: %s' % new_column_name) self.__input_df[new_column_name] = self.__input_df[c].rolling(window=w, center=False).mean() try: self.__truncate_from_above = max(self.__truncate_from_above, list(self.__input_df[new_column_name].isnull().values).index(False)) except ValueError: self.__truncate_from_above = max(self.__truncate_from_above, len(self.__input_df))
def add_lag(self, lag, column=None, prefix='lag(${LAG},', suffix=')', exclude_column_re=None, include_column_re=None): logger = logging.getLogger() checks.check_not_none(lag) if not checks.is_iterable(lag): lag = [lag] if column is None: column = self.__input_df.columns if not checks.is_iterable_not_string(column): column = [column] if exclude_column_re is not None: exclude_column_re = re.compile(exclude_column_re) if include_column_re is not None: include_column_re = re.compile(include_column_re) for c in column: if include_column_re is not None and not include_column_re.match(c): logger.info('- Excluding column due to include_column_re: %s' % c) continue if exclude_column_re is not None and exclude_column_re.match(c): logger.info('- Excluding column due to exclude_column_re: %s' % c) continue for l in lag: c_prefix = prefix.replace('${LAG}', str(l)) c_suffix = suffix.replace('${LAG}', str(l)) new_column_name = c_prefix + c + c_suffix logger.info('- Adding new lag column: %s' % new_column_name) self.__input_df[new_column_name] = self.__input_df[c].shift(l) try: self.__truncate_from_above = max(self.__truncate_from_above, list(self.__input_df[new_column_name].isnull().values).index(False)) except ValueError: self.__truncate_from_above = max(self.__truncate_from_above, len(self.__input_df))
def evaluate_metrics_by_forecast_horizon(ds, column=None, model=sklearn.linear_model.LinearRegression(), metric=sklearn.metrics.r2_score, fit_set='training', predict_set='test'): logger = logging.getLogger() if column is None: column = ds.input_all.columns if not checks.is_iterable_not_string(column): column = [column] assert fit_set in ('training', 'validation', 'test') assert predict_set in ('training', 'validation', 'test') if fit_set == 'training': fit_sets = ds.training_set all_fit_sets = ds.all_training_sets elif fit_set == 'validation': fit_sets = ds.validation_set all_fit_sets = ds.all_validation_sets else: fit_sets = ds.test_set all_fit_sets = ds.all_test_sets if predict_set == 'training': predict_sets = ds.training_set elif predict_set == 'validation': predict_sets = ds.validation_set else: predict_sets = ds.test_set logger.info('Evaluating the metric for column(s) %s' % ', '.join(['"%s"' % c for c in column])) metrics = [] if len(fit_sets) == len(predict_sets): for fs, ps in zip(fit_sets, predict_sets): x_train = fs.input[column].values y_train = fs.output.values model.fit(x_train, y_train) x_predict = ps.input[column].values y_predict = ps.output.values y_predict_pred = model.predict(x_predict) predict_set_metrics = [] for i in range(len(ds.forecast_horizon)): predict_set_metrics.append(metric(y_predict[:,i], y_predict_pred[:,i])) metrics.append(predict_set_metrics) else: x_train = all_fit_sets.input[column].values y_train = all_fit_sets.output.values model.fit(x_train, y_train) for ps in predict_sets: x_predict = ps.input[column].values y_predict = ps.output.values y_predict_pred = model.predict(x_predict) predict_set_metrics = [] for i in range(len(ds.forecast_horizon)): predict_set_metrics.append(metric(y_predict[:,i], y_predict_pred[:,i])) metrics.append(predict_set_metrics) # The mean is taken over the predict sets return np.mean(metrics, axis=0)
def _temporal_comparison(x, y, comp, return_pandas_series=True): import pandas as pd if checks.is_iterable_not_string(x): if checks.is_iterable_not_string(y): result = [ _temporal_comparison(ex, ey, comp) for ex, ey in zip(x, y) ] return pd.Series(result) if return_pandas_series else result else: result = [_temporal_comparison(ex, y, comp) for ex in x] return pd.Series(result) if return_pandas_series else result elif checks.is_iterable_not_string(y): result = [_temporal_comparison(x, ey, comp) for ey in y] return pd.Series(result) if return_pandas_series else result elif checks.is_some_time(x) or checks.is_some_time(y): return comp(conv.to_python_time(x), conv.to_python_time(y)) elif checks.is_some_date(x) or checks.is_some_date(y): return comp(conv.to_python_date(x), conv.to_python_date(y)) else: return comp(conv.to_python_datetime(x), conv.to_python_datetime(y))
def load_df_from_file(path, cusip=None, first_report_date=None, last_report_date=None): predicates = [] if cusip is not None: if checks.is_iterable_not_string(cusip): predicates.append(pdutils.isin('cusip_id', cusip)) else: predicates.append(pdutils.eq('cusip_id', cusip)) if first_report_date is not None: predicates.append( pdutils.ge('trans_dt', first_report_date, conv.str_to_date)) if last_report_date is not None: predicates.append( pdutils.le('trans_dt', last_report_date, conv.str_to_date)) return pdutils.load_df_from_zipped_csv(path, predicates=predicates)
def evaluate_metrics_by_forecast_horizon( ds, column=None, model=sklearn.linear_model.LinearRegression(), metric=sklearn.metrics.r2_score): logger = logging.getLogger() if column is None: column = ds.input_all.columns if not checks.is_iterable_not_string(column): column = [column] logger.info('Evaluating the metric for column(s) %s' % ', '.join(['"%s"' % c for c in column])) metrics = [] if len(ds.training_set) == len(ds.validation_set): for ts, vs in zip(ds.training_set, ds.validation_set): x_train = ts.input[column].values y_train = ts.output.values model.fit(x_train, y_train) x_validation = vs.input[column].values y_validation = vs.output.values y_validation_pred = model.predict(x_validation) validation_set_metrics = [] for i in range(len(ds.forecast_horizon)): validation_set_metrics.append( metric(y_validation[:, i], y_validation_pred[:, i])) metrics.append(validation_set_metrics) else: x_train = ds.all_training_sets.input[column].values y_train = ds.all_training_sets.output.values model.fit(x_train, y_train) for vs in ds.validation_set: x_validation = vs.input[column].values y_validation = vs.output.values y_validation_pred = model.predict(x_validation) validation_set_metrics = [] for i in range(len(ds.forecast_horizon)): validation_set_metrics.append( metric(y_validation[:, i], y_validation_pred[:, i])) metrics.append(validation_set_metrics) # The mean is taken over the validation sets return np.mean(metrics, axis=0)
def visualize_metrics_by_forecast_horizon(ds, column=None, model=sklearn.linear_model.LinearRegression(), metric=sklearn.metrics.r2_score, fit_set='training', predict_set='test', figure=None): if column is None: column = ds.input_all.columns if not checks.is_iterable_not_string(column): column = [column] assert fit_set in ('training', 'validation', 'test') assert predict_set in ('training', 'validation', 'test') if figure is None: figure = plt.figure(figsize=(12, 12)) metrics_by_column = {} for c in column: metrics_by_column[c] = evaluate_metrics_by_forecast_horizon(ds, c, model, metric, fit_set, predict_set) combined_metrics = evaluate_metrics_by_forecast_horizon(ds, column, model, metric, fit_set, predict_set) ax = figure.add_subplot(111) markers = itertools.cycle(('.', ',', 'o', 'v', '^', '<', '>', '1', '2', '3', '4', '8', 's', 'p', 'P', '*', 'h', 'H', '+', 'x', 'X', 'D', 'd', '|', '_')) linestyles = itertools.cycle((':', '-.', '--', '-')) for c in column: ax.plot(ds.forecast_horizon, metrics_by_column[c], marker=next(markers), linestyle=next(linestyles), linewidth=1, alpha=.5, label=c) ax.plot(ds.forecast_horizon, combined_metrics, 'o-', linewidth=2, label='combined') # Shrink current axis's height by 10% on the bottom box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9]) # Put a legend below current axis ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) return {'metrics_by_column': metrics_by_column, 'combined_metrics': combined_metrics}
def keep_input_column(self, column): if not checks.is_iterable_not_string(column): column = [column] for c in self.__input_df.columns: if c not in column: del self.__input_df[c]
def remove_input_column(self, column): if not checks.is_iterable_not_string(column): column = [column] for c in column: del self.__input_df[c]
def run(observable, obss=None, times=None, obs_covs=None, true_values=None, df=None, fun=None, return_df=False): if df is not None: if obss is not None and (checks.is_string(obss) or checks.is_int(obss)): obss = df[obss] if times is None: if isinstance(obss, pd.Series): times = obss.index.values elif (checks.is_string(times) or checks.is_int(times)): times = df[times].values if isinstance(obss, pd.Series): obss = obss.values if obs_covs is not None and (checks.is_string(obs_covs) or checks.is_int(obs_covs)): obs_covs = df[obs_covs].values if true_values is not None and (checks.is_string(true_values) or checks.is_int(true_values)): true_values = df[true_values].values checks.check_not_none(obss) if not checks.is_iterable_not_string(observable): observable = utils.xconst(observable) if not checks.is_iterable_not_string(obss): obss = [obss] if not checks.is_iterable_not_string(times): times = utils.xconst(times) if not checks.is_iterable_not_string(obs_covs): obs_covs = utils.xconst(obs_covs) if not checks.is_iterable_not_string(true_values): true_values = utils.xconst(true_values) obs_result = None cumulative_log_likelihood = 0. if return_df: time = [] filter_name = [] filter_type = [] observable_name = [] accepted = [] obs_mean = [] obs_cov = [] predicted_obs_mean = [] predicted_obs_cov = [] cross_cov = [] innov_mean = [] innov_cov = [] prior_state_mean = [] prior_state_cov = [] posterior_state_mean = [] posterior_state_cov = [] true_value = [] log_likelihood = [] gain = [] last_time = None for an_observable, an_obs, a_time, an_obs_cov, a_true_value in zip( observable, obss, times, obs_covs, true_values): if a_time is None: if last_time is None: a_time = 0 else: a_time = last_time + 1 last_time = a_time if checks.is_callable(an_observable): an_observable = an_observable(an_obs) if fun is not None: an_obs = fun(an_obs) if an_obs_cov is not None: if isinstance(an_obs, (Obs, distrs.Distr)): raise ValueError( 'An observation covariance is provided while the observation is given by a distribution --- conflicting arguments' ) an_obs = distrs.NormalDistr(an_obs, an_obs_cov) if return_df and len(time) == 0: an_initial_state_mean = an_observable.filter.state.state_distr.mean an_initial_state_cov = an_observable.filter.state.state_distr.cov time.append(an_observable.filter.time) filter_name.append(an_observable.filter.name) filter_type.append(type(an_observable.filter)) observable_name.append(None) accepted.append(None) obs_mean.append(None) obs_cov.append(None) predicted_obs_mean.append(None) predicted_obs_cov.append(None) cross_cov.append(None) innov_mean.append(None) innov_cov.append(None) prior_state_mean.append( npu.to_scalar(an_initial_state_mean, raise_value_error=False)) prior_state_cov.append( npu.to_scalar(an_initial_state_cov, raise_value_error=False)) posterior_state_mean.append( npu.to_scalar(an_initial_state_mean, raise_value_error=False)) posterior_state_cov.append( npu.to_scalar(an_initial_state_cov, raise_value_error=False)) true_value.append(None) log_likelihood.append(None) gain.append(None) if isinstance(an_obs, Obs): a_time, _ = _time_and_obs_distr(an_obs, a_time, an_observable.filter.time) predicted_obs = an_observable.predict(time=a_time, true_value=a_true_value) a_prior_state_mean = an_observable.filter.state.state_distr.mean a_prior_state_cov = an_observable.filter.state.state_distr.cov obs_result = an_observable.observe(obs=an_obs, time=a_time, true_value=a_true_value, predicted_obs=predicted_obs) if obs_result.accepted: cumulative_log_likelihood += obs_result.log_likelihood a_posterior_state_mean = an_observable.filter.state.state_distr.mean a_posterior_state_cov = an_observable.filter.state.state_distr.cov if return_df: time.append(obs_result.obs.time) filter_name.append(an_observable.filter.name) filter_type.append(type(an_observable.filter)) observable_name.append(an_observable.name) accepted.append(obs_result.accepted) obs_mean.append( npu.to_scalar(obs_result.obs.distr.mean, raise_value_error=False)) obs_cov.append( npu.to_scalar(obs_result.obs.distr.cov, raise_value_error=False)) predicted_obs_mean.append( npu.to_scalar(obs_result.predicted_obs.distr.mean, raise_value_error=False)) predicted_obs_cov.append( npu.to_scalar(obs_result.predicted_obs.distr.cov, raise_value_error=False)) cross_cov.append( npu.to_scalar(obs_result.predicted_obs.cross_cov, raise_value_error=False)) innov_mean.append( npu.to_scalar(obs_result.innov_distr.mean, raise_value_error=False)) innov_cov.append( npu.to_scalar(obs_result.innov_distr.cov, raise_value_error=False)) prior_state_mean.append( npu.to_scalar(a_prior_state_mean, raise_value_error=False)) prior_state_cov.append( npu.to_scalar(a_prior_state_cov, raise_value_error=False)) posterior_state_mean.append( npu.to_scalar(a_posterior_state_mean, raise_value_error=False)) posterior_state_cov.append( npu.to_scalar(a_posterior_state_cov, raise_value_error=False)) true_value.append( npu.to_scalar(a_true_value, raise_value_error=False)) log_likelihood.append( npu.to_scalar(obs_result.log_likelihood, raise_value_error=False)) gain.append( obs_result.gain if hasattr(obs_result, 'gain') else None) df = None if return_df: df = pd.DataFrame( { 'time': time, 'filter_name': filter_name, 'filter_type': filter_type, 'observable_name': observable_name, 'accepted': accepted, 'obs_mean': obs_mean, 'obs_cov': obs_cov, 'predicted_obs_mean': predicted_obs_mean, 'predicted_obs_cov': predicted_obs_cov, 'cross_cov': cross_cov, 'innov_mean': innov_mean, 'innov_cov': innov_cov, 'prior_state_mean': prior_state_mean, 'prior_state_cov': prior_state_cov, 'posterior_state_mean': prior_state_mean, 'posterior_state_cov': prior_state_cov, 'true_value': true_value, 'log_likelihood': log_likelihood, 'gain': gain }, columns=('time', 'filter_name', 'filter_type', 'observable_name', 'accepted', 'obs_mean', 'obs_cov', 'predicted_obs_mean', 'predicted_obs_cov', 'cross_cov', 'innov_mean', 'innov_cov', 'prior_state_mean', 'prior_state_cov', 'posterior_state_mean', 'posterior_state_cov', 'true_value', 'log_likelihood', 'gain')) return FilterRunResult(obs_result, cumulative_log_likelihood, df)
def run(observable, obss=None, times=None, obs_covs=None, true_values=None, df=None, fun=None, return_df=False): if df is not None: if obss is not None and checks.is_string(obss): obss = df[obss].values if times is not None and checks.is_string(times): times = df[times].values if obs_covs is not None and checks.is_string(obs_covs): obs_covs = df[obs_covs].values if true_values is not None and checks.is_string(true_values): true_values = df[true_values].values checks.check_not_none(obss) if not checks.is_iterable_not_string(observable): observable = utils.xconst(observable) if not checks.is_iterable_not_string(obss): obss = [obss] if not checks.is_iterable_not_string(times): times = utils.xconst(times) if not checks.is_iterable_not_string(obs_covs): obs_covs = utils.xconst(obs_covs) if not checks.is_iterable_not_string(true_values): true_values = utils.xconst(true_values) obs_result = None if return_df: time = [] accepted = [] obs_mean = [] obs_cov = [] predicted_obs_mean = [] predicted_obs_cov = [] innov_mean = [] innov_cov = [] prior_state_mean = [] prior_state_cov = [] posterior_state_mean = [] posterior_state_cov = [] log_likelihood = [] for an_observable, an_obs, a_time, an_obs_cov, a_true_value in zip( observable, obss, times, obs_covs, true_values): if checks.is_callable(an_observable): an_observable = an_observable(an_obs) if fun is not None: an_obs = fun(an_obs) if an_obs_cov is not None: if isinstance(an_obs, (Obs, distrs.Distr)): raise ValueError( 'An observation covariance is provided while the observation is given by a distribution --- conflicting arguments' ) an_obs = distrs.NormalDistr(an_obs, an_obs_cov) if return_df and len(time) == 0: an_initial_state_mean = an_observable.filter.state.state_distr.mean an_initial_state_cov = an_observable.filter.state.state_distr.cov time.append(an_observable.filter.time) accepted.append(None) obs_mean.append(None) obs_cov.append(None) predicted_obs_mean.append(None) predicted_obs_cov.append(None) innov_mean.append(None) innov_cov.append(None) prior_state_mean.append( npu.to_scalar(an_initial_state_mean, raise_value_error=False)) prior_state_cov.append( npu.to_scalar(an_initial_state_cov, raise_value_error=False)) posterior_state_mean.append( npu.to_scalar(an_initial_state_mean, raise_value_error=False)) posterior_state_cov.append( npu.to_scalar(an_initial_state_cov, raise_value_error=False)) log_likelihood.append(None) if isinstance(an_obs, Obs): a_time, _ = _time_and_obs_distr(an_obs, a_time, an_observable.filter.time) predicted_obs = an_observable.predict(time=a_time, true_value=a_true_value) a_prior_state_mean = an_observable.filter.state.state_distr.mean a_prior_state_cov = an_observable.filter.state.state_distr.cov obs_result = an_observable.observe(obs=an_obs, time=a_time, true_value=a_true_value, predicted_obs=predicted_obs) a_posterior_state_mean = an_observable.filter.state.state_distr.mean a_posterior_state_cov = an_observable.filter.state.state_distr.cov if return_df: time.append(obs_result.obs.time) accepted.append(obs_result.accepted) obs_mean.append( npu.to_scalar(obs_result.obs.distr.mean, raise_value_error=False)) obs_cov.append( npu.to_scalar(obs_result.obs.distr.cov, raise_value_error=False)) predicted_obs_mean.append( npu.to_scalar(obs_result.predicted_obs.distr.mean, raise_value_error=False)) predicted_obs_cov.append( npu.to_scalar(obs_result.predicted_obs.distr.cov, raise_value_error=False)) innov_mean.append( npu.to_scalar(obs_result.innov_distr.mean, raise_value_error=False)) innov_cov.append( npu.to_scalar(obs_result.innov_distr.cov, raise_value_error=False)) prior_state_mean.append( npu.to_scalar(a_prior_state_mean, raise_value_error=False)) prior_state_cov.append( npu.to_scalar(a_prior_state_cov, raise_value_error=False)) posterior_state_mean.append( npu.to_scalar(a_posterior_state_mean, raise_value_error=False)) posterior_state_cov.append( npu.to_scalar(a_posterior_state_cov, raise_value_error=False)) log_likelihood.append( npu.to_scalar(obs_result.log_likelihood, raise_value_error=False)) if return_df: return pd.DataFrame( { 'time': time, 'accepted': accepted, 'obs_mean': obs_mean, 'obs_cov': obs_cov, 'predicted_obs_mean': predicted_obs_mean, 'predicted_obs_cov': predicted_obs_cov, 'innov_mean': innov_mean, 'innov_cov': innov_cov, 'prior_state_mean': prior_state_mean, 'prior_state_cov': prior_state_cov, 'posterior_state_mean': prior_state_mean, 'posterior_state_cov': prior_state_cov, 'log_likelihood': log_likelihood }, columns=('time', 'accepted', 'obs_mean', 'obs_cov', 'predicted_obs_mean', 'predicted_obs_cov', 'innov_mean', 'innov_cov', 'prior_state_mean', 'prior_state_cov', 'posterior_state_mean', 'posterior_state_cov', 'log_likelihood')) return obs_result