Beispiel #1
0
def numpy_datetime64_to_python_datetime(x, allow_none=False):
    import numpy as np
    if isinstance(x, np.datetime64):
        # For some reason, the following doesn't always work. Instead of a Python datetime, an int may be returned. This
        # may be due to a bug in NumPy. This function detects this issue and employs an alternative strategy to convert
        # x to a Python datetime.
        r = x.astype(dt.datetime)
        if isinstance(x, dt.datetime): return r
        year = x.astype('datetime64[Y]').astype(int) + 1970
        xm = x.astype('datetime64[M]')
        month = xm.astype(int) % 12 + 1
        days = (x - xm) / np.timedelta64(1, 'D')
        timeindays = days - int(days)
        day = int(days) + 1
        hour = int(timeindays * tc.HOURS_PER_DAY)
        timeindays -= hour / tc.HOURS_PER_DAY
        minute = int(timeindays * tc.MINUTES_PER_DAY)
        timeindays -= minute / tc.MINUTES_PER_DAY
        second = int(timeindays * tc.SECONDS_PER_DAY)
        timeindays -= second / tc.SECONDS_PER_DAY
        microsecond = int(timeindays * tc.MICROSECONDS_PER_DAY)
        r = dt.datetime(year, month, day, hour, minute, second, microsecond)
        if microsecond % 10 == 9: r += dt.timedelta(microseconds=1)
        return r
    elif checks.is_iterable(x):
        return [numpy_datetime64_to_python_datetime(e) for e in x]
    elif allow_none and x is None:
        return None
    raise ValueError('Unable to convert "%s" to Python datetime' % str(x))
Beispiel #2
0
    def split(self,
              purpose=('training', 'validation', 'test'),
              fraction=(.5, .25, .25)):
        logger = logging.getLogger()

        if not checks.is_iterable_not_string(purpose): purpose = [purpose]
        if not checks.is_iterable(fraction): fraction = [fraction]

        split_purposes = []
        split_starts_inclusive = []
        split_ends_exclusive = []

        count_remaining = len(self.input_working)
        fraction_done = 0.
        count_done = 0
        for p, f in zip(purpose, fraction):
            assert p in ('training', 'validation', 'test')
            split_purposes.append(p)
            next_count = int(count_remaining * f / (1. - fraction_done))
            split_starts_inclusive.append(count_done)
            count_done += next_count
            split_ends_exclusive.append(count_done)
            count_remaining -= next_count
            fraction_done += f

            logger.info('A %s set: [%d, %d)' %
                        (split_purposes[-1], split_starts_inclusive[-1],
                         split_ends_exclusive[-1]))

        self.__is_split = True
        self.__split_purposes = tuple(split_purposes)
        self.__split_starts_inclusive = tuple(split_starts_inclusive)
        self.__split_ends_exclusive = tuple(split_ends_exclusive)
Beispiel #3
0
def to_python_datetime(x,
                       allow_dates=True,
                       date_for_times=dt.date.today(),
                       allow_none=False,
                       *args,
                       **kwargs):
    import numpy as np
    import pandas as pd
    if isinstance(x, pd.Timestamp):
        return pandas_timestamp_to_python_datetime(x, *args, **kwargs)
    elif isinstance(x, np.datetime64):
        return numpy_datetime64_to_python_datetime(x, *args, **kwargs)
    elif isinstance(x, dt.datetime):
        return x
    elif date_for_times is not None and isinstance(x, dt.time):
        return dt.datetime.combine(date_for_times, x)
    elif allow_dates and isinstance(x, dt.date):
        return dt.datetime.combine(x, dt.time())
    elif checks.is_string(x):
        return str_to_datetime(x, *args, **kwargs)
    elif checks.is_iterable(x):
        return [
            to_python_datetime(e, allow_dates, date_for_times, *args, **kwargs)
            for e in x
        ]
    elif allow_none and x is None:
        return None
    raise ValueError('Unable to convert "%s" to Python datetime' % str(x))
Beispiel #4
0
 def add_ln(self,
            column=None,
            prefix='ln(',
            suffix=')',
            exclude_column_re=None,
            include_column_re=None,
            exclude_columns_with_negative_values=True):
     logger = logging.getLogger()
     if column is None: column = self.__input_df.columns
     if not checks.is_iterable(column): column = [column]
     if exclude_column_re is not None:
         exclude_column_re = re.compile(exclude_column_re)
     if include_column_re is not None:
         include_column_re = re.compile(include_column_re)
     for c in column:
         if include_column_re is not None and not include_column_re.match(
                 c):
             logger.info('- Excluding column due to include_column_re: %s' %
                         c)
             continue
         if exclude_column_re is not None and exclude_column_re.match(c):
             logger.info('- Excluding column due to exclude_column_re: %s' %
                         c)
             continue
         if exclude_columns_with_negative_values and any(
                 self.__input_df[c] < 0.):
             logger.info(
                 '- Excluding column since it contains negative values: %s'
                 % c)
             continue
         new_column_name = prefix + c + suffix
         logger.info('- Adding new ln column: %s' % new_column_name)
         self.__input_df[new_column_name] = self.__input_df[c].apply(np.log)
Beispiel #5
0
def mean_or_last(x):
    if isinstance(x, pd.DataFrame): return x.apply(mean_or_last)
    else:
        try:
            return np.mean(x)
        except:
            return x[-1] if checks.is_iterable(x) else x
Beispiel #6
0
 def add_diff(self,
              column=None,
              prefix='diff(',
              suffix=')',
              exclude_column_re=None,
              include_column_re=None):
     logger = logging.getLogger()
     if column is None: column = self.__input_df.columns
     if not checks.is_iterable(column): column = [column]
     if exclude_column_re is not None:
         exclude_column_re = re.compile(exclude_column_re)
     if include_column_re is not None:
         include_column_re = re.compile(include_column_re)
     for c in column:
         if include_column_re is not None and not include_column_re.match(
                 c):
             logger.info('- Excluding column due to include_column_re: %s' %
                         c)
             continue
         if exclude_column_re is not None and exclude_column_re.match(c):
             logger.info('- Excluding column due to exclude_column_re: %s' %
                         c)
             continue
         new_column_name = prefix + c + suffix
         logger.info('- Adding new diff column: %s' % new_column_name)
         self.__input_df[new_column_name] = self.__input_df[c].diff()
         try:
             self.__truncate_from_above = max(
                 self.__truncate_from_above,
                 list(self.__input_df[new_column_name].isnull().values).
                 index(False))
         except ValueError:
             self.__truncate_from_above = max(self.__truncate_from_above,
                                              len(self.__input_df))
Beispiel #7
0
 def set_output(self,
                column,
                forecast_horizon=0,
                remove_from_input=None,
                difference_from_present=False):
     assert column is not None
     assert forecast_horizon is not None
     if not checks.is_iterable(forecast_horizon):
         forecast_horizon = [forecast_horizon]
     for fh in forecast_horizon:
         assert fh >= 0
     if remove_from_input is None:
         remove_from_input = not all(forecast_horizon)
     if difference_from_present:
         self.__output_df = pd.concat([
             self.__input_df[column].shift(-fh) - self.__input_df[column]
             for fh in forecast_horizon
         ],
                                      axis=1)
     else:
         self.__output_df = pd.concat([
             self.__input_df[column].shift(-fh) for fh in forecast_horizon
         ],
                                      axis=1)
     self.__output_df.columns = [
         'forecast(' + str(fh) + ',' + column + ')' if fh > 0 else column
         for fh in forecast_horizon
     ]
     self.__output_base_df = self.__input_df[column].to_frame()
     if remove_from_input:
         del self.__input_df[column]
     self.__truncate_from_below = max(forecast_horizon)
Beispiel #8
0
def pandas_timestamp_to_python_datetime(x, allow_none=False):
    import pandas as pd
    if isinstance(x, pd.Timestamp): return x.to_pydatetime()
    elif checks.is_iterable(x):
        return [pandas_timestamp_to_python_datetime(e) for e in x]
    elif allow_none and x is None:
        return None
    raise ValueError('Unable to convert "%s" to Python datetime' % str(x))
Beispiel #9
0
def pandas_timedelta_to_python_timedelta(x, allow_none=False):
    import pandas as pd
    if isinstance(x, pd.Timedelta): return x.to_pytimedelta()
    elif checks.is_iterable(x):
        return [pandas_timedelta_to_python_timedelta(e, allow_none) for e in x]
    elif allow_none and x is None:
        return None
    raise ValueError('Unable to convert "%s" to Python timedelta' % str(x))
Beispiel #10
0
 def __init__(self, obs_matrix):
     super().__init__()
     if not checks.is_numpy_array(obs_matrix) and not checks.is_iterable(obs_matrix):
         obs_matrix = (obs_matrix,)
     self._obs_matrix = npu.make_immutable(
             block_diag(
                     *[npu.to_ndim_2(om, ndim_1_to_col=False, copy=False) for om in obs_matrix]))
     self._to_string_helper_KalmanFilterObsModel = None
     self._str_KalmanFilterObsModel = None
Beispiel #11
0
def numpy_timedelta64_to_python_timedelta(x, allow_none=False):
    import numpy as np
    import pandas as pd
    if isinstance(x, np.timedelta64):
        return pd.to_timedelta(x, errors='coerce', box=True).to_pytimedelta()
    elif checks.is_iterable(x):
        return [numpy_timedelta64_to_python_timedelta(e) for e in x]
    elif allow_none and x is None:
        return None
    raise ValueError('Unable to convert "%s" to Python timedelta' % str(x))
Beispiel #12
0
 def __init__(self, time, state_distr, process,
              weighting_func=None,
              particle_count=1000, observation_dim=1,
              random_state=None,
              predicted_observation_sampler=None, outlier_threshold=None,
              name=None, pype=None, pype_options=frozenset(filtering.FilterPypeOptions)):
     super().__init__(name)
     self._pype = pype
     self._pype_options = frozenset() if (pype_options is None or pype is None) else frozenset(pype_options)
     if not checks.is_iterable(process): process = (process,)
     process = checks.check_iterable_over_instances(process, proc.SolvedItoProcess)
     if weighting_func is None: weighting_func = KDEWeightingFunction()
     self._time = time
     self._observation_dim = observation_dim
     self._state_distr = state_distr
     self._processes = tuple(process)
     self._state_dim = sum([p.process_dim for p in self._processes])
     self._weighting_func = weighting_func
     self._particle_count = particle_count
     self._current_particle_idx = None
     self._random_state = rnd.random_state() if random_state is None else random_state
     self._predicted_observation_sampler = predicted_observation_sampler
     
     self._prior_particles = np.empty((self._particle_count, self._state_dim))
     self._resampled_particles = np.empty((self._particle_count, self._state_dim))
     self._unnormalised_weights = np.empty((self._particle_count,))
     self._weights = np.empty((self._particle_count,))
     self._resampled_particles_uptodate = False
     
     self._last_observation = None
     
     self._cached_prior_mean = None
     self._cached_prior_var = None
     self._cached_posterior_mean = None
     self._cached_posterior_var = None
     self._cached_resampled_mean = None
     self._cached_resampled_var = None
     
     self.log_likelihood = 0.0
     self.effective_sample_size = np.NaN
     
     if self._predicted_observation_sampler is not None:
         self.predicted_observation_particles = None
         self.predicted_observation_kde = None
         self.predicted_observation = np.NaN
         self.innovation = np.NaN
         self.innovationvar = np.NaN
         
     assert self._predicted_observation_sampler is not None or outlier_threshold is None 
     self._outlier_threshold = outlier_threshold
         
     self._context = OrderedDict()
     
     self._initialise()
Beispiel #13
0
def to_python_float(x, allow_none=False, allow_ints=False, *args, **kwargs):
    if checks.is_some_float(x, allow_none): return float(x)
    elif allow_ints and checks.is_some_int(x, allow_none):
        return float(to_python_int(x))
    elif checks.is_string(x):
        return str_to_float(x, *args, **kwargs)
    elif checks.is_iterable(x):
        return [to_python_float(e, *args, **kwargs) for e in x]
    elif allow_none and x is None:
        return None
    raise ValueError('Unable to convert "%s" to Python float' % str(x))
Beispiel #14
0
 def add_ma(self,
            window,
            column=None,
            prefix='ma(${WINDOW},',
            suffix=')',
            exclude_column_re=None,
            include_column_re=None):
     logger = logging.getLogger()
     checks.check_not_none(window)
     if not checks.is_iterable(window): window = [window]
     if column is None: column = self.__input_df.columns
     if not checks.is_iterable(column): column = [column]
     if exclude_column_re is not None:
         exclude_column_re = re.compile(exclude_column_re)
     if include_column_re is not None:
         include_column_re = re.compile(include_column_re)
     for c in column:
         if include_column_re is not None and not include_column_re.match(
                 c):
             logger.info('- Excluding column due to include_column_re: %s' %
                         c)
             continue
         if exclude_column_re is not None and exclude_column_re.match(c):
             logger.info('- Excluding column due to exclude_column_re: %s' %
                         c)
             continue
         for w in window:
             c_prefix = prefix.replace('${WINDOW}', str(w))
             c_suffix = suffix.replace('${WINDOW}', str(w))
             new_column_name = c_prefix + c + c_suffix
             logger.info('- Adding new MA column: %s' % new_column_name)
             self.__input_df[new_column_name] = self.__input_df[c].rolling(
                 window=w, center=False).mean()
             try:
                 self.__truncate_from_above = max(
                     self.__truncate_from_above,
                     list(self.__input_df[new_column_name].isnull().values).
                     index(False))
             except ValueError:
                 self.__truncate_from_above = max(
                     self.__truncate_from_above, len(self.__input_df))
Beispiel #15
0
 def add_lag(self,
             lag,
             column=None,
             prefix='lag(${LAG},',
             suffix=')',
             exclude_column_re=None,
             include_column_re=None):
     logger = logging.getLogger()
     checks.check_not_none(lag)
     if not checks.is_iterable(lag): lag = [lag]
     if column is None: column = self.__input_df.columns
     if not checks.is_iterable(column): column = [column]
     if exclude_column_re is not None:
         exclude_column_re = re.compile(exclude_column_re)
     if include_column_re is not None:
         include_column_re = re.compile(include_column_re)
     for c in column:
         if include_column_re is not None and not include_column_re.match(
                 c):
             logger.info('- Excluding column due to include_column_re: %s' %
                         c)
             continue
         if exclude_column_re is not None and exclude_column_re.match(c):
             logger.info('- Excluding column due to exclude_column_re: %s' %
                         c)
             continue
         for l in lag:
             c_prefix = prefix.replace('${LAG}', str(l))
             c_suffix = suffix.replace('${LAG}', str(l))
             new_column_name = c_prefix + c + c_suffix
             logger.info('- Adding new lag column: %s' % new_column_name)
             self.__input_df[new_column_name] = self.__input_df[c].shift(l)
             try:
                 self.__truncate_from_above = max(
                     self.__truncate_from_above,
                     list(self.__input_df[new_column_name].isnull().values).
                     index(False))
             except ValueError:
                 self.__truncate_from_above = max(
                     self.__truncate_from_above, len(self.__input_df))
Beispiel #16
0
 def __init__(self, time, state_distr, process, name=None, pype=None,
              pype_options=frozenset(filtering.FilterPypeOptions)):
     super().__init__(name)
     self._pype = pype
     self._pype_options = frozenset() if (pype_options is None or pype is None) else frozenset(pype_options)
     if not checks.is_iterable(process): process = (process,)
     checks.check_instance(state_distr, N)
     process = checks.check_iterable_over_instances(process, proc.MarkovProcess)
     self._time = time
     self._state_distr = state_distr
     self._is_posterior = False
     self._processes = tuple(process)
     self._to_string_helper_KalmanFilter = None
     self._str_KalmanFilter = None
     if filtering.FilterPypeOptions.PRIOR_STATE in self._pype_options: self._pype.send(self.state)
Beispiel #17
0
def to_python_time(x, allow_datetimes=True, allow_none=False, *args, **kwargs):
    import numpy as np
    import pandas as pd
    if isinstance(x, dt.time): return x
    elif allow_datetimes and isinstance(x, dt.datetime): return x.time()
    elif allow_datetimes and isinstance(x, np.datetime64):
        return numpy_datetime64_to_python_datetime(x, *args, **kwargs).time()
    elif allow_datetimes and isinstance(x, pd.Timestamp):
        return pandas_timestamp_to_python_datetime(x, *args, **kwargs).time()
    elif isinstance(x, np.timedelta64):
        return numpy_timedelta64_to_python_time(x, allow_none)
    elif checks.is_string(x):
        return str_to_time(x, *args, **kwargs)
    elif checks.is_iterable(x):
        return [to_python_time(e, allow_datetimes, *args, **kwargs) for e in x]
    elif allow_none and x is None:
        return None
    raise ValueError('Unable to convert "%s" to Python time' % str(x))
Beispiel #18
0
def to_python_timedelta(x, allow_none=False):
    import numpy as np
    import pandas as pd
    if isinstance(x, np.timedelta64):
        return numpy_timedelta64_to_python_timedelta(x, allow_none)
    elif isinstance(x, pd.Timedelta):
        return pandas_timedelta_to_python_timedelta(x, allow_none)
    elif isinstance(x, dt.timedelta):
        return x
    elif checks.is_iterable(x):
        return [to_python_timedelta(e, allow_none) for e in x]
    elif allow_none and x is None:
        return None
    else:
        try:
            return dt.timedelta(seconds=x)
        except:
            pass
    raise ValueError('Unable to convert "%s" to Python timedelta' % str(x))
Beispiel #19
0
 def __init__(self, filter, name, obs_model, observed_processes, *args,
              **kwargs):
     super().__init__(filter, name)
     if not checks.is_iterable(observed_processes):
         observed_processes = [observed_processes]
     observed_processes = tuple(
         checks.check_iterable_over_instances(observed_processes,
                                              proc.MarkovProcess))
     if obs_model is None:
         obs_model = ParticleFilterObsModel.create(
             np.eye(sum([p.process_dim for p in observed_processes])))
     self._obs_model = obs_model
     self._state_mean_rects = []
     self._state_mean_rects = []
     self._state_cov_diag_rects = []
     for op in observed_processes:
         matched = False
         row = 0
         for ap in self.filter._processes:
             process_dim = ap.process_dim
             if op is ap:
                 matched = True
                 self._state_mean_rects.append(np.s_[row:row +
                                                     process_dim, 0:1])
                 self._state_cov_diag_rects.append(
                     np.s_[row:row + process_dim,
                           row:row + process_dim])
             row += process_dim
         if not matched:
             raise ValueError(
                 'Each observed process must match a particle filter\'s process'
             )
     self._state_cov_rects = []
     for r in self._state_cov_diag_rects:
         startrow = r[0].start
         stoprow = r[0].stop
         rects = []
         for r1 in self._state_cov_diag_rects:
             startcol = r1[1].start
             stopcol = r1[1].stop
             rects.append(np.s_[startrow:stoprow, startcol:stopcol])
         self._state_cov_rects.append(rects)
Beispiel #20
0
def last(x):
    if isinstance(x, pd.DataFrame): return x.apply(first)
    else: return x[-1] if checks.is_iterable(x) else x
Beispiel #21
0
    def __init__(self,
                 fig,
                 ax,
                 auto_refresh,
                 title,
                 filter_name,
                 process_prior_filter_states,
                 process_posterior_filter_states,
                 process_true_values,
                 process_obs_results,
                 state_indices=None,
                 state_labels=None,
                 observable_names=None,
                 obs_indices=None,
                 obs_labels=None,
                 state_colours=_default_state_colours,
                 true_value_colours=_default_true_value_colours,
                 obs_colours=_default_obs_colours,
                 *args,
                 **kwargs):
        super().__init__(fig, ax, *args, **kwargs)

        self._process_prior_filter_states = process_prior_filter_states
        self._process_posterior_filter_states = process_posterior_filter_states
        self._process_true_values = process_true_values
        self._process_obs_results = process_obs_results

        if state_indices is not None:
            if not checks.is_iterable(state_indices):
                state_indices = (state_indices, )
            else:
                state_indices = tuple(state_indices)
        if state_labels is not None:
            if not checks.is_iterable(state_labels):
                state_labels = (state_labels, )
            else:
                state_labels = tuple(state_labels)
        checks.is_same_len_or_none(state_indices, state_labels)

        if observable_names is not None:
            checks.check_not_none(obs_indices)
            if not checks.is_iterable(observable_names):
                observable_names = (observable_names, )
            else:
                observable_names = tuple(observable_names)
        if obs_indices is not None:
            checks.check_not_none(observable_names)
            if not checks.is_iterable(obs_indices):
                obs_indices = (obs_indices, )
            else:
                obs_indices = tuple(obs_indices)
        if obs_labels is not None:
            if not checks.is_iterable(obs_labels): obs_labels = (obs_labels, )
            else: obs_labels = tuple(obs_labels)
        checks.is_same_len_or_none(observable_names, obs_indices, obs_labels)

        self._auto_refresh = auto_refresh

        self._title = title

        self._filter_name = filter_name

        self._state_indices = state_indices
        self._state_labels = state_labels
        self._observable_names = observable_names
        self._obs_indices = obs_indices
        self._obs_labels = obs_labels

        self._state_colours = state_colours
        self._true_value_colours = true_value_colours
        self._obs_colours = obs_colours

        self._state_and_true_value_plots_inited = False
        self._obs_plots_inited = False

        if self._state_indices is not None:
            self._init_state_and_true_value_plots()

        self._inited_obs_index_count = 0

        if self._observable_names is not None:
            if self._obs_labels is None:
                self._obs_labels = []
                for observable_name, obs_index in zip(self._observable_names,
                                                      self._obs_indices):
                    if self._observable_names.count(observable_name) == 1:
                        self._obs_labels.append(observable_name)
                    else:
                        self._obs_labels.append('%s %d' %
                                                (observable_name, obs_index))
                self._obs_labels = tuple(self._obs_labels)
            self._actual_observable_names = self._observable_names
            self._actual_obs_indices = self._obs_indices
            self._actual_obs_labels = self._obs_labels
            self._init_obs_plots()
            self._obs_plots_inited = True
        else:
            self._actual_observable_names = []
            self._actual_obs_indices = []
            self._actual_obs_labels = []
Beispiel #22
0
def sparsen(df,
            aggregator=mean_or_last,
            date=None,
            time=None,
            datetime=None,
            bucket='date',
            new_bucket_column=None,
            fix_kind='last',
            fix_time=None,
            fix_points=10,
            min_fix_point_count=None,
            max_fix_point_count=None,
            min_min_fix_point_time=None,
            max_min_fix_point_time=None,
            min_max_fix_point_time=None,
            max_max_fix_point_time=None,
            already_sorted=False,
            aggregators_apply_to_df=False,
            exclude_original_temporal_columns=True,
            columns_to_exclude=None,
            return_extra_info=False):
    checks.is_at_least_one_not_none(datetime, date, time)

    if bucket == 'date':
        bucket = lambda x: conv.to_python_date(x, allow_datetimes=True)
    elif bucket == 'week':
        bucket = lambda x: tsatimes.first_day_of_week(x)

    columns_to_exclude = set() if columns_to_exclude is None else set(
        columns_to_exclude)

    if datetime is not None:
        checks.check_all_none(date, time)
        if isinstance(datetime, str):
            if exclude_original_temporal_columns:
                columns_to_exclude.add(datetime)
            if new_bucket_column is None and exclude_original_temporal_columns:
                new_bucket_column = datetime
            datetime = df[datetime].values
        temporals = datetime
    else:
        if isinstance(date, str):
            if exclude_original_temporal_columns: columns_to_exclude.add(date)
            if new_bucket_column is None and exclude_original_temporal_columns:
                new_bucket_column = date
            date = df[date].values
        if isinstance(time, str):
            if exclude_original_temporal_columns: columns_to_exclude.add(time)
            if new_bucket_column is None and exclude_original_temporal_columns:
                new_bucket_column = time
            time = df[time].values

        if date is not None and time is not None:
            temporals = [dt.datetime.combine(d, t) for d, t in zip(date, time)]
        elif date is not None:
            temporals = date
        else:  # time is not None
            temporals = time

    if new_bucket_column is None: new_bucket_column = 'bucket'

    if fix_kind in ('first', 'after'): comparison = 'ge'
    elif fix_kind == 'after_exclusive': comparison = 'gt'
    elif fix_kind in ('last', 'before'): comparison = 'le'
    elif fix_kind == 'before_exclusive': comparison = 'lt'
    else: raise ValueError('Unfamiliar fix_kind: "%s"' % str(fix_kind))

    if fix_kind in ('first', 'last'): checks.check_none(fix_time)
    else: checks.check_not_none(fix_time)

    numeric_fix_points = checks.is_some_number(fix_points)
    if not numeric_fix_points:
        fix_points = conv.to_python_timedelta(fix_points)

    grouping_df = pd.DataFrame({'temporals': temporals})

    grouped_df = grouping_df.groupby(bucket(temporals))

    columns = [new_bucket_column]
    data = {new_bucket_column: []}
    aggs = {}

    if checks.is_some_dict(aggregator): column_agg_pairs = aggregator.items()
    elif checks.is_iterable(aggregator): column_agg_pairs = aggregator
    else: column_agg_pairs = zip(df.columns, utils.xconst(aggregator))
    for column, agg in column_agg_pairs:
        if column not in columns_to_exclude:
            columns.append(column)
            data[column] = []
            aggs[column] = agg

    dates_with_no_points = []
    dates_with_fix_point_limits_breached = col.OrderedDict()
    fix_point_counts = col.OrderedDict()

    for bucket, group_df in grouped_df:
        if len(group_df) == 0: dates_with_no_points.append(bucket)
        if not already_sorted:
            group_df = group_df.copy()
            group_df.sort_values('temporals', inplace=True)
        if fix_kind == 'first': fix_time = group_df['temporals'].values[0]
        elif fix_kind == 'last': fix_time = group_df['temporals'].values[-1]

        if numeric_fix_points:
            if comparison == 'ge':
                fix_point_indices = group_df.index[tsatimes.temporal_ge(
                    group_df['temporals'], fix_time)][0:fix_points]
            elif comparison == 'gt':
                fix_point_indices = group_df.index[tsatimes.temporal_gt(
                    group_df['temporals'], fix_time)][0:fix_points]
            elif comparison == 'le':
                fix_point_indices = group_df.index[tsatimes.temporal_le(
                    group_df['temporals'], fix_time)][-fix_points:]
            else:  # comparison == 'lt'
                fix_point_indices = group_df.index[tsatimes.temporal_lt(
                    group_df['temporals'], fix_time)][-fix_points:]
        else:
            if comparison == 'ge':
                fix_point_indices = group_df.index[(tsatimes.temporal_ge(group_df['temporals'], fix_time)) & \
                                                   (tsatimes.temporal_le(group_df['temporals'], tsatimes.plus_timedelta(fix_time, fix_points)))]
            elif comparison == 'gt':
                fix_point_indices = group_df.index[(tsatimes.temporal_gt(group_df['temporals'], fix_time)) & \
                                                   (tsatimes.temporal_le(group_df['temporals'], tsatimes.plus_timedelta(fix_time, fix_points)))]
            elif comparison == 'le':
                fix_point_indices = group_df.index[(tsatimes.temporal_le(group_df['temporals'], fix_time)) & \
                                                   (tsatimes.temporal_ge(group_df['temporals'], tsatimes.plus_timedelta(fix_time, -fix_points)))]
            else:  # comparison == 'lt':
                fix_point_indices = group_df.index[(tsatimes.temporal_lt(group_df['temporals'], fix_time)) & \
                                                   (tsatimes.temporal_ge(group_df['temporals'], tsatimes.plus_timedelta(fix_time, -fix_points)))]

        fix_point_limits_breached = set()

        if min_fix_point_count is not None and len(
                fix_point_indices) < min_fix_point_count:
            fix_point_limits_breached.add('min_fix_point_count')
        if max_fix_point_count is not None and len(
                fix_point_indices) > max_fix_point_count:
            fix_point_limits_breached.add('max_fix_point_count')
        if min_min_fix_point_time is not None:
            if checks.is_some_timedelta(min_min_fix_point_time):
                the_min_min_fix_point_time = fix_time + min_min_fix_point_time if comparison in (
                    'ge', 'gt') else fix_time - min_min_fix_point_time
            else:
                the_min_min_fix_point_time = min_min_fix_point_time
            if tsatimes.temporal_lt(
                    min(grouping_df['temporals'].values[fix_point_indices]),
                    the_min_min_fix_point_time):
                fix_point_limits_breached.add('min_min_fix_point_time')
        if max_min_fix_point_time is not None:
            if checks.is_some_timedelta(max_min_fix_point_time):
                the_max_min_fix_point_time = fix_time + max_min_fix_point_time if comparison in (
                    'ge', 'gt') else fix_time - max_min_fix_point_time
            else:
                the_max_min_fix_point_time = max_min_fix_point_time
            if tsatimes.temporal_gt(
                    min(grouping_df['temporals'].values[fix_point_indices]),
                    the_max_min_fix_point_time):
                fix_point_limits_breached.add('max_min_fix_point_time')
        if min_max_fix_point_time is not None:
            if checks.is_some_timedelta(min_max_fix_point_time):
                the_min_max_fix_point_time = fix_time + min_max_fix_point_time if comparison in (
                    'ge', 'gt') else fix_time - min_max_fix_point_time
            else:
                the_min_max_fix_point_time = min_max_fix_point_time
            if tsatimes.temporal_lt(
                    max(grouping_df['temporals'].values[fix_point_indices]),
                    the_min_max_fix_point_time):
                fix_point_limits_breached.add('min_max_fix_point_time')
        if max_max_fix_point_time is not None:
            if checks.is_some_timedelta(max_max_fix_point_time):
                the_max_max_fix_point_time = fix_time + max_max_fix_point_time if comparison in (
                    'ge', 'gt') else fix_time - max_max_fix_point_time
            else:
                the_max_max_fix_point_time = max_max_fix_point_time
            if tsatimes.temporal_gt(
                    max(grouping_df['temporals'].values[fix_point_indices]),
                    the_max_max_fix_point_time):
                fix_point_limits_breached.add('max_max_fix_point_time')

        if len(fix_point_limits_breached) > 0:
            dates_with_fix_point_limits_breached[
                bucket] = fix_point_limits_breached
        else:
            data[new_bucket_column].append(bucket)
            for column in columns[1:]:
                if column not in columns_to_exclude:
                    arg = df.iloc[
                        fix_point_indices] if aggregators_apply_to_df else df.iloc[
                            fix_point_indices][column].values
                    data[column].append(aggs[column](arg))
            fix_point_counts[bucket] = len(fix_point_indices)

    df = pd.DataFrame(data, columns=columns)

    if return_extra_info:
        return {
            'df': df,
            'dates_with_no_points': dates_with_no_points,
            'dates_with_fix_point_limits_breached':
            dates_with_fix_point_limits_breached,
            'fix_point_counts': fix_point_counts
        }
    else:
        return df
Beispiel #23
0
def numpy_timedelta64_to_python_time(x, allow_none=False):
    if checks.is_iterable(x):
        return [numpy_timedelta64_to_python_time(e, allow_none) for e in x]
    return (dt.datetime.min +
            numpy_timedelta64_to_python_timedelta(x, allow_none)).time()