def test_first_last_valid(self): ts = self.ts.copy() ts[:5] = np.NaN index = ts.first_valid_index() assert index == ts.index[5] ts[-5:] = np.NaN index = ts.last_valid_index() assert index == ts.index[-6] ts[:] = np.nan assert ts.last_valid_index() is None assert ts.first_valid_index() is None ser = Series([], index=[]) assert ser.last_valid_index() is None assert ser.first_valid_index() is None # GH12800 empty = Series() assert empty.last_valid_index() is None assert empty.first_valid_index() is None # GH20499: its preserves freq with holes ts.index = date_range("20110101", periods=len(ts), freq="B") ts.iloc[1] = 1 ts.iloc[-2] = 1 assert ts.first_valid_index() == ts.index[1] assert ts.last_valid_index() == ts.index[-2] assert ts.first_valid_index().freq == ts.index.freq assert ts.last_valid_index().freq == ts.index.freq
def calculate_enhanced_meta(serie: pd.Series, periodicity: str) -> dict: """Crea o actualiza los metadatos enriquecidos de la serie pasada. El tรญtulo de la misma DEBE ser el ID de la serie en la base de datos""" days_since_update = (datetime.now() - _get_last_day_of_period(serie, periodicity)).days last_index = serie.index.get_loc(serie.last_valid_index()) last = serie[last_index] second_to_last = serie[last_index - 1] if serie.index.size > 1 else None last_pct_change = last / second_to_last - 1 # Cรกlculos meta = { meta_keys.INDEX_START: serie.first_valid_index().date(), meta_keys.INDEX_END: serie.last_valid_index().date(), meta_keys.PERIODICITY: periodicity, meta_keys.INDEX_SIZE: _get_index_size(serie), meta_keys.DAYS_SINCE_LAST_UPDATE: days_since_update, meta_keys.LAST_VALUE: last, meta_keys.SECOND_TO_LAST_VALUE: second_to_last, meta_keys.LAST_PCT_CHANGE: last_pct_change, meta_keys.IS_UPDATED: _is_series_updated(days_since_update, periodicity), meta_keys.MAX: serie.max(), meta_keys.MIN: serie.min(), meta_keys.AVERAGE: serie.mean(), meta_keys.SIGNIFICANT_FIGURES: significant_figures(serie.values) } return meta
def _calculate_smoothed_daily_cases(new_cases: pd.Series, smooth: int = 7): if new_cases.first_valid_index() is None: return new_cases new_cases = new_cases.copy() # Front filling all cases with 0s. We're assuming all regions are accurately # reporting the first day a new case occurs. This will affect the first few cases # in a timeseries, because it's smoothing over a full period, rather than just the first # couple days of reported data. new_cases[:new_cases.first_valid_index() - timedelta(days=1)] = 0 smoothed = series_utils.smooth_with_rolling_average(new_cases, window=smooth) return smoothed
def index(x: pd.Series, initial: int = 1) -> pd.Series: """ Geometric series normalization :param x: time series :param initial: initial value :return: normalized time series **Usage** Divides every value in x by the initial value of x: :math:`Y_t = initial * X_t / X_0` where :math:`X_0` is the first value in the series **Examples** Normalize series to 1: >>> series = generate_series(100) >>> returns = index(series) **See also** :func:`returns` """ i = x.first_valid_index() return pd.Series() if i is None else initial * x / x[i]
def update_enhanced_meta(serie: pd.Series, catalog_id: str, distribution_id: str): """Crea o actualiza los metadatos enriquecidos de la serie pasada. El tรญtulo de la misma DEBE ser el ID de la serie en la base de datos""" field = Field.objects.get(distribution__dataset__catalog__identifier=catalog_id, distribution__identifier=distribution_id, identifier=serie.name) periodicity = meta_keys.get(field.distribution, meta_keys.PERIODICITY) days_since_update = (datetime.now() - _get_last_day_of_period(serie, periodicity)).days last = serie[-1] second_to_last = serie[-2] if serie.index.size > 1 else None last_pct_change = last / second_to_last - 1 # Cรกlculos meta = { meta_keys.INDEX_START: serie.first_valid_index().date(), meta_keys.INDEX_END: serie.last_valid_index().date(), meta_keys.PERIODICITY: periodicity, meta_keys.INDEX_SIZE: _get_index_size(serie), meta_keys.DAYS_SINCE_LAST_UPDATE: days_since_update, meta_keys.LAST_VALUE: last, meta_keys.SECOND_TO_LAST_VALUE: second_to_last, meta_keys.LAST_PCT_CHANGE: last_pct_change, meta_keys.IS_UPDATED: _is_series_updated(days_since_update, periodicity), meta_keys.MAX: serie.max(), meta_keys.MIN: serie.min(), meta_keys.AVERAGE: serie.mean(), } for meta_key, value in meta.items(): field.enhanced_meta.update_or_create(key=meta_key, defaults={'value': value})
def check_series(s: pd.Series, input_output="") -> bool: """ Check if a given Pandas Series has the properties of a RepresentationSeries. """ error_string = ( "There are non-representation cells (every cell should be a list of floats) in the given Series." " See help(hero.HeroSeries) for more information." ) def is_numeric(x): try: float(x) except ValueError: return False else: return True def is_list_of_numbers(cell): return all(is_numeric(x) for x in cell) and isinstance(cell, (list, tuple)) try: first_non_nan_value = s.loc[s.first_valid_index()] if not is_list_of_numbers(first_non_nan_value) or s.index.nlevels != 1: raise TypeError(error_string) except KeyError: # Only NaNs in Series -> same warning applies raise TypeError(error_string)
def index(x: pd.Series, initial: int = 1) -> pd.Series: """ Geometric series normalization :param x: time series :param initial: initial value :return: normalized time series **Usage** Divides every value in x by the initial value of x: :math:`Y_t = initial * X_t / X_0` where :math:`X_0` is the first value in the series **Examples** Normalize series to 1: >>> series = generate_series(100) >>> returns = index(series) **See also** :func:`returns` """ i = x.first_valid_index() if not x[i]: raise MqValueError( 'Divide by zero error. Ensure that the first value of series passed to index(...) ' 'is non-zero') return pd.Series(dtype=float) if i is None else initial * x / x[i]
def check_type(s: pd.Series, input_output="") -> Tuple[bool, str]: """ Check if a given Pandas Series has the properties of a VectorSeries. """ error_string = ( "should be VectorSeries: there are non-representation cells (every cell should be a list of floats) in the given Series." " See help(hero.HeroTypes) for more information.") def is_numeric(x): try: float(x) except ValueError: return False else: return True def is_list_of_numbers(cell): return isinstance(cell, (list, tuple)) and all( is_numeric(x) for x in cell) try: first_non_nan_value = s.loc[s.first_valid_index()] if not is_list_of_numbers(first_non_nan_value): return False, error_string except KeyError: # Only NaNs in Series -> same warning applies return False, error_string return True, ""
def drift(x: Series, h: int) -> np.ndarray: # x : time serie data # h : number of future predictions # equation : ลถt+h|t = Yt + h * ((Yt - Y1) / (t - 1)) diffRate = (x.get(x.last_valid_index()) - x.get(x.first_valid_index())) / (len(x.values) - 1) result = [] for t in range(h): result.append(x.get(x.last_valid_index()) + ((t + 1) * diffRate)) return Series(np.array(result))
def number_of_na_in_ts(ts: pd.Series) -> int: """ Removes all the NaNs at the beginning (assume the first value is never missing), then counts the number of NaNs. See test below. """ index_first_non_na = ts.first_valid_index() ts = ts[index_first_non_na:] return ts.isna().sum()
def parse(self, column_data: pd.Series): super().parse(column_data) idx = column_data.first_valid_index() val = column_data[idx] inferred_shape = np.array(val).shape if self._shape is not None: assert tuple(self._shape) == tuple(inferred_shape), 'Shape mismatch!. Expected shape={},' \ ' shape in the dataset is {}'.format(self._shape, inferred_shape) else: self._shape = inferred_shape
def is_categorical_column( data: pd.Series, threshold: int = 100, ratio: float = 0.1, is_label_columns: bool = False, default_allow_missing: bool = True) -> Tuple[bool, bool]: """Check whether the column is a categorical column. If the number of unique elements in the column is smaller than min(#Total Sample * ratio, threshold), it will be treated as a categorical column Parameters ---------- data The column data threshold The threshold for detecting categorical column is_label_columns Whether the column is a label column ratio The ratio for detecting categorical column Returns ------- is_categorical Whether the column is a categorical column parsed_allow_missing """ threshold = min(int(len(data) * ratio), threshold) sample_set = set() element = data[data.first_valid_index()] if isinstance(element, str): for idx, sample in data.items(): sample_set.add(sample) if len(sample_set) > threshold: return False, False if is_label_columns: return True, False else: return True, default_allow_missing elif isinstance(element, INT_TYPES): value_counts = data.value_counts() if value_counts.keys().min() == 0 and value_counts.keys().max( ) == len(value_counts) - 1: return True, False else: return False, False elif isinstance(element, BOOL_TYPES): return True, False else: return False, False
def test_first_last_valid(self): ts = self.ts.copy() ts[:5] = np.NaN index = ts.first_valid_index() assert index == ts.index[5] ts[-5:] = np.NaN index = ts.last_valid_index() assert index == ts.index[-6] ts[:] = np.nan assert ts.last_valid_index() is None assert ts.first_valid_index() is None ser = Series([], index=[]) assert ser.last_valid_index() is None assert ser.first_valid_index() is None # GH12800 empty = Series() assert empty.last_valid_index() is None assert empty.first_valid_index() is None
def test_first_last_valid(self): ts = self.ts.copy() ts[:5] = np.NaN index = ts.first_valid_index() self.assertEqual(index, ts.index[5]) ts[-5:] = np.NaN index = ts.last_valid_index() self.assertEqual(index, ts.index[-6]) ts[:] = np.nan self.assertIsNone(ts.last_valid_index()) self.assertIsNone(ts.first_valid_index()) ser = Series([], index=[]) self.assertIsNone(ser.last_valid_index()) self.assertIsNone(ser.first_valid_index()) # GH12800 empty = Series() self.assertIsNone(empty.last_valid_index()) self.assertIsNone(empty.first_valid_index())
def write_serie(self, serie: pd.Series, periodicity: str, fields: dict, writer: csv.writer): field_id = fields[serie.name] # Filtrado de NaN serie = serie[serie.first_valid_index():serie.last_valid_index()] df = serie.reset_index().apply(self.rows, axis=1, args=(self.fields_data, field_id, periodicity)) serie = pd.Series(df.values, index=serie.index) for row in serie: writer.writerow(row)
def check_series(s: pd.Series) -> bool: """ Check if a given Pandas Series has the properties of a TextSeries. """ error_string = ( "The input Series should consist only of strings in every cell." " See help(hero.HeroSeries) for more information." ) try: first_non_nan_value = s.loc[s.first_valid_index()] if not isinstance(first_non_nan_value, str) or s.index.nlevels != 1: raise TypeError(error_string) except KeyError: # Only NaNs in Series -> same warning applies raise TypeError(error_string)
def check_type(s: pd.Series) -> Tuple[bool, str]: """ Check if a given Pandas Series has the properties of a TextSeries. """ error_string = ( "should be TextSeries: the input Series should consist only of strings in every cell." " See help(hero.HeroTypes) for more information.") try: first_non_nan_value = s.loc[s.first_valid_index()] if not isinstance(first_non_nan_value, str): return False, error_string except KeyError: # Only NaNs in Series -> same warning applies return False, error_string return True, ""
def interpolate_stalled_and_missing_values(series: pd.Series) -> pd.Series: """Interpolates periods where values have stopped increasing or have gaps. Args: series: Series with a datetime index """ series = series.copy() start, end = series.first_valid_index(), series.last_valid_index() series_with_values = series.loc[start:end] series_with_values[series_with_values.diff() == 0] = None # Use the index to determine breaks between data (so # missing data is not improperly interpolated) series.loc[start:end] = series_with_values.interpolate( method="time").apply(np.floor) return series
def _get_range(x: pd.Series): """Get a range of values so that there are no NaNs in the sequence.""" first_idx = x.first_valid_index() last_idx = x.last_valid_index() subset = x.loc[first_idx:last_idx] while subset.isnull().values.any() and \ (first_idx is not None or last_idx is not None): idx = subset.isna().idxmax() first_idx = subset.loc[idx:last_idx].first_valid_index() subset = x.loc[first_idx:last_idx] if first_idx is None or last_idx is None: return None, None return first_idx, last_idx
def _fit_core(self, s: pd.Series) -> None: if not (s.index.is_monotonic_increasing or s.index.is_monotonic_decreasing): raise ValueError("Time series must have a monotonic time index. ") # remove starting and ending nans s = s.loc[s.first_valid_index():s[::-1].first_valid_index()].copy() if pd.isna(s).any(): raise ValueError( "Found NaN in time series among valid values. " "NaNs starting or ending a time series are allowed, " "but those among valid values are not.") # get datum time self._datumTimestamp = s.index[0] # get series_freq if s.index.freq is not None: self._series_freq = s.index.freqstr else: self._series_freq = s.index.inferred_freq if self._series_freq is None: raise RuntimeError( "Series does not follow any known frequency " "(e.g. second, minute, hour, day, week, month, year, etc.") # get average dT self._dT = pd.Series(s.index).diff().mean() # get seasonal freq if self.freq is None: identified_freq = _identify_seasonal_period(s) if identified_freq is None: raise Exception("Could not find significant seasonality.") else: self.freq_ = identified_freq else: self.freq_ = self.freq # get seasonal pattern if self.trend: seasonal_decompose_results = ( seasonal_decompose(s, period=self.freq_) if parse(statsmodels.__version__) >= parse("0.11") else seasonal_decompose(s, freq=self.freq_)) self.seasonal_ = getattr(seasonal_decompose_results, "seasonal")[:self.freq_] else: self.seasonal_ = s.iloc[:self.freq_].copy() for i in range(len(self.seasonal_)): self.seasonal_.iloc[i] = s.iloc[i::len(self.seasonal_)].mean()
def test_first_last_valid(self): ts = self.ts.copy() ts[:5] = np.NaN index = ts.first_valid_index() self.assertEqual(index, ts.index[5]) ts[-5:] = np.NaN index = ts.last_valid_index() self.assertEqual(index, ts.index[-6]) ts[:] = np.nan self.assert_(ts.last_valid_index() is None) self.assert_(ts.first_valid_index() is None) ser = Series([], index=[]) self.assert_(ser.last_valid_index() is None) self.assert_(ser.first_valid_index() is None)
def check_type(s: pd.Series) -> Tuple[bool, str]: """ Check if a given Pandas Series has the properties of a TokenSeries. """ error_string = ( "should be TokenSeries: there are non-token cells (every cell should be a list of words/tokens) in the given Series." " See help(hero.HeroTypes) for more information.") def is_list_of_strings(cell): return all(isinstance(x, str) for x in cell) and isinstance(cell, (list, tuple)) try: first_non_nan_value = s.loc[s.first_valid_index()] if not is_list_of_strings(first_non_nan_value): return False, error_string except KeyError: # Only NaNs in Series -> same warning applies return False, error_string return True, ""
def check_series(s: pd.Series) -> bool: """ Check if a given Pandas Series has the properties of a TokenSeries. """ error_string = ( "There are non-token cells (every cell should be a list of words/tokens) in the given Series." " See help(hero.HeroSeries) for more information." ) def is_list_of_strings(cell): return all(isinstance(x, str) for x in cell) and isinstance( cell, (list, tuple) ) try: first_non_nan_value = s.loc[s.first_valid_index()] if not is_list_of_strings(first_non_nan_value) or s.index.nlevels != 1: raise TypeError(error_string) except KeyError: # Only NaNs in Series -> same warning applies raise TypeError(error_string)
def _assert_single_contiguous_dense_sequence( _series: pd.Series) -> None: """ Assert that the input series has no Null values after removing leading and trailing Nulls. An motivating example for this requirement is a ForecastCheck, which might have a main value series that ends with trailing Nulls, and a forecast series that begins with leading nulls, but the actual and forecast periods should have no nulls. This is a strong assertion, and I'm not 100% sure it's the right one, but I'm putting it in because I'd rather start out with more constraints. However, we can revisit this design choice. """ assert is_numeric_dtype(_series), 'The "Single Contiguous Dense Sequence" constraint should only be ' \ 'applied to numeric Series' assert (not _series.loc[_series.first_valid_index( ):_series.last_valid_index()].isnull().values.any()), ( 'Numeric series may have leading or trailing null values to represent missing or non-applicable ' 'data points. However, values for the series should otherwise be non-Null.' )
def fill_gaps(time_series: pd.Series) -> pd.Series: """ Fill gaps in a time series (i.e. value equals to NaN) inside the time-series (leading and ending missing values are untouched). Parameters ---------- time_series: pd.Series Time-series of load (can be NaNs) indexed with datetime indexes. Returns ------- time_series: pd.Series Corrected time series """ # First remove starting and ending nans time_series_trim = time_series.loc[time_series.first_valid_index( ):time_series.last_valid_index()] # For each remaining nan, we replace its value by the value of an identical hour in another day for which we have # data time_series_trim_valid = time_series_trim.dropna() nan_indexes = time_series_trim.index[time_series_trim.apply(np.isnan)] for index in nan_indexes: # Get all elements which have are on the same day, same hour similar_hours = time_series_trim_valid[ time_series_trim_valid.index.map(lambda x: x.weekday( ) == index.weekday() and x.hour == index.hour)] # Find closest valid hour closest_valid_hour_index = similar_hours.index[np.argmin( abs((similar_hours.index - index).days))] time_series_trim[index] = time_series_trim_valid[ closest_valid_hour_index] time_series[time_series_trim.index] = time_series_trim.values return time_series
def generate_field_summary(series: pd.Series) -> pd.Series: has_value = not series.isnull().all() min_date = None max_date = None max_value = None min_value = None latest_value = None num_observations = 0 largest_delta = None largest_delta_date = None if has_value: min_date = series.first_valid_index() max_date = series.last_valid_index() latest_value = series[series.notnull()].iloc[-1] max_value = series.max() min_value = series.min() num_observations = len(series[series.notnull()]) largest_delta = series.diff().abs().max() # If a if len(series.diff().abs().dropna()): largest_delta_date = series.diff().abs().idxmax() results = { "has_value": has_value, "min_date": min_date, "max_date": max_date, "max_value": max_value, "min_value": min_value, "latest_value": latest_value, "num_observations": num_observations, "largest_delta": largest_delta, "largest_delta_date": largest_delta_date, } return pd.Series(results)
def _get_index_size(serie: pd.Series): # Filtro los NaN antes y despuรฉs de la serie return len(serie[serie.first_valid_index():serie.last_valid_index()])
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isnull(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which cant be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: raise ValueError('Invalid limit_direction: expecting one of %r, got ' '%r.' % (valid_limit_directions, limit_direction)) from pandas import Series ys = Series(yvalues) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) # violate_limit is a list of the indexes in the series whose yvalue is # currently NaN, and should still be NaN after the interpolation. # Specifically: # # If limit_direction='forward' or None then the list will contain NaNs at # the beginning of the series, and NaNs that are more than 'limit' away # from the prior non-NaN. # # If limit_direction='backward' then the list will contain NaNs at # the end of the series, and NaNs that are more than 'limit' away # from the subsequent non-NaN. # # If limit_direction='both' then the list will contain NaNs that # are more than 'limit' away from any non-NaN. # # If limit=None, then use default behavior of filling an unlimited number # of NaNs in the direction specified by limit_direction # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') # each possible limit_direction # TODO: do we need sorted? if limit_direction == 'forward' and limit is not None: violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) elif limit_direction == 'forward': violate_limit = sorted(start_nans) elif limit_direction == 'backward' and limit is not None: violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit))) elif limit_direction == 'backward': violate_limit = sorted(end_nans) elif limit_direction == 'both' and limit is not None: violate_limit = sorted(_interp_limit(invalid, limit, limit)) else: violate_limit = [] xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[violate_limit] = np.nan return result sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima'] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[violate_limit] = np.nan return result
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = com.isnull(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which cant be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' def _interp_limit(invalid, fw_limit, bw_limit): "Get idx of values that won't be filled b/c they exceed the limits." for x in np.where(invalid)[0]: if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): yield x valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: raise ValueError('Invalid limit_direction: expecting one of %r, got ' '%r.' % (valid_limit_directions, limit_direction)) from pandas import Series ys = Series(yvalues) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) # This is a list of the indexes in the series whose yvalue is currently # NaN, but whose interpolated yvalue will be overwritten with NaN after # computing the interpolation. For each index in this list, one of these # conditions is true of the corresponding NaN in the yvalues: # # a) It is one of a chain of NaNs at the beginning of the series, and # either limit is not specified or limit_direction is 'forward'. # b) It is one of a chain of NaNs at the end of the series, and limit is # specified and limit_direction is 'backward' or 'both'. # c) Limit is nonzero and it is further than limit from the nearest non-NaN # value (with respect to the limit_direction setting). # # The default behavior is to fill forward with no limit, ignoring NaNs at # the beginning (see issues #9218 and #10420) violate_limit = sorted(start_nans) if limit: if limit_direction == 'forward': violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) if limit_direction == 'backward': violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit))) if limit_direction == 'both': violate_limit = sorted(_interp_limit(invalid, limit, limit)) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[violate_limit] = np.nan return result sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'piecewise_polynomial', 'pchip', 'akima'] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[violate_limit] = np.nan return result
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isnull(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which cant be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' def _interp_limit(invalid, fw_limit, bw_limit): "Get idx of values that won't be filled b/c they exceed the limits." for x in np.where(invalid)[0]: if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): yield x valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: raise ValueError('Invalid limit_direction: expecting one of %r, got ' '%r.' % (valid_limit_directions, limit_direction)) from pandas import Series ys = Series(yvalues) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) # This is a list of the indexes in the series whose yvalue is currently # NaN, but whose interpolated yvalue will be overwritten with NaN after # computing the interpolation. For each index in this list, one of these # conditions is true of the corresponding NaN in the yvalues: # # a) It is one of a chain of NaNs at the beginning of the series, and # either limit is not specified or limit_direction is 'forward'. # b) It is one of a chain of NaNs at the end of the series, and limit is # specified and limit_direction is 'backward' or 'both'. # c) Limit is nonzero and it is further than limit from the nearest non-NaN # value (with respect to the limit_direction setting). # # The default behavior is to fill forward with no limit, ignoring NaNs at # the beginning (see issues #9218 and #10420) violate_limit = sorted(start_nans) if limit is not None: if not is_integer(limit): raise ValueError('Limit must be an integer') if limit < 1: raise ValueError('Limit must be greater than 0') if limit_direction == 'forward': violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) if limit_direction == 'backward': violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit))) if limit_direction == 'both': violate_limit = sorted(_interp_limit(invalid, limit, limit)) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[violate_limit] = np.nan return result sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima'] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[violate_limit] = np.nan return result
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isnull(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which cant be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: raise ValueError('Invalid limit_direction: expecting one of %r, got ' '%r.' % (valid_limit_directions, limit_direction)) from pandas import Series ys = Series(yvalues) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) # violate_limit is a list of the indexes in the series whose yvalue is # currently NaN, and should still be NaN after the interpolation. # Specifically: # # If limit_direction='forward' or None then the list will contain NaNs at # the beginning of the series, and NaNs that are more than 'limit' away # from the prior non-NaN. # # If limit_direction='backward' then the list will contain NaNs at # the end of the series, and NaNs that are more than 'limit' away # from the subsequent non-NaN. # # If limit_direction='both' then the list will contain NaNs that # are more than 'limit' away from any non-NaN. # # If limit=None, then use default behavior of filling an unlimited number # of NaNs in the direction specified by limit_direction # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') # each possible limit_direction # TODO: do we need sorted? if limit_direction == 'forward' and limit is not None: violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) elif limit_direction == 'forward': violate_limit = sorted(start_nans) elif limit_direction == 'backward' and limit is not None: violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit))) elif limit_direction == 'backward': violate_limit = sorted(end_nans) elif limit_direction == 'both' and limit is not None: violate_limit = sorted(_interp_limit(invalid, limit, limit)) else: violate_limit = [] xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[violate_limit] = np.nan return result sp_methods = [ 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima' ] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[violate_limit] = np.nan return result
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', limit_area=None, fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isna(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which can't be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: msg = ('Invalid limit_direction: expecting one of {valid!r}, ' 'got {invalid!r}.') raise ValueError( msg.format(valid=valid_limit_directions, invalid=limit_direction)) if limit_area is not None: valid_limit_areas = ['inside', 'outside'] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: raise ValueError('Invalid limit_area: expecting one of {}, got ' '{}.'.format(valid_limit_areas, limit_area)) # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') from pandas import Series ys = Series(yvalues) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) mid_nans = all_nans - start_nans - end_nans # Like the sets above, preserve_nans contains indices of invalid values, # but in this case, it is the final set of indices that need to be # preserved as NaN after the interpolation. # For example if limit_direction='forward' then preserve_nans will # contain indices of NaNs at the beginning of the series, and NaNs that # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit if limit_direction == 'forward': preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == 'backward': preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit preserve_nans = set(_interp_limit(invalid, limit, limit)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 if limit_area == 'inside': # preserve NaNs on the outside preserve_nans |= start_nans | end_nans elif limit_area == 'outside': # preserve NaNs on the inside preserve_nans |= mid_nans # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[preserve_nans] = np.nan return result sp_methods = [ 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima' ] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[preserve_nans] = np.nan return result
def _first_valid_value(serie: Series) -> Any: first_valid_index = serie.first_valid_index() return serie[first_valid_index] if first_valid_index is not None else None
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', limit_area=None, fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isna(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which can't be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: msg = ('Invalid limit_direction: expecting one of {valid!r}, ' 'got {invalid!r}.') raise ValueError(msg.format(valid=valid_limit_directions, invalid=limit_direction)) if limit_area is not None: valid_limit_areas = ['inside', 'outside'] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: raise ValueError('Invalid limit_area: expecting one of {}, got ' '{}.'.format(valid_limit_areas, limit_area)) # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') from pandas import Series ys = Series(yvalues) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) mid_nans = all_nans - start_nans - end_nans # Like the sets above, preserve_nans contains indices of invalid values, # but in this case, it is the final set of indices that need to be # preserved as NaN after the interpolation. # For example if limit_direction='forward' then preserve_nans will # contain indices of NaNs at the beginning of the series, and NaNs that # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit if limit_direction == 'forward': preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == 'backward': preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit preserve_nans = set(_interp_limit(invalid, limit, limit)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 if limit_area == 'inside': # preserve NaNs on the outside preserve_nans |= start_nans | end_nans elif limit_area == 'outside': # preserve NaNs on the inside preserve_nans |= mid_nans # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[preserve_nans] = np.nan return result sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima'] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[preserve_nans] = np.nan return result