def compute(self, df, datetime_column, groupby_columns=None): if groupby_columns is None: groupby_columns = [] generic_check_compute_arguments(datetime_column, groupby_columns) # drop all rows where the timestamp is null df_copy = df.dropna(subset=[datetime_column]).copy() if nothing_to_do(df_copy, min_len=2): logger.warning('The time series has less than 2 rows with values, can not apply window.') return df_copy df_copy.loc[:, datetime_column] = pd.to_datetime(df_copy[datetime_column]) raw_columns = df_copy.select_dtypes(include=['float', 'int']).columns.tolist() if groupby_columns: grouped = df_copy.groupby(groupby_columns) computed_groups = [] identifiers_number = len(groupby_columns) for group_id, group in grouped: logger.info("Computing for group {}".format(group_id)) try: if self.params.causal_window: computed_df = self._compute_causal_stats(group, datetime_column, raw_columns, df_id=group_id) else: computed_df = self._compute_bilateral_stats(group, datetime_column, raw_columns, df_id=group_id) except Exception as e: from future.utils import raise_ # issues with left border, cf https://github.com/pandas-dev/pandas/issues/26005 if str(e) == ('skiplist_init failed'): raise_(Exception, "Window width is too small", sys.exc_info()[2]) else: raise_(Exception, "Compute stats failed. Check the full error log for more info: {}".format(str(e)), sys.exc_info()[2]) if not nothing_to_do(group, min_len=2): group_id = format_group_id(group_id, identifiers_number) computed_df[groupby_columns] = pd.DataFrame([group_id], index=computed_df.index) computed_groups.append(computed_df) final_df = pd.concat(computed_groups, sort=True) else: try: if self.params.causal_window: final_df = self._compute_causal_stats(df_copy, datetime_column, raw_columns) else: final_df = self._compute_bilateral_stats(df_copy, datetime_column, raw_columns) except Exception as e: from future.utils import raise_ if str(e) == ('skiplist_init failed'): raise_(Exception, "Window width is too small", sys.exc_info()[2]) else: raise_(Exception, "Compute stats failed. Check the full error log for more info: {}".format(str(e)), sys.exc_info()[2]) return final_df.reset_index(drop=True)
def _compute_bilateral_stats(self, df, datetime_column, raw_columns, df_id=''): if nothing_to_do(df, min_len=2): logger.info('The time series {} has less than 2 rows with values, can not apply window.'.format(df_id)) return df if has_duplicates(df, datetime_column): logger.error('The time series {} contain duplicate timestamps.'.format(df_id)) raise ValueError('The time series {} contain duplicate timestamps.'.format(df_id)) reference_df = df.set_index(datetime_column).sort_index().copy() new_df = pd.DataFrame(index=reference_df.index) frequency = infer_frequency(reference_df) if frequency: window_description_in_row = convert_time_freq_to_row_freq(frequency, self.params.window_description) else: logger.error('The input time series is not equispaced. Cannot compute bilateral window.') # pandas limitation raise ValueError('The input time series is not equispaced. Cannot compute bilateral window.') # pandas limitation # compute all stats except mean and sum, these stats dont need a win_type roller_without_win_type = reference_df.rolling(window=window_description_in_row, center=True) new_df = self._compute_stats_without_win_type(roller_without_win_type, raw_columns, new_df, reference_df) # compute mean and sum, the only operations that win_type has an effect roller_with_win_type = reference_df.rolling(window=window_description_in_row, win_type=self.params.window_type, center=True) new_df = self._compute_stats_with_win_type(roller_with_win_type, raw_columns, new_df) return new_df.rename_axis(datetime_column).reset_index()
def _detect_segment(self, df, datetime_column, filter_column, filter_function, df_id=''): if has_duplicates(df, datetime_column): raise ValueError( 'The time series {} contain duplicate timestamps.'.format( df_id)) if nothing_to_do(df, min_len=0): logger.warning( 'The time series {} is empty, can not compute.'.format(df_id)) return pd.DataFrame(columns=df.columns) df_copy = df.copy() df_copy.loc[:, datetime_column] = pd.to_datetime(df_copy[datetime_column]) df_copy = df_copy.set_index(datetime_column).sort_index() segment_indexes = self._detect_time_segment(df_copy, filter_column, filter_function) mask_dict = {} if len(segment_indexes) > 0: for segment_index, (start, end) in enumerate(segment_indexes): mask = (df_copy.index >= start) & (df_copy.index <= end) mask_dict[segment_index] = mask df_labeled = df_copy.copy() df_labeled['interval_id'] = np.nan for k, v in mask_dict.items(): df_labeled.loc[v, 'interval_id'] = str(int(k)) segment_df = df_labeled.loc[np.logical_or.reduce( list(mask_dict.values()))].sort_index() else: segment_df = pd.DataFrame(columns=df_copy.columns) return segment_df.rename_axis(datetime_column).reset_index()
def transform(self, df, datetime_column, groupby_columns=None): if groupby_columns is None: groupby_columns = [] generic_check_compute_arguments(datetime_column, groupby_columns) df_copy = df.copy() # drop all rows where the timestamp is null df_copy = df_copy.dropna(subset=[datetime_column]) if nothing_to_do(df_copy, min_len=2): logger.warning('The timeseries has less than 2 rows with values, can not resample.') return df_copy df_copy.loc[:, datetime_column] = pd.to_datetime(df_copy[datetime_column]) # when having multiple timeseries, their time range is not necessarily the same # we thus compute a unified time index for all partitions reference_time_index = self._compute_full_time_index(df_copy, datetime_column) columns_to_resample = [col for col in df_copy.select_dtypes([int, float]).columns.tolist() if col != datetime_column and col not in groupby_columns] category_columns = [col for col in df.select_dtypes([object, bool]).columns.tolist() if col != datetime_column and col not in columns_to_resample and col not in groupby_columns] if groupby_columns: grouped = df_copy.groupby(groupby_columns) resampled_groups = [] identifiers_number = len(groupby_columns) for group_id, group in grouped: logger.info("Computing for group: {}".format(group_id)) group_resampled = self._resample(group.drop(groupby_columns, axis=1), datetime_column, columns_to_resample, category_columns, reference_time_index, df_id=group_id) group_id = format_group_id(group_id, identifiers_number) group_resampled[groupby_columns] = pd.DataFrame([group_id], index=group_resampled.index) resampled_groups.append(group_resampled) df_resampled = pd.concat(resampled_groups, sort=True) else: df_resampled = self._resample(df_copy, datetime_column, columns_to_resample, category_columns, reference_time_index) df_resampled = df_resampled[df.columns].reset_index(drop=True) return df_resampled
def compute(self, df, datetime_column, threshold_dict, groupby_columns=None): generic_check_compute_arguments(datetime_column, groupby_columns) df_copy = df.copy() # drop all rows where the timestamp is null df_copy = df_copy.dropna(subset=[datetime_column]) if nothing_to_do(df_copy, min_len=0): logger.warning('The time series is empty, can not compute.') return pd.DataFrame(columns=df_copy.columns) lower_threshold, upper_threshold, filter_column = None, None, None for column, threshold_tuple in threshold_dict.items(): filter_column = column lower_threshold, upper_threshold = threshold_tuple filter_function = self._between_min_max_mask(lower_threshold, upper_threshold) if groupby_columns: grouped = df.groupby(groupby_columns) filtered_groups = [] for group_id, group in grouped: logger.info("Computing for group {}".format(group_id)) filtered_df = self._detect_segment(group, datetime_column, filter_column, filter_function) filtered_groups.append(filtered_df) return pd.concat(filtered_groups, sort=True).reset_index(drop=True) else: return self._detect_segment(df, datetime_column, filter_column, filter_function)
def _compute_causal_stats(self, df, datetime_column, raw_columns, df_id=''): if nothing_to_do(df, min_len=2): logger.info('The time series {} has less than 2 rows with values, can not apply window.'.format(df_id)) return df if has_duplicates(df, datetime_column): logger.error('The time series {} contain duplicate timestamps.'.format(df_id)) raise ValueError('The time series {} contain duplicate timestamps.'.format(df_id)) reference_df = df.set_index(datetime_column).sort_index().copy() new_df = pd.DataFrame(index=reference_df.index) # compute all stats except mean and sum, the syntax does not change whether or not we have a window type roller_without_window_type = reference_df.rolling(window=self.params.window_description, closed=self.params.closed_option) new_df = self._compute_stats_without_win_type(roller_without_window_type, raw_columns, new_df, reference_df) # compute mean and sum, the only operations that might need a win_type # when using win_type, window must be defined in terms of rows and not time unit (pandas limitation) compute_sum_and_mean = len(set(self.params.aggregation_types).intersection(set(['average', 'sum']))) > 0 if compute_sum_and_mean and self.params.window_type: # row-based rolling is always bound both side of the window, we thus shift 1 row down when closed is left if self.params.closed_option == 'left': shifted_df = reference_df.shift(1) else: shifted_df = reference_df frequency = infer_frequency(reference_df) if frequency: window_description_in_row = convert_time_freq_to_row_freq(frequency, self.params.window_description) else: raise ValueError('The input time series is not equispaced. Cannot apply window with time unit.') # pandas limitation roller_with_window = shifted_df.rolling(window=window_description_in_row, win_type=self.params.window_type, closed=self.params.closed_option) new_df = self._compute_stats_with_win_type(roller_with_window, raw_columns, new_df) return new_df.rename_axis(datetime_column).reset_index()
def compute(self, df, datetime_column, extrema_column, groupby_columns=None): if groupby_columns is None: groupby_columns = [] generic_check_compute_arguments(datetime_column, groupby_columns) df_copy = df.copy() # drop all rows prwhere the timestamp is null df_copy = df_copy.dropna(subset=[datetime_column]) if nothing_to_do(df_copy, min_len=2): logger.warning( 'The time series has less than 2 rows with values, can not find extrema.' ) return df_copy numerical_columns = df_copy.select_dtypes( include=['float', 'int']).columns.tolist() if extrema_column not in numerical_columns: raise ValueError( "The chosen extrema column, {}, is not of type float or int.". format(extrema_column)) df_copy.loc[:, datetime_column] = pd.to_datetime(df[datetime_column]) extrema_df_list = [] identifiers_number = len(groupby_columns) if groupby_columns: grouped = df_copy.groupby(groupby_columns) for group_id, group in grouped: logger.info("Computing for group: {}".format(group_id)) extrema_neighbor_df_list, extrema_value = self._find_extrema_neighbor_zone( group, datetime_column, extrema_column, df_id=group_id) group_id = format_group_id(group_id, identifiers_number) if len(extrema_neighbor_df_list) == 0: extrema_df = pd.DataFrame([group_id], columns=groupby_columns) extrema_df_list.append(extrema_df) else: for extrema_neighbor_df in extrema_neighbor_df_list: rolling_df = self.params.window_aggregator.compute( extrema_neighbor_df, datetime_column) extrema_df = rolling_df.loc[ rolling_df[extrema_column] == extrema_value].copy() # avoid .loc warning extrema_df[groupby_columns] = pd.DataFrame( [group_id], index=extrema_df.index) extrema_df_list.append(extrema_df) final_df = pd.concat(extrema_df_list, sort=True) final_df = final_df.reset_index(drop=True) else: extrema_neighbor_df_list, extrema_value = self._find_extrema_neighbor_zone( df_copy, datetime_column, extrema_column) for extrema_neighbor_df in extrema_neighbor_df_list: rolling_df = self.params.window_aggregator.compute( extrema_neighbor_df, datetime_column) extrema_df = rolling_df.loc[rolling_df[extrema_column] == extrema_value].reset_index( drop=True) extrema_df_list.append(extrema_df) if len(extrema_df_list) > 0: final_df = pd.concat(extrema_df_list) final_df = final_df.reset_index(drop=True) else: final_df = pd.DataFrame(None) return final_df
def _resample(self, df, datetime_column, columns_to_resample, category_columns, reference_time_index, df_id=''): """ 1. Move datetime column to the index. 2. Merge the original datetime index with the full_time_index. 3. Create a numerical index of the df and save the correspond index. """ if has_duplicates(df, datetime_column): raise ValueError('The time series {} contain duplicate timestamps.'.format(df_id)) if nothing_to_do(df, min_len=2): logger.warning('The time series {} has less than 2 rows with values, can not resample.'.format(df_id)) return df # `scipy.interpolate.interp1d` does not like empty columns, so we need to filter these out first filtered_columns_to_resample = filter_empty_columns(df, columns_to_resample) if len(filtered_columns_to_resample) == 0: logger.warning('All numerical columns are empty for the time series {}.'.format(df_id)) return pd.DataFrame({datetime_column: reference_time_index}, columns=[datetime_column] + columns_to_resample) df_resample = df.set_index(datetime_column).sort_index().copy() # merge the reference time index with the original ones that has data # cf: https://stackoverflow.com/questions/47148446/pandas-resample-interpolate-is-producing-nans df_resample = df_resample.reindex(df_resample.index | reference_time_index) # `scipy.interpolate.interp1d` only works with numerical index, so we create one df_resample['numerical_index'] = range(len(df_resample)) reference_index = df_resample.loc[reference_time_index, 'numerical_index'] category_imputation_index = pd.Index([]) df_resample = df_resample.rename_axis(datetime_column).reset_index() for filtered_column in filtered_columns_to_resample: df_without_nan = df.dropna(subset=[filtered_column], how='all') interpolation_index_mask = (df_resample[datetime_column] >= df_without_nan[datetime_column].min()) & ( df_resample[datetime_column] <= df_without_nan[datetime_column].max()) interpolation_index = df_resample.index[interpolation_index_mask] extrapolation_index_mask = (df_resample[datetime_column] < df_without_nan[datetime_column].min()) | ( df_resample[datetime_column] > df_without_nan[datetime_column].max()) extrapolation_index = df_resample.index[extrapolation_index_mask] index_with_data = df_resample.loc[interpolation_index, filtered_column].dropna(how='all').index if self.params.interpolation_method not in ['constant', 'none']: interpolation_function = interpolate.interp1d(index_with_data, df_resample.loc[index_with_data, filtered_column], kind=self.params.interpolation_method, axis=0, fill_value='extrapolate') df_resample.loc[interpolation_index, filtered_column] = interpolation_function(df_resample.loc[interpolation_index].index) if self.params.extrapolation_method == "interpolation": df_resample.loc[extrapolation_index, filtered_column] = interpolation_function(df_resample.loc[extrapolation_index].index) elif self.params.interpolation_method == 'constant': if self.params.extrapolation_method == 'interpolation': df_resample.loc[:, filtered_column] = df_resample.loc[:, filtered_column].fillna(self.params.constant_value) else: df_resample.loc[interpolation_index, filtered_column] = df_resample.loc[interpolation_index, filtered_column].fillna( self.params.constant_value) if self.params.extrapolation_method == "clip": temp_df = df_resample.copy().ffill().bfill() df_resample.loc[extrapolation_index, filtered_column] = temp_df.loc[extrapolation_index, filtered_column] elif self.params.extrapolation_method == "no_extrapolation": reference_index = reference_index[~reference_index.isin(extrapolation_index.values)] category_imputation_index = category_imputation_index.union(extrapolation_index).union(interpolation_index) if len(category_columns) > 0 and len(category_imputation_index) > 0 and self.params.category_imputation_method != "empty": df_processed = df_resample.loc[category_imputation_index] df_resample.loc[category_imputation_index] = self._fill_in_category_values(df_processed, category_columns) df_resampled = df_resample.loc[reference_index].drop('numerical_index', axis=1) return df_resampled