def rolling_mean_by_date_by_group(data: dd = None, groupby_columns: List[str] = None, metric_columns: List[str] = None, date_column: str = None, window: int = None) -> dd: """ Split input dateframe into groups and preform a rolling average on the metric columns for each group :param data: input dataframe :param groupby_columns: list of columns to group by :param metric_columns: columns to calculate rolling average on :param date_column: name of date column :param window: window size to be used on rolling average :return: modified dask dataframe """ data = data.set_index(date_column, sorted=True) output_schema = dict(data.dtypes) for metric_column in metric_columns: output_schema[f'{metric_column}_rolling_mean'] = 'float32' output_schema = list(output_schema.items()) data = data.groupby(by=groupby_columns).apply( lambda df_g: rolling_mean_by_date( data=df_g, metric_columns=metric_columns, window=window), meta=output_schema) data = data.reset_index().rename(columns={'index': date_column}) return data
def fill_missing_dates(data: dd = None, date_column: str = None, fill_method: str = None, columns=None, date_range: Tuple[str] = None, fill_value=None, groupby_columns=None) -> dd: """ Preform date fill on single group """ all_dates = pd.date_range(date_range[0], date_range[1]) metric_data = data[[ col for col in data.columns if col not in groupby_columns ]] data = data[groupby_columns].reindex(all_dates, method='nearest') metric_data = metric_data.reindex(all_dates, method=fill_method, fill_value=fill_value) data = dd.merge(data, metric_data, left_index=True, right_index=True) data = data.reset_index().rename(columns={'index': date_column})[columns] return data
def yoy_percent_change_by_group(data: dd = None, groupby_columns: List[str] = None, metric_columns: List[str] = None, date_column: str = None) -> dd: """ Split dataframe into groups and calculate year over year percent change for the etric columns in each group :param data: input dataframe :param groupby_columns: list of columns to group by :param metric_columns: columns to calculate rolling average on :param date_column: name of date column :return: modified dataframe """ data = data.set_index(date_column, sorted=True) output_schema = dict(data.dtypes) for metric_column in metric_columns: output_schema[f'{metric_column}_yoy_pct_change'] = 'float32' output_schema = list(output_schema.items()) data = data.groupby(by=groupby_columns).apply( lambda df_g: yoy_percent_change(data=df_g, metric_columns=metric_columns), meta=output_schema) data = data.reset_index().rename(columns={'index': date_column}) return data