def fill_missing_dates_by_group(data: dd = None, groupby_columns: List[str] = None, fill_method: str = None, date_range: Tuple[str] = None, date_column: str = None, fill_value=None) -> dd: """ split input dataframe into groups according to groupby columns and reindex with continuous dates with specified date range. Fill missing values according to fill method :param data: dataframe :param groupby_columns: list of columns to groupby :param fill_method: method used to fill missing data :param date_range: date range to reidex to :param date_column: name of date column :return: modified dataframe """ output_schema = dict(data.dtypes) output_schema = list(output_schema.items()) columns = data.columns data = data.set_index(date_column, sorted=True) data = data.groupby(by=groupby_columns).apply( lambda df_g: fill_missing_dates(data=df_g, date_column=date_column, fill_method=fill_method, columns=columns, date_range=date_range, fill_value=fill_value, groupby_columns=groupby_columns), meta=output_schema).reset_index(drop=True) return data
def rolling_mean_by_date_by_group(data: dd = None, groupby_columns: List[str] = None, metric_columns: List[str] = None, date_column: str = None, window: int = None) -> dd: """ Split input dateframe into groups and preform a rolling average on the metric columns for each group :param data: input dataframe :param groupby_columns: list of columns to group by :param metric_columns: columns to calculate rolling average on :param date_column: name of date column :param window: window size to be used on rolling average :return: modified dask dataframe """ data = data.set_index(date_column, sorted=True) output_schema = dict(data.dtypes) for metric_column in metric_columns: output_schema[f'{metric_column}_rolling_mean'] = 'float32' output_schema = list(output_schema.items()) data = data.groupby(by=groupby_columns).apply( lambda df_g: rolling_mean_by_date( data=df_g, metric_columns=metric_columns, window=window), meta=output_schema) data = data.reset_index().rename(columns={'index': date_column}) return data
def aggr_by_year_journal(df: dask.dataframe) -> dask.dataframe: """Aggregate issue count by year and newspaper. :param dask.dataframe df: Dataframe comprising all issues. :return: Dataframe grouped by year and source . :rtype: dask.dataframe """ return df.groupby(['journal', 'year']).count()
def date_continuity_check_by_group(data: dd = None, groupby_columns: List[str] = None, date_column: str = None) -> bool: """ Split data into groups and evaluate each group checking if it contains a set of continuous dates in its date column. If any group contains a discontinuity return true else return false :param data: dask dataframe :param groupby_columns: column names to groupby :param date_column: date column name :return: boolean """ output_schema = [(date_column, data[date_column].dtype)] output_schema.append(('date_continuity_bool', 'bool')) data = data.groupby(by=groupby_columns).apply( lambda df_g: date_continuity_check(data=df_g, date_column=date_column), meta=output_schema).reset_index() return data['date_continuity_bool'].compute().any()
def date_range_check_by_group(data: dd = None, groupby_columns: List[str] = None, date_range: Tuple[str] = None, date_column: str = None) -> bool: """ Split input dataframe by group and check if the min and max date of each group falls outside of specified date range :param data: dask dataframe :param groupby_columns: list of column names to group by :param date_range: tuple defining required date range :param date_column: name of date column :return: bool """ output_schema = ('date_range_bool', 'bool') data = data.groupby(by=groupby_columns).apply( lambda df_g: date_range_check(data=df_g, date_range=date_range, date_column=date_column), meta=output_schema).reset_index() return data['date_range_bool'].compute().any()
def agg_insert_by_group(data: dd = None, groupby_columns: List[str] = None, agg_dict: dict = None, insert_dict: dict = None) -> dd: """ Split input dataframe into groups, apply aggregations on each group according to the aggregation dict, insert aggregated results back into the original dataframe with column values specified in insert dict :param data: input dask dataframe :param groupby_columns: list of column names to group by :param agg_dict: dictionary of the format {column name: aggregation to preform to column name} :param insert_dict: dictionary of the format {column name: value of column to be set prior to insertion} :return: modified datafraeme """ agg_data = data.groupby(groupby_columns).agg(agg_dict).reset_index() agg_data.columns = agg_data.columns.droplevel(1) for column, value in insert_dict.items(): agg_data[column] = 'COMBINED' data = data.append(agg_data) return data
def yoy_percent_change_by_group(data: dd = None, groupby_columns: List[str] = None, metric_columns: List[str] = None, date_column: str = None) -> dd: """ Split dataframe into groups and calculate year over year percent change for the etric columns in each group :param data: input dataframe :param groupby_columns: list of columns to group by :param metric_columns: columns to calculate rolling average on :param date_column: name of date column :return: modified dataframe """ data = data.set_index(date_column, sorted=True) output_schema = dict(data.dtypes) for metric_column in metric_columns: output_schema[f'{metric_column}_yoy_pct_change'] = 'float32' output_schema = list(output_schema.items()) data = data.groupby(by=groupby_columns).apply( lambda df_g: yoy_percent_change(data=df_g, metric_columns=metric_columns), meta=output_schema) data = data.reset_index().rename(columns={'index': date_column}) return data