def fill_missing_dates_by_group(data: dd = None,
                                groupby_columns: List[str] = None,
                                fill_method: str = None,
                                date_range: Tuple[str] = None,
                                date_column: str = None,
                                fill_value=None) -> dd:
    """
    split input dataframe into groups according to groupby columns and reindex with continuous dates with specified 
    date range. Fill missing values according to fill method
    :param data: dataframe
    :param groupby_columns: list of columns to groupby 
    :param fill_method: method used to fill missing data
    :param date_range: date range to reidex to
    :param date_column: name of date column
    :return: modified dataframe
    """
    output_schema = dict(data.dtypes)
    output_schema = list(output_schema.items())
    columns = data.columns
    data = data.set_index(date_column, sorted=True)
    data = data.groupby(by=groupby_columns).apply(
        lambda df_g: fill_missing_dates(data=df_g,
                                        date_column=date_column,
                                        fill_method=fill_method,
                                        columns=columns,
                                        date_range=date_range,
                                        fill_value=fill_value,
                                        groupby_columns=groupby_columns),
        meta=output_schema).reset_index(drop=True)
    return data
def rolling_mean_by_date_by_group(data: dd = None,
                                  groupby_columns: List[str] = None,
                                  metric_columns: List[str] = None,
                                  date_column: str = None,
                                  window: int = None) -> dd:
    """
    Split input dateframe into groups and preform a rolling average on the metric columns for each group
    :param data: input dataframe
    :param groupby_columns: list of columns to group by
    :param metric_columns: columns to calculate rolling average on
    :param date_column: name of date column
    :param window: window size to be used on rolling average
    :return: modified dask dataframe
    """
    data = data.set_index(date_column, sorted=True)
    output_schema = dict(data.dtypes)
    for metric_column in metric_columns:
        output_schema[f'{metric_column}_rolling_mean'] = 'float32'
    output_schema = list(output_schema.items())
    data = data.groupby(by=groupby_columns).apply(
        lambda df_g: rolling_mean_by_date(
            data=df_g, metric_columns=metric_columns, window=window),
        meta=output_schema)
    data = data.reset_index().rename(columns={'index': date_column})
    return data
Exemple #3
0
def aggr_by_year_journal(df: dask.dataframe) -> dask.dataframe:
    """Aggregate issue count by year and newspaper.

    :param dask.dataframe df: Dataframe comprising all issues.
    :return: Dataframe grouped by year and source .
    :rtype: dask.dataframe

    """

    return df.groupby(['journal', 'year']).count()
Exemple #4
0
def date_continuity_check_by_group(data: dd = None, groupby_columns: List[str] = None, date_column: str = None) -> bool:
    """
    Split data into groups and evaluate each group checking if it contains a set of continuous dates in its date column.
    If any group contains a discontinuity return true else return false
    :param data: dask dataframe
    :param groupby_columns: column names to groupby
    :param date_column: date column name
    :return: boolean
    """
    output_schema = [(date_column, data[date_column].dtype)]
    output_schema.append(('date_continuity_bool', 'bool'))
    data = data.groupby(by=groupby_columns).apply(
        lambda df_g: date_continuity_check(data=df_g, date_column=date_column),
        meta=output_schema).reset_index()
    return data['date_continuity_bool'].compute().any()
Exemple #5
0
def date_range_check_by_group(data: dd = None, groupby_columns: List[str] = None, date_range: Tuple[str] = None,
                              date_column: str = None) -> bool:
    """
    Split input dataframe by group and check if the min and max date of each group falls outside of specified date range
    :param data: dask dataframe
    :param groupby_columns: list of column names to group by
    :param date_range: tuple defining required date range
    :param date_column: name of date column
    :return: bool
    """
    output_schema = ('date_range_bool', 'bool')
    data = data.groupby(by=groupby_columns).apply(
        lambda df_g: date_range_check(data=df_g, date_range=date_range, date_column=date_column),
        meta=output_schema).reset_index()
    return data['date_range_bool'].compute().any()
def agg_insert_by_group(data: dd = None,
                        groupby_columns: List[str] = None,
                        agg_dict: dict = None,
                        insert_dict: dict = None) -> dd:
    """
    Split input dataframe into groups, apply aggregations on each group according to the aggregation dict, 
    insert aggregated results back into the original dataframe with column values specified in insert dict
    :param data: input dask dataframe
    :param groupby_columns: list of column names to group by
    :param agg_dict: dictionary of the format {column name: aggregation to preform to column name}
    :param insert_dict: dictionary of the format {column name: value of column to be set prior to insertion}
    :return: modified datafraeme
    """
    agg_data = data.groupby(groupby_columns).agg(agg_dict).reset_index()
    agg_data.columns = agg_data.columns.droplevel(1)
    for column, value in insert_dict.items():
        agg_data[column] = 'COMBINED'
    data = data.append(agg_data)
    return data
def yoy_percent_change_by_group(data: dd = None,
                                groupby_columns: List[str] = None,
                                metric_columns: List[str] = None,
                                date_column: str = None) -> dd:
    """
    Split dataframe into groups and calculate year over year percent change for the etric columns in each group
    :param data: input dataframe
    :param groupby_columns: list of columns to group by
    :param metric_columns: columns to calculate rolling average on
    :param date_column: name of date column
    :return: modified dataframe
    """
    data = data.set_index(date_column, sorted=True)
    output_schema = dict(data.dtypes)
    for metric_column in metric_columns:
        output_schema[f'{metric_column}_yoy_pct_change'] = 'float32'
    output_schema = list(output_schema.items())
    data = data.groupby(by=groupby_columns).apply(
        lambda df_g: yoy_percent_change(data=df_g,
                                        metric_columns=metric_columns),
        meta=output_schema)
    data = data.reset_index().rename(columns={'index': date_column})
    return data