Esempio n. 1
0
def var_change_by_groups(df: pd.DataFrame,
                         var: Union[str, List[str]],
                         byvars: Union[str, List[str]],
                         datevar: str = "Date",
                         numlags: int = 1):
    """
    Used for getting variable changes over time within bygroups.

    :Notes:

    Dataset is not sorted in this process. Sort the data in the order in which you wish
    lags to be created before running this command.

    :param df: dataframe containing bygroups, a date variable, and variables of interest
    :param var: column names of variables to get changes
    :param byvars: column names of variables identifying by groups
    :param datevar: column names of variables identifying periods
    :param numlags: number of periods to go back to get change
    :return:
    """
    var, byvars, datevar = [
        _to_list_if_str(v) for v in [var, byvars, datevar]
    ]  # convert to lists
    assert isinstance(var, list)
    assert isinstance(byvars, list)
    assert isinstance(datevar, list)

    short_df = df.loc[~pd.isnull(df[byvars]).any(axis=1),
                      var + byvars + datevar].drop_duplicates()
    for v in var:
        short_df[v + "_lag"] = short_df.groupby(byvars)[v].shift(numlags)
        short_df[v + "_change"] = short_df[v] - short_df[v + "_lag"]
    dropvars = [v for v in var] + [v + "_lag" for v in var]
    short_df = short_df.drop(dropvars, axis=1)
    return df.merge(short_df, on=datevar + byvars, how="left")
Esempio n. 2
0
def portfolio_returns(
        df: pd.DataFrame,
        retvar: str = 'RET',
        datevar: str = 'Date',
        wtvar: str = 'Market Equity',
        byvars: ListOrStr = None,
        portvar: str = 'Market Equity Portfolio/B/M Portfolio'
) -> pd.DataFrame:

    byvars = _to_list_if_str(byvars)

    if byvars is not None:
        all_byvars = byvars + [datevar, portvar]
    else:
        all_byvars = [datevar, portvar]

    avgs = pd_utils.averages(df,
                             retvar,
                             byvars=all_byvars,
                             wtvar=wtvar,
                             count=False)

    _get_weighted_averages_from_averages(avgs)

    return avgs
Esempio n. 3
0
def fillna_by_groups_and_keep_one_per_group(df,
                                            byvars,
                                            exclude_cols=None,
                                            str_vars="first",
                                            num_vars="mean"):
    """
    Fills missing values by group, with different handling for string variables versus numeric,
    then keeps one observation per group.

    WARNING: do not use if index is important, it will be dropped
    """
    byvars = _to_list_if_str(byvars)
    if exclude_cols:
        exclude_cols = _to_list_if_str(exclude_cols)

    df = fillna_by_groups(df,
                          byvars,
                          exclude_cols=exclude_cols,
                          str_vars=str_vars,
                          num_vars=num_vars)
    _drop_duplicates(df, byvars)

    return df
Esempio n. 4
0
def construct_minus_variables(
        df: pd.DataFrame,
        labels: DictofStrsandStrLists,
        pairing: TwoStrTuple,
        factor_model: StrOrInt = 3,
        byvars: ListOrStr = None,
        datevar='Date',
        size_var: str = None,
        value_var: str = None,
        profitability_var: str = None,
        investment_var: str = None,
        custom_low_minus_high_dict: StrBoolDict = None) -> pd.DataFrame:

    byvars = _to_list_if_str(byvars)

    low_minus_high_dict = _get_low_minus_high_dict(
        factor_model=factor_model,
        size_var=size_var,
        value_var=value_var,
        profitability_var=profitability_var,
        investment_var=investment_var,
        custom_low_minus_high_dict=custom_low_minus_high_dict)

    minus_vars = []
    for index, portvar in enumerate(pairing):
        minus_vars.append(
            _construct_minus_variable(
                df,
                labels=labels,
                portvar_index=index,
                portvar=portvar,
                byvars=byvars,
                datevar=datevar,
                low_minus_high=low_minus_high_dict[portvar]))

    # Rename first factor to show it was calculated with second factor. E.g. SMB -> SMB_HML
    new_main_portname = _calculated_with_varname(minus_vars[0], minus_vars[1])
    df.rename(columns={minus_vars[0]: new_main_portname}, inplace=True)
    minus_vars = [new_main_portname] + minus_vars[1:]

    if byvars is not None:
        all_vars = byvars + [datevar] + minus_vars
    else:
        all_vars = [datevar] + minus_vars

    return df[all_vars]
Esempio n. 5
0
def fill_excluded_rows(df, byvars, fillvars=None, **fillna_kwargs):
    """
    Takes a dataframe which does not contain all possible combinations of byvars as rows. Creates
    those rows if fillna_kwargs are passed, calls fillna using fillna_kwargs for fillvars

    :param df:
    :param byvars: variables on which dataset should be expanded to product. Can pass a str, list of
            strs, or a list of pd.Series.
    :param fillvars: optional variables to apply fillna to
    :param fillna_kwargs: See pandas.DataFrame.fillna for kwargs, value=0 is common
    :return:

    :Example:

    An example::

        df:
                     date     id  var
            0  2003-06-09 42223C    1
            1  2003-06-10 09255G    2

        with fillna_for_excluded_rows(df, byvars=['date','id'], fillvars='var', value=0) becomes:

                      date     id  var
            0  2003-06-09 42223C    1
            1  2003-06-10 42223C    0
            2  2003-06-09 09255G    0
            3  2003-06-10 09255G    2
    """
    byvars, fillvars = [_to_list_if_str(v)
                        for v in [byvars, fillvars]]  # convert to lists

    #     multiindex = [df[i].dropna().unique() for i in byvars]
    multiindex = [_to_series_if_str(df, i).dropna().unique() for i in byvars]
    byvars = [_to_name_if_series(i) for i in byvars]  # get name of any series

    all_df = pd.DataFrame(
        index=pd.MultiIndex.from_product(multiindex)).reset_index()
    all_df.columns = byvars
    merged = all_df.merge(df, how="left", on=byvars)

    if fillna_kwargs:
        fillna_kwargs.update({"inplace": False})
        merged[fillvars] = merged[fillvars].fillna(**fillna_kwargs)
    return merged
Esempio n. 6
0
def _create_df_for_port_sort_byvars_result_byvars_port_datevar(df: pd.DataFrame, groupvar: str='Market Equity',
                                                               byvars: ListOrStr='Year', portvar: str='portfolio',
                                                               id_var: str='Ticker',
                                                               date_var: str='Date') -> DfListListStrTuple:

    byvars = _to_list_if_str(byvars)
    result_byvars = byvars + [id_var]

    # create dataframe which has only the beginning of period values for each company
    df_for_port_sort = df.sort_values([id_var, date_var]).groupby(
        result_byvars, as_index=False)[[groupvar, date_var]].first()

    # Rename date variable
    port_datevar = portvar + ' Formation Date'
    assert port_datevar not in df.columns  # don't overwrite existing
    df_for_port_sort.rename(columns={date_var: port_datevar}, inplace=True)


    return df_for_port_sort, byvars, result_byvars, port_datevar
Esempio n. 7
0
def long_averages_to_wide_averages(
    df: pd.DataFrame,
    datevar: str = 'Date',
    byvars: ListOrStr = None,
    retvar: str = 'RET',
    dual_portvar: str = 'Market Equity Portfolio/B/M Portfolio'
) -> pd.DataFrame:

    byvars = _to_list_if_str(byvars)

    if byvars is not None:
        all_byvars = byvars + [datevar]
    else:
        all_byvars = [datevar]

    return pd_utils.long_to_wide(df,
                                 groupvars=all_byvars,
                                 values=retvar,
                                 colindex=dual_portvar,
                                 colindex_only=True)
Esempio n. 8
0
def fillna_by_groups(df,
                     byvars,
                     exclude_cols=None,
                     str_vars="first",
                     num_vars="mean"):
    """
    Fills missing values by group, with different handling for string variables versus numeric

    WARNING: do not use if index is important, it will be dropped
    """
    byvars = _to_list_if_str(byvars)

    if exclude_cols:
        cols_to_fill = [
            col for col in df.columns
            if (col not in exclude_cols) and (col not in byvars)
        ]
        concat_vars = byvars + exclude_cols
    else:
        cols_to_fill = [col for col in df.columns if col not in byvars]
        concat_vars = byvars

    _fill_data = partial(_fill_data_for_series,
                         str_vars=str_vars,
                         num_vars=num_vars)

    out_dfs = []
    for group, group_df in df[byvars + cols_to_fill].groupby(byvars,
                                                             as_index=False):
        out_dfs.append(group_df.apply(_fill_data, axis=0))

    filled = pd.concat(out_dfs, axis=0).reset_index(drop=True)

    filled = _restore_nans_after_fill(
        filled
    )  # _fill_data places -999.999 in place of nans, now convert back

    return filled
Esempio n. 9
0
def market_returns(df: pd.DataFrame,
                   retvar: str = 'RET',
                   datevar: str = 'Date',
                   wtvar: str = 'Market Equity',
                   byvars: ListOrStr = None) -> pd.DataFrame:

    byvars = _to_list_if_str(byvars)

    if byvars is not None:
        all_byvars = byvars + [datevar]
    else:
        all_byvars = [datevar]

    avgs = pd_utils.averages(df[all_byvars + [retvar, wtvar]],
                             retvar,
                             byvars=all_byvars,
                             wtvar=wtvar,
                             count=False)

    _get_weighted_averages_from_averages(avgs)
    avgs.rename(columns={retvar: 'MKT'}, inplace=True)

    return avgs
Esempio n. 10
0
def create_ff_factors(
        df: pd.DataFrame,
        factor_model: StrOrInt,
        id_var: str = 'PERMNO',
        datevar='Date',
        byvars: ListOrStr = None,
        retvar: str = 'RET',
        wtvar: str = 'Market Equity',
        size_var: str = None,
        value_var: str = None,
        profitability_var: str = None,
        investment_var: str = None,
        custom_labels: DictofStrsandStrLists = None,
        custom_groupvar_ngroups_dict: GroupvarNgroupsDict = None,
        custom_pairings: TwoStrTupleList = None,
        custom_low_minus_high_dict: StrBoolDict = None) -> pd.DataFrame:

    #### Argument preparation #####

    _validate_model(factor_model,
                    custom_labels=custom_labels,
                    custom_groupvar_ngroups_dict=custom_groupvar_ngroups_dict,
                    custom_pairings=custom_pairings,
                    custom_low_minus_high_dict=custom_low_minus_high_dict)

    custom_labels, custom_groupvar_ngroups_dict, custom_pairings, custom_low_minus_high_dict = _standardize_custom_args(
        custom_labels=custom_labels,
        custom_groupvar_ngroups_dict=custom_groupvar_ngroups_dict,
        custom_pairings=custom_pairings,
        custom_low_minus_high_dict=custom_low_minus_high_dict)

    byvars = _to_list_if_str(byvars)

    default_varnames = dict(size_var=size_var,
                            value_var=value_var,
                            profitability_var=profitability_var,
                            investment_var=investment_var)
    # Convert to portfolio names
    default_portfolio_varnames = {
        key: _other_groupvar_portname(value) if value else value
        for key, value in default_varnames.items()
    }

    ##### Main logic ##########

    # Assigns portfolios, leaves data in the original shape
    ff_portfolios = create_ff_portfolios(
        df,
        factor_model=factor_model,
        byvars=byvars,
        id_var=id_var,
        date_var=datevar,
        custom_groupvar_ngroups_dict=custom_groupvar_ngroups_dict,
        **default_varnames)

    # Replace numbered portfolios with labeled portfolios
    labels = get_and_set_labels(ff_portfolios,
                                factor_model=factor_model,
                                custom_labels=custom_labels,
                                **default_portfolio_varnames)

    # Get necessary dual sort pairings, create dual sort portfolio variables
    pairings = create_dual_sort_variables_get_pairings(
        ff_portfolios,
        factor_model=factor_model,
        custom_pairings=custom_pairings,
        **default_portfolio_varnames)

    if byvars is not None:
        base_vars = byvars + [datevar]
    else:
        base_vars = [datevar]

    # Fama-French portfolio difference procedure. Reduces down to size of time/byvars
    # With 3 factor model, this is the last calculation step, as the SMB factor calculated
    # using value portfolios is the entire SMB factor. Just need to rename after.
    base_df = ff_portfolios.loc[:, base_vars].drop_duplicates()
    for pairing in pairings:
        minus_vars_df = construct_averges_and_minus_variables_for_pairing(
            df=ff_portfolios,
            pairing=pairing,
            labels=labels,
            factor_model=factor_model,
            datevar=datevar,
            byvars=byvars,
            retvar=retvar,
            wtvar=wtvar,
            custom_low_minus_high_dict=custom_low_minus_high_dict,
            **default_portfolio_varnames)
        base_df = base_df.merge(minus_vars_df, how='left', on=base_vars)

    # 3 factor model, just rename SMB_HML to SMB
    # For 5 factor model, SMB is calculated separatetely with value ports, profitability
    # ports, and investment ports, then the three are averaged to get the final SMB fator
    combine_main_portfolios(
        df=base_df,
        labels=labels,
        pairings=pairings,
        factor_model=factor_model,
        custom_low_minus_high_dict=custom_low_minus_high_dict,
        **default_portfolio_varnames)

    # Add market returns
    mkt_df: pd.DataFrame = market_returns(df=ff_portfolios,
                                          retvar=retvar,
                                          datevar=datevar,
                                          wtvar=wtvar,
                                          byvars=byvars)
    base_df = base_df.merge(mkt_df, how='left', on=base_vars)

    return base_df