def var_change_by_groups(df: pd.DataFrame, var: Union[str, List[str]], byvars: Union[str, List[str]], datevar: str = "Date", numlags: int = 1): """ Used for getting variable changes over time within bygroups. :Notes: Dataset is not sorted in this process. Sort the data in the order in which you wish lags to be created before running this command. :param df: dataframe containing bygroups, a date variable, and variables of interest :param var: column names of variables to get changes :param byvars: column names of variables identifying by groups :param datevar: column names of variables identifying periods :param numlags: number of periods to go back to get change :return: """ var, byvars, datevar = [ _to_list_if_str(v) for v in [var, byvars, datevar] ] # convert to lists assert isinstance(var, list) assert isinstance(byvars, list) assert isinstance(datevar, list) short_df = df.loc[~pd.isnull(df[byvars]).any(axis=1), var + byvars + datevar].drop_duplicates() for v in var: short_df[v + "_lag"] = short_df.groupby(byvars)[v].shift(numlags) short_df[v + "_change"] = short_df[v] - short_df[v + "_lag"] dropvars = [v for v in var] + [v + "_lag" for v in var] short_df = short_df.drop(dropvars, axis=1) return df.merge(short_df, on=datevar + byvars, how="left")
def portfolio_returns( df: pd.DataFrame, retvar: str = 'RET', datevar: str = 'Date', wtvar: str = 'Market Equity', byvars: ListOrStr = None, portvar: str = 'Market Equity Portfolio/B/M Portfolio' ) -> pd.DataFrame: byvars = _to_list_if_str(byvars) if byvars is not None: all_byvars = byvars + [datevar, portvar] else: all_byvars = [datevar, portvar] avgs = pd_utils.averages(df, retvar, byvars=all_byvars, wtvar=wtvar, count=False) _get_weighted_averages_from_averages(avgs) return avgs
def fillna_by_groups_and_keep_one_per_group(df, byvars, exclude_cols=None, str_vars="first", num_vars="mean"): """ Fills missing values by group, with different handling for string variables versus numeric, then keeps one observation per group. WARNING: do not use if index is important, it will be dropped """ byvars = _to_list_if_str(byvars) if exclude_cols: exclude_cols = _to_list_if_str(exclude_cols) df = fillna_by_groups(df, byvars, exclude_cols=exclude_cols, str_vars=str_vars, num_vars=num_vars) _drop_duplicates(df, byvars) return df
def construct_minus_variables( df: pd.DataFrame, labels: DictofStrsandStrLists, pairing: TwoStrTuple, factor_model: StrOrInt = 3, byvars: ListOrStr = None, datevar='Date', size_var: str = None, value_var: str = None, profitability_var: str = None, investment_var: str = None, custom_low_minus_high_dict: StrBoolDict = None) -> pd.DataFrame: byvars = _to_list_if_str(byvars) low_minus_high_dict = _get_low_minus_high_dict( factor_model=factor_model, size_var=size_var, value_var=value_var, profitability_var=profitability_var, investment_var=investment_var, custom_low_minus_high_dict=custom_low_minus_high_dict) minus_vars = [] for index, portvar in enumerate(pairing): minus_vars.append( _construct_minus_variable( df, labels=labels, portvar_index=index, portvar=portvar, byvars=byvars, datevar=datevar, low_minus_high=low_minus_high_dict[portvar])) # Rename first factor to show it was calculated with second factor. E.g. SMB -> SMB_HML new_main_portname = _calculated_with_varname(minus_vars[0], minus_vars[1]) df.rename(columns={minus_vars[0]: new_main_portname}, inplace=True) minus_vars = [new_main_portname] + minus_vars[1:] if byvars is not None: all_vars = byvars + [datevar] + minus_vars else: all_vars = [datevar] + minus_vars return df[all_vars]
def fill_excluded_rows(df, byvars, fillvars=None, **fillna_kwargs): """ Takes a dataframe which does not contain all possible combinations of byvars as rows. Creates those rows if fillna_kwargs are passed, calls fillna using fillna_kwargs for fillvars :param df: :param byvars: variables on which dataset should be expanded to product. Can pass a str, list of strs, or a list of pd.Series. :param fillvars: optional variables to apply fillna to :param fillna_kwargs: See pandas.DataFrame.fillna for kwargs, value=0 is common :return: :Example: An example:: df: date id var 0 2003-06-09 42223C 1 1 2003-06-10 09255G 2 with fillna_for_excluded_rows(df, byvars=['date','id'], fillvars='var', value=0) becomes: date id var 0 2003-06-09 42223C 1 1 2003-06-10 42223C 0 2 2003-06-09 09255G 0 3 2003-06-10 09255G 2 """ byvars, fillvars = [_to_list_if_str(v) for v in [byvars, fillvars]] # convert to lists # multiindex = [df[i].dropna().unique() for i in byvars] multiindex = [_to_series_if_str(df, i).dropna().unique() for i in byvars] byvars = [_to_name_if_series(i) for i in byvars] # get name of any series all_df = pd.DataFrame( index=pd.MultiIndex.from_product(multiindex)).reset_index() all_df.columns = byvars merged = all_df.merge(df, how="left", on=byvars) if fillna_kwargs: fillna_kwargs.update({"inplace": False}) merged[fillvars] = merged[fillvars].fillna(**fillna_kwargs) return merged
def _create_df_for_port_sort_byvars_result_byvars_port_datevar(df: pd.DataFrame, groupvar: str='Market Equity', byvars: ListOrStr='Year', portvar: str='portfolio', id_var: str='Ticker', date_var: str='Date') -> DfListListStrTuple: byvars = _to_list_if_str(byvars) result_byvars = byvars + [id_var] # create dataframe which has only the beginning of period values for each company df_for_port_sort = df.sort_values([id_var, date_var]).groupby( result_byvars, as_index=False)[[groupvar, date_var]].first() # Rename date variable port_datevar = portvar + ' Formation Date' assert port_datevar not in df.columns # don't overwrite existing df_for_port_sort.rename(columns={date_var: port_datevar}, inplace=True) return df_for_port_sort, byvars, result_byvars, port_datevar
def long_averages_to_wide_averages( df: pd.DataFrame, datevar: str = 'Date', byvars: ListOrStr = None, retvar: str = 'RET', dual_portvar: str = 'Market Equity Portfolio/B/M Portfolio' ) -> pd.DataFrame: byvars = _to_list_if_str(byvars) if byvars is not None: all_byvars = byvars + [datevar] else: all_byvars = [datevar] return pd_utils.long_to_wide(df, groupvars=all_byvars, values=retvar, colindex=dual_portvar, colindex_only=True)
def fillna_by_groups(df, byvars, exclude_cols=None, str_vars="first", num_vars="mean"): """ Fills missing values by group, with different handling for string variables versus numeric WARNING: do not use if index is important, it will be dropped """ byvars = _to_list_if_str(byvars) if exclude_cols: cols_to_fill = [ col for col in df.columns if (col not in exclude_cols) and (col not in byvars) ] concat_vars = byvars + exclude_cols else: cols_to_fill = [col for col in df.columns if col not in byvars] concat_vars = byvars _fill_data = partial(_fill_data_for_series, str_vars=str_vars, num_vars=num_vars) out_dfs = [] for group, group_df in df[byvars + cols_to_fill].groupby(byvars, as_index=False): out_dfs.append(group_df.apply(_fill_data, axis=0)) filled = pd.concat(out_dfs, axis=0).reset_index(drop=True) filled = _restore_nans_after_fill( filled ) # _fill_data places -999.999 in place of nans, now convert back return filled
def market_returns(df: pd.DataFrame, retvar: str = 'RET', datevar: str = 'Date', wtvar: str = 'Market Equity', byvars: ListOrStr = None) -> pd.DataFrame: byvars = _to_list_if_str(byvars) if byvars is not None: all_byvars = byvars + [datevar] else: all_byvars = [datevar] avgs = pd_utils.averages(df[all_byvars + [retvar, wtvar]], retvar, byvars=all_byvars, wtvar=wtvar, count=False) _get_weighted_averages_from_averages(avgs) avgs.rename(columns={retvar: 'MKT'}, inplace=True) return avgs
def create_ff_factors( df: pd.DataFrame, factor_model: StrOrInt, id_var: str = 'PERMNO', datevar='Date', byvars: ListOrStr = None, retvar: str = 'RET', wtvar: str = 'Market Equity', size_var: str = None, value_var: str = None, profitability_var: str = None, investment_var: str = None, custom_labels: DictofStrsandStrLists = None, custom_groupvar_ngroups_dict: GroupvarNgroupsDict = None, custom_pairings: TwoStrTupleList = None, custom_low_minus_high_dict: StrBoolDict = None) -> pd.DataFrame: #### Argument preparation ##### _validate_model(factor_model, custom_labels=custom_labels, custom_groupvar_ngroups_dict=custom_groupvar_ngroups_dict, custom_pairings=custom_pairings, custom_low_minus_high_dict=custom_low_minus_high_dict) custom_labels, custom_groupvar_ngroups_dict, custom_pairings, custom_low_minus_high_dict = _standardize_custom_args( custom_labels=custom_labels, custom_groupvar_ngroups_dict=custom_groupvar_ngroups_dict, custom_pairings=custom_pairings, custom_low_minus_high_dict=custom_low_minus_high_dict) byvars = _to_list_if_str(byvars) default_varnames = dict(size_var=size_var, value_var=value_var, profitability_var=profitability_var, investment_var=investment_var) # Convert to portfolio names default_portfolio_varnames = { key: _other_groupvar_portname(value) if value else value for key, value in default_varnames.items() } ##### Main logic ########## # Assigns portfolios, leaves data in the original shape ff_portfolios = create_ff_portfolios( df, factor_model=factor_model, byvars=byvars, id_var=id_var, date_var=datevar, custom_groupvar_ngroups_dict=custom_groupvar_ngroups_dict, **default_varnames) # Replace numbered portfolios with labeled portfolios labels = get_and_set_labels(ff_portfolios, factor_model=factor_model, custom_labels=custom_labels, **default_portfolio_varnames) # Get necessary dual sort pairings, create dual sort portfolio variables pairings = create_dual_sort_variables_get_pairings( ff_portfolios, factor_model=factor_model, custom_pairings=custom_pairings, **default_portfolio_varnames) if byvars is not None: base_vars = byvars + [datevar] else: base_vars = [datevar] # Fama-French portfolio difference procedure. Reduces down to size of time/byvars # With 3 factor model, this is the last calculation step, as the SMB factor calculated # using value portfolios is the entire SMB factor. Just need to rename after. base_df = ff_portfolios.loc[:, base_vars].drop_duplicates() for pairing in pairings: minus_vars_df = construct_averges_and_minus_variables_for_pairing( df=ff_portfolios, pairing=pairing, labels=labels, factor_model=factor_model, datevar=datevar, byvars=byvars, retvar=retvar, wtvar=wtvar, custom_low_minus_high_dict=custom_low_minus_high_dict, **default_portfolio_varnames) base_df = base_df.merge(minus_vars_df, how='left', on=base_vars) # 3 factor model, just rename SMB_HML to SMB # For 5 factor model, SMB is calculated separatetely with value ports, profitability # ports, and investment ports, then the three are averaged to get the final SMB fator combine_main_portfolios( df=base_df, labels=labels, pairings=pairings, factor_model=factor_model, custom_low_minus_high_dict=custom_low_minus_high_dict, **default_portfolio_varnames) # Add market returns mkt_df: pd.DataFrame = market_returns(df=ff_portfolios, retvar=retvar, datevar=datevar, wtvar=wtvar, byvars=byvars) base_df = base_df.merge(mkt_df, how='left', on=base_vars) return base_df