def load_compustat( freq, get=['sale'], debug=False, comp_dir=r'C:\Users\derobertisna.UFAD\Desktop\Data\Compustat'): freq = check_freq(freq) name = freq_to_name(freq, debug) path = os.path.join(comp_dir, name) comp = load_sas(path, dtype={'gvkey': str}) comp = keep_relevant_data_compustat(comp, get=get, freq=freq) convert_date_compustat(comp) return comp
def _load_data_by_extension_and_convert_date(filepath, freq='m'): filename, file_extension = os.path.splitext(filepath) extension = file_extension.lower() if extension == '.sas7bdat': df = load_sas(filepath) if freq != 'm': df['date'] = convert_sas_date_to_pandas_date( df['date']) #convert to date object return df elif extension == '.csv': if freq != 'm': return pd.read_csv(filepath, parse_dates=['date']) else: return pd.read_csv(filepath) else: raise ValueError( f'Please pass a sas7bdat or csv for FF factors, got {extension}')
def __load_crsp(self): #Check frequency if self.freq.lower() == 'm': filename = 'msf' elif self.freq.lower() == 'd': filename = 'dsf' else: raise ValueError('use m or d for frequency') if self.debug: filename += '_test' #debug datasets only have permnos 10516, 10517 #Load in CRSP file self._log('Loading CRSP dataframe...') filepath = os.path.join(self.crsp_dir, filename + '.sas7bdat') self.crsp_dfs[self.freq.lower()] = load_sas(filepath) self._log('Loaded.') #Change date to datetime format self._log('Converting SAS date to Pandas format.') self.crsp_dfs[ self.freq.lower()]['DATE'] = convert_sas_date_to_pandas_date( self.crsp_dfs[self.freq.lower()]['DATE']) self._log('Converted.')
def merge_dsenames(df, on='TICKER', get='PERMNO', date='Date', other_byvars=None, crsp_dir=None): ''' Merges with dsenames file on on variable (TICKER, PERMNO, PERMCO, NCUSIP, CUSIP6), to get get variable (same list). Must have a Date variable in df. Default is to match on TICKER and pull PERMNO. Required inputs: df: pandas dataframe containing any of (TICKER, PERMNO, PERMCO, NCUSIP, CUSIP6) Optional inputs: on: str, column name to merge on, one of (TICKER, PERMNO, PERMCO, NCUSIP, CUSIP6) get: str or list, column or columns to get from dsenames, any of (TICKER, PERMNO, PERMCO, NCUSIP, CUSIP6) that aren't already in on date: str, column name of date variable other_byvars: any other variables signifying groups in the data, prevents from collapsing those groups ''' # Set default CRSP dir # if crsp_dir is None: # crsp_dir = data_path('CRSP') #Make get a list if isinstance(get, str): get = [get] assert isinstance(get, list) #Make other byvars a list if not other_byvars: other_byvars = [] if isinstance(other_byvars, str): other_byvars = [other_byvars] assert isinstance(other_byvars, list) assert on not in get #can't get what we already have #Pull from CRSP dsenames file file = 'dsenames' fullpath = os.path.join(crsp_dir, file + '.sas7bdat') names_df = load_sas(fullpath) #Convert NCUSIP to CUSIP6 if on == 'CUSIP6' or 'CUSIP6' in get: names_df['CUSIP6'] = names_df['NCUSIP'].apply( lambda x: x if pd.isnull(x) else x[:6]) names_df['start'] = convert_sas_date_to_pandas_date(names_df['NAMEDT']) names_df['end'] = convert_sas_date_to_pandas_date(names_df['NAMEENDT']) names_df['end'] = names_df['end'].fillna(datetime.date.today()) #Now perform merge merged = df[[on, date] + other_byvars].merge( names_df[['start', 'end', on] + get], how='left', on=on) #Drop out observations not in name date range valid = (merged[date] >= merged['start']) & (merged[date] <= merged['end']) #However if there is not a match, doing merged[valid] would drop the observation instead of leaving nan #Therefore, take merged[valid] and merge back again to original new_merged = df.merge(merged[valid].drop(['start', 'end'], axis=1), how='left', on=[on, date] + other_byvars) new_merged = new_merged.reset_index(drop=True) if 'PERMNO' in get: #Dsenames has no record of which permno is the primary link when a firm has multiple share classes. #To get this information, we must merge ccmxpf_linktable. We want to keep only primary links. dups = new_merged[[date, on] + other_byvars].duplicated( keep=False) #series of True or False of whether duplicated row if dups.any( ): #this means we got more than one permno for a single period/firm/byvars duplicated = new_merged[dups].reset_index( ) #puts index in a column for later use not_duplicated = new_merged[~dups] #Take duplicated, merge to ccmxpf_linktable to get gvkey, and the ones which do not have gvkeys are #the non-primary links with_gvkey = get_gvkey_or_permno( duplicated, date) #default is to get gvkey with permno removed_duplicates = with_gvkey[~pd.isnull(with_gvkey['GVKEY'] )].drop('GVKEY', axis=1) #Set index back removed_duplicates.set_index('index', inplace=True) #Now append back together and sort full = not_duplicated.append(removed_duplicates) new_merged = full.sort_index() return new_merged.reset_index(drop=True)
def get_gvkey_or_permno(df, datevar, get='GVKEY', other_byvars=None, crsp_dir=None): """ Takes a dataframe containing either GVKEY or PERMNO and merges to the CRSP linktable to get the other one. """ # Set default CRSP dir # if crsp_dir is None: # crsp_dir = data_path('CRSP') if get == 'GVKEY': rename_get = 'gvkey' l_on = 'PERMNO' r_on = 'lpermno' elif get == 'PERMNO': rename_get = 'lpermno' l_on = 'GVKEY' r_on = 'gvkey' else: raise ValueError('Need get="GVKEY" or "PERMNO"') #Make other byvars a list if not other_byvars: other_byvars = [] if isinstance(other_byvars, str): other_byvars = [other_byvars] assert isinstance(other_byvars, list) link_name = 'ccmxpf_linktable.sas7bdat' link_path = os.path.join(crsp_dir, link_name) link = load_sas(link_path) link['linkdt'] = convert_sas_date_to_pandas_date(link['linkdt']) link['linkenddt'] = convert_sas_date_to_pandas_date(link['linkenddt']) #If end date is missing, that means link is still active. Make end date today. link['linkenddt'] = link['linkenddt'].fillna(datetime.date.today()) #Remove links with no permno so that they don't match to nans in the input df link.dropna(subset=['lpermno'], inplace=True) merged = df.merge( link[['lpermno', 'gvkey', 'linkdt', 'linkenddt', 'linkprim']], how='left', left_on=l_on, right_on=r_on) valid = (merged[datevar] >= merged.linkdt) & \ (merged[datevar] <= merged.linkenddt) & \ (merged.linkprim == 'P') merged = merged[valid].drop(['linkdt', 'linkenddt', 'linkprim', r_on], axis=1).drop_duplicates() merged.rename(columns={rename_get: get}, inplace=True) #Now merge back to the original again to ensure that rows are not deleted new_merged = df.merge(merged[['PERMNO', 'GVKEY', datevar] + other_byvars], how='left', on=[l_on, datevar] + other_byvars) return new_merged