Example #1
0
def _load_data_by_extension_and_convert_date(filepath, freq='m'):
    filename, file_extension = os.path.splitext(filepath)
    extension = file_extension.lower()
    if extension == '.sas7bdat':
        df = load_sas(filepath)
        if freq != 'm':
            df['date'] = convert_sas_date_to_pandas_date(
                df['date'])  #convert to date object
        return df
    elif extension == '.csv':
        if freq != 'm':
            return pd.read_csv(filepath, parse_dates=['date'])
        else:
            return pd.read_csv(filepath)
    else:
        raise ValueError(
            f'Please pass a sas7bdat or csv for FF factors, got {extension}')
Example #2
0
    def test_convert(self):

        expect_df = pd.DataFrame(
            data=[
                (numpy.datetime64("2004-02-13T00:00:00.000000000"),),
                (numpy.datetime64("2005-02-15T00:00:00.000000000"),),
                (numpy.datetime64("2007-01-12T00:00:00.000000000"),),
                (numpy.datetime64("2007-01-31T00:00:00.000000000"),),
                (numpy.datetime64("2007-02-15T00:00:00.000000000"),),
            ],
            columns=[0],
        )

        converted = pd.DataFrame(
            pd_utils.convert_sas_date_to_pandas_date(self.df_sasdate["datadate"])
        )

        assert_frame_equal(expect_df, converted)
Example #3
0
    def __load_crsp(self):
        #Check frequency
        if self.freq.lower() == 'm':
            filename = 'msf'
        elif self.freq.lower() == 'd':
            filename = 'dsf'
        else:
            raise ValueError('use m or d for frequency')
        if self.debug:
            filename += '_test'  #debug datasets only have permnos 10516, 10517

        #Load in CRSP file
        self._log('Loading CRSP dataframe...')
        filepath = os.path.join(self.crsp_dir, filename + '.sas7bdat')
        self.crsp_dfs[self.freq.lower()] = load_sas(filepath)
        self._log('Loaded.')

        #Change date to datetime format
        self._log('Converting SAS date to Pandas format.')
        self.crsp_dfs[
            self.freq.lower()]['DATE'] = convert_sas_date_to_pandas_date(
                self.crsp_dfs[self.freq.lower()]['DATE'])
        self._log('Converted.')
Example #4
0
def convert_date_compustat(df):
    df['datadate'] = convert_sas_date_to_pandas_date(df['datadate'])
Example #5
0
def merge_dsenames(df,
                   on='TICKER',
                   get='PERMNO',
                   date='Date',
                   other_byvars=None,
                   crsp_dir=None):
    '''
    Merges with dsenames file on on variable (TICKER, PERMNO, PERMCO, NCUSIP, CUSIP6), to get get variable (same list).
    Must have a Date variable in df.
    
    Default is to match on TICKER and pull PERMNO.
    
    Required inputs:
    df: pandas dataframe containing any of (TICKER, PERMNO, PERMCO, NCUSIP, CUSIP6)
    
    Optional inputs:
    on: str, column name to merge on, one of (TICKER, PERMNO, PERMCO, NCUSIP, CUSIP6)
    get: str or list, column or columns to get from dsenames, any of (TICKER, PERMNO, PERMCO, NCUSIP, CUSIP6)
         that aren't already in on
    date: str, column name of date variable
    other_byvars: any other variables signifying groups in the data, prevents from collapsing those groups
    '''

    # Set default CRSP dir
    # if crsp_dir is None:
    #     crsp_dir = data_path('CRSP')

    #Make get a list
    if isinstance(get, str):
        get = [get]
    assert isinstance(get, list)

    #Make other byvars a list
    if not other_byvars:
        other_byvars = []
    if isinstance(other_byvars, str):
        other_byvars = [other_byvars]
    assert isinstance(other_byvars, list)

    assert on not in get  #can't get what we already have

    #Pull from CRSP dsenames file
    file = 'dsenames'
    fullpath = os.path.join(crsp_dir, file + '.sas7bdat')
    names_df = load_sas(fullpath)

    #Convert NCUSIP to CUSIP6
    if on == 'CUSIP6' or 'CUSIP6' in get:
        names_df['CUSIP6'] = names_df['NCUSIP'].apply(
            lambda x: x if pd.isnull(x) else x[:6])

    names_df['start'] = convert_sas_date_to_pandas_date(names_df['NAMEDT'])
    names_df['end'] = convert_sas_date_to_pandas_date(names_df['NAMEENDT'])
    names_df['end'] = names_df['end'].fillna(datetime.date.today())

    #Now perform merge
    merged = df[[on, date] + other_byvars].merge(
        names_df[['start', 'end', on] + get], how='left', on=on)
    #Drop out observations not in name date range
    valid = (merged[date] >= merged['start']) & (merged[date] <= merged['end'])
    #However if there is not a match, doing merged[valid] would drop the observation instead of leaving nan
    #Therefore, take merged[valid] and merge back again to original
    new_merged = df.merge(merged[valid].drop(['start', 'end'], axis=1),
                          how='left',
                          on=[on, date] + other_byvars)
    new_merged = new_merged.reset_index(drop=True)

    if 'PERMNO' in get:
        #Dsenames has no record of which permno is the primary link when a firm has multiple share classes.
        #To get this information, we must merge ccmxpf_linktable. We want to keep only primary links.
        dups = new_merged[[date, on] + other_byvars].duplicated(
            keep=False)  #series of True or False of whether duplicated row
        if dups.any(
        ):  #this means we got more than one permno for a single period/firm/byvars
            duplicated = new_merged[dups].reset_index(
            )  #puts index in a column for later use
            not_duplicated = new_merged[~dups]

            #Take duplicated, merge to ccmxpf_linktable to get gvkey, and the ones which do not have gvkeys are
            #the non-primary links
            with_gvkey = get_gvkey_or_permno(
                duplicated, date)  #default is to get gvkey with permno
            removed_duplicates = with_gvkey[~pd.isnull(with_gvkey['GVKEY']
                                                       )].drop('GVKEY', axis=1)

            #Set index back
            removed_duplicates.set_index('index', inplace=True)

            #Now append back together and sort
            full = not_duplicated.append(removed_duplicates)
            new_merged = full.sort_index()

    return new_merged.reset_index(drop=True)
Example #6
0
def get_gvkey_or_permno(df,
                        datevar,
                        get='GVKEY',
                        other_byvars=None,
                        crsp_dir=None):
    """
    Takes a dataframe containing either GVKEY or PERMNO and merges to the CRSP linktable to get the other one.
    """

    # Set default CRSP dir
    # if crsp_dir is None:
    #     crsp_dir = data_path('CRSP')

    if get == 'GVKEY':
        rename_get = 'gvkey'
        l_on = 'PERMNO'
        r_on = 'lpermno'
    elif get == 'PERMNO':
        rename_get = 'lpermno'
        l_on = 'GVKEY'
        r_on = 'gvkey'
    else:
        raise ValueError('Need get="GVKEY" or "PERMNO"')

    #Make other byvars a list
    if not other_byvars:
        other_byvars = []
    if isinstance(other_byvars, str):
        other_byvars = [other_byvars]
    assert isinstance(other_byvars, list)

    link_name = 'ccmxpf_linktable.sas7bdat'
    link_path = os.path.join(crsp_dir, link_name)

    link = load_sas(link_path)
    link['linkdt'] = convert_sas_date_to_pandas_date(link['linkdt'])
    link['linkenddt'] = convert_sas_date_to_pandas_date(link['linkenddt'])
    #If end date is missing, that means link is still active. Make end date today.
    link['linkenddt'] = link['linkenddt'].fillna(datetime.date.today())

    #Remove links with no permno so that they don't match to nans in the input df
    link.dropna(subset=['lpermno'], inplace=True)

    merged = df.merge(
        link[['lpermno', 'gvkey', 'linkdt', 'linkenddt', 'linkprim']],
        how='left',
        left_on=l_on,
        right_on=r_on)

    valid = (merged[datevar] >= merged.linkdt) & \
            (merged[datevar] <= merged.linkenddt) & \
            (merged.linkprim == 'P')

    merged = merged[valid].drop(['linkdt', 'linkenddt', 'linkprim', r_on],
                                axis=1).drop_duplicates()
    merged.rename(columns={rename_get: get}, inplace=True)

    #Now merge back to the original again to ensure that rows are not deleted
    new_merged = df.merge(merged[['PERMNO', 'GVKEY', datevar] + other_byvars],
                          how='left',
                          on=[l_on, datevar] + other_byvars)

    return new_merged