Esempio n. 1
0
def build_crsp_m(permnos):

    crsp_m = conn.raw_sql(f"""
                          select a.permno, a.permco, a.date, b.ticker, b.ncusip, b.shrcd, b.exchcd, b.siccd,
                          a.prc, a.ret, a.retx, a.shrout, a.vol
                          from crsp.msf as a
                          left join crsp.msenames as b
                          on a.permno=b.permno
                          and b.namedt<=a.date
                          and a.date<=b.nameendt
                          where a.permno in {permnos}
                          and b.exchcd between 1 and 3
                          and b.shrcd between 10 and 11
                          """)

    crsp_m['me'] = crsp_m['prc'].abs()*crsp_m['shrout']
    crsp_m['prca'] = crsp_m['prc'].abs()
    crsp_m['lprc'] = crsp_m.groupby(['permno','permco'])['prca'].shift(1)
    crsp_m['lme'] = crsp_m.groupby(['permno','permco'])['me'].shift(1)
    crsp_m[['permco', 'permno', 'shrcd', 'exchcd']] = crsp_m[['permco', 'permno', 'shrcd', 'exchcd']].astype(int)

    # Line up date to be end of month
    crsp_m['date'] = pd.to_datetime(crsp_m['date'])
    crsp_m['jdate'] = crsp_m['date']+MonthEnd(0)

    return crsp_m
Esempio n. 2
0
def build_compq(permnos):
    gvkeys = permnos_to_gvkeys(permnos)
    compq = conn.raw_sql(f"""
                         select 
                         fyearq, fqtr, apdedateq, datadate, pdateq, fdateq, f.gvkey, REVTQ, REQ, EPSPIQ, ACTQ, 
                         INVTQ, LCTQ, CHQ, CSHOQ, PRCCQ, NIQ, ATQ, LTQ, GDWLQ
                         from comp.fundq as f
                         where f.gvkey in {gvkeys}
                         and REVTQ != 'NaN'
                         """)

    compq.fillna(value=np.nan, inplace=True)
    compq.dropna(axis='rows',
                 how='any',
                 subset=['fyearq', 'fqtr'],
                 inplace=True)
    compq['fyearq'].astype(int)
    compq['fqtr'].astype(int)
    compq['datadate'] = pd.to_datetime(compq['datadate'])
    compq['permno'] = compq['gvkey'].apply(gvkey_to_permno)

    compq['quickq'] = (compq['actq'] - compq['invtq']) / compq['lctq']
    compq['curratq'] = compq['actq'] / compq['lctq']
    compq['cashrratq'] = compq['chq'] / compq['lctq']
    compq['peq'] = (compq['cshoq'] * compq['prccq']) / compq['niq']
    compq['roeq'] = compq['niq'] / (compq['prccq'] * compq['prccq'])
    compq['roaq'] = compq['niq'] / (compq['atq'] - compq['ltq'])

    return compq
Esempio n. 3
0
def build_ccm_data(permnos, comp, crsp_jun):

    ccm = conn.raw_sql(f"""
                       select gvkey, lpermno as permno, linktype, linkprim, linkdt, linkenddt
                       from crsp.ccmxpf_linktable
                       where lpermno in {permnos}
                       and substr(linktype,1,1)='L'
                       and linkprim in ('P', 'C')
                       """)

    ccm['linkdt'] = pd.to_datetime(ccm['linkdt'])
    ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt'])
    # if linkenddt is missing then set to next month's june (Changed)
    ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')+YearEnd(0)+MonthEnd(6))
    ccm['linkenddt'] = ccm['linkenddt'].dt.date
    ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt'])

    ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey'])
    ccm1['yearend'] = ccm1['datadate']+YearEnd(0)
    ccm1['jdate'] = ccm1['yearend']+MonthEnd(6)

    ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= (ccm1['linkenddt']))]
    ccm2 = ccm2.drop(columns=['datadate_a','linktype','linkdt','linkenddt'])

    ccm_data = pd.merge(ccm2, crsp_jun, how='left', on=['permno', 'jdate'])

    ccm_data = ccm_data[ccm_data.dec_me != 0]
    ccm_data['beme'] = ccm_data['be']*1000/ccm_data['dec_me']

    # drop duplicates
    ccm_data = ccm_data.sort_values(by=['permno', 'date']).drop_duplicates()
    ccm_data = ccm_data.sort_values(by=['gvkey', 'date']).drop_duplicates()

    # Note: Different from SAS, Python count start from zero, will see if I need to add 1 to better serve the need
    ccm_data['count'] = ccm_data.groupby(['gvkey']).cumcount()

    # Parallel to the cleaning step for 'dr'
    ccm_data['dr'] = np.where(ccm_data.drc.notna() & ccm_data.drlt.notna(), ccm_data.drc+ccm_data.drlt, None)
    ccm_data['dr'] = np.where(ccm_data.drc.notna() & ccm_data.drlt.isna(), ccm_data.drc, ccm_data['dr'])
    ccm_data['dr'] = np.where(ccm_data.drc.isna() & ccm_data.drlt.notna(), ccm_data.drlt, ccm_data['dr'])
    # Parallel to the cleaning step for 'dc'
    ccm_data.loc[(ccm_data['dcvt'].isna()) & (ccm_data['dcpstk'].notna()) & (ccm_data['pstk'].notna()) & (ccm_data['dcpstk'] > ccm_data['pstk']), 'dc'] = ccm_data['dcpstk'] - ccm_data['pstk']
    ccm_data.loc[(ccm_data['dcvt'].isna()) & (ccm_data['dcpstk'].notna()) & (ccm_data['pstk'].isna()), 'dc'] = ccm_data['dcpstk']
    ccm_data.loc[(ccm_data['dc'].isna()), 'dc'] = ccm_data['dcvt']

    ccm_data['xint'] = ccm_data['xint'].fillna(0)
    ccm_data['xsga'] = ccm_data['xsga'].fillna(0)

    ccm_data = ccm_data.sort_values(by=['permno', 'date']).drop_duplicates()
    ccm_data.fillna(value=pd.np.nan, inplace=True)

    return ccm_data
Esempio n. 4
0
def build_dlret(permnos):

    dlret = conn.raw_sql(f"""
                         select permno, dlret, dlstdt
                         from crsp.msedelist
                         where permno in {permnos}
                         """)
    dlret.permno = dlret.permno.astype(int)

    # Line up date to be end of month
    dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt'])
    dlret['jdate'] = dlret['dlstdt']+MonthEnd(0)

    return dlret
Esempio n. 5
0
def build_comp(permnos):
    gvkeys = permnos_to_gvkeys(permnos)

    comp = conn.raw_sql(f"""
                        select
                        fyear, apdedate, datadate, pdate, fdate, c.gvkey, f.cusip as cnum, datadate as datadate_a, 
                        c.cik, sic as sic2, sic, naics, sale, revt, cogs, xsga, xrd, xad, ib, ebitda, ebit, nopi, 
                        spi, pi, txp, ni, txfed, txfo, txt, xint, capx, oancf, dvt, ob, gdwlia, gdwlip, gwo, rect, act, 
                        che, ppegt, invt, at, aco, intan, ao, ppent, gdwl, fatb, fatl, lct, dlc, dltt, lt, dm, dcvt, 
                        cshrc, dcpstk, pstk, ap, lco, lo, drc, drlt, txdi, ceq, scstkc, emp, csho, /*addition*/
                        pstkrv, pstkl, txditc, datadate as year, /*market*/
                        abs(prcc_f) as prcc_f, csho*prcc_f as mve_f, /*HXZ*/
                        am, ajex, txdb, seq, dvc, dvp, dp, dvpsx_f, mib, ivao, ivst, sstk, prstkc,
                        dv, dltis, dltr, dlcch, oibdp, dvpa, tstkp, oiadp, xpp, xacc, re, ppenb,
                        ppenls, capxv, fopt, wcap
                        from comp.names as c, comp.funda as f
                        where c.gvkey in {gvkeys}
                        and f.gvkey=c.gvkey
                        /*get consolidated, standardized, industrial format statements*/
                        and f.indfmt='INDL'
                        and f.datafmt='STD'
                        and f.popsrc='D'
                        and f.consol='C'
                        """)

    comp.cnum = comp.cnum.replace(' ', '').str.slice(0, 6)
    comp.sic2 = comp.sic2 + '12'
    comp.apdedate = pd.to_datetime(comp.apdedate)
    comp.datadate = pd.to_datetime(comp.datadate)
    comp.pdate = pd.to_datetime(comp.pdate)
    comp.fdate = pd.to_datetime(comp.fdate)
    comp.year = comp['datadate'].dt.year
    comp = comp.dropna(subset=['at', 'prcc_f', 'ni'])

    comp['ps_beme'] = np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv'])
    comp['ps_beme'] = np.where(comp['ps_beme'].isnull(), comp['pstk'], comp['ps_beme'])
    comp['ps_beme'] = np.where(comp['ps_beme'].isnull(), 0, comp['ps_beme'])
    comp['txditc'] = comp['txditc'].fillna(0)
    comp['be'] = comp['ceq']+comp['txditc']-comp['ps_beme']
    comp['be'] = np.where(comp['be'] > 0, comp['be'], None)

    comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates(['gvkey', 'datadate'])
    comp['count'] = comp.groupby(['gvkey']).cumcount()

    return comp
Esempio n. 6
0
def construct_daily(business_day, permno):
    assert isinstance(permno, (list, str)), 'invalid permno data type'
    if isinstance(permno, list):
        assert len(permno) != 0, 'zero permno list length'
    if isinstance(permno, str):
        permno = list([permno])
    permno = tuple(permno)

    daily_df = conn.raw_sql(f"""
                            select a.date, a.permno, b.ticker, b.shrcd, b.siccd, a.ret, 
                            abs(a.prc) as prc, a.shrout, a.cfacpr, a.cfacshr
                            from crsp.dsf as a
                            left join crsp.msenames as b
                            on a.permno = b.permno
                            and b.namedt <= a.date
                            and a.date <= b.nameendt
                            and a.date = '{business_day}'
                            where b.permno in {permno if len(permno) > 1 else '(' + permno[0] + ')'}
                            """)
    return daily_df
Esempio n. 7
0
def build_compa(permnos):
    gvkeys = permnos_to_gvkeys(permnos)
    compa = conn.raw_sql(f"""
                         select
                         fyear, apdedate, datadate, pdate, fdate, f.gvkey, REVT, EBIT, EBITDA, RE, EPSPI, GP, OPINCAR, 
                         ACT, INVT, LCT, CH, OANCF, DVP,  DVC, PRSTKC, NI, CSHO, PRCC_F, mkvalt, BKVLPS, AT, LT, 
                         DVT, ICAPT, XINT, DLCCH, DLTT, GDWL, GWO, CAPX, DLC, SEQ
                         from comp.funda as f
                         where f.gvkey in {gvkeys}
                         and REVT != 'NaN' 
                         and f.indfmt='INDL'
                         and f.datafmt='STD'
                         and f.popsrc='D'
                         and f.consol='C'
                         """)

    compa.fillna(value=np.nan, inplace=True)
    compa['fyear'].astype(int)
    compa['fqtr'] = 4
    compa['datadate'] = pd.to_datetime(compa['datadate'])
    compa['permno'] = compa['gvkey'].apply(gvkey_to_permno)

    compa['gma'] = compa['gp'] / compa['revt']
    compa['operprof'] = compa['opincar'] / compa['revt']
    compa['quick'] = (compa['act'] - compa['invt']) / compa['lct']
    compa['currat'] = compa['act'] / compa['lct']
    compa['cashrrat'] = compa['ch'] / compa['lct']
    compa['cftrr'] = compa['oancf'] / compa['revt']
    compa['dpr'] = (compa['dvp'] + compa['dvc'] +
                    compa['prstkc']) / compa['ni']
    compa['pe'] = (compa['csho'] * compa['prcc_f']) / compa['ni']
    compa['pb'] = (compa['mkvalt']) / (compa['bkvlps'])
    compa['roe'] = compa['ni'] / (compa['csho'] * compa['prcc_f'])
    compa['roa'] = compa['ni'] / (compa['at'] - compa['lt'])
    compa['roic'] = (compa['ni'] - compa['dvt']) / compa['icapt']
    compa['cod'] = compa['xint'] / (compa['dlcch'] + compa['dltt'])
    compa['capint'] = compa['capx'] / compa['at']
    compa['lev'] = (compa['dltt'] + compa['dlc']) / compa['seq']

    return compa
Esempio n. 8
0
def build_ccm_jun(ccm_data):
    ccm_jun = ccm_data.copy()
    ccm_jun['mve6b'] = ccm_jun['dec_me']
    ccm_jun['ep'] = ccm_jun.ib/ccm_jun.mve_f
    ccm_jun['cashpr'] = (ccm_jun.mve_f+ccm_jun.dltt-ccm_jun['at'])/ccm_jun.che
    ccm_jun['dy'] = ccm_jun.dvt/ccm_jun.mve_f
    ccm_jun['lev'] = ccm_jun['lt']/ccm_jun.mve_f
    ccm_jun['sp'] = ccm_jun.sale/ccm_jun.mve_f
    ccm_jun['roic'] = (ccm_jun.ebit-ccm_jun.nopi)/(ccm_jun.ceq+ccm_jun['lt']-ccm_jun.che)
    ccm_jun['rd_sale'] = ccm_jun.xrd/ccm_jun.sale
    ccm_jun['rd_mve'] = ccm_jun['xrd']/ccm_jun['mve_f']
    ccm_jun['sp'] = ccm_jun.sale/ccm_jun.mve_f #duplicate?

    # treatment for lagged terms
    ccm_jun['lagat']=ccm_jun.groupby(['permno'])['at'].shift(1)
    ccm_jun['lat'] = ccm_jun['lagat']
    ccm_jun['lagcsho']=ccm_jun.groupby(['permno'])['csho'].shift(1)
    ccm_jun['laglt']=ccm_jun.groupby(['permno'])['lt'].shift(1)
    ccm_jun['lagact']=ccm_jun.groupby(['permno'])['act'].shift(1)
    ccm_jun['lagche']=ccm_jun.groupby(['permno'])['che'].shift(1)
    ccm_jun['lagdlc']=ccm_jun.groupby(['permno'])['dlc'].shift(1)
    ccm_jun['lagtxp']=ccm_jun.groupby(['permno'])['txp'].shift(1)
    ccm_jun['laglct']=ccm_jun.groupby(['permno'])['lct'].shift(1)
    ccm_jun['laginvt']=ccm_jun.groupby(['permno'])['invt'].shift(1)
    ccm_jun['lagemp']=ccm_jun.groupby(['permno'])['emp'].shift(1)
    ccm_jun['lagsale']=ccm_jun.groupby(['permno'])['sale'].shift(1)
    ccm_jun['lagib']=ccm_jun.groupby(['permno'])['ib'].shift(1)
    ccm_jun['lag2at']=ccm_jun.groupby(['permno'])['at'].shift(2)
    ccm_jun['lagrect']=ccm_jun.groupby(['permno'])['rect'].shift(1)
    ccm_jun['lagcogs']=ccm_jun.groupby(['permno'])['cogs'].shift(1)
    ccm_jun['lagxsga']=ccm_jun.groupby(['permno'])['xsga'].shift(1)
    ccm_jun['lagppent']=ccm_jun.groupby(['permno'])['ppent'].shift(1)
    ccm_jun['lagdp']=ccm_jun.groupby(['permno'])['dp'].shift(1)
    ccm_jun['lagxad']=ccm_jun.groupby(['permno'])['xad'].shift(1)
    ccm_jun['lagppegt']=ccm_jun.groupby(['permno'])['ppegt'].shift(1)
    ccm_jun['lagceq']=ccm_jun.groupby(['permno'])['ceq'].shift(1)
    ccm_jun['lagcapx']=ccm_jun.groupby(['permno'])['capx'].shift(1)
    ccm_jun['lag2capx']=ccm_jun.groupby(['permno'])['capx'].shift(2)
    ccm_jun['laggdwl']=ccm_jun.groupby(['permno'])['gdwl'].shift(1)
    ccm_jun['lagdvt']=ccm_jun.groupby(['permno'])['dvt'].shift(1)
    ccm_jun['lagob']=ccm_jun.groupby(['permno'])['ob'].shift(1)
    ccm_jun['lagaco']=ccm_jun.groupby(['permno'])['aco'].shift(1)
    ccm_jun['lagintan']=ccm_jun.groupby(['permno'])['intan'].shift(1)
    ccm_jun['lagao']=ccm_jun.groupby(['permno'])['ao'].shift(1)
    ccm_jun['lagap']=ccm_jun.groupby(['permno'])['ap'].shift(1)
    ccm_jun['laglco']=ccm_jun.groupby(['permno'])['lco'].shift(1)
    ccm_jun['laglo']=ccm_jun.groupby(['permno'])['lo'].shift(1)
    ccm_jun['lagdr']=ccm_jun.groupby(['permno'])['dr'].shift(1)
    ccm_jun['lagxrd']=ccm_jun.groupby(['permno'])['xrd'].shift(1)
    ccm_jun['lagni']=ccm_jun.groupby(['permno'])['ni'].shift(1)
    ccm_jun['lagdltt']=ccm_jun.groupby(['permno'])['dltt'].shift(1)

    ccm_jun['agr']=np.where(ccm_jun['at'].isna() | ccm_jun.lagat.isna(), np.NaN, (ccm_jun.lagat-ccm_jun['at'])/ccm_jun.lagat)
    ccm_jun['gma']=(ccm_jun['revt']-ccm_jun['cogs'])/ccm_jun['lagat']
    ccm_jun['chcsho']=ccm_jun.csho/ccm_jun.lagcsho -1
    ccm_jun['lgr']=ccm_jun['lt']/ccm_jun.laglt -1
    ccm_jun['acc']=(ccm_jun.ib-ccm_jun.oancf)/(ccm_jun['at']+ccm_jun.lagat) * 2
    ccm_jun.loc[ccm_jun['oancf'].isna(), 'acc'] = (((ccm_jun['act'] - lag(ccm_jun, 'act')) - (ccm_jun['che'] - lag(ccm_jun, 'che'))) \
        - (((ccm_jun['lct'] - lag(ccm_jun,'lct')) - (ccm_jun['dlc'] - lag(ccm_jun,'dlc')) - (ccm_jun['txp'] - lag(ccm_jun, 'txp'))) - ccm_jun['dp'])) \
        / ((ccm_jun['at']+lag(ccm_jun,'at'))/2)
    ccm_jun.loc[ccm_jun['ib'] != 0, 'pctacc'] = (ccm_jun['ib'] - ccm_jun['oancf'])/np.abs(ccm_jun['ib'])
    ccm_jun.loc[ccm_jun['ib'] == 0, 'pctacc'] = (ccm_jun['ib'] - ccm_jun['oancf'])/(0.01)
    ccm_jun.loc[(ccm_jun['oancf'].isna()) & (ccm_jun['ib'] != 0), 'pctacc'] = (((ccm_jun['act'] - lag(ccm_jun,'act')) - (ccm_jun['che'] - lag(ccm_jun,'che'))) \
        - ((ccm_jun['lct'] - lag(ccm_jun,'lct'))-(ccm_jun['dlc']-lag(ccm_jun,'dlc')) - (ccm_jun['txp'] - lag(ccm_jun,'txp')) - ccm_jun['dp'] ))/np.abs(ccm_jun['ib'])
    ccm_jun.loc[(ccm_jun['oancf'].isna()) & (ccm_jun['ib'] == 0), 'pctacc'] = (((ccm_jun['act'] - lag(ccm_jun,'act')) - (ccm_jun['che'] - lag(ccm_jun,'che'))) \
        - ((ccm_jun['lct'] - lag(ccm_jun,'lct'))-(ccm_jun['dlc']-lag(ccm_jun,'dlc')) - (ccm_jun['txp'] - lag(ccm_jun,'txp')) - ccm_jun['dp'] ))/0.01
    ccm_jun['cfp']= (ccm_jun['ib']-(ccm_jun['act']-ccm_jun['lagact']-(ccm_jun['che']-ccm_jun['lagche'])))\
                    -(ccm_jun['lct']-ccm_jun['laglct']-(ccm_jun['dlc']-ccm_jun['lagdlc'])\
                      -(ccm_jun['txp']-ccm_jun['lagtxp'])-ccm_jun['dp'])/ccm_jun['mve_f']
    ccm_jun['cfp']=np.where(ccm_jun['oancf'].notna(),ccm_jun['oancf']/ccm_jun['mve_f'], ccm_jun['cfp'])
    ccm_jun['absacc']=ccm_jun['acc'].abs()
    ccm_jun['chinv']=2*(ccm_jun['invt']-ccm_jun['laginvt'])/(ccm_jun['at']+ccm_jun['lagat'])
    ccm_jun['spii']=np.where((ccm_jun['spi']!=0)&ccm_jun['spi'].notna(), 1, 0)
    ccm_jun['spi']=2*ccm_jun['spi']/(ccm_jun['at']+ccm_jun['lagat'])
    ccm_jun['cf']=2*ccm_jun['oancf']/(ccm_jun['at']+ccm_jun['lagat'])
    ccm_jun['cf']=np.where(ccm_jun['oancf'].isna(), (ccm_jun['ib']-(ccm_jun['act']-ccm_jun['lagact']-(ccm_jun['che']-ccm_jun['lagche'])))\
                    -(ccm_jun['lct']-ccm_jun['laglct']-(ccm_jun['dlc']-ccm_jun['lagdlc'])\
                      -(ccm_jun['txp']-ccm_jun['lagtxp'])-ccm_jun['dp'])/((ccm_jun['at']+ccm_jun['lagat'])/2),ccm_jun['cf'])
    ccm_jun['hire']=(ccm_jun['emp']-ccm_jun['lagemp'])/ccm_jun['lagemp']
    ccm_jun['hire']=np.where(ccm_jun['emp'].isna() | ccm_jun['lagemp'].isna(), 0, ccm_jun['hire'])
    ccm_jun['sgr']=ccm_jun['sale']/ccm_jun['lagsale'] -1
    ccm_jun['chpm']=ccm_jun['ib']/ccm_jun['sale']-ccm_jun['lagib']/ccm_jun['lagsale']
    ccm_jun['chato']=(ccm_jun['sale']/((ccm_jun['at']+ccm_jun['lagat'])/2)) - (ccm_jun['lagsale']/((ccm_jun['lagat']+ccm_jun['lag2at'])/2))
    ccm_jun['pchsale_pchinvt']=((ccm_jun['sale']-(ccm_jun['lagsale']))/(ccm_jun['lagsale']))-((ccm_jun['invt']-(ccm_jun['laginvt']))/(ccm_jun['laginvt']))
    ccm_jun['pchsale_pchrect']=((ccm_jun['sale']-(ccm_jun['lagsale']))/(ccm_jun['lagsale']))-((ccm_jun['rect']-(ccm_jun['lagrect']))/(ccm_jun['lagrect']))
    ccm_jun['pchgm_pchsale']=(((ccm_jun['sale']-ccm_jun['cogs'])-((ccm_jun['lagsale'])-(ccm_jun['lagcogs'])))/((ccm_jun['lagsale'])-(ccm_jun['lagcogs'])))-((ccm_jun['sale']-(ccm_jun['lagsale']))/(ccm_jun['lagsale']))
    ccm_jun['pchsale_pchxsga']=((ccm_jun['sale']-(ccm_jun['lagsale']))/(ccm_jun['lagsale']) )-((ccm_jun['xsga']\
            -(ccm_jun['lagxsga'])) /(ccm_jun['lagxsga']) )
    ccm_jun['depr']=ccm_jun['dp']/ccm_jun['ppent']
    ccm_jun['pchdepr']=((ccm_jun['dp']/ccm_jun['ppent'])-((ccm_jun['lagdp'])/(ccm_jun['lagppent'])))/((ccm_jun['lagdp'])/(ccm_jun['lagppent']))
    ccm_jun['chadv']=np.log(1+ccm_jun['xad'])-np.log((1+(ccm_jun['lagxad'])))
    ccm_jun['invest']=((ccm_jun['ppegt']-(ccm_jun['lagppegt'])) +  (ccm_jun['invt']-(ccm_jun['laginvt'])) ) / (ccm_jun['lagat'])
    ccm_jun['invest']=np.where(ccm_jun['ppegt'].isna(), ((ccm_jun['ppent']-(ccm_jun['lagppent'])) +  (ccm_jun['invt']-(ccm_jun['laginvt'])) ) / (ccm_jun['lagat']), ccm_jun['invest'])
    ccm_jun['egr']=((ccm_jun['ceq']-(ccm_jun['lagceq']))/(ccm_jun['lagceq']))
    ccm_jun['capx']=np.where(ccm_jun['capx'].isna() & ccm_jun['count']>=1,ccm_jun['ppent']-(ccm_jun['lagppent']), ccm_jun['capx'])
    ccm_jun['pchcapx']=(ccm_jun['capx']-ccm_jun['lagcapx'])/ccm_jun['lagcapx']
    ccm_jun['grcapx']=(ccm_jun['capx']-ccm_jun['lag2capx'])/ccm_jun['lag2capx']
    ccm_jun['grGW'] = np.nan
    ccm_jun['grGW']=(ccm_jun['gdwl']-lag(ccm_jun, 'gdwl'))/lag(ccm_jun, 'gdwl')
    ccm_jun['grGW']=np.where((ccm_jun['gdwl'].isna()) | (ccm_jun['gdwl']==0), 0, ccm_jun['grGW'])
    ccm_jun['grGW']=np.where((ccm_jun['gdwl'].notna()) & (ccm_jun['gdwl']!=0) & (ccm_jun['grGW'].isna()), 1, ccm_jun['grGW'])
    ccm_jun['woGW']=np.where((ccm_jun['gdwlia'].notna()&ccm_jun['gdwlia']!=0)|(ccm_jun['gdwlip'].notna()&(ccm_jun['gdwlip']!=0))|\
                                                                              (ccm_jun['gwo'].notna()&ccm_jun['gwo']!=0) , 1, 0)
    ccm_jun['tang']=(ccm_jun['che']+ccm_jun['rect']*0.715+ccm_jun['invt']*0.547+ccm_jun['ppent']*0.535)/ccm_jun['at']
    ccm_jun['sic']=ccm_jun['sic'].astype(int)
    ccm_jun['sin']=np.where(ccm_jun['sic'].between(2100,2199) | ccm_jun['sic'].between(2080,2085) | (ccm_jun['naics'].isin(['7132', '71312', \
                                                                                 '713210', '71329', '713290', '72112', '721120'])), 1, 0)
    ccm_jun['act']=np.where(ccm_jun['act'].isna(), ccm_jun['che']+ccm_jun['rect']+ccm_jun['invt'],ccm_jun['act'])
    ccm_jun['lct']=np.where(ccm_jun['lct'].isna(), ccm_jun['ap'], ccm_jun['lct'])
    ccm_jun['currat']=ccm_jun['act']/ccm_jun['lct']
    ccm_jun['pchcurrat']= ((ccm_jun['act']/ccm_jun['lct']) - (lag(ccm_jun, 'act')/lag(ccm_jun, 'lct'))) / (lag(ccm_jun,'act')/lag(ccm_jun,'lct'))
    ccm_jun['quick']=(ccm_jun['act']-ccm_jun['invt'])/ccm_jun['lct']
    ccm_jun['pchquick']=((ccm_jun['act']-ccm_jun['invt'])/ccm_jun['lct'] - ((ccm_jun['lagact'])-(ccm_jun['laginvt']))/(ccm_jun['laglct']) )/ (((ccm_jun['lagact'])-(ccm_jun['laginvt']))/(ccm_jun['laglct']))
    ccm_jun['salecash']=ccm_jun['sale']/ccm_jun['che']
    ccm_jun['salerec']=ccm_jun['sale']/ccm_jun['rect']
    ccm_jun['saleinv']=ccm_jun['sale']/ccm_jun['invt']
    ccm_jun['pchsaleinv']=((ccm_jun['sale']/ccm_jun['invt'])-((ccm_jun['lagsale'])/(ccm_jun['laginvt'])) ) / ((ccm_jun['lagsale'])/(ccm_jun['laginvt']))
    ccm_jun['cashdebt']=(ccm_jun['ib']+ccm_jun['dp'])/((ccm_jun['lt']+(ccm_jun['laglt']))/2)
    ccm_jun['realestate']=(ccm_jun['fatb']+ccm_jun['fatl'])/ccm_jun['ppegt']
    ccm_jun['realestate']=np.where(ccm_jun['ppegt'].isna(), (ccm_jun['fatb']+ccm_jun['fatl'])/ccm_jun['ppent'], ccm_jun['realestate'])
    ccm_jun['divi']=np.where((ccm_jun['dvt'].notna() & ccm_jun['dvt']>0) & ((ccm_jun['lagdvt'])==0 | (ccm_jun['lagdvt'].isna())),1,0)
    ccm_jun['divo']=np.where((ccm_jun['dvt'].isna() | ccm_jun['dvt']==0) & ((ccm_jun['lagdvt'])>0 & (ccm_jun['lagdvt'].notna())),1,0)
    ccm_jun['obklg']=ccm_jun['ob']/((ccm_jun['at']+(ccm_jun['lagat']))/2)
    ccm_jun['chobklg']=(ccm_jun['ob']-(ccm_jun['lagob']))/((ccm_jun['at']+(ccm_jun['lagat']))/2)


    ccm_jun['securedind']=np.where(ccm_jun['dm'].notna() &ccm_jun['dm']!=0, 1, 0)
    ccm_jun['secured']=ccm_jun['dm']/ccm_jun['dltt']
    ccm_jun['convind']=np.where((ccm_jun['dc'].notna() & ccm_jun['dc']!=0) | (ccm_jun['cshrc'].notna() & ccm_jun['cshrc']!=0) , 1, 0)
    ccm_jun['dc']=ccm_jun['dc'].astype(float)
    ## There will be inf in the result
    ccm_jun['conv']=ccm_jun['dc']/ccm_jun['dltt']


    ccm_jun['grltnoa']=((ccm_jun['rect']+ccm_jun['invt']+ccm_jun['ppent']+ccm_jun['aco']+ccm_jun['intan']+ccm_jun['ao']-ccm_jun['ap']-ccm_jun['lco']-ccm_jun['lo'])\
                        -((ccm_jun['lagrect'])+(ccm_jun['laginvt'])+(ccm_jun['lagppent'])+(ccm_jun['lagaco'])+(ccm_jun['lagintan'])+(ccm_jun['lagao'])-(ccm_jun['lagap'])\
                          -(ccm_jun['laglco'])-(ccm_jun['laglo'])) -(ccm_jun['rect']-(ccm_jun['lagrect'])+ccm_jun['invt']-(ccm_jun['laginvt'])+ccm_jun['aco']-(ccm_jun['lagaco'])\
                            -(ccm_jun['ap']-(ccm_jun['lagap'])+ccm_jun['lco']-(ccm_jun['laglco'])) -ccm_jun['dp']))/((ccm_jun['at']+(ccm_jun['lagat']))/2)
    ccm_jun['chdrc']=(ccm_jun['dr']-(ccm_jun['lagdr']))/((ccm_jun['at']+(ccm_jun['lagat']))/2)


    ccm_jun['xrd/lagat']=ccm_jun['xrd']/(ccm_jun['lagat'])
    ccm_jun['lag(xrd/lagat)']=ccm_jun.groupby(['permno'])['xrd/lagat'].shift(1)
    ccm_jun['rd']=np.where(((ccm_jun['xrd']/ccm_jun['at'])-ccm_jun['lag(xrd/lagat)'])/ccm_jun['lag(xrd/lagat)']>0.05, 1, 0)
    ccm_jun['rdbias']=(ccm_jun['xrd']/(ccm_jun['lagxrd']))-1-ccm_jun['ib']/(ccm_jun['lagceq'])
    ccm_jun['roe']=ccm_jun['ib']/(ccm_jun['lagceq'])


    ccm_jun['ps_beme']=np.where(ccm_jun['pstkrv'].isnull(), ccm_jun['pstkl'], ccm_jun['pstkrv'])
    ccm_jun['ps_beme']=np.where(ccm_jun['ps_beme'].isnull(),ccm_jun['pstk'], ccm_jun['ps_beme'])
    ccm_jun['ps_beme']=np.where(ccm_jun['ps_beme'].isnull(),0,ccm_jun['ps_beme'])
    ccm_jun['txditc']=ccm_jun['txditc'].fillna(0)
    ccm_jun['be']=ccm_jun['ceq']+ccm_jun['txditc']-ccm_jun['ps_beme']
    ccm_jun['be']=np.where(ccm_jun['be']>0,ccm_jun['be'],np.NaN)
    ccm_jun['operprof']=np.where(ccm_jun['be'].notna() & ccm_jun['revt'].notna() & (ccm_jun['cogs'].notna() | ccm_jun['xsga'].notna() | ccm_jun['xint'].notna()),\
                              (ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']-ccm_jun['xint'])/ccm_jun['be'], np.nan)

    ccm_jun['ps']=(ccm_jun['ni']>0).astype(int) +(ccm_jun['oancf']>0)+(ccm_jun['ni']/ccm_jun['at'] > (ccm_jun['lagni'])/(ccm_jun['lagat']))+(ccm_jun['oancf']>ccm_jun['ni'])+(ccm_jun['dltt']/ccm_jun['at'] < (ccm_jun['lagdltt'])/(ccm_jun['lagat']))\
                    +(ccm_jun['act']/ccm_jun['lct'] > (ccm_jun['lagact'])/(ccm_jun['laglct'])) +((ccm_jun['sale']-ccm_jun['cogs'])/ccm_jun['sale'] > ((ccm_jun['lagsale'])-(ccm_jun['lagcogs']))/(ccm_jun['lagsale']))\
                    + (ccm_jun['sale']/ccm_jun['at'] > (ccm_jun['lagsale'])/(ccm_jun['lagat']))+ (ccm_jun['scstkc']==0)


    def tr_fyear(row):
        if row['fyear']<=1978:
            value = 0.48
        elif row['fyear']<=1986:
            value = 0.46
        elif row['fyear']==1987:
            value = 0.4
        elif row['fyear']>=1988 and row['fyear']<=1992:
            value = 0.34
        elif row['fyear']>=1993:
            value = 0.35
        else:
            value=''
        return value

    ccm_jun['tr']=ccm_jun.apply(tr_fyear, axis=1)
    ccm_jun['tb_1']=((ccm_jun['txfo']+ccm_jun['txfed'])/ccm_jun['tr'])/ccm_jun['ib']
    ccm_jun['tb_1']=np.where(ccm_jun['txfo'].isna() | ccm_jun['txfed'].isna(),((ccm_jun['txt']+ccm_jun['txdi'])/ccm_jun['tr'])/ccm_jun['ib'], ccm_jun['tb_1'])

    # 	if (txfo+txfed>0 or txt>txdi) and ib<=0 then
    # 		tb_1=1;
    #!!! Caution that for condition, when using | and &, one must apply parenthesis
    ccm_jun['tb_1']=np.where(((ccm_jun['txfo']+ccm_jun['txfed'])>0 | (ccm_jun['txt']>ccm_jun['txdi'])) & (ccm_jun['ib']<=0), 1, ccm_jun['tb_1'])

    # 	*variables that will be used in subsequent steps to get to final RPS;
    # 	*--prep for for Mohanram (2005) score;
    # 	roa=ni/((at+lag(at))/2);
    # 	cfroa=oancf/((at+lag(at))/2);
    ccm_jun['roa']=ccm_jun['ni']/((ccm_jun['at']+(ccm_jun['lagat']))/2)
    ccm_jun['cfroa']=ccm_jun['oancf']/((ccm_jun['at']+(ccm_jun['lagat']))/2)

    # 	if missing(oancf) then
    # 		cfroa=(ib+dp)/((at+lag(at))/2);
    # 	xrdint=xrd/((at+lag(at))/2);
    # 	capxint=capx/((at+lag(at))/2);
    # 	xadint=xad/((at+lag(at))/2);
    ccm_jun['cfroa']=np.where(ccm_jun['oancf'].isna(),ccm_jun['ib']+ccm_jun['dp'] /((ccm_jun['at']+(ccm_jun['lagat']))/2), ccm_jun['cfroa'])
    ccm_jun['xrdint']=ccm_jun['xrd']/((ccm_jun['at']+(ccm_jun['lagat']))/2)
    ccm_jun['capxint']=ccm_jun['capx']/((ccm_jun['at']+(ccm_jun['lagat']))/2)
    ccm_jun['xadint']=ccm_jun['xad']/((ccm_jun['at']+(ccm_jun['lagat']))/2)

    # 	/*HXZ*/
    # 	adm=xad/mve6b;
    # 	gad=(xad-lag(xad))/lag(xad);
    # 	rdm=xrd/mve6b;
    # 	rds=xrd/sale;
    # 	ol=(cogs+xsga)/at;
    # 	rc_1=xrd+0.8*lag(xrd)+0.6*lag2(xrd)+0.4*lag3(xrd)+0.2*lag4(xrd);

    #New lag terms for this section
    ccm_jun['lag2xrd']=ccm_jun.groupby(['permno'])['lagxrd'].shift(1)
    ccm_jun['lag3xrd']=ccm_jun.groupby(['permno'])['lag2xrd'].shift(1)
    ccm_jun['lag4xrd']=ccm_jun.groupby(['permno'])['lag3xrd'].shift(1)

    # Here I follow previous naming of mve6b as dec_me
    ccm_jun['adm']=ccm_jun['xad']/ccm_jun['dec_me']
    ccm_jun['gad']=(ccm_jun['xad']-(ccm_jun['lagxad']))/(ccm_jun['lagxad'])
    ccm_jun['rdm']=ccm_jun['xrd']/ccm_jun['dec_me']
    ccm_jun['rds']=ccm_jun['xrd']/ccm_jun['sale']
    ccm_jun['ol']=(ccm_jun['cogs']+ccm_jun['xsga'])/ccm_jun['at']
    ccm_jun['rc_1']=ccm_jun['xrd']+0.8*(ccm_jun['lagxrd'])+0.6*(ccm_jun['lag2xrd'])+0.4*(ccm_jun['lag3xrd'])+0.2*(ccm_jun['lag4xrd'])
    ccm_jun['rca'] = ccm_jun['rc_1']/ccm_jun['at']
    ccm_jun['eps_1'] = ccm_jun['ajex']/ccm_jun['prcc_f']
    ccm_jun['x_1']=ccm_jun['ppent']+ccm_jun['intan']+ccm_jun['ao']-ccm_jun['lo']+ccm_jun['dp']
    ccm_jun['etr'] = (ccm_jun['x_1'] - (lag(ccm_jun, 'x_1') + lag(ccm_jun,'x_1',2)+lag(ccm_jun,'x_1',3))/3) * (ccm_jun['eps_1'] - lag(ccm_jun, 'eps_1'))
    ccm_jun['x_2'] = ccm_jun['sale']/ccm_jun['emp']
    ccm_jun['lfe'] = (ccm_jun['x_2'] - lag(ccm_jun, 'x_2'))/lag(ccm_jun, 'x_2')
    ccm_jun['kz'] = -1.002*(ccm_jun['ib'] + ccm_jun['dp'])/lag(ccm_jun,'ppent') + 0.283*(ccm_jun['at']+ccm_jun['mve6b']-ccm_jun['ceq']-ccm_jun['txdb'])/ccm_jun['at'] + \
        3.139*(ccm_jun['dlc'] + ccm_jun['dltt'])/(ccm_jun['dlc'] + ccm_jun['dltt'] + ccm_jun['seq']) - 39.368*(ccm_jun['dvc']+ccm_jun['dvp'])/lag(ccm_jun,'ppent') -\
        1.315*(ccm_jun['che'])/lag(ccm_jun,'ppent')


    ccm_jun['cdd']=ccm_jun['dcvt']/(ccm_jun['dlc']+ccm_jun['dltt'])
    ccm_jun['roaq_a']=ccm_jun['ib']/(ccm_jun['lagat'])

    # must reindex in order to set back the value
    ccm_jun['roavol_1']=ccm_jun.groupby(['permno'])['roaq_a'].rolling(10).std(skipna=True).reset_index()['roaq_a']

    ccm_jun['cs_1']=(ccm_jun['ib']-(ccm_jun['act']-(ccm_jun['lagact'])-(ccm_jun['lct']-(ccm_jun['laglct']))-(ccm_jun['che']-(ccm_jun['lagche']))+ccm_jun['dlc']-(ccm_jun['lagdlc'])))/(ccm_jun['lagat'])
    ccm_jun['roavol_2']=ccm_jun.groupby(['permno'])['cs_1'].rolling(10).std().reset_index()['cs_1']
    ccm_jun['roavol_a']=ccm_jun['roavol_1']/ccm_jun['roavol_2']

    ccm_jun['gdwl']=ccm_jun['gdwl'].fillna(0)
    ccm_jun['intan']=ccm_jun['intan'].fillna(0)
    ccm_jun['ala']=ccm_jun['che']+0.75*(ccm_jun['act']-ccm_jun['che'])-0.5*(ccm_jun['at']-ccm_jun['act']-ccm_jun['gdwl']-ccm_jun['intan'])
    ccm_jun['alm']=ccm_jun['ala']/(ccm_jun['at']+ccm_jun['prcc_f']*ccm_jun['csho']-ccm_jun['ceq'])
    ccm_jun['ob_a']=ccm_jun['ob']/(0.5*ccm_jun['at']+0.5*(ccm_jun['lagat']))
    ccm_jun['x_3']=ccm_jun['capx']/ccm_jun['sale']
    ccm_jun['lagx_3']=ccm_jun.groupby(['permno'])['x_3'].shift(1)
    ccm_jun['lag2x_3']=ccm_jun.groupby(['permno'])['lagx_3'].shift(1)
    ccm_jun['lag3x_3']=ccm_jun.groupby(['permno'])['lag2x_3'].shift(1)
    ccm_jun['cinvest_a']=ccm_jun['x_3']/(((ccm_jun['lagx_3'])+(ccm_jun['lag2x_3'])+(ccm_jun['lag3x_3']))/3)-1
    ccm_jun['dpia'] = (ccm_jun['ppegt']-lag(ccm_jun,'ppegt') + ccm_jun['invt'] - lag(ccm_jun, 'invt'))/lag(ccm_jun, 'at')

    ccm_jun['dlc']=ccm_jun['dlc'].fillna(0)
    ccm_jun['dltt']=ccm_jun['dltt'].fillna(0)
    ccm_jun['mib']=ccm_jun['mib'].fillna(0)
    ccm_jun['pstk']=ccm_jun['pstk'].fillna(0)
    ccm_jun['ceq']=ccm_jun['ceq'].fillna(0)


    ccm_jun['noa']=((ccm_jun['at']-ccm_jun['che'])-(ccm_jun['at']-ccm_jun['dlc']-ccm_jun['dltt']-ccm_jun['mib']-ccm_jun['pstk']-ccm_jun['ceq']))/(ccm_jun['lagat'])
    ccm_jun['lagnoa']=ccm_jun.groupby(['permno'])['noa'].shift(1)
    ccm_jun['dnoa']=ccm_jun['noa']-(ccm_jun['lagnoa'])
    ccm_jun['lag3capx']=ccm_jun.groupby(['permno'])['capx'].shift(3)
    ccm_jun['pchcapx3']=ccm_jun['capx']/(ccm_jun['lag3capx'])-1
    ccm_jun['x_4']=ccm_jun['dlc']+ccm_jun['dltt']
    ccm_jun['lag5x_4']=ccm_jun.groupby(['permno'])['capx'].shift(5)
    ccm_jun['cdi']=np.log(ccm_jun['x_4']/(ccm_jun['lag5x_4']))

    ccm_jun['ivg']=ccm_jun['invt']/(ccm_jun['laginvt'])-1
    ccm_jun['dcoa']=(ccm_jun['act']-(ccm_jun['lagact'])-(ccm_jun['che']-(ccm_jun['lagche'])))/(ccm_jun['lagat'])
    ccm_jun['dcol']=(ccm_jun['lct']-(ccm_jun['laglct'])-(ccm_jun['dlc']-(ccm_jun['lagdlc'])))/(ccm_jun['lagat'])
    ccm_jun['dwc']=(ccm_jun['dcoa']-ccm_jun['dcol'])/(ccm_jun['lagat'])
    ccm_jun['lagivao']=ccm_jun.groupby(['permno'])['ivao'].shift(1)
    ccm_jun['dnca']=(ccm_jun['at']-ccm_jun['act']-ccm_jun['ivao']-((ccm_jun['lagat'])-(ccm_jun['lagact'])-(ccm_jun['lagivao'])))/(ccm_jun['lagat'])
    ccm_jun['dncl']=(ccm_jun['lt']-ccm_jun['lct']-ccm_jun['dltt']-((ccm_jun['laglt'])-(ccm_jun['laglct'])-(ccm_jun['lagdltt'])))/(ccm_jun['lagat'])
    ccm_jun['dnco']=(ccm_jun['dnca']-ccm_jun['dncl'])/(ccm_jun['lagat'])
    ccm_jun['lagivst']=ccm_jun.groupby(['permno'])['ivst'].shift(1)
    ccm_jun['lagpstk']=ccm_jun.groupby(['permno'])['pstk'].shift(1)
    ccm_jun['dfin']=(ccm_jun['ivst']+ccm_jun['ivao']-ccm_jun['dltt']-ccm_jun['dlc']-ccm_jun['pstk']-((ccm_jun['lagivst'])+(ccm_jun['lagivao'])-(ccm_jun['lagdltt'])-(ccm_jun['lagdlc'])-(ccm_jun['lagpstk'])))/(ccm_jun['lagat'])
    ccm_jun['ta']=(ccm_jun['dwc']+ccm_jun['dnco']+ccm_jun['dfin'])/(ccm_jun['lagat'])
    ccm_jun['dsti']=(ccm_jun['ivst']-(ccm_jun['lagivst']))/(ccm_jun['lagat'])
    ccm_jun['dfnl']=(ccm_jun['dltt']+ccm_jun['dlc']+ccm_jun['pstk']-((ccm_jun['lagdltt'])+(ccm_jun['lagdlc'])+(ccm_jun['lagpstk'])))/(ccm_jun['lagat'])
    ccm_jun['egr_hxz']=(ccm_jun['ceq']-(ccm_jun['lagceq']))/(ccm_jun['lagat'])

    ccm_jun['lagpstkrv']=ccm_jun.groupby(['permno'])['pstkrv'].shift(1)

    ccm_jun['txp']=ccm_jun['txp'].fillna(0)
    ccm_jun['poa']=(ccm_jun['act']-(ccm_jun['lagact'])-(ccm_jun['che']-(ccm_jun['lagche']))-(ccm_jun['lct']-(ccm_jun['laglct'])-(ccm_jun['dlc']-(ccm_jun['lagdlc']))-(ccm_jun['txp']-(ccm_jun['lagtxp'])))-ccm_jun['dp'])/(ccm_jun['ni'].abs())
    ccm_jun['nef']=(ccm_jun['sstk']-ccm_jun['prstkc']-ccm_jun['dv'])/((ccm_jun['at']+(ccm_jun['lagat']))/2)
    ccm_jun['dlcch']=ccm_jun['dlcch'].fillna(0)
    ccm_jun['ndf']=(ccm_jun['dltis']-ccm_jun['dltr']+ccm_jun['dlcch'])/((ccm_jun['at']+(ccm_jun['lagat']))/2)
    ccm_jun['nxf'] = ccm_jun['ndf'] + ccm_jun['nef']
    ccm_jun['atm']=ccm_jun['at']/ccm_jun['dec_me']
    ccm_jun['cp']=(ccm_jun['ib']+ccm_jun['dp'])/ccm_jun['dec_me']
    ccm_jun['op']=(ccm_jun['dvc']+ccm_jun['prstkc']-(ccm_jun['pstkrv']-(ccm_jun['lagpstkrv'])))/ccm_jun['dec_me']
    ccm_jun['nop']=(ccm_jun['dvc']+ccm_jun['prstkc']-(ccm_jun['pstkrv']-(ccm_jun['lagpstkrv']))-ccm_jun['sstk']+ccm_jun['pstkrv']-(ccm_jun['lagpstkrv']))/ccm_jun['dec_me']
    ccm_jun['em'] = (ccm_jun['mve6b'] + ccm_jun['dlc'] + ccm_jun['dltt'] + ccm_jun['pstkrv'] - ccm_jun['che'])/ccm_jun['oibdp']


    ccm_jun['dvpa']=ccm_jun['dvpa'].fillna(0)
    ccm_jun['tstkp']=ccm_jun['tstkp'].fillna(0)
    ccm_jun['ndp']=ccm_jun['dltt']+ccm_jun['dlc']+ccm_jun['pstk']+ccm_jun['dvpa']-ccm_jun['tstkp']-ccm_jun['che']
    ccm_jun['ebp']=(ccm_jun['ndp']+ccm_jun['ceq']+ccm_jun['tstkp']-ccm_jun['dvpa'])/(ccm_jun['ndp']+ccm_jun['dec_me'])
    ccm_jun['rna']=ccm_jun['oiadp']/(ccm_jun['lagnoa'])
    ccm_jun['pm']=ccm_jun['rna']/ccm_jun['sale']
    ccm_jun['ato']=ccm_jun['sale']/(ccm_jun['lagnoa'])
    ccm_jun['cto']=ccm_jun['sale']/(ccm_jun['lagat'])
    ccm_jun['gpa']=(ccm_jun['revt']-ccm_jun['cogs'])/ccm_jun['at']
    ccm_jun['rmw']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']-ccm_jun['xint'])/(ccm_jun['ceq']+ccm_jun['pstk'])
    ccm_jun['ole']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']-ccm_jun['xint'])/((ccm_jun['lagceq'])+(ccm_jun['lagpstk']))
    ccm_jun['opa']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']+ccm_jun['xrd'])/ccm_jun['at']
    ccm_jun['ola']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']+ccm_jun['xrd'])/(ccm_jun['lagat'])

    ccm_jun['lagxpp']=ccm_jun.groupby(['permno'])['xpp'].shift(1)
    ccm_jun['lagdrc']=ccm_jun.groupby(['permno'])['drc'].shift(1)
    ccm_jun['lagdrlt']=ccm_jun.groupby(['permno'])['drlt'].shift(1)
    ccm_jun['lagxacc']=ccm_jun.groupby(['permno'])['xacc'].shift(1)

    ccm_jun['cop']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']+ccm_jun['xrd']-(ccm_jun['rect']-(ccm_jun['lagrect']))-(ccm_jun['invt']-(ccm_jun['laginvt']))-\
             (ccm_jun['xpp']-(ccm_jun['lagxpp']))+ccm_jun['drc']-(ccm_jun['lagdrc'])+ccm_jun['drlt']-(ccm_jun['lagdrlt'])+ccm_jun['ap']-(ccm_jun['lagap'])+ccm_jun['xacc']-(ccm_jun['lagxacc']))/ccm_jun['at']
    ccm_jun['cla']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']+ccm_jun['xrd']-(ccm_jun['rect']-(ccm_jun['lagrect']))-(ccm_jun['invt']-(ccm_jun['laginvt']))-\
             (ccm_jun['xpp']-(ccm_jun['lagxpp']))+ccm_jun['drc']-(ccm_jun['lagdrc'])+ccm_jun['drlt']-(ccm_jun['lagdrlt'])+ccm_jun['ap']-(ccm_jun['lagap'])+ccm_jun['xacc']-(ccm_jun['lagxacc']))/ccm_jun['lagat']


    ccm_jun['i_1']=np.where(ccm_jun['lt']>ccm_jun['at'],1,0)
    ccm_jun['i_2']=np.where((ccm_jun['ni']<0) & (ccm_jun['lagni']<0),1,0)
    ccm_jun['os']=-1.32-0.407*np.log(ccm_jun['at'])+6.03*(ccm_jun['dlc']+ccm_jun['dltt'])/ccm_jun['at']-1.43*(ccm_jun['act']-ccm_jun['lct'])/ccm_jun['at']+0.076*(ccm_jun['lct']/ccm_jun['act'])-1.72*ccm_jun['i_1']-2.37*ccm_jun['ni']/ccm_jun['at']-1.83*(ccm_jun['pi']+ccm_jun['dp'])/ccm_jun['lt']+0.285*ccm_jun['i_2']-0.521*(ccm_jun['ni']+(ccm_jun['lagni']))/((ccm_jun['ni'].abs())+ccm_jun['lagni'].abs())
    ccm_jun['zs']=1.2*(ccm_jun['act']-ccm_jun['lct'])/ccm_jun['at']+1.4*ccm_jun['re']/ccm_jun['at']+3.3*ccm_jun['oiadp']/ccm_jun['at']+0.6*ccm_jun['dec_me']/ccm_jun['lt']+ccm_jun['sale']/ccm_jun['at']
    ccm_jun['bi']=np.where(ccm_jun['be']>0,ccm_jun['at']/ccm_jun['be'],np.NaN)


    ccm_jun['pchsale_pchinvt_hxz']=(ccm_jun['sale']-(ccm_jun['lagsale']))/(0.5*ccm_jun['sale']+0.5*(ccm_jun['lagsale']))-(ccm_jun['invt']-(ccm_jun['laginvt']))/(0.5*ccm_jun['invt']+0.5*(ccm_jun['laginvt']))
    ccm_jun['pchsale_pchrect_hxz']=(ccm_jun['sale']-(ccm_jun['lagsale']))/(0.5*ccm_jun['sale']+0.5*(ccm_jun['lagsale']))-(ccm_jun['rect']-(ccm_jun['lagrect']))/(0.5*ccm_jun['rect']+0.5*(ccm_jun['lagrect']))
    ccm_jun['gm_1']=ccm_jun['sale']-ccm_jun['cogs']
    ccm_jun['laggm_1']=ccm_jun.groupby(['permno'])['gm_1'].shift(1)
    ccm_jun['pchgm_pchsale_hxz']=(ccm_jun['gm_1']-(ccm_jun['laggm_1']))/(0.5*(ccm_jun['gm_1']+(ccm_jun['laggm_1'])))-(ccm_jun['sale']-(ccm_jun['lagsale']))/(0.5*ccm_jun['sale']+0.5*(ccm_jun['lagsale']))
    ccm_jun['pchsale_pchxsga_hxz']=(ccm_jun['sale']-(ccm_jun['lagsale']))/(0.5*ccm_jun['sale']+0.5*(ccm_jun['lagsale']))-(ccm_jun['xsga']-(ccm_jun['lagxsga']))/(0.5*ccm_jun['xsga']+0.5*(ccm_jun['lagxsga']))
    ccm_jun['realestate_hxz']=(ccm_jun['fatb']+ccm_jun['fatl'])/ccm_jun['ppegt']


    ccm_jun['secured_hxz']=ccm_jun['dm']/(ccm_jun['dltt']+ccm_jun['dlc'])
    ccm_jun['agr_hxz']=ccm_jun['at']/(ccm_jun['lagat'])-1
    ccm_jun['lagx_1']=ccm_jun.groupby(['permno'])['x_1'].shift(1)
    ccm_jun['grltnoa_hxz']=(ccm_jun['x_1']-(ccm_jun['lagx_1']))/((ccm_jun['at']+(ccm_jun['lagat']))/2)
    ccm_jun['lagajex']=ccm_jun.groupby(['permno'])['ajex'].shift(1)
    ccm_jun['chcsho_hxz']=np.log(ccm_jun['csho']*ccm_jun['ajex'])-np.log((ccm_jun['lagcsho'])*(ccm_jun['lagajex']))
    ccm_jun['lagcapxv']=ccm_jun.groupby(['permno'])['capxv'].shift(1)
    ccm_jun['lag2capxv']=ccm_jun.groupby(['permno'])['capxv'].shift(2)
    ccm_jun['pchcapx_hxz']=(ccm_jun['capxv']-0.5*(ccm_jun['lagcapxv'])-0.5*(ccm_jun['lag2capxv']))/(0.5*(ccm_jun['lagcapxv'])+0.5*(ccm_jun['lag2capxv']))


    ccm_jun['txp']=ccm_jun['txp'].fillna(0)
    ccm_jun['acc_hxz']=(ccm_jun['act']-(ccm_jun['lagact'])-(ccm_jun['che']-(ccm_jun['lagche']))-(ccm_jun['lct']-(ccm_jun['laglct'])-(ccm_jun['dlc']-(ccm_jun['lagdlc']))-(ccm_jun['txp']-(ccm_jun['lagtxp'])))-ccm_jun['dp'])/(ccm_jun['lagat'])
    ccm_jun['pctacc_hxz']=(ccm_jun['dwc']+ccm_jun['dnco']+ccm_jun['dfin'])/(ccm_jun['ni'].abs())
    ccm_jun['lev_hxz']=(ccm_jun['dlc']+ccm_jun['dltt'])/ccm_jun['dec_me']
    ccm_jun['ep_hxz']=ccm_jun['ib']/ccm_jun['dec_me']
    ccm_jun['lagwcap']=ccm_jun.groupby(['permno'])['wcap'].shift(1)
    ccm_jun['cfp_hxz']=(ccm_jun['fopt']-(ccm_jun['wcap']-(ccm_jun['lagwcap'])))/ccm_jun['dec_me']

    ccm_jun['cfp_hxz']=np.where(ccm_jun['oancf'].notna(),(ccm_jun['fopt']-ccm_jun['oancf'])/ccm_jun['dec_me'],ccm_jun['cfp_hxz'])
    ccm_jun['tb_hxz']=ccm_jun['pi']/ccm_jun['ni']

    ccm_jun = ccm_jun.sort_values(['sic2', 'fyear'])
    a=ccm_jun.groupby(['sic2','fyear'])['chpm'].mean()
    a=a.rename('meanchpm')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])
    ccm_jun['chpmia']=ccm_jun['chpm']-ccm_jun['meanchpm']

    a=ccm_jun.groupby(['sic2','fyear'])['chato'].mean()
    a=a.rename('meanchato')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])
    ccm_jun['chatoia']=ccm_jun['chato']-ccm_jun['meanchato']

    a=ccm_jun.groupby(['sic2','fyear'])['sale'].sum()
    a=a.rename('indsale')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])

    a=ccm_jun.groupby(['sic2','fyear'])['hire'].mean()
    a=a.rename('meanhire')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])
    ccm_jun['chempia']=ccm_jun['hire']-ccm_jun['meanhire']

    ccm_jun['beme']=ccm_jun['beme'].astype(float)
    a=ccm_jun.groupby(['sic2','fyear'])['beme'].mean()
    a=a.rename('meanbeme')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])
    ccm_jun['beme_ia']=ccm_jun['beme']-ccm_jun['meanbeme']
    ccm_jun['bm_ia'] = ccm_jun['beme_ia']

    a=ccm_jun.groupby(['sic2','fyear'])['pchcapx'].mean()
    a=a.rename('meanpchcapx')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])
    ccm_jun['pchcapx_ia']=ccm_jun['pchcapx']-ccm_jun['meanpchcapx']

    a=ccm_jun.groupby(['sic2','fyear'])['tb_1'].mean()
    a=a.rename('meantb_1')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])
    ccm_jun['tb']=ccm_jun['tb_1']-ccm_jun['meantb_1']

    a=ccm_jun.groupby(['sic2','fyear'])['cfp'].mean()
    a=a.rename('meancfp')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])
    ccm_jun['cfp_ia']=ccm_jun['cfp']-ccm_jun['meancfp']

    a=ccm_jun.groupby(['sic2','fyear'])['mve_f'].mean()
    a=a.rename('meanmve_f')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])
    ccm_jun['mve_ia']=ccm_jun['mve_f']-ccm_jun['meanmve_f']

    a=ccm_jun.groupby(['sic2','fyear'])['at'].sum()
    a=a.rename('indat')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])

    a=ccm_jun.groupby(['sic2','fyear'])['be'].sum()
    a=a.rename('indbe')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])

    a=ccm_jun.groupby(['sic2','fyear'])['pchcapx_hxz'].mean()
    a=a.rename('meanpchcapx_hxz')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])
    ccm_jun['pchcapx_ia_hxz']=ccm_jun['pchcapx_hxz']-ccm_jun['meanpchcapx_hxz']
    ccm_jun['herfraw']=(ccm_jun['sale']/ccm_jun['indsale'])*(ccm_jun['sale']/ccm_jun['indsale'])
    a=ccm_jun.groupby(['sic2','fyear'])['herfraw'].sum()
    a=a.rename('herf')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])

    ccm_jun['haraw']=(ccm_jun['at']/ccm_jun['indat'])*(ccm_jun['at']/ccm_jun['indat'])
    a=ccm_jun.groupby(['sic2','fyear'])['haraw'].sum()
    a=a.rename('ha')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])

    ccm_jun['heraw']=(ccm_jun['be']/ccm_jun['indbe'])*(ccm_jun['be']/ccm_jun['indbe'])
    a=ccm_jun.groupby(['sic2','fyear'])['heraw'].sum()
    a=a.rename('he')
    ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear'])

    ccm_jun = ccm_jun.sort_values(['sic2', 'fyear'])

    indmd=pd.DataFrame()
    indmd = ccm_jun.groupby(['sic2', 'fyear'], as_index=False)['roa'].median()
    indmd = indmd.rename(columns={'roa': 'md_roa'})
    indmd['md_cfroa'] = ccm_jun.groupby(['sic2', 'fyear'], as_index=False)['cfroa'].median()['cfroa']
    indmd['md_xrdint'] = ccm_jun.groupby(['sic2', 'fyear'], as_index=False)['xrdint'].median()['xrdint']
    indmd['md_capxint'] = ccm_jun.groupby(['sic2', 'fyear'], as_index=False)['capxint'].median()['capxint']
    indmd['md_xadint'] = ccm_jun.groupby(['sic2', 'fyear'], as_index=False)['xadint'].median()['xadint']
    ccm_jun=pd.merge(ccm_jun, indmd, how='left', on=['sic2','fyear'])

    ccm_jun = ccm_jun.sort_values(['gvkey', 'datadate'])
    ccm_jun['m1']=np.where(ccm_jun['roa']>ccm_jun['md_roa'], 1, 0)
    ccm_jun['m2']=np.where(ccm_jun['cfroa']>ccm_jun['md_cfroa'], 1, 0)
    ccm_jun['m3']=np.where(ccm_jun['oancf']>ccm_jun['ni'], 1, 0)
    ccm_jun['m4']=np.where(ccm_jun['xrdint']>ccm_jun['md_xrdint'], 1, 0)
    ccm_jun['m5']=np.where(ccm_jun['capxint']>ccm_jun['md_capxint'], 1, 0)
    ccm_jun['m6']=np.where(ccm_jun['xadint']>ccm_jun['md_xadint'], 1, 0)

    compr=conn.raw_sql("""
                      select splticrm, gvkey, datadate
                      from comp.adsprate
                      """)
    compr.datadate=pd.to_datetime(compr.datadate)

    ccm_jun['year']=ccm_jun['datadate'].dt.year
    compr['year']=compr['datadate'].dt.year
    ccm_jun=pd.merge(ccm_jun, compr[['splticrm','gvkey','year']], how='left', on=['gvkey','year'])

    ccm_jun = ccm_jun.sort_values(['gvkey', 'datadate'])

    cpi=pd.DataFrame()
    cpi['fyear'] = list(reversed(range(1924, 2017+1)))
    cpi['cpi'] = [246.19,242.23,236.53,229.91,229.17,229.594,
                  224.939,218.056,214.537,215.303,207.342,201.6,195.3,188.9,
                  183.96,179.88,177.1,172.2,166.6,163,160.5,156.9,
                  152.4,148.2,144.5,140.3,136.2,130.7,124,118.3,
                  113.6,109.6,107.6,103.9,99.6,96.5,90.9,82.4,
                  72.6,65.2,60.6,56.9,53.8,49.3,44.2,41.8,
                  40.6,38.9,36.7,34.8,33.4,32.5,31.6,31,
                  30.7,30.2,29.9,29.6,29.2,28.9,28.2,27.3,
                  26.8,26.9,26.8,26.7,25.9,24,23.7,24.4,
                  22.2,19.7,18.1,17.6,17.3,16.3,14.7,14,13.8,14.1,
                  14.4,13.9,13.6,13.3,13.1,13.6,15.1,16.6,17.2,17.1,17.2,17.5,17.7,17]
    ccm_jun = pd.merge(ccm_jun, cpi, how='left', on='fyear')
    ccm_jun = ccm_jun.sort_values(['gvkey', 'datadate'])
    ccm_jun = ccm_jun.drop_duplicates(['gvkey','datadate'])

    credit_mapping = {'D': 1, 'C': 2, 'CC': 3, 'CCC-':4, 'CCC':5, 'CCC+':6, 'B-': 7, 'B': 8, 'B+':9, 'BB-':10, 'BB':11, 'BB+': 12, 'BBB-':13,
                      'BBB':14, 'BBB+': 15, 'A-':16, 'A':17, 'A+':18, 'AA-':19,'AA':20,'AA+':21,'AAA':22}
    ccm_jun['credrat'] = ccm_jun['splticrm'].map(credit_mapping)
    ccm_jun['credrat'] = ccm_jun['credrat']
    ccm_jun['credrat'] = 0
    ccm_jun.loc[ccm_jun['credrat'] < lag(ccm_jun, 'credrat'), 'credrat_dwn'] = 1
    ccm_jun.loc[ccm_jun['credrat'] >= lag(ccm_jun, 'credrat'), 'credrat_dwn'] = 0


    ccm_jun = ccm_jun.sort_values(['gvkey','datadate'])
    ccm_jun['avgat'] = (ccm_jun['at']+ccm_jun['lagat'])/2
    ccm_jun.loc[ccm_jun['count']==0, 'orgcap_1'] = (ccm_jun['xsga']/ccm_jun['cpi'])/(.1+.15)
    orgcap_1 = ccm_jun[['orgcap_1','xsga','cpi']]
    prev_row = None
    for i, row in orgcap_1.iterrows():
        if (np.isnan(row['orgcap_1'])) and prev_row is not None:
            row['orgcap_1'] = prev_row['orgcap_1']*(1-0.15)+row['xsga']/row['cpi']
        prev_row = row
    ccm_jun['orgcap_1'] = orgcap_1['orgcap_1']

    ccm_jun['orgcap'] = ccm_jun['orgcap_1']/ccm_jun['avgat']
    ccm_jun.loc[ccm_jun['count'] == 0, 'orgcap'] = np.nan

    ccm_jun.loc[ccm_jun['count']==0, 'oc_1'] = ccm_jun['xsga']/(.1+.15)
    oc_1 = ccm_jun[['oc_1','xsga','cpi']]
    prev_row = None
    for i, row in oc_1.iterrows():
        if(np.isnan(row['oc_1'])) and prev_row is not None:
            row['oc_1'] = (1-0.15)*prev_row['oc_1'] + row['xsga']/row['cpi']
        prev_row = row
    ccm_jun['oc_1'] = oc_1['oc_1']

    ccm_jun['oca'] = ccm_jun['oc_1']/ccm_jun['at']

    mean_orgcap = ccm_jun.rename(columns={'orgcap':'orgcap_mean'}).groupby(['sic2','fyear'])['orgcap_mean'].mean()
    std_orgcap = ccm_jun.rename(columns={'orgcap':'orgcap_std'}).groupby(['sic2','fyear'])['orgcap_std'].std()
    ccm_jun = pd.merge(ccm_jun, mean_orgcap, on=['sic2','fyear'], how='left')
    ccm_jun = pd.merge(ccm_jun, std_orgcap, on=['sic2','fyear'], how='left')
    ccm_jun['orgcap_ia'] = (ccm_jun['orgcap']-ccm_jun['orgcap_mean'])/ccm_jun['orgcap_std']

    mean_oca = ccm_jun.rename(columns={'oca':'oca_mean'}).groupby(['sic2','fyear'])['oca_mean'].mean()
    std_oca = ccm_jun.rename(columns={'oca':'oca_std'}).groupby(['sic2','fyear'])['oca_std'].std()
    ccm_jun = pd.merge(ccm_jun, mean_oca, on=['sic2','fyear'], how='left')
    ccm_jun = pd.merge(ccm_jun, std_oca, on=['sic2','fyear'], how='left')
    ccm_jun['oca_ia'] = (ccm_jun['oca']-ccm_jun['oca_mean'])/ccm_jun['oca_std']

    ccm_jun.loc[ccm_jun['dvpsx_f']>0, 'dvpsx_1'] = 1
    ccm_jun.loc[ccm_jun['dvpsx_f']<=0, 'dvpsx_1'] = 0
    ccm_jun['ww'] = -0.091*(ccm_jun['ib']+ccm_jun['dp'])/ccm_jun['at'] - 0.062*ccm_jun['dvpsx_1'] + 0.021*ccm_jun['dltt']/ccm_jun['at'] -0.044*np.log(ccm_jun['at']) + 0.102*(ccm_jun['indsale']/lag(ccm_jun,'indsale')-1)-0.035*(ccm_jun['sale']/lag(ccm_jun,'sale')-1)

    return ccm_jun
Esempio n. 9
0
def build_compq6(permnos, ccm_jun):
    gvkeys = permnos_to_gvkeys(permnos)

    def lag(df, col, n=1, on='gvkey'):
        return df.groupby(on)[col].shift(n)

    lnk = conn.raw_sql(f"""
                         select * from crsp.ccmxpf_linktable
                         where lpermno in {permnos}
                         """)
    lnk = lnk[lnk['linktype'].isin(
        ['LU', 'LC', 'LD', 'LF', 'LN', 'LO', 'LS', 'LX'])]
    lnk = lnk[(2018 >= lnk['linkdt'].astype(str).str[0:4].astype(int)) |
              (lnk['linkdt'] == '.B')]
    lnk = lnk[(lnk['linkenddt'].isna()) |
              ("1940" <= lnk['linkenddt'].astype(str).str[0:4])]
    lnk = lnk.sort_values(['gvkey', 'linkdt'])
    lnk['linkdt'] = pd.to_datetime(lnk['linkdt'])
    lnk['linkenddt'] = pd.to_datetime(lnk['linkenddt'])

    ccm_jun2 = pd.merge(lnk[['gvkey', 'linkdt', 'linkenddt', 'lpermno']],
                        ccm_jun,
                        on='gvkey',
                        how='inner')
    ccm_jun2['datadate'] = pd.to_datetime(ccm_jun2['datadate'])
    ccm_jun2 = ccm_jun2[(ccm_jun2['linkdt'] <= ccm_jun2['datadate']) |
                        (ccm_jun2['linkdt'] == '.B')]
    ccm_jun2 = ccm_jun2[(ccm_jun2['datadate'] <= ccm_jun2['linkenddt']) |
                        (ccm_jun2['linkenddt'].isna())]
    ccm_jun2 = ccm_jun2[(ccm_jun2['lpermno'] != '.')
                        & ccm_jun2['gvkey'].notna()]

    temp = ccm_jun2[[
        'gvkey', 'permno', 'datadate', 'fyear', 'sic2', 'cfp', 'ep', 'cashpr',
        'dy', 'lev', 'sp', 'roic', 'rd_sale', 'chadv', 'agr', 'invest', 'gma',
        'chcsho', 'lgr', 'egr', 'chpm', 'chato', 'chinv', 'hire', 'cf', 'acc',
        'pctacc', 'absacc', 'spii', 'spi', 'sgr', 'pchsale_pchinvt',
        'pchsale_pchrect', 'pchgm_pchsale', 'pchsale_pchxsga', 'pchcapx', 'ps',
        'divi', 'divo', 'obklg', 'chobklg', 'securedind', 'secured', 'convind',
        'conv', 'grltnoa', 'chdrc', 'rd', 'rdbias', 'chpmia', 'chatoia',
        'chempia', 'pchcapx_ia', 'tb', 'cfp_ia', 'mve_ia', 'herf', 'credrat',
        'credrat_dwn', 'orgcap', 'grcapx', 'depr', 'pchdepr', 'grGW', 'tang',
        'woGW', 'sin', 'currat', 'pchcurrat', 'quick', 'pchquick', 'orgcap_ia',
        'adm', 'gad', 'rdm', 'rds', 'ol', 'ww', 'cdd', 'roavol_a', 'ala',
        'alm', 'ob_a', 'cinvest_a', 'noa', 'dnoa', 'pchcapx3', 'cdi', 'ivg',
        'dcoa', 'dcol', 'dwc', 'dnca', 'dncl', 'dnco', 'dfin', 'ta', 'dsti',
        'dfnl', 'poa', 'nef', 'ndf', 'atm', 'cp', 'op', 'nop', 'ndp', 'ebp',
        'rna', 'pm', 'ato', 'cto', 'gpa', 'rmw', 'ole', 'opa', 'ola', 'cop',
        'cla', 'os', 'zs', 'bi', 'oca', 'oca_ia', 'ha', 'he',
        'pchsale_pchinvt_hxz', 'pchsale_pchrect_hxz', 'pchgm_pchsale_hxz',
        'pchsale_pchxsga_hxz', 'realestate_hxz', 'secured_hxz', 'agr_hxz',
        'grltnoa_hxz', 'chcsho_hxz', 'pchcapx_ia_hxz', 'acc_hxz', 'egr_hxz',
        'pctacc_hxz', 'lev_hxz', 'ep_hxz', 'cfp_hxz', 'tb_hxz', 'salecash',
        'salerec', 'pchsaleinv', 'cashdebt', 'realestate', 'roe', 'operprof',
        'mve_f', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6'
    ]]

    crsp_msf = conn.raw_sql(f"""
                          select ret, retx, prc, shrout, vol, date, permno from crsp.msf
                          where permno in {permnos}
                          """)
    crsp_msf = crsp_msf[crsp_msf['permno'].isin(temp['permno'])]
    crsp_msf['date'] = pd.to_datetime(crsp_msf['date'])
    crsp_msf = crsp_msf.sort_values('date')

    z = temp[['datadate', 'permno']]
    z['date_l'] = temp['datadate'] + pd.TimedeltaIndex(
        [7] * len(z), 'M') + pd.TimedeltaIndex([-5] * len(z), 'd')
    z['date_u'] = temp['datadate'] + pd.TimedeltaIndex([20] * len(z), 'M')
    z = pd.merge(z, crsp_msf, on='permno', how='left')
    z['date'] = pd.to_datetime(z['date'])
    z = z[(z['date'] >= z['date_l']) & (z['date'] < z['date_u'])]

    temp2 = pd.merge(z, temp, on=['permno', 'datadate'], how='left')
    crsp_mseall = conn.raw_sql(f"""
                              select date, permno, exchcd, shrcd, siccd from crsp.mseall 
                              where permno in {permnos}
                              and exchcd in (1, 2, 3) and shrcd in (10, 11)
                              """)
    crsp_mseall = crsp_mseall.sort_values(['permno', 'exchcd', 'date'])
    mseall_min = crsp_mseall.groupby([
        'permno', 'exchcd'
    ])['date'].min().reset_index().rename(columns={'date': 'exchstdt'})
    mseall_max = crsp_mseall.groupby([
        'permno', 'exchcd'
    ])['date'].max().reset_index().rename(columns={'date': 'exchedt'})

    crsp_mseall = pd.merge(crsp_mseall, mseall_min, on=['permno', 'exchcd'])
    crsp_mseall = pd.merge(crsp_mseall, mseall_max, on=['permno', 'exchcd'])
    crsp_mseall = crsp_mseall.rename(columns={'date': 'time_1'})
    crsp_mseall = crsp_mseall.sort_values(['permno', 'exchcd'])
    crsp_mseall = crsp_mseall.drop_duplicates(['permno', 'exchcd'])
    crsp_mseall['exchstdt'] = pd.to_datetime(crsp_mseall['exchstdt'])
    crsp_mseall['exchedt'] = pd.to_datetime(crsp_mseall['exchedt'])

    temp2 = pd.merge(temp2, crsp_mseall, on='permno', how='left')
    temp2 = temp2[((temp2['date'] >= temp2['exchstdt']) &
                   (temp2['date'] <= temp2['exchedt']))]

    crsp_mseall_dl = conn.raw_sql(f"""
                                  select dlret, dlstcd, exchcd, date, permno from crsp.mseall
                                  where permno in {permnos}
                                  """)
    crsp_mseall_dl['date'] = pd.to_datetime(crsp_mseall_dl['date'])
    temp2 = pd.merge(temp2, crsp_mseall_dl, on=['date', 'permno'])

    temp2['exchcd'] = temp2['exchcd_x']

    temp2.loc[(temp2['dlret'].isna()) & ((temp2['dlstcd'] == 500) |
                                         ((temp2['dlstcd'] >= 520) &
                                          (temp2['dlstcd'] <= 584))) &
              (temp2['exchcd'].isin([1, 2])), 'dlret'] = -0.35
    temp2.loc[(temp2['dlret'].isna()) & ((temp2['dlstcd'] == 500) |
                                         ((temp2['dlstcd'] >= 520) &
                                          (temp2['dlstcd'] <= 584))) &
              (temp2['exchcd'].isin([3])), 'dlret'] = -0.55
    temp2.loc[(temp2['dlret'].notna()) & (temp2['dlret'] < -1), 'dlret'] = -1
    temp2.loc[(
        temp2['dlret'].isna()
    ), 'dlret'] = 0  #TODO: wtf? this should not be 0... i think this should be not missing...
    temp2['ret'] = temp2['ret'] + temp2['dlret']

    temp2 = temp2.sort_values(['permno', 'date', 'datadate'],
                              ascending=[True, True, False])
    temp2 = temp2.drop_duplicates(['permno', 'date'])

    temp2 = temp2.rename(columns={'datadate': 'time_2'})
    temp2['mve0'] = np.abs(temp2['prc']) * temp2['shrout']
    temp2['mvel1'] = lag(temp2, 'mve0')
    temp2['pps'] = lag(temp2, 'prc')

    comp_qtr = conn.raw_sql(f"""
                            select fyearq, fqtr, apdedateq, datadate, pdateq, fdateq, c.gvkey, f.cusip as cnum, 
                            datadate as datadate_q, rdq, sic as sic2,
                            ibq, saleq, txtq, revtq, cogsq, xsgaq,
                            atq, actq, cheq, lctq, dlcq, ppentq,
                            xrdq, rectq, invtq, ppegtq, txdbq, dlttq, dvpsxq, gdwlq, intanq, txditcq,
                            dpq, oibdpq, cshprq, ajexq, oiadpq, ivaoq, mibq, xintq, drcq, drltq, apq,
                            abs(prccq) as prccq, abs(prccq)*cshoq as mveq, ceqq, seqq, pstkq, atq, ltq, pstkrq
                            from comp.names as c, comp.fundq as f
                            where c.gvkey in {gvkeys}
                            and f.gvkey = c.gvkey
                            and f.indfmt='INDL'
                            and f.datafmt='STD'
                            and f.popsrc='D'
                            and f.consol='C'
                            """)

    comp_qtr.apdedateq = pd.to_datetime(comp_qtr.apdedateq)
    comp_qtr.datadate = pd.to_datetime(comp_qtr.datadate)
    comp_qtr.pdateq = pd.to_datetime(comp_qtr.pdateq)
    comp_qtr.fdateq = pd.to_datetime(comp_qtr.fdateq)

    comp_qtr = comp_qtr.loc[:, ~comp_qtr.columns.duplicated()]
    comp_qtr['cshoq'] = comp_qtr['mveq'] / abs(comp_qtr['prccq'])
    comp_qtr = comp_qtr.sort_values(['gvkey', 'datadate_q'])
    comp_qtr = comp_qtr.drop_duplicates(['gvkey', 'datadate_q'])

    def lag(df, col, n=1, on='gvkey'):
        z = df.groupby(on)[col].shift(n)
        z = z.reset_index()
        z = z.sort_values('index')
        z = z.set_index('index')
        return z[col]

    compq3 = comp_qtr
    compq3.loc[compq3['pstkrq'].notna(), 'pstk'] = compq3['pstkrq']
    compq3.loc[compq3['pstkrq'].isna(), 'ptsk'] = compq3['pstkq']
    compq3['scal'] = compq3['seqq']
    compq3.loc[compq3['seqq'].isna(), 'scal'] = compq3['ceqq'] + compq3['pstk']
    compq3.loc[(compq3['seqq'].isna()) & ((compq3['ceqq'].isna()) |
                                          (compq3['pstk'].isna())),
               'scal'] = compq3['atq'] - compq3['ltq']

    compq3['chtx'] = (compq3['txtq'] - lag(compq3, 'txtq', 4)) / lag(
        compq3, 'atq', 4)
    compq3['roaq'] = compq3['ibq'] / lag(compq3, 'atq')
    compq3['roeq'] = compq3['ibq'] / lag(compq3, 'scal')
    compq3['rsup'] = (compq3['saleq'] -
                      lag(compq3, 'saleq', 4)) / compq3['mveq']
    compq3['sacc'] = (
        ((compq3['actq'] - lag(compq3, 'actq')) -
         (compq3['cheq'] - lag(compq3, 'cheq'))) -
        ((compq3['lctq'] - lag(compq3, 'lctq')) -
         (compq3['dlcq'] - lag(compq3, 'dlcq')))) / compq3['saleq']
    compq3.loc[compq3['saleq'] <= 0,
               'sacc'] = ((compq3['actq'] - lag(compq3, 'actq')) -
                          (compq3['cheq'] - lag(compq3, 'cheq')))

    def trailing_std(df, col, n=15, on='gvkey'):
        z = df.groupby(on)[col].rolling(n).std()
        z = z.reset_index()
        z = z.sort_values('level_1')
        z = z.set_index('level_1')
        return z[col]

    compq3['stdacc'] = trailing_std(compq3, 'sacc', 16)
    compq3['sgrvol'] = trailing_std(compq3, 'rsup', 15)
    compq3['roavol'] = trailing_std(compq3, 'roaq', 15)
    compq3['scf'] = compq3['ibq'] / compq3['saleq'] - compq3['sacc']
    compq3.loc[compq3['saleq'] <= 0,
               'scf'] = compq3['ibq'] / 0.01 - compq3['sacc']
    compq3['stdcf'] = trailing_std(compq3, 'scf', 16)
    compq3['cash'] = compq3['cheq'] / compq3['atq']
    compq3['cinvest'] = (compq3['ppentq'] - lag(
        compq3, 'ppentq')) / compq3['saleq'] - (1 / 3) * (
            (lag(compq3, 'ppentq') - lag(compq3, 'ppentq', 2)) /
            lag(compq3, 'saleq')) - (1 / 3) * (
                (lag(compq3, 'ppentq', 2) - lag(compq3, 'ppentq', 3)) /
                lag(compq3, 'saleq', 2)) - (1 / 3) * (
                    (lag(compq3, 'ppentq', 3) - lag(compq3, 'ppentq', 4)) /
                    lag(compq3, 'saleq', 3))
    compq3.loc[compq3['saleq'] <= 0, 'cinvest'] = (
        compq3['ppentq'] - lag(compq3, 'ppentq')) / 0.01 - (1 / 3) * (
            (lag(compq3, 'ppentq') - lag(compq3, 'ppentq', 2)) /
            0.01) - (1 / 3) * (
                (lag(compq3, 'ppentq', 2) - lag(compq3, 'ppentq', 3)) / 0.01
            ) - (1 / 3) * (
                (lag(compq3, 'ppentq', 3) - lag(compq3, 'ppentq', 4)) / 0.01)
    compq3['che'] = compq3['ibq'] = lag(compq3, 'ibq', 4)
    #compq3['nincr']
    #TODO: nincr

    compq3['rdmq'] = compq3['xrdq'] / compq3['mveq']
    compq3['rdsq'] = compq3['xrdq'] / compq3['saleq']
    compq3['olq'] = (compq3['cogsq'] + compq3['xsgaq']) / compq3['atq']
    compq3['tanq'] = (compq3['cheq'] + 0.715 * compq3['rectq'] +
                      0.54 * compq3['invtq'] +
                      0.535 * compq3['ppegtq']) / compq3['atq']
    compq3['kzq'] = -1.002*((compq3['ibq'] + lag(compq3, 'ibq', 1) + lag(compq3, 'ibq', 2) + lag(compq3, 'ibq', 3) + compq3['dpq']) / lag(compq3,'ppentq')) \
        + 0.283*(compq3['atq'] + compq3['mveq'] - compq3['ceqq'] - compq3['txdbq'])/compq3['atq'] \
        - 3.139 * (compq3['dlcq'] + compq3['dlttq'])/(compq3['dlcq'] + compq3['dlttq'] + compq3['seqq']) \
        + 39.368*(compq3['dvpsxq'] * compq3['cshoq'] + lag(compq3, 'dvpsxq')*lag(compq3, 'cshoq') + lag(compq3, 'dvpsxq',2)*lag(compq3, 'cshoq',2) + lag(compq3, 'dvpsxq',3)*lag(compq3, 'cshoq', 3))/lag(compq3, 'ppentq')\
        - 1.315*compq3['cheq']/lag(compq3, 'ppentq')
    compq3.loc[compq3['gdwlq'].isna(), 'gdwlq'] = 0
    compq3.loc[compq3['intanq'].isna(), 'intanq'] = 0
    compq3['alaq'] = compq3['cheq'] + 0.75 * (
        compq3['actq'] -
        compq3['cheq']) + 0.5 * (compq3['atq'] - compq3['actq'] -
                                 compq3['gdwlq'] - compq3['intanq'])
    compq3['almq'] = compq3['alaq'] / (compq3['atq'] + compq3['mveq'] -
                                       compq3['ceqq'])
    compq3['laq'] = compq3['atq'] / lag(compq3, 'atq') - 1

    compq3.loc[compq3['seqq'].isna(),
               'seqq'] = compq3['ceqq'] + compq3['pstkq'] - compq3['ltq']
    compq3.loc[compq3['txditcq'].notna(),
               'bmq'] = (compq3['seqq'] + compq3['txditcq'] -
                         compq3['pstkq']) / compq3['mveq']
    compq3['dmq'] = (compq3['dlcq'] + compq3['dlttq']) / compq3['mveq']
    compq3['amq'] = compq3['atq'] / compq3['mveq']
    compq3['epq'] = compq3['ibq'] / compq3['mveq']
    compq3['cpq'] = (compq3['ibq'] + compq3['dpq']) / compq3['mveq']
    compq3['emq'] = (compq3['mveq'] + compq3['dlcq'] + compq3['dlttq'] +
                     compq3['pstkq'] - compq3['cheq']) / compq3['oibdpq']
    compq3['spq'] = compq3['saleq'] / compq3['mveq']
    compq3['ndpq'] = compq3['dlttq'] + compq3['dlcq'] + compq3[
        'pstkq'] - compq3['cheq']
    compq3['ebpq'] = (compq3['ndpq'] + compq3['ceqq']) / (compq3['ndpq'] +
                                                          compq3['mveq'])
    compq3['x_1'] = compq3['saleq'] / (compq3['cshprq'] * compq3['ajexq'])
    compq3['rs'] = (compq3['x_1'] - lag(compq3, 'x_1', 4)) / trailing_std(
        compq3, 'x_1', 6)
    compq3['droeq'] = compq3['roeq'] - lag(compq3, 'roeq', 4)
    compq3['droaq'] = compq3['roaq'] - lag(compq3, 'roaq', 4)

    compq3.loc[compq3['dlcq'].isna(), 'dlcq'] = 0
    compq3.loc[compq3['ivaoq'].isna(), 'ivaoq'] = 0
    compq3.loc[compq3['mibq'].isna(), 'mibq'] = 0
    compq3.loc[compq3['pstkq'].isna(), 'pstkq'] = 0
    compq3['noaq'] = (compq3['atq'] - compq3['cheq'] - compq3['ivaoq']) - (
        compq3['dlcq'] - compq3['dlttq'] - compq3['mibq'] - compq3['pstkq'] -
        compq3['ceqq']) / lag(compq3, 'atq')
    compq3['rnaq'] = compq3['oiadpq'] / lag(compq3, 'noaq')
    compq3['pmq'] = compq3['oiadpq'] / compq3['saleq']
    compq3['atoq'] = compq3['saleq'] / lag(compq3, 'noaq')
    compq3['ctoq'] = compq3['saleq'] / lag(compq3, 'atq')
    compq3['glaq'] = (compq3['revtq'] - compq3['cogsq']) / lag(compq3, 'atq')
    compq3['oleq'] = (compq3['revtq'] - compq3['cogsq'] - compq3['xsgaq'] -
                      compq3['xintq']) / lag(compq3, 'bmq')
    compq3['olaq'] = (compq3['revtq'] - compq3['cogsq'] - compq3['xsgaq'] +
                      compq3['xrdq']) / lag(compq3, 'atq')
    compq3['claq'] = ((compq3['revtq'] - compq3['cogsq'] - compq3['xsgaq'] + compq3['xrdq'] - (compq3['rectq']-lag(compq3, 'rectq')) - (compq3['invtq']-lag(compq3, 'invtq')) \
          + compq3['drcq'] - lag(compq3, 'drcq') + compq3['drltq'] - lag(compq3, 'drltq') + compq3['apq'] - lag(compq3, 'apq'))) / lag(compq3, 'atq')
    compq3['blq'] = compq3['atq'] / compq3['bmq']
    compq3['sgq'] = compq3['saleq'] / lag(compq3, 'saleq', 4)

    compq3.loc[compq3['dvpsxq'] > 0, 'dvpsxq_1'] = 1
    compq3.loc[compq3['dvpsxq'] <= 0, 'dvpsxq_1'] = 0
    temp_indsaleq = compq3.groupby(['sic2',
                                    'fyearq'])['saleq'].sum().reset_index()
    temp_indsaleq = temp_indsaleq.rename(columns={'saleq': 'indsaleq'})
    compq3 = pd.merge(compq3, temp_indsaleq, on=['fyearq', 'sic2'])
    compq3['wwq'] = -0.091*(compq3['ibq'] + compq3['dpq'])/compq3['atq'] - 0.062*compq3['dvpsxq_1'] + 0.021*compq3['dlttq']/compq3['atq'] \
        - 0.044 * np.log(compq3['atq']) + 0.102*(compq3['indsaleq']/lag(compq3, 'indsaleq') - 1) - 0.035*(compq3['saleq']/lag(compq3, 'saleq') - 1)

    temp_md_roavol = compq3.groupby(['fyearq', 'fqtr',
                                     'sic2'])['roavol'].median().reset_index()
    temp_md_roavol = temp_md_roavol.rename(columns={'roavol': 'md_roavol'})
    compq3 = pd.merge(compq3, temp_md_roavol, on=['fyearq', 'fqtr', 'sic2'])
    temp_md_sgrvol = compq3.groupby(['fyearq', 'fqtr',
                                     'sic2'])['sgrvol'].median().reset_index()
    temp_md_sgrvol = temp_md_sgrvol.rename(columns={'sgrvol': 'md_sgrvol'})
    compq3 = pd.merge(compq3, temp_md_sgrvol, on=['fyearq', 'fqtr', 'sic2'])
    compq3.loc[compq3['roavol'] < compq3['md_roavol'], 'm7'] = 1
    compq3.loc[compq3['roavol'] >= compq3['md_roavol'], 'm7'] = 0
    compq3.loc[compq3['sgrvol'] < compq3['md_sgrvol'], 'm8'] = 1
    compq3.loc[compq3['sgrvol'] >= compq3['md_sgrvol'], 'm8'] = 0

    # ibessum = conn.raw_sql(f"""
    #                         select ticker, cusip, fpedats, statpers, ANNDATS_ACT,
    #                         numest, ANNTIMS_ACT, medest, actual, stdev
    #                         from ibes.statsum_epsus
    #                         where ticker in {tics}
    #                         and fpi='6'
    #                         and statpers<ANNDATS_ACT
    #                         and measure='EPS'
    #                         and (fpedats-statpers)>=0
    #                         """)
    # ibessum = ibessum[(ibessum['medest'].notna()) & (ibessum['fpedats'].notna())]
    # ibessum = ibessum.sort_values(by=['cusip','fpedats','statpers'], ascending=[True,True,False])
    # ibessum = ibessum.drop_duplicates(['cusip', 'fpedats'])
    #
    # crsp_msenames = conn.raw_sql("""select * from crsp.msenames""")
    # crsp_msenames = crsp_msenames[crsp_msenames['ncusip'].notna()]
    # crsp_msenames = crsp_msenames.sort_values(['permno','ncusip'])
    # crsp_msenames = crsp_msenames.drop_duplicates(['permno','ncusip'])
    # names = crsp_msenames.rename(columns={'cusip':'cusip6'})
    #
    # ibessum2 = pd.merge(ibessum, names[['ncusip','cusip6']], left_on='cusip', right_on=['ncusip'], how='left')
    # ibessum2['cusip6'] = ibessum2['cusip6'].astype(str).str[0:6]

    compq3['cnum'] = compq3['cnum'].astype(str).str[0:6]
    # compq4 = pd.merge(compq3, ibessum2[['medest','actual','cusip6','fpedats']], left_on=['cnum','datadate_q'], right_on=['cusip6','fpedats'], how='left')
    compq4 = compq3
    compq4 = compq4.sort_values(['gvkey', 'datadate_q'])
    compq4 = compq4.drop_duplicates(['gvkey', 'datadate_q'])

    # compq4.loc[(compq4['medest'].isna()) | (compq4['actual']).isna(), 'sue'] = compq4['che']/compq4['mveq']
    # compq4.loc[(compq4['medest'].notna()) & (compq4['actual']).notna(), 'sue'] = (compq4['actual'] - compq4['medest'])/abs(compq4['prccq'])

    lnk = conn.raw_sql(f"""
                        select * from crsp.ccmxpf_linktable
                        where lpermno in {permnos}
                        """)
    lnk = lnk[lnk['linktype'].isin(
        ['LU', 'LC', 'LD', 'LF', 'LN', 'LO', 'LS', 'LX'])]
    lnk = lnk[(2018 >= lnk['linkdt'].astype(str).str[0:4].astype(int)) |
              (lnk['linkdt'] == '.B')]
    lnk = lnk[(lnk['linkenddt'].isna()) |
              ("1940" <= lnk['linkenddt'].astype(str).str[0:4])]
    lnk = lnk.sort_values(['gvkey', 'linkdt'])

    compq5 = pd.merge(compq4,
                      lnk[['gvkey', 'linkdt', 'linkenddt', 'lpermno']],
                      on='gvkey',
                      how='inner')
    compq5 = compq5[(compq5['linkdt'] <= compq5['datadate_q']) |
                    (compq5['linkdt'] == '.B')]
    compq5 = compq5[(compq5['datadate_q'] <= compq5['linkenddt']) |
                    (compq5['linkenddt'].isna())]
    compq5 = compq5[(compq5['lpermno'] != '.') & compq5['gvkey'].notna()]
    compq5 = compq5[(compq5['lpermno'].notna()) & (compq5['rdq'].notna())]

    crsp_dsf = conn.raw_sql(f"""
                          select vol, ret, permno, date from crsp.dsf as d
                          where d.permno in {permnos}
                          """)
    crsp_dsf = crsp_dsf[crsp_dsf['permno'].isin(compq5['lpermno'])]
    crsp_dsf['date'] = pd.to_datetime(crsp_dsf['date'])
    crsp_dsf = crsp_dsf.sort_values('date')

    compq5['temp_rdq'] = np.busday_offset(
        compq5['rdq'].values.astype('datetime64[D]'), -10, roll='forward')
    crsp_dsf['avgvol'] = trailing_std(crsp_dsf, 'vol', n=21, on='permno')
    compq5 = pd.merge(compq5,
                      crsp_dsf[['date', 'permno', 'avgvol']],
                      how='left',
                      left_on=['temp_rdq', 'lpermno'],
                      right_on=['date', 'permno'])

    compq5['temp_rdq'] = np.busday_offset(
        compq5['rdq'].values.astype('datetime64[D]'), 1, roll='forward')
    crsp_dsf['aeavol'] = trailing_std(crsp_dsf, 'vol', n=3, on='permno')
    crsp_dsf['ear'] = lag(crsp_dsf, 'ret', 0, on='permno') + lag(
        crsp_dsf, 'ret', 1, on='permno') + lag(crsp_dsf, 'ret', 2, on='permno')
    compq6 = pd.merge(compq5,
                      crsp_dsf[['date', 'permno', 'aeavol', 'avgvol', 'ear']],
                      how='left',
                      left_on=['temp_rdq', 'lpermno'],
                      right_on=['date', 'permno'])
    compq6['avgvol'] = compq6['avgvol_x']
    compq6['aeavol'] = (compq6['aeavol'] - compq6['avgvol']) / compq6['avgvol']

    compq6 = compq6[[
        'fyearq', 'fqtr', 'apdedateq', 'datadate', 'pdateq', 'fdateq', 'gvkey',
        'lpermno', 'datadate_q', 'rdq', 'chtx', 'roaq', 'rsup', 'stdacc',
        'stdcf', 'sgrvol', 'rdmq', 'rdsq', 'olq', 'tanq', 'kzq', 'alaq',
        'almq', 'laq', 'bmq', 'dmq', 'amq', 'epq', 'cpq', 'emq', 'spq', 'ndpq',
        'ebpq', 'wwq', 'rs', 'droeq', 'droaq', 'noaq', 'rnaq', 'pmq', 'atoq',
        'ctoq', 'glaq', 'oleq', 'olaq', 'claq', 'blq', 'sgq', 'roavol', 'cash',
        'cinvest', 'm7', 'm8', 'prccq', 'roeq', 'aeavol', 'ear'
    ]]

    compq6 = compq6.drop_duplicates()

    return compq6, temp2