shareclass = []
daily = []

bins = {k:{} for k in ['effective', 'realized', 'impact', 'quoted', 'volume',
                       'offersize', 'bidsize', 'ret', 'retq', 'counts']}
tic = time.time()
intervals = [(v,'s') for v in [1,2,5,15,30]] + [(v,'m') for v in [1,2,5]]
dates = [20191007, 20191008, 20180305, 20180306]
for d, date in enumerate(dates):
    master, trades, quotes = opentaq(date, taqdir)

    # screen on CRSP universe, and drop duplicate share classes (same permco)
    univ = crsp.get_universe(date)\
               .join(crsp.get_section(dataset='names',
                                      fields=['ncusip', 'permco', 'exchcd'],
                                      date_field='date',
                                      date=date,
                                      start=0), how='inner')\
               .sort_values(['permco', 'ncusip'])
    dups = master['CUSIP'].str.slice(0, 8).isin(
        univ.loc[univ.duplicated(['permco'], keep=False), 'ncusip'])
    shareclass.extend(master[dups].to_dict(orient='index').values())
    univ = univ.sort_values(['permco','cap'], na_position='first')\
               .drop_duplicates(['permco'], keep='last')\
               .reset_index().set_index('ncusip', drop=False)

    # Iterate by symbol over Daily Taq trades, nbbo and master files
    for ct, cq, m in itertaq(trades, quotes, master, cusips=univ['ncusip'],
                             open_t=_open, close_t=None):
        h = {'date':date}
        h.update(univ.loc[m['CUSIP'][:8], ['permno','decile','exchcd','siccd']])
Esempio n. 2
0
node_color.update({k: 'cyan' for k in top_color})
pos = igraph_draw(
    g, num=1, center_name=center_name,
    node_color=node_color, node_size=node_size, edge_color='r', k=2,
    pos=None, font_size=10, figsize=(11,12),
    labels={k:v for k,v in zip(g.vs['name'], g.vs['bea'])},
    title=f"Production Flows {list(total['year'].unique())}")
plt.show()
   
# Construct monthly BEA industry returns for the same period of years
codes = Sectoring(sql, f"bea{vintage}", fillna='')
naics = pstat.build_lookup('lpermno', 'naics', fillna=0)
caps, counts, rets = [], [], []
for year in years:
    date = bd.endyr(year - 1)
    univ = crsp.get_universe(date)
    univ['bea'] = codes[naics(univ.index, date)]
    univ = univ[univ['bea'].ne('')]
    grouped = univ.groupby('bea')
    caps.append(grouped['cap'].sum().rename(year))
    counts.append(grouped['cap'].count().rename(year))
        
    months = bd.date_range(date, bd.endyr(year), 'endmo')
    for rebaldate, end in zip(months[:-1], months[1:]):
        r = pd.concat([crsp.get_ret(bd.begmo(end), end),
                       crsp.get_cap(rebaldate, use_permco=False),
                       univ['bea']], axis=1, join='inner').dropna()
        grp = r.groupby('bea')   # industry ret is sum of weighted rets
        r['wtdret'] = r['ret'].mul(r['cap'].div(grp['cap'].transform('sum')))
        rets.append(grp['wtdret'].sum(min_count=1).rename(end))
        print(end, len(r), r['wtdret'].sum() / len(grp))
Esempio n. 3
0
# Construct weekly reversal
rebalbeg = 19730629  # increased stocks coverage in CRSP from around this date
rebalend = 20210101  # a Friday, so can include last week in 2020
wd = Weekly(sql, 'Fri', rebalbeg, rebalend)  # Generate Friday-end weekly cal

# Retrieve weekly returns, standardize scores, and compute returns and i.c.
june_universe = 0  # to track when reached a June end to update universe
year = 0  # to track new year to retrieve prices in batch for screening
res = DataFrame()
tic = time.time()
for rebaldate in wd.date_range(rebalbeg, rebalend)[:-1]:
    d = bd.june_universe(rebaldate)
    if d != june_universe:
        june_universe = d  # update universe every June
        univ = crsp.get_universe(june_universe)  # usual CRSP universe screen
        univ = univ[univ['decile'] < 10]  # drop smalest decile stocks
    start = wd.begwk(rebaldate)  # starting date of rebalance week
    beg = bd.offset(rebaldate, 1)  # beginning date of holding week
    end = wd.endwk(beg)  # ending date of holding week

    prcdate = bd.offset(start, -1)  # require price available at start of week
    prcyear = (prcdate // 10000) * 10000
    if prcyear != year:  # retrieve new batch of prices each new year
        year = prcyear
        prc = crsp.get_range('daily',
                             'prc',
                             'date',
                             year + 101,
                             year + 1231,
                             use_cache=True)
    ax.set_title(f"Top Bigrams: {ticker} {year} 10-K Business Description")
    #plt.savefig(os.path.join(imgdir, f"{ticker}{year}.jpg"))
plt.show()


# Community Detection with Business Descriptions
# Load spacy vocab
lang = 'en_core_web_lg'
nlp = spacy.load(lang)
n_vocab, vocab_dim = nlp.vocab.vectors.shape
print('Language:', lang, '   vocab:', n_vocab, '   dim:', vocab_dim)
stopwords = {'company', 'companys', 'companies', 'product', 'products',
             'service', 'services', 'business', 'description', 'year', 'years'}

# Load stock universes
univs = {y: crsp.get_universe(bd.endmo(int(f"{y-1}1231"))).assign(year=y)
         for y in range(1993, 2021)}


## Extract lemmatized nouns and named entities from bus10K documents
A = ed.open(form=form, item=item)  # open bus10K archive
A['year'] = [d.year-(d.month<4) for d in int2date(A['date'])] # set fiscal year

tic = time.time()
for year in [2020, 2019, 2018, 2017]:
    docs = dict()
    ners = dict()
    for i, permno in tqdm(enumerate(sorted(univs[year].index))):
        doc = A[A['permno'].eq(permno) & A['year'].eq(year)].sort_values('date')
        if len(doc):
            sent = ed[doc.iloc[0]['pathname']].encode('ascii', 'ignore').lower()
        f.write(' <br>')
        f.write('<p>\n')

# Momentum and divyld from CRSP monthly
if 'monthly' in testable:
    if regenerate:
        beg, end = 19251231, LAST_DATE
        intervals = {'mom12m': (2,12), 'mom36m': (13,36),
                     'mom6m': (2,6), 'mom1m': (1,1)}
        for label, past in intervals.items():
            out = DataFrame()
            for rebaldate in bd.date_range(bd.endmo(beg, past[1]), end, 'endmo'):
                start = bd.endmo(rebaldate, -past[1])
                beg1 = bd.offset(start, 1)
                end1 = bd.endmo(rebaldate, 1-past[0])
                df = crsp.get_universe(end1)
                df['start'] = crsp.get_section(dataset='monthly', fields=['ret'],
                                               date_field='date', date=start)\
                                  .reindex(df.index)
                df[label] = crsp.get_ret(beg1, end1).reindex(df.index)
                df['permno'] = df.index
                df['rebaldate'] = rebaldate
                df = df.dropna(subset=['start'])
                out = out.append(df[['rebaldate', 'permno', label]],
                                 ignore_index=True)    # append rows
            n = signals.write(out, label, overwrite=True)

        beg, end = 19270101, LAST_DATE
        columns = ['chmom', 'divyld', 'indmom']
        out = DataFrame()
        for rebaldate in bd.date_range(beg, end, 'endmo'):
                            LAST_DATE,
                            window=12,
                            months=[6],
                            rebals=rebals)['holdings']

# Compute MOM momentum factor
label = 'mom'
past = (2, 12)
df = []  # collect each month's momentum signal values
rebalend = bd.endmo(LAST_DATE, -1)
for rebaldate in bd.date_range(rebalbeg, rebalend, 'endmo'):
    beg = bd.endmo(rebaldate, -past[1])  # require price at this date
    start = bd.offset(beg, 1)  # start date, inclusive, of signal
    end = bd.endmo(rebaldate, 1 - past[0])  # end date of signal
    p = [
        crsp.get_universe(rebaldate),  # retrieve prices and construct signal
        crsp.get_ret(start, end)['ret'].rename(label),
        crsp.get_section('monthly', ['prc'], 'date', beg)['prc'].rename('beg'),
        crsp.get_section('monthly', ['prc'], 'date', end)['prc'].rename('end')
    ]
    q = pd.concat(p, axis=1, join='inner').reset_index().dropna()
    q['rebaldate'] = rebaldate
    df.append(q[['permno', 'rebaldate', label]])
    print(rebaldate, len(df), len(q))
df = pd.concat(df)
signals = chunk_signal(df)
holdings[label] = famafrench_sorts(crsp,
                                   label,
                                   signals,
                                   rebalbeg,
                                   rebalend,
Esempio n. 7
0
wordlists = Unstructured(mongodb, 'WordLists')
sentiments = {k: wordlists['lm', k] for k in ['positive', 'negative']}

# Pre-process with sklearn methods
tf_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    strip_accents='unicode',
    lowercase=True,
    #stop_words=stop_words,
    # tokenizer=CustomTokenizer(),
    token_pattern=r"\b[^\d\W][^\d\W][^\d\W]+\b")
tokenize = tf_vectorizer.build_tokenizer()
analyze = tf_vectorizer.build_analyzer()

# Construct sentiment feature all years for usual universe
univs = {
    yr + 1: crsp.get_universe(bd.endmo(yr * 10000 + 1231)).assign(year=yr + 1)
    for yr in range(1992, 2020)
}
results = []
files = ed.open(form=form, item=item)  # open mda10K archive
tic = time.time()
permnos = files['permno'].unique().astype(int)
for i, permno in tqdm(enumerate(permnos)):  # Loop over all permnos

    # retrieve all valid mda's for this permno by year
    mdas = {}
    dates = {}
    for _, f in files[files['permno'].eq(permno)].iterrows():
        year = int(f['date']) // 10000
        if ((f['date'] // 100) % 100) <= 3:  # if filing date <= Mar
            year = year - 1  # then assign to previous year