def parse_items(years=years):
    ed = EdgarClone(config.datapath['10X'], zipped=False)
    sql = SQL(**config.credentials['sql'])
    bday = BusDay(sql)
    pstat = PSTAT(sql, bday)
    to_permno = pstat.build_lookup(target='lpermno', source='cik', fillna=0)

    items = {'10-K': ['bus10K', 'mda10K']}  # '10-Q': ['mda10Q']}
    logger = []
    for year in years:    #2019, 2021):  # Start 1998++
        rows = ed.open(date=year)
        row = rows.iloc[0]
        for i, row in rows.iterrows():
            permno = to_permno(int(row['cik']))
            if row['form'] in items and permno:
                filing = ed[row['pathname']]
                for item in items[row['form']]:
                    extract = Edgar.extract_item(filing, item)
                    s = ed.to_path(form=row['form'], permno=permno, item=item,
                                   basename=os.path.basename(row['pathname']))
                    with open(s, 'wt') as g:
                        g.write(extract)
                    r = {'year': year, 'permno': permno, 'item': item,
                         'text_c': len(filing),
                         'item_c': len(extract),
                         'text_w': len(filing.split()),
                         'item_w': len(extract.split())}
                    logger.append(r)
                    print(", ".join([f"{k}: {v}" for k,v in r.items()]))
    logger = DataFrame.from_records(logger)
Beispiel #2
0
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import igraph  # pip3 install cairocffi
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from finds.pyR import PyR    
from finds.busday import BusDay
from finds.database import SQL, Redis
from finds.structured import CRSP, PSTAT
from finds.sectors import Sectoring, BEA
from finds.graph import igraph_draw
from settings import settings
ECHO = True
sql = SQL(**settings['sql'])
bd = BusDay(sql)
rdb = Redis(**settings['redis'])
crsp = CRSP(sql, bd, rdb)
pstat = PSTAT(sql, bd)
bea = BEA(rdb, **settings['bea'], echo=ECHO)
logdir = None # os.path.join(settings['images'], 'bea')
years = np.arange(1947, 2020) 
vintages = [1997, 1963, 1947]   # when sectoring schemes were revised

# Read IOUse tables from BEA website
ioUses = dict()
for vintage in vintages:
    for year in [y for y in years if y >= vintage]:
        df = bea.read_ioUse(year, vintage=vintage)
        ioUses[(vintage, year)] = df
    print(f"{len(ioUses)} tables through sectoring vintage year {vintage}")
Beispiel #3
0
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
import igraph  # pip3 install cairocffi
from igraph import Graph
from itertools import chain
from finds.graph import igraph_info, igraph_community
from finds.database import SQL
from finds.busday import BusDay
from finds.structured import PSTAT
from finds.sectors import Sectoring
from settings import settings

sql = SQL(**settings['sql'])
bd = BusDay(sql)
pstat = PSTAT(sql, bd)
logdir = os.path.join(settings['images'], 'tnic')  # None
tnic_scheme = 'tnic2'

# Retrieve TNIC scheme from Hoberg and Phillips website
# https://hobergphillips.tuck.dartmouth.edu/industryclass.htm
root = 'https://hobergphillips.tuck.dartmouth.edu/idata/'
source = os.path.join(root, tnic_scheme + '_data.zip')
if source.startswith('http'):
    response = requests.get(source)
    source = io.BytesIO(response.content)
with zipfile.ZipFile(source).open(tnic_scheme + "_data.txt") as f:
    tnic_data = pd.read_csv(f, sep='\s+')
tnic_data.head()
import time
import os
from pandas import DataFrame, Series
from matplotlib import colors
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from finds.database import SQL, Redis
from finds.structured import CRSP
from finds.busday import BusDay
from finds.taq import opentaq, itertaq, open_t, close_t, bin_trades, bin_quotes
from finds.display import plot_time, row_formatted
from finds.solve import weighted_average
from settings import settings
sql = SQL(**settings['sql'])
user = SQL(**settings['user'])
bday = BusDay(sql)
rdb = Redis(**settings['redis'])
crsp = CRSP(sql, bday, rdb=rdb)
logdir = os.path.join(settings['images'], 'micro')  # None
taqdir = os.path.join(settings['remote'], 'TAQ')
_open = pd.to_datetime('1900-01-01T9:30')    # exclude <= 
_close = pd.to_datetime('1900-01-01T16:00')  # exclude >

# Loop through the sample TAQ data dates available from NYSE and collect info
shareclass = []
daily = []

bins = {k:{} for k in ['effective', 'realized', 'impact', 'quoted', 'volume',
                       'offersize', 'bidsize', 'ret', 'retq', 'counts']}
tic = time.time()
intervals = [(v,'s') for v in [1,2,5,15,30]] + [(v,'m') for v in [1,2,5]]
Beispiel #5
0
License: MIT
"""
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from finds.database import SQL, Redis
from finds.structured import CRSP
from finds.busday import BusDay, Weekly
from settings import settings
sql = SQL(**settings['sql'])
user = SQL(**settings['user'])
rdb = Redis(**settings['redis'])
bd = BusDay(sql)
crsp = CRSP(sql, bd, rdb)
logdir = os.path.join(settings['images'], 'weekrev')

# Construct weekly reversal
rebalbeg = 19730629  # increased stocks coverage in CRSP from around this date
rebalend = 20210101  # a Friday, so can include last week in 2020
wd = Weekly(sql, 'Fri', rebalbeg, rebalend)  # Generate Friday-end weekly cal

# Retrieve weekly returns, standardize scores, and compute returns and i.c.
june_universe = 0  # to track when reached a June end to update universe
year = 0  # to track new year to retrieve prices in batch for screening
res = DataFrame()
tic = time.time()
for rebaldate in wd.date_range(rebalbeg, rebalend)[:-1]:
    d = bd.june_universe(rebaldate)
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from tqdm import tqdm
from collections import Counter
from finds.database import SQL, MongoDB, Redis
from finds.structured import CRSP, PSTAT
from finds.busday import BusDay, int2date
from finds.unstructured import Unstructured
from finds.edgar import EdgarClone, Edgar
from finds.graph import igraph_info, igraph_community
from finds.sectors import Sectoring
from settings import settings
ECHO = True
sql = SQL(**settings['sql'])
user = SQL(**settings['user'])
bd = BusDay(sql)
rdb = Redis(**settings['redis'])
crsp = CRSP(sql, bd, rdb)
mongodb = MongoDB(**settings['mongodb'])
wordlists = Unstructured(mongodb, 'WordLists')
ed = EdgarClone(settings['10X'], zipped=True, echo=ECHO)
imgdir = os.path.join(settings['images'], 'edgar')
item, form = 'bus10K', '10-K'

# Retrieve business description (10-K item 1) for 'aapl' from Edgar
from nltk.tokenize import RegexpTokenizer
ticker = 'AAPL'
cik = Edgar.fetch_tickers()[ticker.lower()]                 # lookup aapl's cik
stop_words = [w for c in ['genericlong', 'DatesandNumbers'] # LM stop word lists
              for w in wordlists['lm', c.lower()]] # if "'" not in w]
top_words = {}
    def endwk(self, date, weeks=0):
        """Return ending business week date/s"""
        return _to_values(self.weeks['end'].iloc[self.numwk(date) + weeks])

    def ismonthend(self, date):
        """If date/s in last complete week in any month"""
        return _to_values(self.weeks['ismonthend'].iloc[self.numwk(date)])


if False:  # create custom busday trading dates
    from settings import settings
    from finds.database import SQL
    from finds.busday import BusDay
    sql = SQL(**settings['sql'], echo=True)
    busday = BusDay(sql, create=True)  # set create flag as True

if False:  # some unit tests
    from settings import settings
    from finds.database import SQL
    from finds.busday import Weekly
    sql = SQL(**settings['sql'], echo=True)
    wd = Weekly(sql, day=3, end=20201231)  # derive weekly trading calendar

    print(wd.numwk(20201230))
    print(wd.numwk(20210130))
    print(wd.numwk(20201231))
    print(wd.endwk([20201209, 20201219]))
    print(wd.endwk(20201209))
    print(wd.endmo([20201209, 20201219]))
    print(wd.endmo(20201209))
import matplotlib.pyplot as plt
import os, time
from datetime import datetime
from settings import settings
from finds.database import SQL, Redis
from finds.structured import PSTAT, CRSP, IBES, Benchmarks, Signals
from finds.busday import BusDay, Weekly
from finds.structured import as_signal
from finds.backtesting import BackTest
from finds.solve import fractiles
LAST_DATE = settings['crsp_date']

sql = SQL(**settings['sql'])
user = SQL(**settings['user'])
rdb = Redis(**settings['redis'])
bd = BusDay(sql)
pstat = PSTAT(sql, bd)
crsp = CRSP(sql, bd, rdb)
ibes = IBES(sql, bd)
bench = Benchmarks(sql, bd)
signals = Signals(user)
backtest = BackTest(user, bench, 'RF', LAST_DATE)
outdir = os.path.join(settings['images'], 'factors')

# signals to flip signs when forming spread portfolios
flips = {'mom1m':-1, 'mom36m':-1, 'pricedelay':-1, 'absacc':-1, 'acc':-1,
         'agr':-1, 'chcsho':-1, 'egr':-1, 'mve_ia':-1, 'pctacc':-1, 'aeavol':-1,
         'disp':-1, 'stdacc':-1, 'stdcf':-1, 'secured':-1, 'maxret':-1, 'ill':-1,
         'zerotrade':-1, 'cashpr':-1, 'chinv':-1, 'invest':-1, 'cinvest':-1}

## Helpers to lag characteristics and roll returns
print("\n".join(f"[{i}] {d}" for i, d in enumerate(datasets)))
for name, item, suffix in datasets:
    df = fetch_FamaFrench(name=name, item=item, suffix=suffix,
                          index_formatter=bd.offset)
    for col in df.columns:
        bench.load_series(df[col], name=name, item=item)
print(DataFrame(**sql.run('select * from ' + bench['ident'].key)))
"""
"""Weekly: price update and clear affected redis store
run yahoo
./redis-cli --scan --pattern '*CRSP_2021*' | xargs ./redis-cli del
"""

# Estimate daily factors
LAST_DATE = 20210618  # last date in daily prices table
bd = BusDay(sql)
bench = Benchmarks(sql, bd)
crsp = CRSP(sql, bd, rdb)
pstat = PSTAT(sql, bd)

## Rebalance and return dates, and initialize classes for calculations
rebalbeg = bd.offset(20190630)
rebals = [bd.offset(d) for d in [20200630]]
stocks = chunk_stocks(crsp, rebalbeg, LAST_DATE)
perf = DailyPerformance(stocks)

# Compute HML factor
label = 'hml'
lag = 6  # number of months to lag fundamental data
df = pstat.get_linked(  # retrieve required fields from compustat
    dataset='annual',
        df = pd.read_csv(os.path.join(pathname, 'prices.csv.gz'), sep='|')
        new = set(np.unique(df['ticker'])).difference(
            set(np.unique(prices['ticker'])))
        df = df[df['ticker'].isin(new)]
        prices = prices.append(df, sort=False)
        print(pathname, 'added prices', new)

        df = pd.read_csv(os.path.join(pathname, 'dividends.csv.gz'), sep='|')
        new = set(np.unique(df['ticker'])).difference(
            set(np.unique(dividends['ticker'])))
        df = df[df['ticker'].isin(new)]
        dividends = dividends.append(df, sort=False)
        print(pathname, 'added dividends', new)

    sql = SQL(**config.credentials['sql'], echo=config.ECHO)
    bd = BusDay(sql)
    crsp = CRSP(sql, bd, rdb=None)
    date = bd.offset(crsp_date)

    # get price and shrout as of last date
    price = crsp.get_section('daily', ['prc', 'shrout'],
                             'date',
                             date,
                             start=None)

    # get tickers to lookup permno
    tickers = crsp.get_section('names', ['tsymbol', 'date'],
                               'date',
                               date,
                               start=0).reindex(price.index)
    tickers = tickers.sort_values(['tsymbol', 'date'])\
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import pandas_datareader as pdr
from pandas_datareader.data import DataReader
from pandas_datareader.famafrench import FamaFrenchReader
from finds.database import SQL, Redis
from finds.structured import CRSP, Signals, Benchmarks
from finds.busday import BusDay
from finds.backtesting import RiskPremium
from finds.solve import winsorized
from settings import settings
LAST_DATE = settings['crsp_date']
sql = SQL(**settings['sql'])
user = SQL(**settings['user'])
rdb = Redis(**settings['redis'])
bd = BusDay(sql)
crsp = CRSP(sql, bd, rdb)
bench = Benchmarks(sql, bd)
signals = Signals(user)
logdir = os.path.join(settings['images'], 'fm')


def least_squares(data=None, y=['y'], x=['x'], stdres=False):
    """Helper to compute least square coefs, supports groupby().apply"""
    X = data[x].to_numpy()
    Y = data[y].to_numpy()
    X = np.hstack([np.ones((X.shape[0], 1)), X])
    x = ['Intercept'] + x
    b = np.dot(np.linalg.inv(np.dot(X.T, X)), np.dot(X.T, Y)).T
    if stdres:
        b = np.hstack([b, np.std(Y - (X @ b.T), axis=0).reshape(-1, 1)])
Beispiel #12
0
import sklearn.feature_extraction
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
from finds.database import SQL, MongoDB, Redis
from finds.structured import CRSP, Signals
from finds.busday import BusDay
from finds.unstructured import Unstructured
from finds.readers import fetch_lm
from finds.solve import weighted_average, fractiles
from finds.edgar import EdgarClone
from settings import settings
ECHO = False
sql = SQL(**settings['sql'])
user = SQL(**settings['user'])
bd = BusDay(sql)
rdb = Redis(**settings['redis'])
crsp = CRSP(sql, bd, rdb)
ed = EdgarClone(settings['10X'], zipped=True, echo=ECHO)
signals = Signals(user)
mongodb = MongoDB(**settings['mongodb'])
wordlists = Unstructured(mongodb, 'WordLists')
imgdir = os.path.join(settings['images'], 'edgar')
item, form = 'mda10K', '10-K'

# Load Loughran and MacDonald sentiment words and stopwords
mongodb = MongoDB()
wordlists = Unstructured(mongodb, 'WordLists')
sentiments = {k: wordlists['lm', k] for k in ['positive', 'negative']}

# Pre-process with sklearn methods
Beispiel #13
0
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import time
from settings import settings
import os
from finds.database import SQL, Redis
from finds.busday import BusDay
from finds.structured import PSTAT, CRSP, Benchmarks, Signals
from finds.backtesting import BackTest
from finds.solve import fractiles
LAST_DATE = settings['crsp_date']

sql = SQL(**settings['sql'])
user = SQL(**settings['user'])
rdb = Redis(**settings['redis'])
bd = BusDay(sql)
pstat = PSTAT(sql, bd)
crsp = CRSP(sql, bd, rdb)
bench = Benchmarks(sql, bd)
signals = Signals(user)
backtest = BackTest(user, bench, 'RF', LAST_DATE)
logdir = os.path.join(settings['images'], 'ff')  # None

# Load items from Compustat Annual
# Construct HML as shareholders equity plus investment tax credits, less
#   preferred stock divided by December market cap.
# Require 6 month reporting lag and at least two years history in Compustat

label = 'hml'
lag = 6  # number of months to lag fundamental data
df = pstat.get_linked(  # retrieve required fields from compustat