def _get_stats_info(cls, self): """Get and clean company stats info from server.""" stats = serverAPI('stats_combined').df stats_dt = stats[stats['date'] == stats['date'].max()].copy() all_symbols = serverAPI('all_symbols').df all_syms = (all_symbols[['symbol', 'name' ]].rename(columns={'name': 'companyName'})) ntest = pd.merge(all_syms, stats_dt, on='companyName') cols_to_keep = ([ 'symbol', 'marketcap', 'beta', 'peRatio', 'nextEarningsDate', 'day30ChangePercent', 'month6ChangePercent', 'year1ChangePercent' ]) df_stats = ntest[cols_to_keep].set_index('symbol') df_stats['nextEarningsDate'] = (pd.to_datetime( df_stats['nextEarningsDate'])) df_stats['dt'] = pd.to_datetime(getDate.query('iex_eod')) # Earnings data info (ned=Next Earnings Date) df_ned = df_stats.dropna(subset=['nextEarningsDate']).copy() df_ned['days_until_ER'] = (getDate.get_bus_day_diff( df_ned, 'dt', 'nextEarningsDate')) # Get dataframe with no information for earnings dates idx_diff = df_stats.index.difference(df_ned.index) df_no_ned = df_stats[df_stats.index.isin(idx_diff)] # Combine the two dataframes df_comb = pd.concat([df_ned, df_no_ned]).drop(columns='dt') return df_comb
def company_10qs_ref(): """Get ref data for company 10qs (quarterly filings).""" sma_api = serverAPI('sec_master_all') sma_df = sma_api.df.copy() sma_df['date'] = pd.to_datetime(sma_df['date'], unit='ms') forms_list = sma_df['Form Type'].value_counts().index # form_10 = [f for f in forms_list if '10' in str(f)] form_10qs = [f for f in forms_list if '10-Q' in str(f)] f10q_df = sma_df[sma_df['Form Type'].isin(form_10qs)].copy() all_syms = serverAPI('all_symbols').df all_syms.drop(columns=['date'], inplace=True) min_cik_len = all_syms['cik'].str.len().min() if min_cik_len < 10: print('Not all CIKs are 10 digits long') f10q_df.rename(columns={'name': 'sec_name'}, inplace=True) comb_df = pd.merge(f10q_df, all_syms, on=['cik']) tenq_df = comb_df[comb_df['Form Type'] == '10-Q'].copy() tenq_df.drop_duplicates(subset=['date', 'cik'], inplace=True) cols_to_keep = ['cik', 'symbol', 'date', 'name', 'Form Type'] tenq_df = tenq_df[cols_to_keep] path = Path(baseDir().path, 'ref_data', 'symbol_10q_ref.parquet') write_to_parquet(tenq_df, path)
def get_all_symbol_ref(): """Get all common and OTC symbols.""" load_dotenv() env = os.environ.get("env") df_all = None if env == "production": bpath = Path(baseDir().path, 'tickers', 'symbol_list') com_syms_path = bpath.joinpath('all_symbols.parquet') otc_syms_path = bpath.joinpath('otc_syms.parquet') com_df = pd.read_parquet(com_syms_path) otc_df = pd.read_parquet(otc_syms_path) otc_df.dropna(subset=['cik'], inplace=True) otc_df['cik'] = (otc_df['cik'].astype('int64').astype('str').str.zfill( 10).astype('category').reset_index(drop=True)) df_all = pd.concat([com_df, otc_df]).reset_index(drop=True) else: try: from api import serverAPI com_syms = serverAPI('all_symbols').df otc_syms = serverAPI('otc_syms').df df_all = pd.concat([com_syms, otc_syms]).reset_index(drop=True) except ModuleNotFoundError: help_print_arg('Tried import server api in get_all_symbols func') return df_all
def _get_all_symbols(cls, self): """Get all symbols data.""" all_symbols = serverAPI('all_symbols').df syms_sub = all_symbols[['symbol', 'type']] all_syms = syms_sub.set_index('symbol') return all_syms
def remove_funds_spacs(query='symbol not in @exclude_list'): """Remove funds and other non-common stock symbols.""" all_syms = serverAPI('all_symbols').df cond_str = all_syms['name'].str.lower().str # Exclude dict edict = {} edict['spacs'] = all_syms[cond_str.contains('acquisition')] edict['funds'] = all_syms[cond_str.contains('fund')] edict['etfs'] = all_syms[all_syms['type'] == 'et'] other_list = ['ps', 'rt', 'struct', 'ut', 'wt'] edict['others'] = all_syms[all_syms['type'].isin(other_list)] edict['inv_corp'] = all_syms[cond_str.contains('investment corp')] edict['capital'] = all_syms[cond_str.contains('capital')] edict['trusts'] = all_syms[(cond_str.contains('trust'))] # Exclude certain words words_to_exclude = ([ 'reit', 'municipal', 'income', 'merger', 'investors', ' spac ', ' i ', ' ii ', ' iv ', ' v ', ' vi ' ]) for word in words_to_exclude: edict[word] = all_syms[cond_str.contains(word)] exclude_list = pd.concat([val['symbol'] for val in edict.values()]) result = all_syms.query(query).reset_index(drop=True) return result
def _call_stock_meta_info(cls, self): """Get stock sector/industry data from server.""" meta = serverAPI('company_meta').df cols_to_keep = ['symbol', 'sector', 'industry'] meta_mod = (meta[cols_to_keep].drop_duplicates( subset=['symbol']).reset_index(drop=True).set_index('symbol')) return meta_mod
def _get_df_from_api(cls, self): """Get analyst recs df from server.""" recs_all = serverAPI('analyst_recs_all').df cols_to_keep = ([ 'ticker', 'date', 'time', 'action_company', 'action_pt', 'analyst', 'analyst_name', 'pt_prior', 'pt_current', 'rating_current', 'rating_prior', 'updated' ]) df = recs_all[cols_to_keep].reset_index(drop=True).copy() return df
def get_cik(sym): """Get SEC CIK number from symbol.""" all_symbols = serverAPI('all_symbols').df # Drop cik values that are NaNs or infinite all_symbols.dropna(axis=0, subset=['cik'], inplace=True) all_symbols['cik'] = all_symbols['cik'].astype(np.uint32) cik = (all_symbols[all_symbols['symbol'] == sym].head(1)['cik'].astype( 'uint32').iloc[0]) return cik
def fpath_not_converted(): """Get fpaths that weren't converted to parquet files.""" fpath_list = serverAPI('fpath_list').df fpath_list.rename(columns={0: 'fpath'}, inplace=True) not_converted = [] sub_str = '_all_combined_not_converted.gz' for fpath in tqdm(fpath_list['fpath']): if sub_str in (fpath): not_converted.append(fpath) return not_converted
def get_symbol_stats(): """Get stats for all symbols.""" try: from scripts.dev.api import serverAPI except ModuleNotFoundError: from api import serverAPI og_stats = serverAPI('stats_combined').df stats = og_stats[og_stats['date'] != 'nan'] stats_max = stats[stats['date'] == stats['date'].max()].reset_index( drop=True).copy() # All non-otc symbols reference data all_syms = serverAPI('all_symbols').df all_syms.drop(columns=['date'], inplace=True) df_stats = pd.merge(stats_max, all_syms, left_on=['companyName'], right_on=['name'], how='inner') # Company meta information meta_sapi = serverAPI('company_meta') meta_df = meta_sapi.df meta_df['spac'] = np.where( meta_df['companyName'].str.contains('Acquisition'), 1, 0) meta_df['fund'] = np.where(meta_df['companyName'].str.contains('Fund'), 1, 0) non_fund_spac = (meta_df[(meta_df['spac'] != 1) & (meta_df['fund'] != 1)].drop_duplicates( subset=['symbol'])) cols_to_keep = ['symbol', 'industry', 'sector', 'tags'] non_fund_spac = non_fund_spac[cols_to_keep] df_stats = pd.merge(non_fund_spac, df_stats, on=['symbol']) return df_stats
def get_company_meta_data(): """Get company meta data, save locally, from IEX.""" all_symbols = serverAPI('all_symbols').df all_cs = all_symbols[all_symbols['type'].isin(['cs', 'ad'])] sym_list = all_cs['symbol'].unique().tolist() bpath = Path(baseDir().path, 'company_stats', 'meta') for sym in tqdm(sym_list): try: ud = urlData(f"/stock/{sym}/company") fpath_suf = f"{sym.lower()[0]}/_{sym}.parquet" fpath = bpath.joinpath(fpath_suf) write_to_parquet(ud.df, fpath) except Exception as e: print(f"Company meta stats error: {type(e)} {str(e)}")
def retrieve_df(cls, self, latest): """Retrieve latest sec df if no df is passed.""" sec_df = serverAPI('sec_rss_latest').df # Rename columns, drop duplicates, and reset index sec_df = (sec_df.rename(columns={ 'CIK': 'cik', 'description': 'form' }).drop_duplicates(subset=['cik', 'pubDate']).reset_index(drop=True)) sec_df['dt'] = pd.to_datetime(sec_df['pubDate']) if latest: # Get data from latest rss (10 minutes) prev_15 = (datetime.now() - timedelta(minutes=15)).time() sec_df = (sec_df[(sec_df['dt'].dt.time > prev_15) & (sec_df['dt'].dt.date == date.today())].copy()) # Store under class variable self.sec_df = sec_df.copy()
def get_collect_prep_sec_data(df=False): """Get SEC master index file for each day since 2021.""" sma_api = serverAPI('sec_master_all') sma_df = sma_api.df.copy() sma_df['date'] = pd.to_datetime(sma_df['date'], unit='ms') sma_df['cik'] = sma_df['cik'].astype('category') dt = date(2021, 1, 1) sma_df = sma_df[sma_df['date'].dt.date > dt].copy() tenq = ['10-Q', 'NT 10-Q', '10-Q/A'] tenk = ['10-K', '10-K/A', 'NT 10-K'] eightk = ['8-K', '8-K/A'] sma_df['tenq'] = np.where(sma_df['Form Type'].isin(tenq), 1, 0) sma_df['tenk'] = np.where(sma_df['Form Type'].isin(tenk), 1, 0) sma_df['eightk'] = np.where(sma_df['Form Type'].isin(eightk), 1, 0) # Get symbol reference data from sec sec_syms = None path = Path(baseDir().path, 'tickers', 'symbol_list', 'sec_syms.parquet') if path.exists(): sec_syms = pd.read_parquet(path) else: sec_sym_list() time.sleep(2) sec_syms = pd.read_parquet(path) sec_syms.rename(columns={'cik_str': 'cik'}, inplace=True) cols_to_drop = ['name', 'File Name'] sma_df.drop(columns=cols_to_drop, inplace=True, errors='ignore') df_form_match = (sma_df[sma_df[['tenq', 'tenk', 'eightk']].isin([1]).any(axis=1)].copy()) sec_df = pd.merge(sec_syms, df_form_match, on='cik', how='inner') sec_df.drop(columns=['cik', 'title', 'Form Type'], inplace=True) sec_df.rename(columns={'ticker': 'symbol'}, inplace=True) sec_df = (sec_df.groupby(by=['symbol', 'date'], as_index=False).sum().copy()) if isinstance(df, pd.DataFrame): sec_df = pd.merge(df, sec_df, on=['symbol', 'date'], how='left') return sec_df
def last_bus_day_syms(): """Read all symbols from the last business day.""" sdir = Path(baseDir().path, 'tickers', 'new_symbols') fpath = get_most_recent_fpath(sdir, f_pre='_') sym_df = False if fpath.exists(): sym_df = pd.read_parquet(fpath) else: fpath = sdir.parent.joinpath('symbol_list', 'all_symbols.parquet') if fpath.exists(): sym_df = pd.read_parquet(fpath) else: sym_df = serverAPI('all_symbols').df write_to_parquet(sym_df, fpath) return sym_df
def clean_yfinance_options(df_temp=False, refresh=False): """Align with cboe ref data. Clean. Convert columns.""" df_comb = False path_suf = f"_{getDate.query('cboe')}.parquet" path = Path(baseDir().path, 'derivatives/end_of_day/daily_dump', path_suf) if path.is_file() and not refresh: df_comb = pd.read_parquet(path) return df_comb else: if not isinstance(df_temp, pd.DataFrame): df_temp = return_yoptions_temp_all() # cboe_ref = get_cboe_ref() cboe_ref = serverAPI('cboe_symref').df cboe_ref['contractSymbol'] = cboe_ref['OSI Symbol'].str.replace(' ', '') df_comb = pd.merge(df_temp, cboe_ref, on=['contractSymbol']).copy() df_comb['date'] = pd.to_datetime(df_comb['date'], unit='ms') df_comb['lastTradeDate'] = pd.to_datetime(df_comb['lastTradeDate'], unit='ms') df_comb['lastTradeDay'] = df_comb['lastTradeDate'].dt.date df_comb.drop_duplicates(subset=['contractSymbol', 'lastTradeDay'], inplace=True) # Add column for puts and calls df_comb['side'] = df_comb['OSI Symbol'].str[-9] # Add expiration dates df_comb['expDate'] = df_comb['OSI Symbol'].str[-16:-9].str.replace(' ', '') df_comb['expDate'] = pd.to_datetime(df_comb['expDate'], format='%y%m%d') df_comb['openInterest'] = df_comb['openInterest'].where(df_comb['openInterest'] != 0, 1) df_comb['vol/oi'] = df_comb['volume'].div(df_comb['openInterest']).round(0) df_comb['mid'] = (df_comb['ask'].add(df_comb['bid'])).div(2).round(3) df_comb['bid'] = df_comb['bid'].round(3) df_comb['premium'] = (df_comb['mid'].mul(df_comb['volume']) * 100).round(0) df_comb.rename(columns={'Symbol': 'symbol'}, inplace=True) if 'strike_x' in df_comb.columns: df_comb['strike'] = df_comb['strike_x'] write_to_parquet(df_comb, path) return df_comb
def get_missing_sec_master_idx(sma_df=False): """Get missing sec reference data files.""" # sma_df is the master index file of all dates if not isinstance(sma_df, pd.DataFrame): sma_df = serverAPI('sec_master_all').df sma_df['date'] = pd.to_datetime(sma_df['date'], unit='ms') bus_days = getDate.get_bus_days(this_year=True) dt = getDate.query('iex_eod') bus_days = bus_days[bus_days['date'].dt.date <= dt].copy() dts_missing = bus_days[~bus_days['date'].isin(sma_df['date'].unique(). tolist())].copy() dts_missing['dt_format'] = dts_missing['date'].dt.strftime('%Y%m%d') for dt in tqdm(dts_missing['dt_format']): try: smi = secMasterIdx(hist_date=dt) sleep(.5) except Exception as e: msg = f"get_missing_sec_master_idx: {str(e)}" help_print_arg(msg)
def get_last_30_intradays(): """Get last 30 intraday trading days.""" bsdays = getDate.get_bus_days() dt_today = getDate.query('iex_eod') dt_30 = dt_today - timedelta(days=30) days = (bsdays[(bsdays['date'].dt.date >= dt_30) & (bsdays['date'].dt.date <= dt_today)]) df_m1 = serverAPI('iex_intraday_m1').df days_tget = (days[~days['date'].isin(df_m1['date'] .unique())].copy()) # days_tget['dt_fmt'] = days_tget['date'].dt.strftime('%Y%m%d') try: from app.tasks import execute_func for dt in days_tget['date']: kwargs = {'dt': dt} execute_func.delay('iex_intraday', **kwargs) except ModuleNotFoundError: pass
def return_yoptions_temp_all(): """Return dataframe of all yoptions temp (today's data).""" df_all = None # If local environment if 'Algo' in baseDir().path: try: from api import serverAPI df_all = serverAPI('yoptions_temp').df except ModuleNotFoundError as me: help_print_arg(str(me)) else: # Assume production environment dt = getDate.query('iex_eod') yr = dt.year fpath = Path(baseDir().path, 'derivatives/end_of_day/temp', str(yr)) globs = list(fpath.glob('**/*.parquet')) df_list = [] [df_list.append(pd.read_parquet(path)) for path in globs] df_all = pd.concat(df_list) return df_all
def _get_syms(cls, self): """Get symbol list for which to retrieve data.""" all_syms = serverAPI('all_symbols').df syms = all_syms['symbol'].unique().tolist() self.syms = syms
def read_clean_combined_all(local=False, dt=None, filter_syms=True): """Read, clean, and add columns to StockEOD combined all.""" df_all = None if local: bpath = Path(baseDir().path, 'StockEOD/combined_all') fpath = get_most_recent_fpath(bpath) cols_to_read = [ 'date', 'symbol', 'fOpen', 'fHigh', 'fLow', 'fClose', 'fVolume' ] df_all = pd.read_parquet(fpath, columns=cols_to_read) if df_all['date'].dtype == 'object': df_all['date'] = pd.to_datetime(df_all['date']) df_all.drop_duplicates(subset=['symbol', 'date'], inplace=True) else: cols_to_read = [ 'date', 'symbol', 'fOpen', 'fHigh', 'fLow', 'fClose', 'fVolume' ] df_all = serverAPI('stock_close_cb_all').df df_all = df_all[cols_to_read] if filter_syms: all_cs_syms = remove_funds_spacs() df_all = df_all[df_all['symbol'].isin( all_cs_syms['symbol'])].copy() df_all['date'] = pd.to_datetime(df_all['date']) # Define base bpath for 2015-2020 stock data bpath = Path(baseDir().path, 'historical/each_sym_all') path = get_most_recent_fpath( bpath.joinpath('each_sym_all', 'combined_all')) df_hist = pd.read_parquet(path) # Combine 2015-2020 stock data with ytd df_all = pd.concat([df_hist, df_all]).copy() df_all.drop_duplicates(subset=['symbol', 'date'], inplace=True) df_all.reset_index(drop=True, inplace=True) if not dt: dt = getDate.query('iex_eod') # Exclude all dates from before this year df_all = (df_all[df_all['date'] >= str(dt.year)].dropna( subset=['fVolume']).copy()) # Get rid of all symbols that only have 1 day of data df_vc = df_all['symbol'].value_counts() df_vc_1 = df_vc[df_vc == 1].index.tolist() df_all = (df_all[~df_all['symbol'].isin(df_vc_1)].reset_index( drop=True).copy()) # Sort by symbol, date ascending df_all = df_all.sort_values(by=['symbol', 'date'], ascending=True) df_all['fRange'] = (df_all['fHigh'] - df_all['fLow']) df_all['vol/mil'] = (df_all['fVolume'].div(1000000)) df_all['prev_close'] = df_all['fClose'].shift(periods=1, axis=0) df_all['prev_symbol'] = df_all['symbol'].shift(periods=1, axis=0) # Add fChangeP col print('Fib funcs: adding fChangeP column') df_all = add_fChangeP_col(df_all) # Merge with df_all and resume # Add gap column print('Fib funcs: adding gap column') df_all = add_gap_col(df_all) # Add range of gap df_all['gRange'] = (np.where(df_all['prev_close'] < df_all['fLow'], df_all['fHigh'] - df_all['prev_close'], df_all['fHigh'] - df_all['fLow'])) df_all['cumPerc'] = np.where(df_all['symbol'] == df_all['prev_symbol'], df_all['fChangeP'].cumsum(), np.NaN) df_all['perc5'] = np.where(df_all['symbol'] == df_all['prev_symbol'], df_all['cumPerc'].shift(-5) - df_all['cumPerc'], np.NaN) df_all['vol_avg_2m'] = np.where(df_all['symbol'] == df_all['prev_symbol'], df_all['fVolume'].rolling(60).mean(), np.NaN) # Add cumulative sum of last 5 fChangeP rows df_all['fCP5'] = (np.where( df_all['symbol'] == df_all['prev_symbol'], df_all['fChangeP'].rolling(min_periods=1, window=5).sum(), 0)) df_all = df_all.copy() # Calc RSI and moving averages print('Fib Funcs: calc_rsi') df_all = calc_rsi(df_all) print('Fib Funcs: making_moving_averages') df_all = make_moving_averages(df_all) # fHighMax funcs print('Fib funcs: fHighMax') df_all = add_fHighMax_col(df_all).copy() df_all = df_all.sort_values(by=['symbol', 'date'], ascending=True) float_32s = df_all.dtypes[df_all.dtypes == np.float32].index for col in float_32s: df_all[col] = df_all[col].astype(np.float64).round(3) df_all = dataTypes(df_all, parquet=True).df.copy() return df_all
dt = getDate.query('iex_close') bpath = Path(baseDir().path, 'derivatives/cboe_intraday/2021') fpath = get_most_recent_fpath(bpath, f_suf='_eod', dt=dt) # %% codecell from workbooks.fib_funcs import read_clean_combined_all df_all = read_clean_combined_all() # Eagles golf club # path_eod won't have the timestamps # %% codecell # %% codecell sapi_eod = serverAPI('cboe_intraday_eod') df_cboe = sapi_eod.df cols_to_rename = ({ 'Symbol': 'symbol', 'Call/Put': 'side', 'Expiration': 'expirationDate', 'Strike Price': 'strike' }) df_cboe.rename(columns=cols_to_rename, inplace=True) df_cboe['symbol'] = df_cboe['symbol'].astype('category') # %% codecell sapi = serverAPI('yoptions_daily') df = sapi.df # %% codecell df_last = (df[df['lastTradeDate'].dt.date ==
from data_collect.iex_class import urlData from api import serverAPI from importlib import reload import sys reload(sys.modules['multiuse.help_class']) # %% codecell pd.set_option('display.max_columns', 65) pd.set_option('display.max_rows', 150) # %% codecell dt = getDate.query('iex_eod') bpath = Path(baseDir().path, 'intraday', 'minute_1', str(dt.year)) all_syms = serverAPI('all_symbols').df syms = all_syms['symbol'].unique() # %% codecell minute = 'minute_1' def combine_all_intraday_data(minute='minute_1'): """Combine all intraday data, write to file.""" dt = getDate.query('iex_eod') path = Path(baseDir().path, 'intraday', minute, str(dt.year)) fpaths = list(path.glob('**/*.parquet')) df_list = []
try: from scripts.dev.data_collect.sec_routines import secMasterIdx from scripts.dev.missing_data.missing_sec_masteridx import get_missing_sec_master_idx from scripts.dev.missing_data.missing_dates import get_missing_dates from scripts.dev.multiuse.help_class import getDate, baseDir, write_to_parquet from scripts.dev.api import serverAPI except ModuleNotFoundError: from data_collect.sec_routines import secMasterIdx from missing_data.missing_sec_masteridx import get_missing_sec_master_idx from missing_data.missing_dates import get_missing_dates from multiuse.help_class import getDate, baseDir, write_to_parquet from api import serverAPI # %% codecell serverAPI('redo', val='get_missing_sec_master_idx') serverAPI('redo', val='combine_all_sec_masters') # %% codecell # %% codecell sma_api = serverAPI('sec_master_all') sma_df = sma_api.df.copy() sma_df['date'] = pd.to_datetime(sma_df['date'], unit='ms') sma_df.shape sma_df.head() # %% codecell