def _write_syms_to_years(cls, self, df_hist): """Take combined df and write to correct year, for each sym.""" years = df_hist['date'].dt.year.unique() syms = df_hist['symbol'].unique() df_hist_idx = df_hist.copy() df_hist_idx['year'] = df_hist['date'].dt.year df_hist_idx = df_hist_idx.set_index(['symbol', 'year']) bpath = Path(baseDir().path, 'StockEOD') for sym in tqdm(syms): try: for yr in years: df_mod = (df_hist_idx.loc[sym, yr] .reset_index(level='symbol') .reset_index(drop=True) .copy()) yr_path = bpath.joinpath(str(yr), sym.lower()[0], f"_{sym}.parquet") if yr_path.exists(): df_old = pd.read_parquet(yr_path) df_all = pd.concat([df_old, df_mod]) df_all = df_all.drop_duplicates(subset=['date']).reset_index(drop=True) write_to_parquet(df_all, yr_path) else: write_to_parquet(df_mod.reset_index(drop=True), yr_path) except Exception as e: print(sym) print(str(e)) print()
def _combine_all(cls, self): """Combine all local files into a combined df.""" bpath = Path(baseDir().path, 'historical/each_sym_all') sym_path_list = list(bpath.glob('**/*.parquet')) sym_path_list = ([f for f in sym_path_list if 'info' not in str(f) if 'combined_all' not in str(f)]) sym_list = [] for fpath in tqdm(sym_path_list): try: sym_list.append(pd.read_parquet(fpath)) except Exception as e: print(str(e)) f_suf = f"_{getDate.query('iex_eod')}.parquet" path_to_write = bpath.joinpath('each_sym_all', 'combined_all', f_suf) df_all = pd.concat(sym_list) df_all.columns = [col.lower() for col in df_all.columns] (df_all.rename(columns={'open': 'fOpen', 'high': 'fHigh', 'low': 'fLow', 'close': 'fClose', 'volume': 'fVolume'}, inplace=True)) self.df_all = df_all write_to_parquet(df_all, path_to_write)
def add_perc_change_columns(df_prices=False, df_cleaned=False, refresh=False): """Use historical 2021 data from iex in prep to merge.""" perc_path = Path(baseDir().path, 'StockEOD/combined', "_2021_yprices_percs.parquet") # If perc path exists, return that file instead of running whole analysis if perc_path.exists() and not refresh: df_y = pd.read_parquet(perc_path) max_date = df_y.index.get_level_values('date').max() print(f"Most recent date for historical data is: {max_date}") return df_y df_yprices = df_prices[df_prices['date'] >= df_cleaned['date'].min()].copy() cols_to_keep = (['date', 'symbol', 'fOpen', 'fClose', 'fHigh', 'fLow', 'fVolume', 'change', 'changePercent', 'changeOverTime', 'marketChangeOverTime']) df_y = df_yprices[cols_to_keep].copy() path = Path(baseDir().path, 'StockEOD/combined', "_2021_yprices.parquet") df_yprices.to_parquet(path) df_y = (df_y.dropna(subset=['date', 'symbol']) .drop_duplicates(subset=['date', 'symbol']) .set_index(['date', 'symbol']) .sort_index(level=['date', 'symbol'])) df_y['fRange'] = (df_y['fHigh'] - df_y['fLow']).round(2) syms = df_y.index.get_level_values('symbol').unique().tolist() idx = pd.IndexSlice cols_cperc_change = ['c_perc1', 'c_perc2', 'c_perc3', 'c_perc5', 'c_perc7'] cols_operc_change = ['o_perc1', 'o_perc2', 'o_perc3', 'o_perc5', 'o_perc7'] all_perc_cols = cols_cperc_change + cols_operc_change df_y[all_perc_cols] = 0 for sym in tqdm(syms): df_sub = df_y[df_y.index.get_level_values('symbol') == sym].copy() for col in all_perc_cols: df_sub[col] = -df_sub['fClose'].pct_change(periods=-int(col[-1])) df_y.loc[idx[df_sub.index, all_perc_cols]] = df_sub[all_perc_cols] perc_path = Path(baseDir().path, 'StockEOD/combined', "_2021_yprices_percs.parquet") df_y = dataTypes(df_y, parquet=True).df df_y.to_parquet(perc_path) return df_y
def analyze_iex_ytd(): """Analyze iex historical data for this year.""" df_prices_get = serverAPI('stock_close_prices').df df_prices = df_prices_get.copy() df_prices['date'] = pd.to_datetime(df_prices['date'], unit='ms') dt_max = df_prices['date'].max().date() path = Path(baseDir().path, 'StockEOD/combined', f"_{dt_max}.parquet") df_prices.to_parquet(path) df_2021 = df_prices[df_prices['date'].dt.year >= 2021].copy() return df_2021
def get_company_meta_data(): """Get company meta data, save locally, from IEX.""" all_symbols = serverAPI('all_symbols').df all_cs = all_symbols[all_symbols['type'].isin(['cs', 'ad'])] sym_list = all_cs['symbol'].unique().tolist() bpath = Path(baseDir().path, 'company_stats/meta') for sym in tqdm(sym_list): try: ud = urlData(f"/stock/{sym}/company") fpath_suf = f"{sym.lower()[0]}/_{sym}.parquet" fpath = bpath.joinpath(fpath_suf) write_to_parquet(ud.df, fpath) except Exception as e: print(f"Company meta stats error: {type(e)} {str(e)}")
def _write_to_local(cls, self, data): """Write to local dataframes.""" syms = data.columns.get_level_values(0).unique() bpath = Path(baseDir().path, 'historical/each_sym_all') for sym in tqdm(syms): df_sym = (data.loc[:, data.columns.get_level_values(0) == sym] .droplevel(0, axis='columns') .reset_index().copy()) df_sym.insert(0, 'symbol', sym) df_sym.columns = [col.lower() for col in df_sym.columns] (df_sym.rename(columns={'Date': 'date', 'Open': 'fOpen', 'High': 'fHigh', 'Low': 'fLow', 'Close': 'fClose', 'Volume': 'fVolume'}, inplace=True)) sym_ea_path = bpath.joinpath(sym.lower()[0], f"_{sym}.parquet") write_to_parquet(df_sym, sym_ea_path)
def treasuries_clean_write(): """Clean, and store daily treasury data locally.""" tz = serverAPI('treasuries').df tz['time_test'] = pd.to_datetime(tz['time'], unit='ms', errors='coerce') tz_mod = tz.dropna(subset=['time_test']) tz_mod = tz_mod.drop(columns=['time']).rename(columns={'time_test': 'time'}) tz = tz[~tz.index.isin(tz_mod.index)].drop(columns=['time_test']).copy() tz = pd.concat([tz, tz_mod]) col_dict = ({'^IRX': 'ThreeM', '^FVX': 'ThreeY', '^TNX': 'FiveY', '^TYX': 'TenY'}) tz.rename(columns=col_dict, inplace=True) tz['time'] = pd.to_datetime(tz['time']) tz['date'] = pd.to_datetime(tz['time'].dt.date) tz = tz.sort_values(by=['date']) tz_daily = tz.groupby(by=['date']).mean() path_to_write = Path(baseDir().path, 'economic_data/tz_daily.parquet') write_to_parquet(tz_daily, path_to_write) return tz_daily
def _find_missing_hist_symbols(cls, self): """Finding all missing symbols from max historical.""" bpath = Path(baseDir().path, 'historical/each_sym_all') self.bpath = bpath info_path = bpath.joinpath('info', 'info.parquet') if info_path.exists(): sym_df = pd.read_parquet(info_path) else: df_stats = get_symbol_stats() symbols = df_stats['symbol'].dropna().unique().tolist() sym_df = pd.DataFrame(symbols, columns=['symbol']) sym_path_list = list(bpath.glob('**/*.parquet')) sym_list = [str(f).split('_')[-1].split('.')[0] for f in sym_path_list] sym_df['missing'] = np.where(sym_df['symbol'].isin(sym_list), 1, 0) write_to_parquet(sym_df, info_path) sym_missing = sym_df[sym_df['missing'] == 0] sym_start = sym_missing['symbol'].tolist()[0:1000] return sym_df, sym_start
""" Get a list of all directories, create in new dir data_test """ # %% codecell ############################## import sys import glob import os from multiuse.help_class import baseDir # %% codecell ############################## data_dir = f"{baseDir().path}/*" glob.glob(data_dir) list(os.walk(baseDir().path)) # %% codecell ##############################
pd.DataFrame.chained_isin = chained_isin pd.set_option('display.max_columns', 50) # %% codecell # %% codecell # %% codecell # df_all = read_clean_combined_all(local=True) # %% codecell # %% codecell fpath = Path(baseDir().path, 'ml_data/fib_analysis/df_all_temp.parquet') df_all = pd.read_parquet(fpath) df_all = add_gap_col(df_all) # %% codecell df_all_cols = df_all.columns cols_to_round = ([ 'fOpen', 'fLow', 'fClose', 'fHighMax', 'prev_close', 'rsi', 'vol_avg_2m', 'fCP5', 'sma_50', 'sma_200' ]) df_all[cols_to_round] = df_all[cols_to_round].astype(np.float64).round(2) df_all.reset_index(drop=True, inplace=True) # 1. Period of little movement for 2+ weeks. # 2. Period of major up movement
from tqdm import tqdm import pandas as pd import numpy as np import talib from talib import abstract from multiuse.help_class import baseDir, dataTypes, getDate # %% codecell # Display max 50 columns pd.set_option('display.max_columns', 100) # Display maximum rows pd.set_option('display.max_rows', 100) # %% codecell path = Path(baseDir().path, 'historical', '2021') price_cols = ['fOpen', 'fHigh', 'fLow', 'fClose'] cols_to_read = ['fVolume'] + price_cols df_list = [pd.read_parquet(fpath) for fpath in list(path.glob('**/*.parquet')) if os.path.getsize(fpath) > 0] df = pd.concat(df_list) # %% codecell df = df.set_index(['symbol', 'date']) df_sub = df[cols_to_read].copy() combined_fpath = Path(baseDir().path, 'historical', 'combined', 'sub.parquet') combined_fpath.resolve() df_sub = dataTypes(df_sub).df
inf = yoptions_all[np.isfinite(yoptions_all[col]) == False].shape[0] if inf > 0: print(f"{col} {inf}") except TypeError: pass # %% codecell # %% codecell # ref_data = serverAPI('cboe_symref').df # min_ref_data = ref_data[['Underlying', 'side', 'expirationDate', 'sym_suf']].copy() # min_ref_data['contractSymbol'] = min_ref_data.apply(lambda row: f"{row['Underlying']}{row['sym_suf']}", axis=1) # deriv_all = pd.merge(yoptions_all, min_ref_data, on=['contractSymbol'], how='left') # %% codecell path_to_write = Path(baseDir().path, 'derivatives/temp_dump/yderivs_comb.parquet') # write_to_parquet(deriv_all, path_to_write) # %% codecell df_all = pd.read_parquet(path_to_write) # %% codecell df_all['sym_suf'].isna().sum() # For simplicity's sake, lets only work with cleaned data df_mod = df_all.dropna(subset=['sym_suf']) path_to_write = Path(baseDir().path, 'derivatives/temp_dump/yderivs_nonan.parquet') # write_to_parquet(df_mod, path_to_write) # %% codecell # %% codecell path_to_write = Path(baseDir().path, 'derivatives/temp_dump/yderivs_comb.parquet') df_mod = pd.read_parquet(path_to_write) # %% codecell
# %% codecell df_test.loc['OCGN'].nlargest(5, 'corr') df_test # %% codecell # There's the question of correlation with percentage returns # Or whether to apply a logarithmic function to flatten the noise. # I'm guess that ^ this is probably the better idea. scaled_price = (logprice - np.mean(logprice)) / np.sqrt(np.var(logprice)) # %% codecell fpath = Path(baseDir().path, 'ref_data', 'peer_list', '_peers.parquet') df_peers = pd.read_parquet(fpath) all_syms = serverAPI('all_symbols').df df_peers = pd.merge(df_peers, all_syms[['symbol', 'type']], on='symbol', how='left') df_peers = (df_peers.mask('corr', .95, lesser=True).mask('corr', -.95, greater=True)) # %% codecell df_peers_idx = df_peers.set_index(['key', 'type']) df_peers
df['reportDate'] = pd.to_datetime(df['rptDate'].str[-13:-3], format='%Y-%m-%d') cols_to_drop = [col for col in df.columns if 'strike' in col or 'rpt' in col] df_sub = df.drop(columns=cols_to_drop).copy() df_sub['expDate'] = pd.to_datetime(df_sub['expDate'], unit='ms') # %% codecell df_sym_sub = df_sub['Underlying'] cols_to_rename = {'reportDate': 'date', 'Underlying': 'symbol'} df_sub.rename(columns=cols_to_rename, inplace=True) # Read historical data collected from IEX combined_fpath = Path(baseDir().path, 'historical', 'combined', 'sub.parquet') df_hist = pd.read_parquet(combined_fpath) # Only include values after 2020 df_hist = df_hist[df_hist.index.get_level_values('date') > '2020'] df_use = df_hist[df_hist.index.get_level_values('symbol').isin( df_sub['symbol'].tolist())].copy() df_use['range'] = df_use['fHigh'] - df_use['fLow'] # Create percentage change columns for the following days periods = [1, 2, 3, 5, 10] periods_to_cols = [f"pc_{p}" for p in periods] df_use.sort_index(level='symbol', inplace=True) # Cycle through percentage change columns, round to 0 decimal places for p, col in zip(periods, periods_to_cols): df_use[col] = (-df_use['fClose'].pct_change(periods=-p) * 100).round(0)
url_1 = 'https://api.stocktwits.com/api/2/streams' url_2 = f'/symbol/{symbol}.json' url = f"{url_1}{url_2}" try: get = s.get(url) except ConnectionError: break if get.status_code == 200: df = pd.DataFrame(get.json()['messages']) df = clean_st_messages(df) path = Path(baseDir().path, 'all_symbol_data', f"{symbol}", 'daily', f"_{dt}.parquet") if path.exists(): df_old = pd.read_parquet(path) df_all = pd.concat([df_old, df]) df_all.drop_duplicates(subset=['id']) else: df_all = df.copy() df_all = df_all.dropna().reset_index(drop=True) try: write_to_parquet(df_all, path) syms_collected.append(symbol) except Exception as e: print(f"Could not write symbol {symbol} to parquet: {str(e)}") elif get.status_code == 404: if get.json()['errors'][0]['message']:
import yfinance as yf from multiuse.help_class import baseDir, getDate, write_to_parquet, dataTypes, check_nan from multiuse.path_helpers import get_most_recent_fpath from multiuse.pd_funcs import mask, chained_isin from studies.add_study_cols import add_gap_col, calc_rsi, make_moving_averages, add_fChangeP_col, add_fHighMax_col importlib.reload(sys.modules['studies.add_study_cols']) from studies.add_study_cols import add_gap_col, calc_rsi, make_moving_averages, add_fChangeP_col, add_fHighMax_col from api import serverAPI # %% codecell pd.DataFrame.mask = mask pd.DataFrame.chained_isin = chained_isin dump_path = Path(baseDir().path, 'dump', 'df_all_cleaned_max.parquet') df_all = pd.read_parquet(dump_path).copy() # %% codecell # %% codecell # %% codecell # %% codecell path = Path(baseDir().path, 'dump', 'refact_fib_data.parquet') # write_to_parquet(df_all, path) # %% codecell df_all = pd.read_parquet(path).copy()
# Someone bought/sold 800 calls at $7 strike for RIG 2022 # We probably want the last 50 holidays, the next 50 holidays, to run every 6 months # %% codecell ################################## redo_otc_syms = serverAPI('redo', val='otc_ref_data') otc_syms = serverAPI('otc_syms').df all_syms = serverAPI('all_symbols').df all_syms = df_create_bins(all_syms) all_syms.dtypes base_dir = baseDir().path new_syms = urlData('/ref-data/symbols') new_syms_df = new_syms.df.copy(deep=True) new_syms_df['type'].value_counts() all_syms['type'].value_counts() otc_syms = urlData('/ref-data/otc/symbols').df otc_df = otc_syms.copy(deep=True) all_syms['bins'].value_counts() # pd.qcut(df['ext price'], q=4)
# %% codecell serverAPI('redo', val='GetMissingDates') # %% codecell serverAPI('redo', val='warrants') # %% codecell serverAPI('redo', val='get_missing_hist_from_yf') # %% codecell serverAPI('redo', val='CboeIntraday') # %% codecell serverAPI('redo', val='combine_all_cboe_symref') # %% codecell mf_url = '/ref-data/mutual-funds/symbols' mf_syms = urlData(mf_url).df path = Path(baseDir().path, 'tickers', 'mfund_symbols.parquet') mf_syms.info() # %% codecell yall_today = serverAPI('yoptions_daily').df # yall_all = serverAPI('yoptions_all').df yall_dd = dd.from_pandas(yall_today, npartitions=1) cboe_symref = serverAPI('cboe_symref_all').df cboe_dd = dd.from_pandas(cboe_symref, npartitions=1) cboe_dd['OSI Symbol'] = cboe_dd['OSI Symbol'].str.replace(' ', '') cboe_dd['expirationDate'] = cboe_dd['expirationDate'].astype('int64') cboe_dd['expirationDate'] = dd.to_datetime(cboe_dd['expirationDate'], format='%y%m%d')