def clear_yoptions_temp_unfin(testing=False): """Clear temp and unfin yoptions directories.""" dt = getDate.query('iex_eod') yr = str(dt.year) path = Path(baseDir().path, 'derivatives/end_of_day/temp', yr) temps = list(path.glob('**/*.parquet')) if temps: for fpath in temps: if testing: print(fpath) break else: os.remove(fpath) unfin = Path(baseDir().path, 'derivatives/end_of_day/unfinished') unfins = list(unfin.glob('*.parquet')) if unfins: for fpath in unfins: if testing: print(fpath) break else: os.remove(fpath)
def yoptions_combine_last(all=False): """Combine all options with max date.""" # Default is last. Change all to True for all_combined dt = getDate.query('iex_eod') fpath = Path(baseDir().path, f'derivatives/end_of_day/{str(dt.year)}') globs = list(fpath.glob('**/*.parquet')) df_list = [] [df_list.append(pd.read_parquet(path)) for path in globs] df_all = pd.concat(df_list) path_suf = f"_{getDate.query('cboe')}.parquet" # Comine last aka today's data to combined folder if not all: df_today = df_all[df_all['date'] == df_all['date'].max()].copy() df_today.drop_duplicates(subset=['contractSymbol'], inplace=True) path = Path(baseDir().path, 'derivatives/end_of_day/combined', path_suf) write_to_parquet(df_today, path) elif all: # Combine all data to combined_all directory df_all.drop_duplicates(subset=['contractSymbol', 'date'], inplace=True) path = Path(baseDir().path, 'derivatives/end_of_day/combined_all', path_suf) write_to_parquet(df_all, path)
def yoptions_still_needed(recreate=False): """Return a list of all syms:exp_dates that are missing.""" ref_path = Path(baseDir().path, 'ref_data', 'syms_with_options.parquet') ref_df = pd.read_parquet(ref_path) dt = getDate.query('iex_eod') fsuf = f'derivatives/end_of_day/temp/{str(dt.year)}' path_for_temp = Path(baseDir().path, fsuf) paths_for_temp = list(path_for_temp.glob('**/*.parquet')) df_list = [] for fpath in paths_for_temp: df_list.append(pd.read_parquet(fpath)) df_all = pd.concat(df_list) df_collected = (df_all.groupby(by=['symbol'])['expDate'] .agg({lambda x: list(x)}) .reset_index() .rename(columns={'<lambda>': 'expDatesStored'}) .copy()) df_comb = pd.merge(ref_df, df_collected, how='left', on=['symbol'], indicator=True) df_left = df_comb[df_comb['_merge'] == 'left_only'].copy() # df_comb['expDatesNeeded'] = df_comb.apply(lambda row: list(set(row.expDates) - set(row.expDatesStored)), axis=1) # if recreate: # df_comb = df_comb.drop(columns=['expDates', 'expDatesStored']) return df_left
def add_sec_days_until_10q(df_all): """Add days until 10q filing for df_all historical prices.""" sec_10q_path = Path(baseDir().path, 'ref_data', 'symbol_10q_ref.parquet') sec_ref = pd.read_parquet(sec_10q_path) sec_ref['filing'] = sec_ref['date'] df_all = pd.merge(df_all, sec_ref[['symbol', 'date', 'filing']], on=['symbol', 'date'], how='left') df_all['filing'] = df_all['filing'].fillna(method='bfill') df_all['date_test'] = df_all['date'].dt.date df_all['filing_test'] = df_all['filing'].dt.date holidays_fpath = Path(baseDir().path, 'ref_data/holidays.parquet') holidays = pd.read_parquet(holidays_fpath) dt = getDate.query('sec_master') current_holidays = (holidays[(holidays['date'].dt.year >= dt.year) & (holidays['date'].dt.date <= dt)]) hol_list = current_holidays['date'].dt.date.tolist() df_all['date_test'].isna().sum() df_all['filing_test'].isna().sum() df_all.dropna(subset=['filing'], inplace=True) df_mod = df_all[['date_test', 'filing_test']].copy() df_mod['days_until'] = df_mod.apply(lambda row: np.busday_count(row['date_test'], row['filing_test'], holidays=hol_list), axis=1) df_all['days_until'] = df_mod['days_until'] df_all.drop(columns=['date_test', 'filing_test'], inplace=True) return df_all
def fib_all_clean_combine_write(dt=False, read=False, round=True): """Take pre_cleaned_data and fib_vals. Combine for further analysis.""" df_all = None bpath = Path(baseDir().path, 'ml_data/fib_analysis') fib_all_path = bpath.joinpath('fib_all_cleaned_data.parquet') fib_vals_path = Path(baseDir().path, 'studies/fibonacci', 'fib_vals.parquet') if read: df_all = pd.read_parquet(fib_all_path) else: if not dt: dt = date(2021, 1, 1) df_pre = read_clean_combined_all(dt=dt) fib_df = pd.read_parquet(fib_vals_path) cols_to_rename = {'range': 'fib_range', 'date': 'fib_date'} fib_df.rename(columns=cols_to_rename, inplace=True) fib_cols = fib_df.columns fib_cols = (fib_cols[~fib_cols.isin(['symbol', 'date'])] .append(pd.Index(['hit_1.618', 'hit_2.618', 'hit_4.236']))) df_drop = df_pre.drop(columns=fib_cols, errors='ignore').copy() df_all = pd.merge(df_drop, fib_df, on=['symbol'], how='left') write_to_parquet(df_all, fib_all_path) if round: cols_to_round = df_all.select_dtypes(include=[np.float32]).columns.tolist() df_all[cols_to_round] = df_all[cols_to_round].astype(np.float64) df_all[cols_to_round] = df_all[cols_to_round].round(3) return df_all
def _get_missing_dates_df(cls, self, key): """Get missing dates.""" key_options = ['previous', 'all', 'less_than_20'] if str(key) not in key_options: self.proceed = False # If provided key not in options bpath = Path(baseDir().path, 'StockEOD/missing_dates', key) path = get_most_recent_fpath(bpath) df_dates = pd.read_parquet(path) # Define path of null dates null_path = Path(baseDir().path, 'StockEOD/missing_dates/null_dates', '_null_dates.parquet') # Get all data that isn't null/empty if null_path.exists(): null_df = pd.read_parquet(null_path) df = (pd.merge(df_dates, null_df, how='left', indicator=True).query('_merge == "left_only"').drop( columns=['_merge'], axis=1).copy()) # If the merging failed if df.empty: df = df_dates self.null_dates = [] self.merged_df = df self.missing_df = self._clean_process_missing(self, df) self.single_df, self.multiple_df = self._get_single_multiple_dfs( self, self.missing_df)
def make_yfinance_dirs(temp=False): """Make options historical directory.""" path = '' if not temp: path = Path(baseDir().path, 'derivatives/end_of_day') elif temp: path = Path(baseDir().path, 'derivatives/end_of_day/temp') makedirs_with_permissions(path) make_hist_prices_dir(path)
def write_fibs_to_parquet(df_confirm_all, fib_dict_list): """Write fibonacci data to local dataframe.""" path = Path(baseDir().path, 'studies/fibonacci', 'confirmed_all.parquet') write_to_parquet(df_confirm_all, path) fib_df = pd.DataFrame.from_records(fib_dict_list) (fib_df.insert(2, "date_range", getDate.get_bus_day_diff(fib_df, 'start_date', 'end_date'))) path = Path(baseDir().path, 'studies/fibonacci', 'fib_vals_test.parquet') write_to_parquet(fib_df, path)
def intraday_tick(): """Intraday tick data.""" bpath_t = Path(baseDir().path, 'tickers', 'sectors') ticks = ({ 'sector_perf': get_most_recent_fpath(bpath_t, f_pre='performance'), 'treasuries': Path(baseDir().path, 'economic_data', 'treasuries.parquet') }) return ticks
def get_options_symbols(get_fresh=False): """Get symbols with derivatives from IEX.""" fpath = Path(baseDir().path, 'ref_data', 'syms_with_options.parquet') stop = True if fpath.is_file(): return pd.read_parquet(fpath) else: get_fresh = True stop = False if get_fresh: load_dotenv() base_url = os.environ.get("base_url") url_suf = '/ref-data/options/symbols' payload = {'token': os.environ.get("iex_publish_api")} get = requests.get(f"{base_url}{url_suf}", params=payload) sym_df = pd.DataFrame(get.json().items()) sym_df.columns = ['symbol', 'expDates'] write_to_parquet(sym_df, fpath) if not stop: return sym_df
def company_10qs_ref(): """Get ref data for company 10qs (quarterly filings).""" sma_api = serverAPI('sec_master_all') sma_df = sma_api.df.copy() sma_df['date'] = pd.to_datetime(sma_df['date'], unit='ms') forms_list = sma_df['Form Type'].value_counts().index # form_10 = [f for f in forms_list if '10' in str(f)] form_10qs = [f for f in forms_list if '10-Q' in str(f)] f10q_df = sma_df[sma_df['Form Type'].isin(form_10qs)].copy() all_syms = serverAPI('all_symbols').df all_syms.drop(columns=['date'], inplace=True) min_cik_len = all_syms['cik'].str.len().min() if min_cik_len < 10: print('Not all CIKs are 10 digits long') f10q_df.rename(columns={'name': 'sec_name'}, inplace=True) comb_df = pd.merge(f10q_df, all_syms, on=['cik']) tenq_df = comb_df[comb_df['Form Type'] == '10-Q'].copy() tenq_df.drop_duplicates(subset=['date', 'cik'], inplace=True) cols_to_keep = ['cik', 'symbol', 'date', 'name', 'Form Type'] tenq_df = tenq_df[cols_to_keep] path = Path(baseDir().path, 'ref_data', 'symbol_10q_ref.parquet') write_to_parquet(tenq_df, path)
def _get_cboe_data(cls, self): """Get symbol volume data from cboe.""" markets = ['cone', 'opt', 'ctwo', 'exo'] url_base = 'https://www.cboe.com/us/options/market_statistics/symbol_data/csv/?mkt' # Empty data list to append, then concat dataframes to df_list = [] # Iterate through market list and get data for each one for mar in markets: url = f"{url_base}={mar}" get = requests.get(url) if get.status_code == 200: df = pd.read_csv(BytesIO(get.content)) df['exch'] = mar df_list.append(df) # Concatenate data for all 4 markets df_all = pd.concat(df_list) dt = getDate.query('iex_close') df_all.insert(1, "date", dt) df_all['date'] = pd.to_datetime(df_all['date']) df_all['Expiration'] = pd.to_datetime(df_all['Expiration']) self.dt = dt self.bpath = Path(baseDir().path, 'derivatives/cboe_intraday', str(dt.year)) self.df = df_all return df_all
def write_combined(): """Concat iex eod prices into one file.""" base_dir = baseDir().path fpath = f"{base_dir}/iex_eod_quotes/{date.today().year}/*/**.parquet" choices = glob.glob(fpath) concat_list = [] for choice in choices: concat_list.append(pd.read_parquet(choice)) all_df = pd.concat(concat_list) this_df = all_df.copy(deep=True) this_df['date'] = pd.to_datetime(this_df['latestUpdate'], unit='ms').dt.date cutoff = datetime.date(2021, 4, 7) this_df = this_df[this_df['date'] >= cutoff].copy(deep=True) this_df.sort_values(by=['symbol', 'latestUpdate'], inplace=True, ascending=False) this_df.drop_duplicates(subset=['symbol', 'date'], inplace=True) dt_counts = this_df['date'].value_counts().index for dt in dt_counts: mod_df = this_df[this_df['date'] == dt] mod_df.reset_index(inplace=True, drop=True) mod_fpath = f"{base_dir}/iex_eod_quotes/combined/_{dt}.parquet" write_to_parquet(mod_df, mod_fpath)
def _construct_params(cls, self, filter_extremes): """Construct parameters used for class.""" bpath = Path(baseDir().path, 'ref_data', 'peer_list') if filter_extremes: self.fpath = bpath.joinpath('_peers_extreme.parquet') else: self.fpath = bpath.joinpath('_peers.parquet')
def get_all_symbol_ref(): """Get all common and OTC symbols.""" load_dotenv() env = os.environ.get("env") df_all = None if env == "production": bpath = Path(baseDir().path, 'tickers', 'symbol_list') com_syms_path = bpath.joinpath('all_symbols.parquet') otc_syms_path = bpath.joinpath('otc_syms.parquet') com_df = pd.read_parquet(com_syms_path) otc_df = pd.read_parquet(otc_syms_path) otc_df.dropna(subset=['cik'], inplace=True) otc_df['cik'] = (otc_df['cik'].astype('int64').astype('str').str.zfill( 10).astype('category').reset_index(drop=True)) df_all = pd.concat([com_df, otc_df]).reset_index(drop=True) else: try: from api import serverAPI com_syms = serverAPI('all_symbols').df otc_syms = serverAPI('otc_syms').df df_all = pd.concat([com_syms, otc_syms]).reset_index(drop=True) except ModuleNotFoundError: help_print_arg('Tried import server api in get_all_symbols func') return df_all
def scans(): """Scans for stocks/other items.""" bpath = Path(baseDir().path, 'scans') scans = ({'top_vol': get_most_recent_fpath(bpath.joinpath('top_vol'))}) return scans
def filter_my_stocks(cls, self): """Filter dataframe for my stocks.""" path = Path(baseDir().path, 'tickers', 'my_syms.parquet') my_df = pd.read_parquet(path) # Convert local dataframe to syms to look for inv_list = my_df['symbol'].tolist() if ('form' or 'cik') not in self.df.columns: col_dict = {'description': 'form', 'CIK': 'cik'} self.df.rename(columns=col_dict, inplace=True) df_inv = self.df[self.df['symbol'].isin(inv_list)].copy() if (df_inv.shape[0] == 0) and self.testing: help_print_arg("AnalyzeSecRss: no matching stocks for rss feed") forms_to_watch = ['8-K', '3', '4'] # df_forms = df_inv[df_inv['form'].isin(forms_to_watch)] msg_dict = {sym: [] for sym in inv_list} for index, row in df_inv.iterrows(): if row['cik']: msg = f"{row['symbol']} has just filed form {row['form']}" msg_dict[row['symbol']].append(msg) self.msg_dict = msg_dict self.df_inv = df_inv.copy()
def _iex_intraday_m1(cls, self, df): """Write to local file structure.""" cols_to_keep = ['symbol', 'dtime', 'date', 'minute', 'exchangeType'] df_cols = df.columns mkt_cols = [col for col in df_cols if 'market' in str(col)] cols_to_keep = cols_to_keep + mkt_cols cols_to_keep = [col for col in cols_to_keep if col in df_cols] df_m1 = df[cols_to_keep].copy() # df_m1.rename(columns={'sym': 'symbol'}, inplace=True) df_m1['year'] = df_m1['date'].dt.year df_idx_m1 = df_m1.set_index(['symbol', 'year']) sym_list = df_idx_m1.index.get_level_values('symbol').unique() yr_list = df_idx_m1.index.get_level_values('year').unique() bpath = Path(baseDir().path, 'intraday', 'minute_1') for sym in tqdm(sym_list): for yr in yr_list: df_sym = df_idx_m1.loc[sym, yr].copy() sym = str(sym) fpath = bpath.joinpath(str(yr), sym[0].lower(), f"_{sym}.parquet") write_to_parquet(df_sym.reset_index(), fpath) return df_m1
def cboe_symref_raw(): """Read, concat, and write cboe symbol ref.""" mkt_list = ['cone', 'opt', 'ctwo', 'exo'] burl1 = 'https://www.cboe.com/us/options/' burl2 = 'market_statistics/symbol_reference/?mkt=' url_end = '&listed=1&unit=1&closing=1' df_list = [] for mkt in mkt_list: url = f"{burl1}{burl2}{mkt}{url_end}" get = requests.get(url) if get.status_code == 200: df_list.append(pd.read_csv(BytesIO(get.content))) else: help_print_arg(f"Symbol ref request failed for mkt {str(mkt)}") df = pd.concat(df_list) cols_to_drop = ['Matching Unit', 'Closing Only'] df.drop(columns=cols_to_drop, inplace=True) if df['OSI Symbol'].isna().sum() != 0: df.dropna(subset=['OSI Symbol'], inplace=True) # %% codecell dt = getDate.query('iex_close') path_to_write = (Path(baseDir().path, 'ref_data/yoptions_ref/cboe_ref_raw', f'_{dt}.parquet')) write_to_parquet(df, path_to_write)
def new_syms_ref_type(cls, self): """Get reference type data for new symbols.""" iex_sup = urlData("/ref-data/symbols").df bpath = Path(baseDir().path, 'tickers') fpath = bpath.joinpath('all_symbols.parquet') fpath_new = bpath.joinpath('symbol_list', 'all_symbols.parquet') # Write to parquet file write_to_parquet(iex_sup, fpath) write_to_parquet(iex_sup, fpath_new) iex_sup.drop(columns=[ 'exchangeSuffix', 'exchangeName', 'name', 'iexId', 'region', 'currency', 'isEnabled', 'cik', 'lei', 'figi' ], axis=1, inplace=True) new_syms_tp = iex_sup[iex_sup['symbol'].isin(self.new_syms['symbol'])] new_syms_tp.reset_index(inplace=True, drop=True) dt = getDate().query('occ') fpath_new = f"{baseDir().path}/tickers/new_symbols/{dt}.parquet" write_to_parquet(new_syms_tp, fpath_new) return new_syms_tp
def get_all_symbols(): """Get and write to local file all symbols.""" bpath = Path(baseDir().path, 'tickers', 'symbol_list') fpath = bpath.joinpath('all_symbols.parquet') symbols = urlData("/ref-data/symbols").df write_to_parquet(symbols, fpath) return symbols
def get_all_syms(cls, self): """Get the all_syms dataframe.""" # all_syms = serverAPI('all_symbols').df syms_path = Path(baseDir().path, 'tickers', 'symbol_list', 'all_symbols.parquet') all_syms = pd.read_parquet(syms_path) sym_list = all_syms['symbol'].tolist() self.all_syms, self.sym_list = all_syms, sym_list
def _write_error_dict(cls, self, error_dict): """Write error_dict to local df.""" path = Path(baseDir().path, 'errors', 'iex_intraday_resample.parquet') df_errors = pd.DataFrame.from_dict(error_dict).T df_errors['date'] = getDate.query('iex_eod') write_to_parquet(df_errors, path, combine=False) self.df_errors = df_errors
def _get_dir_lists(cls, self): """Get json and gz lists.""" bpath = Path(baseDir().path) self.all_json = list(bpath.glob('**/*.json')) self.all_gz = list(bpath.glob('**/*.gz')) self.exc_list = [] self.fpath_exc_list = []
def clear_yoptions_dirs(): """Removing files in yoptions due to incompatibility.""" dt = getDate.query('iex_eod') fsuf = f"derivatives/end_of_day/{str(dt.year)}" path = Path(baseDir().path, fsuf) path_list = list(path.glob('**/*.parquet')) for fpath in path_list: os.remove(fpath)
class secCompanyIdx(): """Get master parquet list from sec edgar.""" # Store as local dataframe. Accepts either symbol or cik # base_dir = baseDir().path def __init__(self, sym=False, cik=False): self.construct_params(self, sym, cik) self.retrieve_data(self) self.write_or_update(self) @classmethod def construct_params(cls, self, sym, cik): """Construct url and local fpath.""" bpath = baseDir().path syms_fpath = bpath.joinpath('/tickers/symbol_list/all_symbols.parquet') all_symbols = pd.read_parquet(syms_fpath) # Drop cik values that are NaNs or infinite all_symbols.dropna(axis=0, subset=['cik'], inplace=True) all_symbols['cik'] = all_symbols['cik'].astype(np.uint32) if sym: # Get symbol cik number for edgar lookup cik = (all_symbols[all_symbols['symbol'] == sym] .head(1)['cik'].astype('uint32').iloc[0]) elif cik: # Get symbol sym = (all_symbols[(all_symbols['cik'] == cik) & (all_symbols['type'] == 'cs')]) # Construct local fpath fsuf = f"/sec/company_index/{str(cik)[-1]}/_{cik}.parquet" self.fpath = bpath.joinpath(fsuf) # Sec base url sec_burl = 'https://data.sec.gov/submissions/CIK' self.url = f"{sec_burl}{str(cik).zfill(10)}.json" # cik and sym self.sym, self.cik = sym, cik @classmethod def retrieve_data(cls, self): """Get data from SEC EDGAR and convert to parquet.""" sec_get = requests.get(self.url) if sec_get.status_code != 200: time.sleep(1) # Sleep for 1 second and retry sec_get = requests.get(self.url) self.sec_df = pd.DataFrame(sec_get.json()['filings']['recent']) @classmethod def write_or_update(cls, self): """Write new file or update from previous.""" if os.path.isfile(self.fpath): sec_prev_df = pd.read_parquet(self.fpath) sec_df = pd.concat([sec_prev_df, self.sec_df]) sec_df.drop_duplicates(subset=['accesssionNumber'], inplace=True) self.sec_df = sec_df.reset_index(drop=True).copy(deep=True) write_to_parquet(self.sec_df, self.fpath) self.df = self.sec_df.copy(deep=True)
def create_ysymbol_info_dirs(): """Create local fpath directory for ysymbol info.""" base_dir = Path(baseDir().path, 'tickers/info') if not base_dir.exists(): makedirs_with_permissions(base_dir) make_hist_prices_dir(base_dir) makedirs_with_permissions(Path(base_dir, 'combined')) makedirs_with_permissions(Path(base_dir, 'unfinished')) makedirs_with_permissions(Path(base_dir, 'temp'))
def sec_ref_from_combined(): """Make local sec ref data from combined master_idx.""" base_dir, mast_df = baseDir().path, None fpath_all = f"{base_dir}/sec/daily_index/_all_combined.parquet" # Read sec_master_combined dataframe mast_df = pd.read_json(fpath_all, compression='gzip') mast_df.drop_duplicates(subset=['CIK'], inplace=True) sec_ref = mast_df[['CIK', 'Company Name']].copy(deep=True) sec_ref.reset_index(drop=True, inplace=True) # Define fpath of reference data base_dir = baseDir().path fpath = f"{base_dir}/tickers/symbol_list/sec_ref.parquet" # Write reference data to local file write_to_parquet(sec_ref, fpath)
def get_peers(): """Get dict for peers.""" bpath = Path(baseDir().path, 'ref_data', 'peer_list') peers = ({ 'all_correlations': bpath.joinpath('_corrlist_all.parquet'), 'extremes': bpath.joinpath('_peers_extreme.parquet'), 'peers_80': bpath.joinpath('_peers.parquet') }) return peers
def stocktwits(): """Stocktwits data.""" bpath = Path(baseDir().path, 'stocktwits') stocktwits = ({ 'trending': get_most_recent_fpath(bpath.joinpath('trending'), f_pre='_') }) return stocktwits