def _get_sym_min_data(cls, self, sym, dt, bpath, verbose=False): """Get minute data for symbol. Write to file.""" if not dt: dt = getDate.query('iex_eod') # Construct fpath fpath = bpath.joinpath(sym[0].lower(), f"_{sym}.parquet") # Construct url url_p1 = f"/stock/{sym.lower()}/chart/date/" url_p2 = f"{dt.strftime('%Y%m%d')}?chartByDay=false" url = f"{url_p1}{url_p2}" if verbose: # If verbose print out key vars msg = f"Sym: {sym}, Date: {str(dt)}, fpath: {str(fpath)}, url: {url}" help_print_arg(msg) # Get data with requested url df_ud = urlData(url).df df_ud['dtime'] = (pd.to_datetime(df_ud['date'] + df_ud['minute'], format='%Y-%m-%d%H:%M')) df_ud['date'] = pd.to_datetime(df_ud['date'], format='%Y-%m-%d') df_ud.insert(0, 'symbol', sym) # Write to parquet and exit function df_ud['symbol'] = df_ud['symbol'].astype('category') (df_ud.drop(columns=['minute', 'exchangeType'], inplace=True, errors="ignore")) write_to_parquet(df_ud, fpath, combine=True)
def fib_all_clean_combine_write(dt=False, read=False, round=True): """Take pre_cleaned_data and fib_vals. Combine for further analysis.""" df_all = None bpath = Path(baseDir().path, 'ml_data/fib_analysis') fib_all_path = bpath.joinpath('fib_all_cleaned_data.parquet') fib_vals_path = Path(baseDir().path, 'studies/fibonacci', 'fib_vals.parquet') if read: df_all = pd.read_parquet(fib_all_path) else: if not dt: dt = date(2021, 1, 1) df_pre = read_clean_combined_all(dt=dt) fib_df = pd.read_parquet(fib_vals_path) cols_to_rename = {'range': 'fib_range', 'date': 'fib_date'} fib_df.rename(columns=cols_to_rename, inplace=True) fib_cols = fib_df.columns fib_cols = (fib_cols[~fib_cols.isin(['symbol', 'date'])] .append(pd.Index(['hit_1.618', 'hit_2.618', 'hit_4.236']))) df_drop = df_pre.drop(columns=fib_cols, errors='ignore').copy() df_all = pd.merge(df_drop, fib_df, on=['symbol'], how='left') write_to_parquet(df_all, fib_all_path) if round: cols_to_round = df_all.select_dtypes(include=[np.float32]).columns.tolist() df_all[cols_to_round] = df_all[cols_to_round].astype(np.float64) df_all[cols_to_round] = df_all[cols_to_round].round(3) return df_all
def _iex_intraday_m1(cls, self, df): """Write to local file structure.""" cols_to_keep = ['symbol', 'dtime', 'date', 'minute', 'exchangeType'] df_cols = df.columns mkt_cols = [col for col in df_cols if 'market' in str(col)] cols_to_keep = cols_to_keep + mkt_cols cols_to_keep = [col for col in cols_to_keep if col in df_cols] df_m1 = df[cols_to_keep].copy() # df_m1.rename(columns={'sym': 'symbol'}, inplace=True) df_m1['year'] = df_m1['date'].dt.year df_idx_m1 = df_m1.set_index(['symbol', 'year']) sym_list = df_idx_m1.index.get_level_values('symbol').unique() yr_list = df_idx_m1.index.get_level_values('year').unique() bpath = Path(baseDir().path, 'intraday', 'minute_1') for sym in tqdm(sym_list): for yr in yr_list: df_sym = df_idx_m1.loc[sym, yr].copy() sym = str(sym) fpath = bpath.joinpath(str(yr), sym[0].lower(), f"_{sym}.parquet") write_to_parquet(df_sym.reset_index(), fpath) return df_m1
def get_options_symbols(get_fresh=False): """Get symbols with derivatives from IEX.""" fpath = Path(baseDir().path, 'ref_data', 'syms_with_options.parquet') stop = True if fpath.is_file(): return pd.read_parquet(fpath) else: get_fresh = True stop = False if get_fresh: load_dotenv() base_url = os.environ.get("base_url") url_suf = '/ref-data/options/symbols' payload = {'token': os.environ.get("iex_publish_api")} get = requests.get(f"{base_url}{url_suf}", params=payload) sym_df = pd.DataFrame(get.json().items()) sym_df.columns = ['symbol', 'expDates'] write_to_parquet(sym_df, fpath) if not stop: return sym_df
def get_exact_dates(cls, self): """Get exact dates.""" self.payload['range'], df_list = 'date', [] get_errors = [] # For all the dates needed for fdt in self.dts_need: self.payload['exactDate'] = fdt get = requests.get(self.url, params=self.payload) if get.status_code == 200: df_list.append(pd.DataFrame(get.json())) else: get_errors.append(f"Error with {self.url}. {get.content}") # Print out any errors that may have arisen. if len(get_errors) > 1: self.class_print(get_errors) # Concat all new dates if list is > 1 if len(df_list) > 0: new_df = pd.concat(df_list) # Concat existing df with new dates all_df = pd.concat([self.df, new_df]) all_df.drop_duplicates(subset=['date'], inplace=True) all_df.reset_index(drop=True, inplace=True) write_to_parquet(all_df, self.fpath)
def get_last_range(cls, self, sym): """Get last month of data.""" get = requests.get(self.url, params=self.payload) # If at first you don't succeed, try, try again. if get.status_code != 200: get = requests.get(self.url, params=self.payload) self.get = get if get.status_code == 200: try: df = pd.DataFrame(get.json()) except ValueError: df = pd.DataFrame.from_dict(get.json(), orient='index').T # self.df = dataTypes(df).df if os.path.isfile(self.fpath): old_df = pd.read_parquet(self.fpath) df_all = pd.concat([old_df, df]).reset_index(drop=True) write_to_parquet(df_all, self.fpath) # Assign dataframe to class attribute self.df = df_all else: # Write dataframe to parquet file write_to_parquet(df, self.fpath) # Assign dataframe to class attribute self.df = df else: msg = f"IexHistV2 for {sym} get request failed with status_code {get.status_code}" help_print_arg(msg)
def _multiple_date_loop(cls, self, sym, df): """Loop for multiple dates. Error handling.""" # Get a subset of the dataframe df_mod = df[df['symbol'] == sym] # Set empty df list df_list = [] # Iterate through subset for index, row in df_mod.iterrows(): ud = urlData(row['url']) # Request data again if the first time doesn't work if ud.df.empty: ud = urlData(row['url']) if ud.df.empty: self.null_dates.append(row) else: df_list.append(ud.df) else: df_list.append(ud.df) if df_list: # If length > 0 df_all = pd.concat(df_list) if Path(row['path_parq']).exists(): df_old = pd.read_parquet(row['path_parq']) df_all = pd.concat([df_old, df_all]).reset_index(drop=True) else: df_all = ud.df.copy() write_to_parquet(df_all, row['path_parq'])
def yoptions_combine_last(all=False): """Combine all options with max date.""" # Default is last. Change all to True for all_combined dt = getDate.query('iex_eod') fpath = Path(baseDir().path, f'derivatives/end_of_day/{str(dt.year)}') globs = list(fpath.glob('**/*.parquet')) df_list = [] [df_list.append(pd.read_parquet(path)) for path in globs] df_all = pd.concat(df_list) path_suf = f"_{getDate.query('cboe')}.parquet" # Comine last aka today's data to combined folder if not all: df_today = df_all[df_all['date'] == df_all['date'].max()].copy() df_today.drop_duplicates(subset=['contractSymbol'], inplace=True) path = Path(baseDir().path, 'derivatives/end_of_day/combined', path_suf) write_to_parquet(df_today, path) elif all: # Combine all data to combined_all directory df_all.drop_duplicates(subset=['contractSymbol', 'date'], inplace=True) path = Path(baseDir().path, 'derivatives/end_of_day/combined_all', path_suf) write_to_parquet(df_all, path)
def write_combined(): """Concat iex eod prices into one file.""" base_dir = baseDir().path fpath = f"{base_dir}/iex_eod_quotes/{date.today().year}/*/**.parquet" choices = glob.glob(fpath) concat_list = [] for choice in choices: concat_list.append(pd.read_parquet(choice)) all_df = pd.concat(concat_list) this_df = all_df.copy(deep=True) this_df['date'] = pd.to_datetime(this_df['latestUpdate'], unit='ms').dt.date cutoff = datetime.date(2021, 4, 7) this_df = this_df[this_df['date'] >= cutoff].copy(deep=True) this_df.sort_values(by=['symbol', 'latestUpdate'], inplace=True, ascending=False) this_df.drop_duplicates(subset=['symbol', 'date'], inplace=True) dt_counts = this_df['date'].value_counts().index for dt in dt_counts: mod_df = this_df[this_df['date'] == dt] mod_df.reset_index(inplace=True, drop=True) mod_fpath = f"{base_dir}/iex_eod_quotes/combined/_{dt}.parquet" write_to_parquet(mod_df, mod_fpath)
def cboe_symref_raw(): """Read, concat, and write cboe symbol ref.""" mkt_list = ['cone', 'opt', 'ctwo', 'exo'] burl1 = 'https://www.cboe.com/us/options/' burl2 = 'market_statistics/symbol_reference/?mkt=' url_end = '&listed=1&unit=1&closing=1' df_list = [] for mkt in mkt_list: url = f"{burl1}{burl2}{mkt}{url_end}" get = requests.get(url) if get.status_code == 200: df_list.append(pd.read_csv(BytesIO(get.content))) else: help_print_arg(f"Symbol ref request failed for mkt {str(mkt)}") df = pd.concat(df_list) cols_to_drop = ['Matching Unit', 'Closing Only'] df.drop(columns=cols_to_drop, inplace=True) if df['OSI Symbol'].isna().sum() != 0: df.dropna(subset=['OSI Symbol'], inplace=True) # %% codecell dt = getDate.query('iex_close') path_to_write = (Path(baseDir().path, 'ref_data/yoptions_ref/cboe_ref_raw', f'_{dt}.parquet')) write_to_parquet(df, path_to_write)
def company_10qs_ref(): """Get ref data for company 10qs (quarterly filings).""" sma_api = serverAPI('sec_master_all') sma_df = sma_api.df.copy() sma_df['date'] = pd.to_datetime(sma_df['date'], unit='ms') forms_list = sma_df['Form Type'].value_counts().index # form_10 = [f for f in forms_list if '10' in str(f)] form_10qs = [f for f in forms_list if '10-Q' in str(f)] f10q_df = sma_df[sma_df['Form Type'].isin(form_10qs)].copy() all_syms = serverAPI('all_symbols').df all_syms.drop(columns=['date'], inplace=True) min_cik_len = all_syms['cik'].str.len().min() if min_cik_len < 10: print('Not all CIKs are 10 digits long') f10q_df.rename(columns={'name': 'sec_name'}, inplace=True) comb_df = pd.merge(f10q_df, all_syms, on=['cik']) tenq_df = comb_df[comb_df['Form Type'] == '10-Q'].copy() tenq_df.drop_duplicates(subset=['date', 'cik'], inplace=True) cols_to_keep = ['cik', 'symbol', 'date', 'name', 'Form Type'] tenq_df = tenq_df[cols_to_keep] path = Path(baseDir().path, 'ref_data', 'symbol_10q_ref.parquet') write_to_parquet(tenq_df, path)
def _process_data(cls, self): """Convert from xml, clean, and process.""" df = pd.read_xml(self.get.content, xpath='.//item') col_list = [] for col in df.columns: if '}' in str(col): # print(col.split('}')[1]) col_list.append(col.split('}')[1]) else: col_list.append(col) df.columns = col_list df.drop(columns=['description', 'PauseThresholdPrice'], inplace=True) self.df = df if self.path.exists(): df_prev = pd.read_parquet(self.path) subset = ['HaltTime', 'IssueSymbol'] df_all = (pd.concat( [df_prev, df]).reset_index(drop=True).drop_duplicates(subset=subset)) write_to_parquet(df_all, self.path) else: write_to_parquet(df, self.path)
def get_all_symbols(): """Get and write to local file all symbols.""" bpath = Path(baseDir().path, 'tickers', 'symbol_list') fpath = bpath.joinpath('all_symbols.parquet') symbols = urlData("/ref-data/symbols").df write_to_parquet(symbols, fpath) return symbols
def clean_sort_write(cls, self): """Clean and sort data.""" flist = self.server_flist cboe = self.cboe_dict on_list = list(self.unmerged_df.columns.drop('dataDate')) top_df = pd.DataFrame() for fsn, fs in enumerate(flist): try: # print(flist[(fsn - 1)]) df = pd.merge(cboe[flist[fsn]], cboe[flist[(fsn - 1)]], how='outer', on=on_list, indicator='Exist').copy(deep=True) df = df[df['Exist'] == 'left_only'].copy(deep=True) df.drop(columns=['Exist'], inplace=True) df.reset_index(inplace=True, drop=True) df['dataDate'] = fs[-13:-3] top_df = pd.concat([top_df, df]).copy(deep=True) write_to_parquet(df, self.local_flist[fsn]) except IndexError: pass # Drop columns from merge top_df.drop(columns=['dataDate_x', 'dataDate_y'], inplace=True) return top_df
def otc_ref_data(): """Get all otc reference data from IEX cloud.""" # Use urlData function to get all otc sym ref data otc_syms = urlData('/ref-data/otc/symbols').df # Create fpath to store otc_syms fpath = f"{baseDir().path}/tickers/symbol_list/otc_syms.parquet" # Write otc symbols to local gzip file write_to_parquet(otc_syms, fpath)
def _write_error_dict(cls, self, error_dict): """Write error_dict to local df.""" path = Path(baseDir().path, 'errors', 'iex_intraday_resample.parquet') df_errors = pd.DataFrame.from_dict(error_dict).T df_errors['date'] = getDate.query('iex_eod') write_to_parquet(df_errors, path, combine=False) self.df_errors = df_errors
def _if_file_exists(cls, self, fname, time_dict, t): """Check if file exists and don't overwrite if it does.""" if os.path.isfile(fname): pass else: time_dict[t] = time_dict[t].T.drop_duplicates().T # Minimize size of data write_to_parquet(time_dict[t], fname)
def _get_data(cls, self, sym, which): """Base function for getting company stats data.""" url = f"/stock/{sym}/{self.stats_dict[which]['url_suffix']}" df = urlData(url).df path = Path( f"{self.fpath}/{sym[0].lower()}/_{sym}_{date.today()}.parquet") write_to_parquet(df, path) return df
def concat_and_write(cls, self): """Concat dataframes and write to parquet.""" new_df = pd.concat([self.old_df, self.rec_df]) new_df.reset_index(inplace=True, drop=True) write_to_parquet(new_df, self.st_fname) return new_df
def _write_cboe_intraday(cls, self, df_all): """Write cboe intraday data to local file.""" df_all['time'] = pd.Timestamp.today(tz='US/Eastern') path_intra = self.bpath.joinpath(f"_{self.dt}_intraday.parquet") if path_intra.exists(): df_old = pd.read_parquet(path_intra) df_all = pd.concat([df_old, df_all]).reset_index(drop=True) write_to_parquet(df_all, path_intra)
def write_fibs_to_parquet(df_confirm_all, fib_dict_list): """Write fibonacci data to local dataframe.""" path = Path(baseDir().path, 'studies/fibonacci', 'confirmed_all.parquet') write_to_parquet(df_confirm_all, path) fib_df = pd.DataFrame.from_records(fib_dict_list) (fib_df.insert(2, "date_range", getDate.get_bus_day_diff(fib_df, 'start_date', 'end_date'))) path = Path(baseDir().path, 'studies/fibonacci', 'fib_vals_test.parquet') write_to_parquet(fib_df, path)
def _clean_up_and_write(cls, self, df_sym_all): """Clean data and write to local fpath.""" df_sym_all.index.name = 'key' df_sym_all = df_sym_all.stack().to_frame() df_sym_all.rename(columns={0: 'corr'}, inplace=True) df_key_all = df_sym_all[~df_sym_all['corr'].isin([1, -1])] self.df = df_key_all.reset_index() write_to_parquet(df_key_all, self.fpath)
def _write_to_parquet(cls, self, df, which=False): """Write response to parquet.""" path = None if which: path = self.path_dict[which] else: path = self.path_dict[self.which] write_to_parquet(df, path)
def write_or_update(cls, self): """Write new file or update from previous.""" if os.path.isfile(self.fpath): sec_prev_df = pd.read_parquet(self.fpath) sec_df = pd.concat([sec_prev_df, self.sec_df]) sec_df.drop_duplicates(subset=['accesssionNumber'], inplace=True) self.sec_df = sec_df.reset_index(drop=True).copy(deep=True) write_to_parquet(self.sec_df, self.fpath) self.df = self.sec_df.copy(deep=True)
def yoptions_combine_temp_all(keep_temps=False, keep_unfin=False, verbose=False): """Combine temporary options with historical records.""" dt = getDate.query('iex_eod') yr = dt.year path_base = Path(baseDir().path, 'derivatives/end_of_day') temps = list(Path(path_base, 'temp', str(yr)).glob('**/*.parquet')) for tpath in temps: try: # Derive symbol from temp fpath, construct new path to write sym = str(tpath.resolve()).split('_')[-1].split('.')[0] path_to_write = Path(path_base, str(yr), sym.lower()[0], f"_{sym}.parquet") if verbose: n_pre = "yoptions_combine_temp_all: derived symbol path is" help_print_arg(f"{n_pre} {sym}") help_print_arg(f"path_to_write: {str(path_to_write)}") help_print_arg(f"temp path: {str(tpath)}") if path_to_write.is_file(): df_old = pd.read_parquet(path_to_write) df_new = pd.read_parquet(tpath) # Combine dataframes and write to local file df_all = pd.concat([df_old, df_new]) write_to_parquet(df_all, path_to_write) if verbose: help_print_arg(f"path_to_write for symbol {sym} exists") # Remove temp file if not keep_temps: os.remove(tpath) else: df_new = pd.read_parquet(tpath) write_to_parquet(df_new, path_to_write) if verbose: help_print_arg( f"path_to_write for symbol {sym} did not exist") except Exception as e: if verbose: help_print_arg(str(e)) else: pass if not keep_unfin: unfinished_paths = list( path_base.joinpath('unfinished').glob('*.parquet')) if unfinished_paths: for upath in unfinished_paths: os.remove(upath)
def _error_handling(cls, self, error_dict, bpath): """Error handling for 1 minute data.""" df_errors = pd.DataFrame.from_dict(error_dict) self.df_errors = df_errors self.error_dict = error_dict bpath_e = bpath.parent.parent fpath_errors = bpath_e.joinpath('errors', "_minute_1_errors.parquet") write_to_parquet(df_errors, fpath_errors, combine=True)
def etf_list(): """Read local etf list.""" etf_fname = f"{baseDir().path}/tickers/symbol_list/etf_list.parquet" if os.path.isfile(etf_fname): etf_df = pd.read_parquet(etf_fname) else: symbols = urlData("/ref-data/symbols").df etf_df = pd.DataFrame(symbols[symbols['type'] == 'et']['symbol']) etf_df.reset_index(inplace=True, drop=True) write_to_parquet(etf_df, etf_fname) return etf_df
def combine_write(cls, self): """Concat and write to local combined df.""" all_df = pd.concat(self.data_list) all_df.reset_index(drop=True, inplace=True) # Get date for data to use for fpath latest_dt = pd.to_datetime(all_df['latestUpdate'], unit='ms').dt.date[0] # Construct fpath fpath = f"{self.fpath_base}/combined/_{latest_dt}.parquet" # Minimize file size and write to parquet write_to_parquet(all_df, fpath)
def add_fib_peaks_troughs_diffs(read=False): """Apply distance matrix to each row. Find min differences. Local peaks/troughs.""" df_all = None bpath = Path(baseDir().path, 'ml_data/fib_analysis') fib_all_path = bpath.joinpath('fib_all_cleaned_data.parquet') path_peaks_troughs = bpath.joinpath('fib_diff_peaks_troughs.parquet') if read: df_all = pd.read_parquet(path_peaks_troughs) else: df_clean = pd.read_parquet(fib_all_path) df_clean['fHigh_peaks'] = (np.where(df_clean.index.isin( find_peaks(df_clean['fHigh'])[0]), 1, 0)) df_clean['fLow_troughs'] = (np.where(df_clean.index.isin( find_peaks(-df_clean['fLow'])[0]), 1, 0)) cols_exclude = ['ext_date', 'ext_end', 'ext_cond'] ext_ret_cols = ([col for col in df_clean.columns if ((('ret_' in str(col)) | ('ext_' in str(col))) & (str(col) not in cols_exclude))]) dist_high_cols = ['symbol', 'date', 'fHigh'] + ext_ret_cols dist_low_cols = ['symbol', 'date', 'fLow'] + ext_ret_cols df_clean_high_dist = (df_clean[dist_high_cols].set_index( ['symbol', 'date', 'fHigh']).copy()) df_fHigh_dist = (abs(df_clean_high_dist.sub(df_clean_high_dist.index .get_level_values('fHigh'), axis=0))) df_clean_low_dist = (df_clean[dist_low_cols].set_index( ['symbol', 'date', 'fLow']).copy()) df_fLow_dist = (abs(df_clean_low_dist.sub(df_clean_low_dist.index .get_level_values('fLow'), axis=0))) df_clean_idx = df_clean.set_index(['symbol', 'date']) df_clean_idx['fibHighMinCol'] = (df_fHigh_dist.idxmin(axis='columns') .reset_index(level='fHigh', drop=True) ) df_clean_idx['fibHighMinVal'] = (df_fHigh_dist.min(axis=1) .reset_index(level='fHigh', drop=True) ) df_clean_idx['fibLowMinCol'] = (df_fLow_dist.idxmin(axis='columns') .reset_index(level='fLow', drop=True)) df_clean_idx['fibLowMinVal'] = (df_fLow_dist.min(axis=1) .reset_index(level='fLow', drop=True)) df_all = df_clean_idx.reset_index() df_all['fibHighDiffP'] = df_all['fibHighMinVal'].div(df_all['fHigh']) df_all['fibLowDiffP'] = df_all['fibLowMinVal'].div(df_all['fLow']) write_to_parquet(df_all, path_peaks_troughs) return df_all
def sec_sym_list(): """Get list of symbols from SEC.""" # Symbol, Name CIK string sec_cp_url = 'https://www.sec.gov/files/company_tickers.json' get = requests.get(sec_cp_url) sec_syms = pd.DataFrame(get.json()).T sec_syms['cik_str'] = sec_syms['cik_str'].astype('str').str.zfill(10) bpath = Path(baseDir().path, 'tickers', 'symbol_list') ss_path = bpath.joinpath('sec_syms.parquet') write_to_parquet(sec_syms, ss_path, combine=True, drop_duplicates=True)