def local_get_data(cls, self, bins_unique, testing): """Get historical data using imported function.""" # For each of the 1000 symbol bins, get data for bin in bins_unique: syms_part = self.df[self.df['bins'] == bin] if testing: syms_part = syms_part.sample(n=5).copy(deep=True) sym_list = syms_part['symbol'].tolist() # Using list of symbols, call function to get data and store local if self.last_month: for sym in sym_list: HistPricesV2(sym, last_month=True) elif self.previous: for sym in sym_list: try: HistPricesV2(sym, previous=True) except NameError as ne: msg = f"Master Hist Prices Error: symbol - {sym} - {str(ne)}" help_print_arg(msg) break else: for sym in sym_list: HistPricesV2(sym) return True
def cboe_symref_raw(): """Read, concat, and write cboe symbol ref.""" mkt_list = ['cone', 'opt', 'ctwo', 'exo'] burl1 = 'https://www.cboe.com/us/options/' burl2 = 'market_statistics/symbol_reference/?mkt=' url_end = '&listed=1&unit=1&closing=1' df_list = [] for mkt in mkt_list: url = f"{burl1}{burl2}{mkt}{url_end}" get = requests.get(url) if get.status_code == 200: df_list.append(pd.read_csv(BytesIO(get.content))) else: help_print_arg(f"Symbol ref request failed for mkt {str(mkt)}") df = pd.concat(df_list) cols_to_drop = ['Matching Unit', 'Closing Only'] df.drop(columns=cols_to_drop, inplace=True) if df['OSI Symbol'].isna().sum() != 0: df.dropna(subset=['OSI Symbol'], inplace=True) # %% codecell dt = getDate.query('iex_close') path_to_write = (Path(baseDir().path, 'ref_data/yoptions_ref/cboe_ref_raw', f'_{dt}.parquet')) write_to_parquet(df, path_to_write)
def _get_sym_min_data(cls, self, sym, dt, bpath, verbose=False): """Get minute data for symbol. Write to file.""" if not dt: dt = getDate.query('iex_eod') # Construct fpath fpath = bpath.joinpath(sym[0].lower(), f"_{sym}.parquet") # Construct url url_p1 = f"/stock/{sym.lower()}/chart/date/" url_p2 = f"{dt.strftime('%Y%m%d')}?chartByDay=false" url = f"{url_p1}{url_p2}" if verbose: # If verbose print out key vars msg = f"Sym: {sym}, Date: {str(dt)}, fpath: {str(fpath)}, url: {url}" help_print_arg(msg) # Get data with requested url df_ud = urlData(url).df df_ud['dtime'] = (pd.to_datetime(df_ud['date'] + df_ud['minute'], format='%Y-%m-%d%H:%M')) df_ud['date'] = pd.to_datetime(df_ud['date'], format='%Y-%m-%d') df_ud.insert(0, 'symbol', sym) # Write to parquet and exit function df_ud['symbol'] = df_ud['symbol'].astype('category') (df_ud.drop(columns=['minute', 'exchangeType'], inplace=True, errors="ignore")) write_to_parquet(df_ud, fpath, combine=True)
def makedirs_with_permissions(path): """Make directory with permissions.""" if not os.path.isdir(path): os.umask(0) os.makedirs(path, mode=0o777) else: help_print_arg(f"Directory already exists: {str(path)}")
def filter_my_stocks(cls, self): """Filter dataframe for my stocks.""" path = Path(baseDir().path, 'tickers', 'my_syms.parquet') my_df = pd.read_parquet(path) # Convert local dataframe to syms to look for inv_list = my_df['symbol'].tolist() if ('form' or 'cik') not in self.df.columns: col_dict = {'description': 'form', 'CIK': 'cik'} self.df.rename(columns=col_dict, inplace=True) df_inv = self.df[self.df['symbol'].isin(inv_list)].copy() if (df_inv.shape[0] == 0) and self.testing: help_print_arg("AnalyzeSecRss: no matching stocks for rss feed") forms_to_watch = ['8-K', '3', '4'] # df_forms = df_inv[df_inv['form'].isin(forms_to_watch)] msg_dict = {sym: [] for sym in inv_list} for index, row in df_inv.iterrows(): if row['cik']: msg = f"{row['symbol']} has just filed form {row['form']}" msg_dict[row['symbol']].append(msg) self.msg_dict = msg_dict self.df_inv = df_inv.copy()
def get_all_symbol_ref(): """Get all common and OTC symbols.""" load_dotenv() env = os.environ.get("env") df_all = None if env == "production": bpath = Path(baseDir().path, 'tickers', 'symbol_list') com_syms_path = bpath.joinpath('all_symbols.parquet') otc_syms_path = bpath.joinpath('otc_syms.parquet') com_df = pd.read_parquet(com_syms_path) otc_df = pd.read_parquet(otc_syms_path) otc_df.dropna(subset=['cik'], inplace=True) otc_df['cik'] = (otc_df['cik'].astype('int64').astype('str').str.zfill( 10).astype('category').reset_index(drop=True)) df_all = pd.concat([com_df, otc_df]).reset_index(drop=True) else: try: from api import serverAPI com_syms = serverAPI('all_symbols').df otc_syms = serverAPI('otc_syms').df df_all = pd.concat([com_syms, otc_syms]).reset_index(drop=True) except ModuleNotFoundError: help_print_arg('Tried import server api in get_all_symbols func') return df_all
def get_last_range(cls, self, sym): """Get last month of data.""" get = requests.get(self.url, params=self.payload) # If at first you don't succeed, try, try again. if get.status_code != 200: get = requests.get(self.url, params=self.payload) self.get = get if get.status_code == 200: try: df = pd.DataFrame(get.json()) except ValueError: df = pd.DataFrame.from_dict(get.json(), orient='index').T # self.df = dataTypes(df).df if os.path.isfile(self.fpath): old_df = pd.read_parquet(self.fpath) df_all = pd.concat([old_df, df]).reset_index(drop=True) write_to_parquet(df_all, self.fpath) # Assign dataframe to class attribute self.df = df_all else: # Write dataframe to parquet file write_to_parquet(df, self.fpath) # Assign dataframe to class attribute self.df = df else: msg = f"IexHistV2 for {sym} get request failed with status_code {get.status_code}" help_print_arg(msg)
def clean_data(cls, self): """Clean dataframe - remove na columns.""" df = self.df na_cutoff = (.75 * df.shape[0]) cols_to_drop = [] for col in df.columns: if df[col].isna().sum() > na_cutoff: cols_to_drop.append(col) # Drop columns that are at least 3/4 nas df.drop(columns=cols_to_drop, inplace=True) # Extract CIK from title column try: df['CIK'] = df['title'].str.extract("\((.*?)\)") except Exception as e: help_print_arg(f"SEC RSS CIK Error: {str(e)}") df['dt'] = pd.to_datetime(df['pubDate']) prev_15 = (datetime.now() - timedelta(minutes=60)).time() sec_df = (df[(df['dt'].dt.time > prev_15) & (df['dt'].dt.date == date.today())].copy()) self.df = df.copy() try: AnalyzeSecRss(latest=True, sec_df=sec_df) except Exception as e: help_print_arg(f"SecRss: AnalyzeSecRss Error {str(e)}")
def _display_options(cls, self, category, keyword): """Display categorical options.""" cat_dict = ({ 'peers': FpathDicts.get_peers(), 'refs': FpathDicts.symbol_ref_data(), 'ticks': FpathDicts.intraday_tick(), 'warrants': FpathDicts.warrants(), 'company_stats': FpathDicts.company_stats(), 'scans': FpathDicts.scans(), 'sec': FpathDicts.sec(), 'externals': FpathDicts.externals(), 'stocktwits': FpathDicts.stocktwits(), 'historical': FpathDicts.historical() }) self.cat_dict = cat_dict if category in cat_dict: if keyword: if keyword in cat_dict[category]: self.fpath = cat_dict[category][keyword] else: self.options = cat_dict[category] else: help_print_arg('Could not find your keyword') else: help_print_arg('Could not find your category') self.options = cat_dict.keys()
def concat_and_or_write(df_all, path, path_parq=True, path_gz=False, to_parq=True, to_gz=False, from_parq=True, from_gz=False, verb=False): """Concat and write to parquet and/or gzip file.""" if '.parquet' in str(path_parq): from_parq = True if '.gz' in str(from_gz): from_gz = True if verb: help_print_arg(path) if Path(path).exists() and from_parq: df_old = pd.read_parquet() df_all = pd.concat([df_old, df_all]).reset_index(drop=True) df_all.to_parquet(path) elif Path(path).exists() and from_gz: df_old = pd.read_json(path, compression='gzip') df_all = pd.concat([df_old, df_all]).reset_index(drop=True) df_all.to_json(path, compression='gzip') else: if to_parq: df_all.to_parquet(path) elif to_gz: df_all.to_json(path, compression='gzip')
def __init__(self, followup=False, testing=False, options=True, other=False): self.testing, self.options, self.other = testing, options, other self.proceed = True proxies = get_sock5_nord_proxies() if followup and self.options: # self.sym_df = yoptions_still_needed() self.sym_df = get_yoptions_unfin() elif not followup and self.options: self.sym_df = get_cboe_ref(ymaster=True) # Check if no further data needed if self.sym_df.empty: self.proceed = False elif not followup and other == 'yinfo': self.sym_df = get_all_symbol_ref() else: help_print_arg('No SetUpYahooOptions __init__ condition satisfied') if self.proceed: # Default True comb_df = self.get_bins_and_combine(self, proxies) self.initiate_for_loop(self, comb_df)
def send_text_messages(cls, self): """Send text messages to myself with relevant data.""" for key, msg in self.msg_dict.items(): if msg: send_twilio_message(msg=msg) elif self.testing: help_print_arg("AnalyzeSecRss: testing msg send func") help_print_arg(str(msg))
def get_rss_feed(cls, self): """Request and retry to get data from sec.""" get = requests.get(self.url, headers=self.headers) if get.status_code >= 400: get = requests.get(self.url, headers=self.headers) if get.status_code >= 400: help_print_arg('SEC RSS Feed: 2nd get request failed') self.df = pd.read_xml(get.content, xpath='.//item')
def get_data(cls, self, base_url, headers): """Get Alpaca symbol reference data.""" url = f"{base_url}/assets" get_ref = requests.get(url, headers=headers) if get_ref.status_code < 400: df = pd.DataFrame(get_ref.json()) self.df = dataTypes(df).df else: help_print_arg(get_ref.content)
def apca_get_data(cls, self, testing): """Start long running apca historical data request.""" kwargs = {'sym_list': self.df['symbol'].tolist()} if testing: kwargs['sym_list'] = self.df['symbol'].sample(n=10).tolist() help_print_arg(kwargs) rate_limit(ApcaHist, testing=True, **kwargs) else: rate_limit(ApcaHist, testing=False, **kwargs) return True
def execute_iex_stats(df, testing=False): """Task_functions loop for individual bin.""" # Df is in json format because it's being passed from a celery task df = pd.read_json(df) for index, row in df.iterrows(): try: get_daily_stats(row) except Exception as e: help_print_arg( f"Daily Stats Error: symbol - {row['symbol']} - {str(e)}") if testing: break
def yoptions_drop_hist_dupes(): """Cycle through yoptions hist and drop duplicates.""" dt = getDate.query('cboe') yr = dt.year path = Path(baseDir().path, 'derivatives/end_of_day/', str(yr)) fpaths = list(path.glob('**/*.parquet')) for fpath in tqdm(fpaths): try: df = pd.read_parquet(fpath) df.drop_duplicates(subset=['contractSymbol', 'date'], inplace=True) write_to_parquet(df, fpath) except Exception as e: help_print_arg(e)
def __init__(self, sym, current_day=True, ytd=False, testing=False): self.assign_variables(self, current_day, ytd, testing) self.construct_fpath(self, sym) # Construct parameters for request headers, url, params = self.construct_params(self, sym) # Use params to get data from Alpaca. Default is today's data self.get_data(self, headers, url, params) # If file exists, concat, otherwise just clean self.clean_concat_data(self) # Write to local parquet file if isinstance(self.df, pd.DataFrame): self.write_to_parquet(self) else: help_print_arg(f"Data Collection for symbol {sym} failed")
def initiate_exec(cls, self, cs_adr): """Initiate execution of cs_adr loop through.""" bins = cs_adr['bins'].unique().tolist() args = [cs_adr[cs_adr['bins'] == n] for n in iter(bins)] for arg in args: try: from app.tasks import execute_func kwargs = {'df': arg.to_json()} execute_func.delay('execute_iex_stats', **kwargs) except ModuleNotFoundError: execute_iex_stats(arg.to_json()) help_print_arg('Execute yahoo options not found') # 15 minutes in the future, combine all company stats info # All previous symbols are assumed to have data at that point execute_func.apply_async(args=['combine_stats'], countdown=900)
def scraped_ee_dates(verbose=False, hist=False, current_year=True): """Start for loop of dates to get future/past analyst estimates.""" dt = getDate.query('iex_eod') bdays, pos_days = None, None if (365 - dt.timetuple().tm_yday) > 15: bdays = getDate.get_bus_days(this_year=True) else: bdays = getDate.get_bus_days(this_year=False) bdays = bdays[bdays['date'].dt.year >= dt.year].copy() bdays['current_date'] = pd.to_datetime(getDate.query('iex_close')) bdays['bday_diff'] = (getDate.get_bus_day_diff( bdays, 'current_date', 'date')) if hist and not current_year: pos_days = bdays[bdays['bday_diff'] < 15].copy() elif hist and current_year: cond1 = (bdays['bday_diff'] < 15) cond2 = (bdays['date'].dt.year == dt.year) pos_days = bdays[cond1 & cond2].copy() else: pos_days = bdays[bdays['bday_diff'].between(0, 15)].copy() bpath = Path(baseDir().path, 'economic_data', 'analyst_earnings') fpath_dir = bpath.joinpath(f"_{str(dt.year)}") pos_days['fpath'] = (pos_days.apply(lambda row: f"{fpath_dir}/_{str(row['date'].date())}.parquet", axis=1)) pos_days['fpath_exists'] = (pos_days['fpath'].astype(str) .map(os.path.exists)) dt_need = pos_days[~pos_days['fpath_exists']] dt_list = [] for dt in dt_need['date']: try: ScrapedEE(dt=dt.date()) sleep(randint(5, 15)) dt_list.append(dt.date()) except Exception as e: help_print_arg(f"scraped_ee_dates {type(e)} {str(e)}") if verbose: help_print_arg(str(dt_list))
def collect_rest_of_yoptions(): """After a period of time, collect rest of data.""" # Follow up to the first sequence of requests path = Path(baseDir().path, 'derivatives/end_of_day/unfinished') paths = list(path.glob('*.parquet')) for fpath in paths: df = pd.read_parquet(fpath) if df.empty: os.remove(fpath) else: try: from app.tasks import execute_func kwargs = {'df': df.to_json()} execute_func.delay('execute_yoptions', **kwargs) except ModuleNotFoundError: help_print_arg('Execute yahoo options not found')
def __init__(self, sym, testing=False, last_month=False, previous=False): self.testing = testing self.last_month, self.previous = last_month, previous # Check if no data path exists - default get ytd self.check_existing(self, sym) self.get_iex_params(self, sym) if last_month or previous: self.get_last_range(self, sym) elif self.need_data: if self.need_ytd: # If ytd data needed self.get_ytd(self) else: # If exact dates needed self.get_exact_dates(self) else: msg = 'HistPricesV2: None of the __init__ conditions satisfied' help_print_arg(msg) raise NameError
def _request_data_and_store(cls, self): """Request data and convert to dataframe. Write locally.""" get = (requests.get(self.url, headers=self.headers, params=self.payload)) if get.status_code < 400: df = pd.DataFrame(get.json()['data']['rows']) df['date'] = self.dt self.df = df CleanScrapedEE(df, self.fpath) # write_to_parquet(df, self.fpath) else: msg = f"Scraped EE failed with msg {str(get.content)}" help_print_arg(msg) self.get = get
def paths_combine_dataframes(dirs, cb_path='', cb_all_path='', verbose=False): """Read dataframes and combine into combined, combined_all fpaths.""" df_list = [] for dir in dirs: path_list = dir.glob('**/*.parquet') for f in path_list: df_list.append(pd.read_parquet(f)) df_all = pd.concat(df_list) df_cb = df_all[df_all['date'] == df_all['date'].max()] if verbose: msg = f"paths_combine_dataframes {str(df_cb['date'].max())}" help_print_arg(msg) write_to_parquet(df_cb, cb_path) write_to_parquet(df_all, cb_all_path)
def execute_yahoo_func(df, which='yinfo', verbose=False, **kwargs): """Execute for loop. Run from tasks execute_function.""" # Df is in json format because it's being passed from a celery task df = pd.read_json(df) # If which function to execute is passed if 'which' in kwargs.keys(): which = kwargs['which'] # Define function dict and unfinished fpath dir to store unfinished symbols func_dict = {'yinfo': ysymbols_info} unfin_dict = ({ 'yinfo': 'tickers/info/unfinished', 'yoptions': 'derivatives/end_of_day/unfinished' }) # Add all index/row errors to dict for future use error_dict = {} for index, row in df.iterrows(): try: if which == 'yinfo': func_dict[which](row['symbol']) except SOCKS5AuthError as sae: # Print error help_print_arg( f"Execute Yahoo Func: Socks 5 AuthError: {str(sae)}") try: time.sleep(.5) if which == 'yinfo': func_dict[which](row['symbol']) except Exception as e: # End loop break except TypeError as te: error_dict[index] = row if verbose: help_print_arg(f"Execute yahoo func: TypeError: {str(te)}") help_print_arg(f"{str(index)}: {str(row)}") except Exception as e: error_dict[index] = row if verbose: help_print_arg(f"Execute yahoo func: Gen Excp: {str(e)}") try: # Create dataframe from error dict df_errors = pd.DataFrame.from_dict(error_dict).T df_unfin = pd.concat([df_errors, df.iloc[index:]]).copy() # Define path to write file path = Path(baseDir().path, unfin_dict[which], f"df_bin{row['bins']}.parquet") df_unfin.to_parquet(path) except UnboundLocalError: pass
def get_nasdaq_symbol_changes(): """Get symbol change history from nasdaq.""" sym_change_url = 'https://api.nasdaq.com/api/quote/list-type-extended/symbolchangehistory' nasdaq_headers = ({ 'Host': 'api.nasdaq.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:94.0) Gecko/20100101 Firefox/94.0', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Origin': 'https://www.nasdaq.com', 'DNT': '1', 'Connection': 'keep-alive', 'Referer': 'https://www.nasdaq.com/', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-site', 'Sec-GPC': '1', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache' }) get = requests.get(sym_change_url, headers=nasdaq_headers) df_sym_change = None if get.status_code == 200: df_sym_change = (pd.DataFrame( get.json()['data']['symbolChangeHistoryTable']['rows'])) else: msg1 = 'get_nasdaq_symbol_changes failed with url' msg2 = f"and status code {str(get.status_code)}" help_print_arg(f"{msg1} {sym_change_url} {msg2}") dt = getDate.query('iex_close') path = (Path(baseDir().path, 'ref_data/symbol_ref/symbol_changes', f'_{dt}.parquet')) if isinstance(df_sym_change, pd.DataFrame): write_to_parquet(df_sym_change, path) else: raise Exception
def _file_change_loop(cls, self, fpath_list): """Start the for loop for file type change.""" exc_list = [] fpath_exc_list = [] gc.set_threshold(100, 5, 5) for fpath in tqdm(fpath_list): size = os.path.getsize(str(fpath)) / 1000000 # If size < 250 mb if size < 250 and 'apca' not in str(fpath) and 'intraday' not in str(fpath): try: self._read_write_file(self, fpath) except Exception as e: msg = f"{type(e)} : {str(fpath)} : {str(e)}" help_print_arg(msg) fpath_exc_list.append(fpath) exc_list.append(msg) self.exc_list += exc_list self.fpath_exc_list += fpath_exc_list
def get_all_int_syms(cls, self): """Loop through exchanges and get all international symbols.""" df_list = [] for exch in self.exch_df['exchange'].tolist(): url = f"/ref-data/exchange/{exch}/symbols" try: ud = urlData(url) df = ud.df.copy() df_list.append(df) sleep(.5) except Exception as e: # If error, print error and exchange to console msg_1 = f"IntSyms Error getting data for exchange {exch}" msg_2 = f" with error type: {type(e)} and error: {str(e)}" msg_3 = f"Url: {url} get.status_code: {ud.get.status_code} message: {ud.get.text}" help_print_arg(f"{msg_1}{msg_2}") help_print_arg(msg_3) df_all = pd.concat(df_list).reset_index(drop=True) self.all_int_syms = df_all
def combine_all_intraday_data(minute='minute_1'): """Combine all intraday data, write to file.""" dt = getDate.query('iex_eod') path = Path(baseDir().path, 'intraday', minute, str(dt.year)) fpaths = list(path.glob('**/*.parquet')) df_list = [] for fpath in fpaths: try: df_list.append(pd.read_parquet(fpath)) except Exception as e: msg = f"fpath: {str(fpath)} reason: {str(e)}" help_print_arg(msg) df_all = pd.concat(df_list) fpre = f'combined_all/{minute}/' fsuf = f"{fpre}_{dt}.parquet" path_to_write = path.parent.parent.joinpath(fsuf) write_to_parquet(df_all, path_to_write)
def get_yf_loop_missing_hist(key='less_than_20', cs=False, sym_list=None, verb=False, refresh_missing_dates=True): """Get less_than_20 syms and call GetYfMissingDates.""" if sym_list: pass if verb: help_print_arg('get_yf_loop_missing_hist: sym_list assumed') elif key == 'get_ignore_ytd': df_all = read_clean_combined_all() dt = getDate.query('iex_eod') df_year = df_all[df_all['date'].dt.year == dt.year].copy(deep=True) vc = df_year.value_counts(subset='symbol', ascending=False) syms_one_miss = vc[(vc < (vc.max() - 1)) & (vc > 0)].index sym_list = syms_one_miss.tolist() if verb: help_print_arg('get_yf_loop_missing_hist: key==get_ignore_ytd : syms_one_miss') elif cs is True: if refresh_missing_dates: MissingHistDates(cs=True) bpath = Path(baseDir().path, "StockEOD/missing_dates/all") fpath = get_most_recent_fpath(bpath) df_dates = pd.read_parquet(fpath) # Get all symbols, reduce to common stock and adr's sym_list = df_dates['symbol'].unique().tolist() if verb: help_print_arg('get_yf_loop_missing_hist: cs=True') else: if refresh_missing_dates: MissingHistDates() bpath = Path(baseDir().path, f"StockEOD/missing_dates/{key}") fpath = get_most_recent_fpath(bpath) df_dates = pd.read_parquet(fpath) sym_list = df_dates['symbol'].unique().tolist() if verb: help_print_arg('get_yf_loop_missing_hist: sym_list from missing_dates/key') for sym in tqdm(sym_list): try: GetYfMissingDates(sym=sym) except Exception as e: help_print_arg(f"get_yf_loop_missing_hist error: {str(e)}")