def concat_data(cls, self, df): """Concat data with previous file if it exists.""" if os.path.isfile(self.fpath): df_all = pd.concat([pd.read_parquet(self.fpath), df]) df_all = (df_all.drop_duplicates(subset='symbol').reset_index( drop=True)) df = dataTypes(df_all).df.copy(deep=True) else: # Minimize size of data df = dataTypes(df).df.copy(deep=True) self.df = df
def clean_concat_data(cls, self): """Clean, concat data and prepare for compression.""" df_old, df = None, None if os.path.isfile(self.fpath): df_old = pd.read_parquet(self.fpath) df = pd.concat([df_old, self.df]) # Minimize data size self.df = dataTypes(df).df.copy(deep=True)
def _convert_to_pandas(cls, self, sh_get): """Convert data to pandas dataframe.""" sh_df = pd.read_csv(BytesIO(sh_get.content), escapechar='\n', delimiter=',', skipinitialspace=False) # Convert object columns to category columns sh_df = dataTypes(sh_df, parquet=True).df return sh_df
def get_data(cls, self, base_url, headers): """Get Alpaca symbol reference data.""" url = f"{base_url}/assets" get_ref = requests.get(url, headers=headers) if get_ref.status_code < 400: df = pd.DataFrame(get_ref.json()) self.df = dataTypes(df).df else: help_print_arg(get_ref.content)
def get_full_hist(cls, self, sym_list): """Get full history with the date_range param.""" for sym in sym_list: url = f"{self.base_url}/stock/{sym}/chart/{self.date_range}" df = pd.DataFrame(requests.get(url, params=self.payload).json()) df['year'] = pd.to_datetime(df['date']).dt.year years = df['year'].value_counts().index.tolist() for yr in years: df_mod = df[df['year'] == yr].copy(deep=True) df_mod.drop(columns=['year'], inplace=True) df_mod.reset_index(drop=True, inplace=True) df_mod = dataTypes(df_mod).df fpath = f"{self.base_dir}/StockEOD/{yr}/{sym.lower()[0]}/_{sym}.parquet" write_to_parquet(df_mod, fpath)
def symref_format(cls, self, df): """Format symbol reference data.""" df['sym_suf'] = df['OSI Symbol'].str[-15:] df['side'] = df['sym_suf'].str[6] df['strike'] = (df['sym_suf'].str[7:12] + '.' + df['sym_suf'].str[13]) df['expirationDate'] = df['sym_suf'].str[0:6] try: df.drop(columns=['Closing Only', 'Matching Unit', 'sym_suf'], inplace=True) except KeyError as ke: pass df['yr'] = df['expirationDate'].str[0:2] df['mo'] = df['expirationDate'].str[2:4] df['day'] = df['expirationDate'].str[4:6] df.rename(columns={'Cboe Symbol': 'Symbol'}, inplace=True) df = dataTypes(df, parquet=True).df.copy() return df
def _get_clean_data(cls, self, sym, period, interval, proxy): """Request and clean data from yfinance.""" data = yf.download( tickers=sym, period=period, interval=interval, group_by='ticker', auto_adjust=True, prepost=False, proxy=proxy ) df = data.reset_index() df.insert(1, 'symbol', sym) (df.rename(columns={'Date': 'date', 'Open': 'fOpen', 'High': 'fHigh', 'Low': 'fLow', 'Close': 'fClose', 'Volume': 'fVolume'}, inplace=True)) df = dataTypes(df, parquet=True).df dt = getDate.query('iex_eod') self.df_yf = df[df['date'].dt.date <= dt]
def merge_dfs(cls, self): """Merge mmo and symref dataframes.""" merge_list = ['Symbol', 'Underlying'] if ('exchange' in self.mmo_df.columns and 'exchange' in self.sym_df.columns): merge_list.append('exchange') try: df = (pd.merge(self.mmo_df, self.sym_df, on=merge_list, how='inner')) df.reset_index(inplace=True, drop=True) # df['rptDate'] = date.today() df['rptDate'] = getDate.query('cboe') # Change data types to reduce file size df = dataTypes(df, parquet=True).df # df = dataTypes(df).df except TypeError: df = pd.DataFrame() return df
def format_data(cls, self): """Format stocktwits data.""" self.rec_df.drop(columns=['is_following', 'title', 'aliases'], inplace=True) self.rec_df = dataTypes(self.rec_df).df self.rec_df['timestamp'] = pd.Timestamp.now('US/Eastern')
def _conver_cols_nopop(cls, self): """Convert columns from not popular data frame.""" self.df = dataTypes(self.df, parquet=True).df
def read_clean_combined_all(local=False, dt=None, filter_syms=True): """Read, clean, and add columns to StockEOD combined all.""" df_all = None if local: bpath = Path(baseDir().path, 'StockEOD/combined_all') fpath = get_most_recent_fpath(bpath) cols_to_read = [ 'date', 'symbol', 'fOpen', 'fHigh', 'fLow', 'fClose', 'fVolume' ] df_all = pd.read_parquet(fpath, columns=cols_to_read) if df_all['date'].dtype == 'object': df_all['date'] = pd.to_datetime(df_all['date']) df_all.drop_duplicates(subset=['symbol', 'date'], inplace=True) else: cols_to_read = [ 'date', 'symbol', 'fOpen', 'fHigh', 'fLow', 'fClose', 'fVolume' ] df_all = serverAPI('stock_close_cb_all').df df_all = df_all[cols_to_read] if filter_syms: all_cs_syms = remove_funds_spacs() df_all = df_all[df_all['symbol'].isin( all_cs_syms['symbol'])].copy() df_all['date'] = pd.to_datetime(df_all['date']) # Define base bpath for 2015-2020 stock data bpath = Path(baseDir().path, 'historical/each_sym_all') path = get_most_recent_fpath( bpath.joinpath('each_sym_all', 'combined_all')) df_hist = pd.read_parquet(path) # Combine 2015-2020 stock data with ytd df_all = pd.concat([df_hist, df_all]).copy() df_all.drop_duplicates(subset=['symbol', 'date'], inplace=True) df_all.reset_index(drop=True, inplace=True) if not dt: dt = getDate.query('iex_eod') # Exclude all dates from before this year df_all = (df_all[df_all['date'] >= str(dt.year)].dropna( subset=['fVolume']).copy()) # Get rid of all symbols that only have 1 day of data df_vc = df_all['symbol'].value_counts() df_vc_1 = df_vc[df_vc == 1].index.tolist() df_all = (df_all[~df_all['symbol'].isin(df_vc_1)].reset_index( drop=True).copy()) # Sort by symbol, date ascending df_all = df_all.sort_values(by=['symbol', 'date'], ascending=True) df_all['fRange'] = (df_all['fHigh'] - df_all['fLow']) df_all['vol/mil'] = (df_all['fVolume'].div(1000000)) df_all['prev_close'] = df_all['fClose'].shift(periods=1, axis=0) df_all['prev_symbol'] = df_all['symbol'].shift(periods=1, axis=0) # Add fChangeP col print('Fib funcs: adding fChangeP column') df_all = add_fChangeP_col(df_all) # Merge with df_all and resume # Add gap column print('Fib funcs: adding gap column') df_all = add_gap_col(df_all) # Add range of gap df_all['gRange'] = (np.where(df_all['prev_close'] < df_all['fLow'], df_all['fHigh'] - df_all['prev_close'], df_all['fHigh'] - df_all['fLow'])) df_all['cumPerc'] = np.where(df_all['symbol'] == df_all['prev_symbol'], df_all['fChangeP'].cumsum(), np.NaN) df_all['perc5'] = np.where(df_all['symbol'] == df_all['prev_symbol'], df_all['cumPerc'].shift(-5) - df_all['cumPerc'], np.NaN) df_all['vol_avg_2m'] = np.where(df_all['symbol'] == df_all['prev_symbol'], df_all['fVolume'].rolling(60).mean(), np.NaN) # Add cumulative sum of last 5 fChangeP rows df_all['fCP5'] = (np.where( df_all['symbol'] == df_all['prev_symbol'], df_all['fChangeP'].rolling(min_periods=1, window=5).sum(), 0)) df_all = df_all.copy() # Calc RSI and moving averages print('Fib Funcs: calc_rsi') df_all = calc_rsi(df_all) print('Fib Funcs: making_moving_averages') df_all = make_moving_averages(df_all) # fHighMax funcs print('Fib funcs: fHighMax') df_all = add_fHighMax_col(df_all).copy() df_all = df_all.sort_values(by=['symbol', 'date'], ascending=True) float_32s = df_all.dtypes[df_all.dtypes == np.float32].index for col in float_32s: df_all[col] = df_all[col].astype(np.float64).round(3) df_all = dataTypes(df_all, parquet=True).df.copy() return df_all