def test_equal_subsets(): """ Test the signals are identical when calculated for subsets of the tickers. """ # Calculate and compare signals for these tickers. ticker = 'SAVE' tickers = ['FL', 'SAVE', 'TLYS'] # Used for calculating signals for ALL tickers. hub1 = sf.StockHub() # Used for calculating signals for SOME tickers. hub2 = sf.StockHub(tickers=tickers) # Used for calculating signals for ONE ticker. hub3 = sf.StockHub(tickers=[ticker]) # Helper-function to perform the actual signal calculation and comparison. def _test(func_name, variant, func=None): # Get the object-methods for the function with the given name. signal_func1 = getattr(hub1, func_name) signal_func2 = getattr(hub2, func_name) signal_func3 = getattr(hub3, func_name) # Calculate the signals. df_signals1 = signal_func1(variant=variant, func=func) df_signals2 = signal_func2(variant=variant, func=func) df_signals3 = signal_func3(variant=variant, func=func) # Compare the signals and ensure they are identical. assert_frame_equal(df_signals1.loc[tickers], df_signals2) assert_frame_equal(df_signals1.loc[ticker], df_signals3.loc[ticker]) assert_frame_equal(df_signals2.loc[ticker], df_signals3.loc[ticker]) # Test for the different signal-functions, variants and functions. for func_name in ['val_signals', 'fin_signals', 'growth_signals']: for variant in ['daily', 'latest']: for func in [None, sf.avg_ttm_2y]: _test(func_name=func_name, variant=variant, func=func)
# Add this date-offset to the fundamental data such as # Income Statements etc., because the REPORT_DATE is not # when it was actually made available to the public, # which can be 1, 2 or even 3 months after the Report Date. offset = pd.DateOffset(days=60) # Refresh the fundamental datasets (Income Statements etc.) # every 30 days. refresh_days = 30 # Refresh the dataset with shareprices every 10 days. refresh_days_shareprices = 10 hub = sf.StockHub(market=market, offset=offset, refresh_days=refresh_days, refresh_days_shareprices=refresh_days_shareprices) df_fin_signals = hub.fin_signals(variant='daily') df_growth_signals = hub.growth_signals(variant='daily') df_val_signals = hub.val_signals(variant='daily') # Combine the DataFrames. dfs = [df_fin_signals, df_growth_signals, df_val_signals] df_signals = pd.concat(dfs, axis=1) # Remove all rows with only NaN values. df = df_signals.dropna(how='all').reset_index(drop=True) # List of the columns before removing any. columns_before = df_signals.columns
import simfin as sf #Used for column names from simfin.names import * #SIMFIN params # Where all the csvs are stored sf.set_data_dir('./simfin_data/') # Simfin needs an api key (Using simfins free key so only get stock info from before 2018 ) sf.load_api_key(path='~/simfin_api_key.txt', default_key='free') #offset the fundamental data by 50 days, the simfin documentation alsorecommends doing this.. dateOffset = pd.DateOffset(days=50) refresh_days = 25 refresh_days_shareprices = 14 #Data collection from Simfin hub = sf.StockHub(market='us', offset=dateOffset, refresh_days=refresh_days, refresh_days_shareprices=refresh_days_shareprices) #Creating a panda dataframe with the simfin data growthSignalsDf = hub.growth_signals(variant='daily') valueSignalsDf = hub.val_signals(variant='daily') financialSignalsDf = hub.fin_signals(variant='daily') #Combining the 3 data frames into one big data frame dfs = [financialSignalsDf, growthSignalsDf, valueSignalsDf] signalsDf = pd.concat(dfs, axis=1) #Drop the rows where all elements are missing. signalsDf.dropna(how='all').head() df = signalsDf.dropna(how='all').reset_index(drop=True) #Columns must have atleast 80% non NULL values, any that don't are dropped #(Scikit cannot work well with lots of missing data) thresh = 0.80 * len(signalsDf.dropna(how='all')) signalsDf = signalsDf.dropna(axis='columns', thresh=thresh)
df_all = df_all.merge( df_companies.merge( df_industries, how='left', on=INDUSTRY_ID).set_index(SIMFIN_ID), how='left', on=SIMFIN_ID).drop( TICKER + '_y', axis=1) df_all[TICKER] = df_all[TICKER + '_x'] df_all = df_all.drop(TICKER + '_x', axis=1) df_all = df_all.set_index([TICKER, REPORT_DATE]) fin_sig_list = list() for mkt in market_list: hub = sf.StockHub(market=mkt, refresh_days=30, refresh_days_shareprices=1) # Lade die Finanzsignale für den aktuelle Markt fin_sig_list.append(hub.fin_signals(variant='quarterly')) df_all = pd.concat([df_all, pd.concat(fin_sig_list)], axis=1) fin_price_list = list() for mkt in market_list: hub = sf.StockHub(market=mkt, refresh_days=30, refresh_days_shareprices=1) # Lade die Aktienpreise für den aktuelle Markt fin_price_list.append(hub.load_shareprices(variant='daily')) fin_prices = pd.concat(fin_price_list) df_all = df_all.reset_index(level=[REPORT_DATE])
#Set the local directory where data-files are stored. sf.set_data_dir('C:/Users/think/Desktop/UVA/2020 Spring/STAT 4996 Capstone\python code/simfin_data/') # Set up API key sf.set_api_key(api_key='free') #set plotting style sns.set_style("whitegrid") #--------------------------Load data----------------------------- offset = pd.DateOffset(days=60) # Refresh the fundamental datasets (Income Statements etc.) # Refresh the dataset with shareprices every 10 days. hub = sf.StockHub(market='us', offset=offset, refresh_days=30, refresh_days_shareprices=10) #calculating signal data df_fin_signals = hub.fin_signals(variant='daily') df_growth_signals = hub.growth_signals(variant='daily') df_val_signals = hub.val_signals(variant='daily') dfs = [df_fin_signals, df_growth_signals, df_val_signals] df_signals = pd.concat(dfs, axis=1) #-----------------------Missing Data---------------------------------- # First Remove all rows with only NaN values. df = df_signals.dropna(how='all').reset_index(drop=True) # For each column, show the fraction of the rows that are NaN.