def test_translate_currency_real(self, ticker): columns = ['equity','eps','revenue','netinccmn', 'cashneq','debt','ebit','ebitda'] data_loader = SF1Data(config['sf1_data_path']) quarterly_df = data_loader.load_quarterly_data(ticker, 10) trans_df = SF1Data.translate_currency(quarterly_df, columns) for col in columns: diff = trans_df['{}usd'.format(col)] - trans_df[col] diff = np.abs(diff.values / trans_df['{}usd'.format(col)].values) diff = diff[~np.isnan(diff)] assert diff.max() < 0.1
def test_load_base_data(self): data_loader = SF1Data(config['sf1_data_path']) df = data_loader.load_base_data() assert type(df) == pd.DataFrame assert len(df) > 0 assert 'ticker' in df.columns assert df['ticker'].isnull().max() == False
def test_calculate(self, tickers, columns, agg_day_counts, max_back_quarter): fc = DailyAggQuarterFeatures(columns=columns, agg_day_counts=agg_day_counts, max_back_quarter=max_back_quarter) data_loader = SF1Data(config['sf1_data_path']) X = fc.calculate(data_loader, tickers) assert type(X) == pd.DataFrame assert 'ticker' in X.index.names assert 'date' in X.index.names assert X.shape[0] <= max_back_quarter * len(tickers) assert X.shape[1] == len(calc_series_stats([])) * \ len(columns) * len(agg_day_counts) for col in columns: for count in agg_day_counts: min_col = 'days{}_{}_min'.format(count, col) max_col = 'days{}_{}_max'.format(count, col) mean_col = 'days{}_{}_mean'.format(count, col) median_col = 'days{}_{}_median'.format(count, col) assert (X[max_col] >= X[min_col]).min() assert (X[max_col] >= X[mean_col]).min() assert (X[max_col] >= X[median_col]).min() assert (X[mean_col] >= X[min_col]).min() assert (X[median_col] >= X[min_col]).min()
def test_calculate(self, tickers): data_loader = SF1Data(config['sf1_data_path']) quarterly_df = data_loader.load_quarterly_data(tickers, quarter_count=None) target = QuarterlyTarget('marketcap', quarter_shift=0) info_df = quarterly_df.drop_duplicates('ticker', keep='first') \ [['ticker', 'date', 'marketcap']] y = target.calculate(data_loader, info_df[['ticker', 'date']]) assert type(y) == pd.DataFrame assert 'y' in y.columns np.testing.assert_array_equal(y['y'].values, info_df['marketcap'].values) info_df = quarterly_df[['ticker', 'date', 'marketcap']] y = target.calculate(data_loader, info_df) np.testing.assert_array_equal(y['y'].values, info_df['marketcap'].values) target = QuarterlyTarget('marketcap', quarter_shift=1) info_df = quarterly_df[['ticker', 'date', 'marketcap']] y = target.calculate(data_loader, info_df) np.testing.assert_array_equal(y['y'].values, info_df.groupby('ticker')['marketcap']\ .shift(1).astype('float').values) target = QuarterlyTarget('marketcap', quarter_shift=-3) info_df = quarterly_df[['ticker', 'date', 'marketcap']] y = target.calculate(data_loader, info_df) np.testing.assert_array_equal(y['y'].values, info_df.groupby('ticker')['marketcap']\ .shift(-3).astype('float').values)
def test_calculate(self, tickers, cat_columns): loaders = [ Data(columns=[], cat_columns=cat_columns, tickers=tickers), SF1Data(config['sf1_data_path']) ] for data_loader in loaders[:]: fc = BaseCompanyFeatures(cat_columns=cat_columns) X = fc.calculate(data_loader, tickers) assert type(X) == pd.DataFrame assert 'ticker' in X.index.names base_data = data_loader.load_base_data() for col in cat_columns: assert len(base_data[col].unique()) ==\ len(fc.col_to_encoder[col].classes_) # Reuse fitted after first calculate fc for col in cat_columns: assert col in fc.col_to_encoder new_X = fc.calculate(data_loader, tickers) for col in cat_columns: assert (new_X[col] == X[col]).min() wd = WrapData(data_loader, tickers) new_X = fc.calculate(wd, tickers) for col in cat_columns: assert (new_X[col] == X[col]).min()
def test_calculate(self, tickers, columns, quarter_counts, max_back_quarter): fc = QuarterlyFeatures(columns=columns, quarter_counts=quarter_counts, max_back_quarter=max_back_quarter) loaders = [Data(columns), SF1Data(config['sf1_data_path'])] for data_loader in loaders: X = fc.calculate(data_loader, tickers) assert type(X) == pd.DataFrame assert 'ticker' in X.index.names assert 'date' in X.index.names if type(data_loader) == Data: assert X.shape[0] == max_back_quarter * len(tickers) else: assert X.shape[0] <= max_back_quarter * len(tickers) assert X.shape[1] == 2 * len(calc_series_stats([])) * \ len(columns) * len(quarter_counts) # Minimum can not be lower with reduction of quarter_count sorted_quarter_counts = np.sort(quarter_counts) for col in columns: for k in range(len(sorted_quarter_counts) - 1): lower_count = sorted_quarter_counts[k] higher_count = sorted_quarter_counts[k + 1] l_col = 'quarter{}_{}_min'.format(lower_count, col) h_col = 'quarter{}_{}_min'.format(higher_count, col) assert (X[h_col] <= X[l_col]).min() # Maximum can not be higher with reduction of quarter_count sorted_quarter_counts = np.sort(quarter_counts) for col in columns: for k in range(len(sorted_quarter_counts) - 1): lower_count = sorted_quarter_counts[k] higher_count = sorted_quarter_counts[k + 1] l_col = 'quarter{}_{}_max'.format(lower_count, col) h_col = 'quarter{}_{}_max'.format(higher_count, col) assert (X[h_col] >= X[l_col]).min() std_cols = [x for x in X.columns if '_std' in x] for col in std_cols: assert X[col].min() >= 0 for col in columns: for count in quarter_counts: min_col = 'quarter{}_{}_min'.format(count, col) max_col = 'quarter{}_{}_max'.format(count, col) mean_col = 'quarter{}_{}_mean'.format(count, col) median_col = 'quarter{}_{}_median'.format(count, col) assert (X[max_col] >= X[min_col]).min() assert (X[max_col] >= X[mean_col]).min() assert (X[max_col] >= X[median_col]).min() assert (X[mean_col] >= X[min_col]).min() assert (X[median_col] >= X[min_col]).min()
def test_export_load(self, tmpdir): data_loader = SF1Data(config['sf1_data_path']) tickers_df = data_loader.load_base_data(currency='USD', scalemarketcap=['5 - Large']) tickers = tickers_df['ticker'].unique().tolist() pipeline = self._create_pipeline() res = pipeline.fit(data_loader, tickers[:100]) df = pipeline.execute(data_loader, tickers[:100]) pipeline.export_core('{}/pipeline'.format(str(tmpdir))) #assert str(tmpdir) == 'efef' pipeline = BasePipeline.load('{}/pipeline.pickle'.format(str(tmpdir))) df1 = pipeline.execute(data_loader, tickers[:100]) np.testing.assert_array_equal(df['y'].values, df1['y'].values)
def test_calculate(self, tickers): data_loader = SF1Data(config['sf1_data_path']) quarterly_df = data_loader.load_quarterly_data(tickers, quarter_count=None) target = QuarterlyDiffTarget('marketcap', norm=False) info_df = quarterly_df[['ticker', 'date', 'marketcap']] y = target.calculate(data_loader, info_df) assert type(y) == pd.DataFrame assert 'y' in y.columns assert len(y) == len(info_df) gt = info_df['marketcap'].astype('float') - \ info_df.groupby('ticker')['marketcap'].shift(-1).astype('float') np.testing.assert_array_equal(y['y'].values, gt.values)
def test_fit_execute(self): data_loader = SF1Data(config['sf1_data_path']) tickers_df = data_loader.load_base_data(currency='USD', scalemarketcap=['5 - Large']) tickers = tickers_df['ticker'].unique().tolist() pipeline = self._create_pipeline() res = pipeline.fit(data_loader, tickers[:800]) assert type(res) == dict assert res['metric'] < 0.5 df = pipeline.execute(data_loader, tickers[800:]) assert type(df) == pd.DataFrame assert 'y' in df assert df['y'].mean() > 0 X = pipeline.core['feature'].calculate(data_loader, tickers[800:]) assert len(X) == len(df)
def test_translate_currency_synthetic(self, cnt): np.random.seed(0) currency_arr = np.array(range(1, cnt + 1)) df = pd.DataFrame() df['debtusd'] = np.random.uniform(-1e5, 1e5, cnt) df['debt'] = df['debtusd'] * currency_arr df['ebitusd'] = np.random.uniform(-10, 10, cnt) noise = np.random.uniform(-0.1, 0.1, cnt) df['ebit'] = df['ebitusd'] * (currency_arr + noise) del_proba = np.random.uniform(0, 0.3) drop_mask = np.random.choice([True, False], cnt, p=[del_proba, 1 - del_proba]) df.loc[drop_mask, 'ebitusd'] = None trans_df = SF1Data.translate_currency(df, ['debt', 'ebit']) for col in ['debt', 'ebit']: diff = trans_df['{}usd'.format(col)] - trans_df[col] diff = np.abs(diff.values / trans_df['{}usd'.format(col)].values) diff = diff[~np.isnan(diff)] assert diff.max() < 0.1
def test_calculate(self, tickers, columns, compare_quarter_idxs, max_back_quarter): fc = QuarterlyDiffFeatures(columns=columns, compare_quarter_idxs=compare_quarter_idxs, max_back_quarter=max_back_quarter) loaders = [Data(columns), SF1Data(config['sf1_data_path'])] for data_loader in loaders: X = fc.calculate(data_loader, tickers) assert type(X) == pd.DataFrame assert 'ticker' in X.index.names assert 'date' in X.index.names if type(data_loader) == Data: assert X.shape[0] == max_back_quarter * len(tickers) else: assert X.shape[0] <= max_back_quarter * len(tickers) assert X.shape[1] == len(compare_quarter_idxs) * len(columns)
def test_load_quarterly_data(self, tickers, quarter_count, dimension): data_loader = SF1Data(config['sf1_data_path']) quarterly_df = data_loader.load_quarterly_data(tickers, quarter_count, dimension) assert type(quarterly_df) == pd.DataFrame assert 'ticker' in quarterly_df.columns assert 'date' in quarterly_df.columns # Data should be ordered by date inside ticker quarterly_df['date_'] = quarterly_df['date'].astype(np.datetime64) quarterly_df['def_order'] = range(len(quarterly_df))[::-1] expected_dates_order = quarterly_df.sort_values(['ticker', 'date_'], ascending=False)['date'].values real_dates_order = quarterly_df.sort_values(['ticker', 'def_order'], ascending=False)['date'].values np.testing.assert_array_equal(expected_dates_order, real_dates_order) for cnt in quarterly_df.groupby('ticker').size(): assert cnt <= quarter_count assert (quarterly_df['dimension'] == dimension).min()
def test_calculate(self, tickers): data_loader = SF1Data(config['sf1_data_path']) fc1 = QuarterlyFeatures(columns=['ebit'], quarter_counts=[2], max_back_quarter=10) fc2 = QuarterlyDiffFeatures(columns=['ebit', 'debt'], compare_quarter_idxs=[1, 4], max_back_quarter=10) fc3 = BaseCompanyFeatures(cat_columns=['sector', 'sicindustry']) X1 = fc1.calculate(data_loader, tickers) X2 = fc2.calculate(data_loader, tickers) X3 = fc3.calculate(data_loader, tickers) fm1 = FeatureMerger(fc1, fc2, on=['ticker', 'date']) Xm1 = fm1.calculate(data_loader, tickers) fm2 = FeatureMerger(fc1, fc3, on='ticker') Xm2 = fm2.calculate(data_loader, tickers) assert Xm1.shape[0] == X1.shape[0] assert Xm2.shape[0] == X1.shape[0] assert Xm1.shape[1] == X1.shape[1] + X2.shape[1] assert Xm2.shape[1] == X1.shape[1] + X3.shape[1] assert (Xm1.index == X1.index).min() assert (Xm2.index == X1.index).min() new_cols = Xm1.columns[:X1.shape[1]] old_cols = X1.columns for nc, oc in zip(new_cols, old_cols): assert (Xm1[nc] == X1[oc]).min() new_cols = Xm2.columns[:X1.shape[1]] old_cols = X1.columns for nc, oc in zip(new_cols, old_cols): assert (Xm2[nc] == X1[oc]).min()
def test_load_daily_data(self, tickers, back_days): data_loader = SF1Data(config['sf1_data_path']) daily_df = data_loader.load_daily_data(tickers, back_days=back_days) assert type(daily_df) == pd.DataFrame assert 'ticker' in daily_df.columns assert 'date' in daily_df.columns # Data should be ordered by date inside ticker daily_df['date_'] = daily_df['date'].astype(np.datetime64) daily_df['def_order'] = range(len(daily_df))[::-1] expected_dates_order = daily_df.sort_values(['ticker', 'date_'], ascending=False)['date'].values real_dates_order = daily_df.sort_values(['ticker', 'def_order'], ascending=False)['date'].values np.testing.assert_array_equal(expected_dates_order, real_dates_order) # Should not be large holes in date diffs = daily_df.groupby('ticker')['date_'].shift(1) - daily_df['date_'] assert (diffs.dropna() <= np.timedelta64(14,'D')).min() if back_days is not None: for cnt in daily_df.groupby('ticker').size(): assert cnt == back_days
import pytest import pandas as pd import numpy as np from data import SF1Data from features import calc_series_stats, QuarterlyFeatures, BaseCompanyFeatures,\ QuarterlyDiffFeatures, FeatureMerger, \ DailyAggQuarterFeatures from utils import load_json, int_hash_of_str from synthetic_data import GeneratedData config = load_json('config.json') loaders = [GeneratedData()] if config['sf1_data_path'] is not None: loaders.append(SF1Data(config['sf1_data_path'])) @pytest.mark.parametrize(["series", "norm", "expected"], [([10, 0, 1], False, { '_mean': 3.6666666666666665, '_median': 1.0, '_max': 10.0, '_min': 0.0, '_std': 4.4969125210773475 }), ([10, -30, 1, 4, 15.2], False, { '_mean': 0.039999999999999855, '_median': 4.0, '_max': 15.2, '_min': -30.0, '_std': 15.798936673080249 }),
QuarterlyDiffFeatures, DailyAggQuarterFeatures from targets import DailyAggTarget from models import TimeSeriesOOFModel, AnsambleModel from metrics import median_absolute_relative_error, down_std_norm from pipelines import BasePipeline if __name__ == '__main__': parser = argparse.ArgumentParser() arg = parser.add_argument arg('--config_path', type=str) args = parser.parse_args() config = load_json(args.config_path) pipeline_config = config['pipelines']['marketcap_down_std'] data_loader = SF1Data(config['sf1_data_path']) tickers_df = data_loader.load_base_data( currency=pipeline_config['currency'], scalemarketcap=pipeline_config['scalemarketcap']) ticker_list = tickers_df['ticker'].unique().tolist() fc1 = QuarterlyFeatures( columns=pipeline_config['quarter_columns'], quarter_counts=pipeline_config['quarter_counts'], max_back_quarter=pipeline_config['max_back_quarter']) fc2 = BaseCompanyFeatures(cat_columns=pipeline_config['cat_columns']) fc3 = QuarterlyDiffFeatures( columns=pipeline_config['quarter_columns'], compare_quarter_idxs=pipeline_config['compare_quarter_idxs'],