Beispiel #1
0
 def test_translate_currency_real(self, ticker):
     columns = ['equity','eps','revenue','netinccmn',
                 'cashneq','debt','ebit','ebitda']
     data_loader = SF1Data(config['sf1_data_path'])
     quarterly_df = data_loader.load_quarterly_data(ticker, 10)
     trans_df = SF1Data.translate_currency(quarterly_df, columns)
     for col in columns:
         diff = trans_df['{}usd'.format(col)] - trans_df[col]
         diff = np.abs(diff.values / trans_df['{}usd'.format(col)].values)
         diff = diff[~np.isnan(diff)]
         assert diff.max() < 0.1
Beispiel #2
0
 def test_load_base_data(self):
     data_loader = SF1Data(config['sf1_data_path'])
     df = data_loader.load_base_data()
     assert type(df) == pd.DataFrame
     assert len(df) > 0
     assert 'ticker' in df.columns
     assert df['ticker'].isnull().max() == False
Beispiel #3
0
    def test_calculate(self, tickers, columns, agg_day_counts,
                       max_back_quarter):
        fc = DailyAggQuarterFeatures(columns=columns,
                                     agg_day_counts=agg_day_counts,
                                     max_back_quarter=max_back_quarter)

        data_loader = SF1Data(config['sf1_data_path'])
        X = fc.calculate(data_loader, tickers)

        assert type(X) == pd.DataFrame
        assert 'ticker' in X.index.names
        assert 'date' in X.index.names

        assert X.shape[0] <= max_back_quarter * len(tickers)
        assert X.shape[1] == len(calc_series_stats([])) * \
                             len(columns) * len(agg_day_counts)

        for col in columns:
            for count in agg_day_counts:
                min_col = 'days{}_{}_min'.format(count, col)
                max_col = 'days{}_{}_max'.format(count, col)
                mean_col = 'days{}_{}_mean'.format(count, col)
                median_col = 'days{}_{}_median'.format(count, col)
                assert (X[max_col] >= X[min_col]).min()
                assert (X[max_col] >= X[mean_col]).min()
                assert (X[max_col] >= X[median_col]).min()
                assert (X[mean_col] >= X[min_col]).min()
                assert (X[median_col] >= X[min_col]).min()
    def test_calculate(self, tickers):
        data_loader = SF1Data(config['sf1_data_path'])
        quarterly_df = data_loader.load_quarterly_data(tickers,
                                                       quarter_count=None)
                                                       
        target = QuarterlyTarget('marketcap', quarter_shift=0)
        info_df = quarterly_df.drop_duplicates('ticker', keep='first') \
                                        [['ticker', 'date', 'marketcap']]

        y = target.calculate(data_loader, info_df[['ticker', 'date']])
        assert type(y) == pd.DataFrame
        assert 'y' in y.columns
        np.testing.assert_array_equal(y['y'].values,
                                      info_df['marketcap'].values)
        
        info_df = quarterly_df[['ticker', 'date', 'marketcap']]
        y = target.calculate(data_loader, info_df)
        np.testing.assert_array_equal(y['y'].values,
                                      info_df['marketcap'].values)


        target = QuarterlyTarget('marketcap', quarter_shift=1)
        info_df = quarterly_df[['ticker', 'date', 'marketcap']]
        y = target.calculate(data_loader, info_df)
        np.testing.assert_array_equal(y['y'].values, 
                                      info_df.groupby('ticker')['marketcap']\
                                      .shift(1).astype('float').values)


        target = QuarterlyTarget('marketcap', quarter_shift=-3)
        info_df = quarterly_df[['ticker', 'date', 'marketcap']]
        y = target.calculate(data_loader, info_df)
        np.testing.assert_array_equal(y['y'].values, 
                                      info_df.groupby('ticker')['marketcap']\
                                      .shift(-3).astype('float').values)
Beispiel #5
0
    def test_calculate(self, tickers, cat_columns):
        loaders = [
            Data(columns=[], cat_columns=cat_columns, tickers=tickers),
            SF1Data(config['sf1_data_path'])
        ]
        for data_loader in loaders[:]:
            fc = BaseCompanyFeatures(cat_columns=cat_columns)
            X = fc.calculate(data_loader, tickers)

            assert type(X) == pd.DataFrame
            assert 'ticker' in X.index.names
            base_data = data_loader.load_base_data()
            for col in cat_columns:
                assert len(base_data[col].unique()) ==\
                       len(fc.col_to_encoder[col].classes_)

            # Reuse fitted after first calculate fc
            for col in cat_columns:
                assert col in fc.col_to_encoder
            new_X = fc.calculate(data_loader, tickers)
            for col in cat_columns:
                assert (new_X[col] == X[col]).min()

            wd = WrapData(data_loader, tickers)
            new_X = fc.calculate(wd, tickers)
            for col in cat_columns:
                assert (new_X[col] == X[col]).min()
Beispiel #6
0
    def test_calculate(self, tickers, columns, quarter_counts,
                       max_back_quarter):
        fc = QuarterlyFeatures(columns=columns,
                               quarter_counts=quarter_counts,
                               max_back_quarter=max_back_quarter)

        loaders = [Data(columns), SF1Data(config['sf1_data_path'])]
        for data_loader in loaders:
            X = fc.calculate(data_loader, tickers)

            assert type(X) == pd.DataFrame
            assert 'ticker' in X.index.names
            assert 'date' in X.index.names

            if type(data_loader) == Data:
                assert X.shape[0] == max_back_quarter * len(tickers)
            else:
                assert X.shape[0] <= max_back_quarter * len(tickers)

            assert X.shape[1] == 2 * len(calc_series_stats([])) * \
                                 len(columns) * len(quarter_counts)

            # Minimum can not be lower with reduction of quarter_count
            sorted_quarter_counts = np.sort(quarter_counts)
            for col in columns:
                for k in range(len(sorted_quarter_counts) - 1):
                    lower_count = sorted_quarter_counts[k]
                    higher_count = sorted_quarter_counts[k + 1]
                    l_col = 'quarter{}_{}_min'.format(lower_count, col)
                    h_col = 'quarter{}_{}_min'.format(higher_count, col)

                    assert (X[h_col] <= X[l_col]).min()

            # Maximum can not be higher with reduction of quarter_count
            sorted_quarter_counts = np.sort(quarter_counts)
            for col in columns:
                for k in range(len(sorted_quarter_counts) - 1):
                    lower_count = sorted_quarter_counts[k]
                    higher_count = sorted_quarter_counts[k + 1]
                    l_col = 'quarter{}_{}_max'.format(lower_count, col)
                    h_col = 'quarter{}_{}_max'.format(higher_count, col)

                    assert (X[h_col] >= X[l_col]).min()

            std_cols = [x for x in X.columns if '_std' in x]
            for col in std_cols:
                assert X[col].min() >= 0

            for col in columns:
                for count in quarter_counts:
                    min_col = 'quarter{}_{}_min'.format(count, col)
                    max_col = 'quarter{}_{}_max'.format(count, col)
                    mean_col = 'quarter{}_{}_mean'.format(count, col)
                    median_col = 'quarter{}_{}_median'.format(count, col)
                    assert (X[max_col] >= X[min_col]).min()
                    assert (X[max_col] >= X[mean_col]).min()
                    assert (X[max_col] >= X[median_col]).min()
                    assert (X[mean_col] >= X[min_col]).min()
                    assert (X[median_col] >= X[min_col]).min()
    def test_export_load(self, tmpdir):
        data_loader = SF1Data(config['sf1_data_path'])
        tickers_df = data_loader.load_base_data(currency='USD',
                                                scalemarketcap=['5 - Large'])
        tickers = tickers_df['ticker'].unique().tolist()
        pipeline = self._create_pipeline()
        res = pipeline.fit(data_loader, tickers[:100])
        df = pipeline.execute(data_loader, tickers[:100])
        pipeline.export_core('{}/pipeline'.format(str(tmpdir)))
        #assert str(tmpdir) == 'efef'
        pipeline = BasePipeline.load('{}/pipeline.pickle'.format(str(tmpdir)))
        df1 = pipeline.execute(data_loader, tickers[:100])

        np.testing.assert_array_equal(df['y'].values, df1['y'].values)
 def test_calculate(self, tickers):
     data_loader = SF1Data(config['sf1_data_path'])
     quarterly_df = data_loader.load_quarterly_data(tickers,
                                                    quarter_count=None)
                                                  
     target = QuarterlyDiffTarget('marketcap', norm=False)
     info_df = quarterly_df[['ticker', 'date', 'marketcap']]
     y = target.calculate(data_loader, info_df)
     assert type(y) == pd.DataFrame
     assert 'y' in y.columns
     assert len(y) == len(info_df)
     gt = info_df['marketcap'].astype('float') - \
          info_df.groupby('ticker')['marketcap'].shift(-1).astype('float')
     np.testing.assert_array_equal(y['y'].values, gt.values)
    def test_fit_execute(self):
        data_loader = SF1Data(config['sf1_data_path'])
        tickers_df = data_loader.load_base_data(currency='USD',
                                                scalemarketcap=['5 - Large'])
        tickers = tickers_df['ticker'].unique().tolist()
        pipeline = self._create_pipeline()
        res = pipeline.fit(data_loader, tickers[:800])

        assert type(res) == dict
        assert res['metric'] < 0.5

        df = pipeline.execute(data_loader, tickers[800:])
        assert type(df) == pd.DataFrame
        assert 'y' in df
        assert df['y'].mean() > 0
        X = pipeline.core['feature'].calculate(data_loader, tickers[800:])
        assert len(X) == len(df)
Beispiel #10
0
    def test_translate_currency_synthetic(self, cnt):
        np.random.seed(0)
        currency_arr = np.array(range(1, cnt + 1))
        df = pd.DataFrame()
        df['debtusd'] = np.random.uniform(-1e5, 1e5, cnt) 
        df['debt'] = df['debtusd'] * currency_arr
        df['ebitusd'] = np.random.uniform(-10, 10, cnt)
        noise = np.random.uniform(-0.1, 0.1, cnt) 
        df['ebit'] = df['ebitusd'] * (currency_arr + noise)
        del_proba = np.random.uniform(0, 0.3)
        drop_mask = np.random.choice([True, False], cnt, 
                                     p=[del_proba, 1 - del_proba])
        df.loc[drop_mask, 'ebitusd'] = None

        trans_df = SF1Data.translate_currency(df, ['debt', 'ebit'])
        for col in ['debt', 'ebit']:
            diff = trans_df['{}usd'.format(col)] - trans_df[col]
            diff = np.abs(diff.values / trans_df['{}usd'.format(col)].values)
            diff = diff[~np.isnan(diff)]
            assert diff.max() < 0.1
Beispiel #11
0
    def test_calculate(self, tickers, columns, compare_quarter_idxs,
                       max_back_quarter):
        fc = QuarterlyDiffFeatures(columns=columns,
                                   compare_quarter_idxs=compare_quarter_idxs,
                                   max_back_quarter=max_back_quarter)

        loaders = [Data(columns), SF1Data(config['sf1_data_path'])]
        for data_loader in loaders:
            X = fc.calculate(data_loader, tickers)

            assert type(X) == pd.DataFrame
            assert 'ticker' in X.index.names
            assert 'date' in X.index.names

            if type(data_loader) == Data:
                assert X.shape[0] == max_back_quarter * len(tickers)
            else:
                assert X.shape[0] <= max_back_quarter * len(tickers)

            assert X.shape[1] == len(compare_quarter_idxs) * len(columns)
Beispiel #12
0
 def test_load_quarterly_data(self, tickers, quarter_count, dimension):
     data_loader = SF1Data(config['sf1_data_path'])
     quarterly_df = data_loader.load_quarterly_data(tickers, quarter_count,
                                                    dimension)
     
     assert type(quarterly_df) == pd.DataFrame
     assert 'ticker' in quarterly_df.columns
     assert 'date' in quarterly_df.columns
     
     # Data should be ordered by date inside ticker
     quarterly_df['date_'] = quarterly_df['date'].astype(np.datetime64)
     quarterly_df['def_order'] = range(len(quarterly_df))[::-1]
     expected_dates_order = quarterly_df.sort_values(['ticker', 'date_'],
                                         ascending=False)['date'].values
     real_dates_order = quarterly_df.sort_values(['ticker', 'def_order'], 
                                         ascending=False)['date'].values          
     np.testing.assert_array_equal(expected_dates_order, real_dates_order)
                          
     for cnt in quarterly_df.groupby('ticker').size():
         assert cnt <= quarter_count
                     
     assert (quarterly_df['dimension'] == dimension).min()     
Beispiel #13
0
    def test_calculate(self, tickers):
        data_loader = SF1Data(config['sf1_data_path'])
        fc1 = QuarterlyFeatures(columns=['ebit'],
                                quarter_counts=[2],
                                max_back_quarter=10)

        fc2 = QuarterlyDiffFeatures(columns=['ebit', 'debt'],
                                    compare_quarter_idxs=[1, 4],
                                    max_back_quarter=10)

        fc3 = BaseCompanyFeatures(cat_columns=['sector', 'sicindustry'])

        X1 = fc1.calculate(data_loader, tickers)
        X2 = fc2.calculate(data_loader, tickers)
        X3 = fc3.calculate(data_loader, tickers)

        fm1 = FeatureMerger(fc1, fc2, on=['ticker', 'date'])
        Xm1 = fm1.calculate(data_loader, tickers)

        fm2 = FeatureMerger(fc1, fc3, on='ticker')
        Xm2 = fm2.calculate(data_loader, tickers)

        assert Xm1.shape[0] == X1.shape[0]
        assert Xm2.shape[0] == X1.shape[0]
        assert Xm1.shape[1] == X1.shape[1] + X2.shape[1]
        assert Xm2.shape[1] == X1.shape[1] + X3.shape[1]
        assert (Xm1.index == X1.index).min()
        assert (Xm2.index == X1.index).min()

        new_cols = Xm1.columns[:X1.shape[1]]
        old_cols = X1.columns
        for nc, oc in zip(new_cols, old_cols):
            assert (Xm1[nc] == X1[oc]).min()

        new_cols = Xm2.columns[:X1.shape[1]]
        old_cols = X1.columns
        for nc, oc in zip(new_cols, old_cols):
            assert (Xm2[nc] == X1[oc]).min()
Beispiel #14
0
    def test_load_daily_data(self, tickers, back_days):
        data_loader = SF1Data(config['sf1_data_path'])
        daily_df = data_loader.load_daily_data(tickers, back_days=back_days)  
        assert type(daily_df) == pd.DataFrame
        assert 'ticker' in daily_df.columns
        assert 'date' in daily_df.columns
           
        # Data should be ordered by date inside ticker
        daily_df['date_'] = daily_df['date'].astype(np.datetime64)
        daily_df['def_order'] = range(len(daily_df))[::-1]
        expected_dates_order = daily_df.sort_values(['ticker', 'date_'],
                                            ascending=False)['date'].values
        real_dates_order = daily_df.sort_values(['ticker', 'def_order'], 
                                            ascending=False)['date'].values
        np.testing.assert_array_equal(expected_dates_order, real_dates_order)

        # Should not be large holes in date
        diffs = daily_df.groupby('ticker')['date_'].shift(1) - daily_df['date_']
        assert (diffs.dropna() <= np.timedelta64(14,'D')).min()
        
        if back_days is not None:
            for cnt in daily_df.groupby('ticker').size():
                assert cnt == back_days
import pytest
import pandas as pd
import numpy as np
from data import SF1Data
from features import calc_series_stats, QuarterlyFeatures, BaseCompanyFeatures,\
                     QuarterlyDiffFeatures, FeatureMerger, \
                     DailyAggQuarterFeatures
from utils import load_json, int_hash_of_str
from synthetic_data import GeneratedData
config = load_json('config.json')

loaders = [GeneratedData()]
if config['sf1_data_path'] is not None:
    loaders.append(SF1Data(config['sf1_data_path']))


@pytest.mark.parametrize(["series", "norm", "expected"],
                         [([10, 0, 1], False, {
                             '_mean': 3.6666666666666665,
                             '_median': 1.0,
                             '_max': 10.0,
                             '_min': 0.0,
                             '_std': 4.4969125210773475
                         }),
                          ([10, -30, 1, 4, 15.2], False, {
                              '_mean': 0.039999999999999855,
                              '_median': 4.0,
                              '_max': 15.2,
                              '_min': -30.0,
                              '_std': 15.798936673080249
                          }),
Beispiel #16
0
                     QuarterlyDiffFeatures, DailyAggQuarterFeatures
from targets import DailyAggTarget
from models import TimeSeriesOOFModel, AnsambleModel
from metrics import median_absolute_relative_error, down_std_norm
from pipelines import BasePipeline

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('--config_path', type=str)
    args = parser.parse_args()

    config = load_json(args.config_path)
    pipeline_config = config['pipelines']['marketcap_down_std']

    data_loader = SF1Data(config['sf1_data_path'])
    tickers_df = data_loader.load_base_data(
        currency=pipeline_config['currency'],
        scalemarketcap=pipeline_config['scalemarketcap'])
    ticker_list = tickers_df['ticker'].unique().tolist()

    fc1 = QuarterlyFeatures(
        columns=pipeline_config['quarter_columns'],
        quarter_counts=pipeline_config['quarter_counts'],
        max_back_quarter=pipeline_config['max_back_quarter'])

    fc2 = BaseCompanyFeatures(cat_columns=pipeline_config['cat_columns'])

    fc3 = QuarterlyDiffFeatures(
        columns=pipeline_config['quarter_columns'],
        compare_quarter_idxs=pipeline_config['compare_quarter_idxs'],