def test_calculate(self, tickers, columns, quarter_counts, max_back_quarter): fc = QuarterlyFeatures(columns=columns, quarter_counts=quarter_counts, max_back_quarter=max_back_quarter) loaders = [Data(columns), SF1Data(config['sf1_data_path'])] for data_loader in loaders: X = fc.calculate(data_loader, tickers) assert type(X) == pd.DataFrame assert 'ticker' in X.index.names assert 'date' in X.index.names if type(data_loader) == Data: assert X.shape[0] == max_back_quarter * len(tickers) else: assert X.shape[0] <= max_back_quarter * len(tickers) assert X.shape[1] == 2 * len(calc_series_stats([])) * \ len(columns) * len(quarter_counts) # Minimum can not be lower with reduction of quarter_count sorted_quarter_counts = np.sort(quarter_counts) for col in columns: for k in range(len(sorted_quarter_counts) - 1): lower_count = sorted_quarter_counts[k] higher_count = sorted_quarter_counts[k + 1] l_col = 'quarter{}_{}_min'.format(lower_count, col) h_col = 'quarter{}_{}_min'.format(higher_count, col) assert (X[h_col] <= X[l_col]).min() # Maximum can not be higher with reduction of quarter_count sorted_quarter_counts = np.sort(quarter_counts) for col in columns: for k in range(len(sorted_quarter_counts) - 1): lower_count = sorted_quarter_counts[k] higher_count = sorted_quarter_counts[k + 1] l_col = 'quarter{}_{}_max'.format(lower_count, col) h_col = 'quarter{}_{}_max'.format(higher_count, col) assert (X[h_col] >= X[l_col]).min() std_cols = [x for x in X.columns if '_std' in x] for col in std_cols: assert X[col].min() >= 0 for col in columns: for count in quarter_counts: min_col = 'quarter{}_{}_min'.format(count, col) max_col = 'quarter{}_{}_max'.format(count, col) mean_col = 'quarter{}_{}_mean'.format(count, col) median_col = 'quarter{}_{}_median'.format(count, col) assert (X[max_col] >= X[min_col]).min() assert (X[max_col] >= X[mean_col]).min() assert (X[max_col] >= X[median_col]).min() assert (X[mean_col] >= X[min_col]).min() assert (X[median_col] >= X[min_col]).min()
def _create_base_components(self): columns = ['revenue', 'netinc', 'ncf', 'ebitda', 'debt', 'fcf'] f1 = QuarterlyFeatures(columns=columns, quarter_counts=[2, 10], max_back_quarter=1) target = QuarterlyTarget(col='marketcap', quarter_shift=0) model = GroupedOOFModel(lgbm.sklearn.LGBMRegressor(), group_column='ticker', fold_cnt=4) return f1, target, model
def test_calculate(self, tickers): data_loader = SF1Data(config['sf1_data_path']) fc1 = QuarterlyFeatures(columns=['ebit'], quarter_counts=[2], max_back_quarter=10) fc2 = QuarterlyDiffFeatures(columns=['ebit', 'debt'], compare_quarter_idxs=[1, 4], max_back_quarter=10) fc3 = BaseCompanyFeatures(cat_columns=['sector', 'sicindustry']) X1 = fc1.calculate(data_loader, tickers) X2 = fc2.calculate(data_loader, tickers) X3 = fc3.calculate(data_loader, tickers) fm1 = FeatureMerger(fc1, fc2, on=['ticker', 'date']) Xm1 = fm1.calculate(data_loader, tickers) fm2 = FeatureMerger(fc1, fc3, on='ticker') Xm2 = fm2.calculate(data_loader, tickers) assert Xm1.shape[0] == X1.shape[0] assert Xm2.shape[0] == X1.shape[0] assert Xm1.shape[1] == X1.shape[1] + X2.shape[1] assert Xm2.shape[1] == X1.shape[1] + X3.shape[1] assert (Xm1.index == X1.index).min() assert (Xm2.index == X1.index).min() new_cols = Xm1.columns[:X1.shape[1]] old_cols = X1.columns for nc, oc in zip(new_cols, old_cols): assert (Xm1[nc] == X1[oc]).min() new_cols = Xm2.columns[:X1.shape[1]] old_cols = X1.columns for nc, oc in zip(new_cols, old_cols): assert (Xm2[nc] == X1[oc]).min()
def _create_pipeline(self): columns = ['revenue', 'netinc', 'ncf', 'ebitda', 'debt', 'fcf'] features = QuarterlyFeatures(columns=columns, quarter_counts=[2, 10], max_back_quarter=1) target = QuarterlyTarget(col='marketcap', quarter_shift=0) model = GroupedOOFModel(lgbm.sklearn.LGBMRegressor(), group_column='ticker', fold_cnt=4) pipeline = BasePipeline(features, target, model, metric=median_absolute_relative_error) return pipeline
def test_execute_simple(self, data_loader): columns = ['revenue', 'netinc', 'ncf', 'ebitda', 'debt', 'fcf'] f1 = QuarterlyFeatures(columns=columns, quarter_counts=[2, 10], max_back_quarter=1) target1 = QuarterlyTarget(col='marketcap', quarter_shift=0) target2 = QuarterlyTarget(col='marketcap', quarter_shift=-1) model = lgbm.sklearn.LGBMRegressor() pipeline1 = BasePipeline(feature=f1, target=target1, model=model, metric=median_absolute_relative_error, out_name='p1') pipeline2 = BasePipeline(feature=f1, target=target2, model=model, metric=median_absolute_relative_error, out_name='p2') pipeline3 = QuarterlyLoadPipeline(['ticker', 'date', 'marketcap']) pipeline1.fit(data_loader, tickers) pipeline2.fit(data_loader, tickers) merge1 = ExecuteMergePipeline( pipeline_list=[pipeline1, pipeline2, pipeline3], on=['ticker', 'date']) df1 = pipeline1.execute(data_loader, tickers) df2 = pipeline2.execute(data_loader, tickers) df3 = pipeline3.execute(data_loader, tickers) df = merge1.execute(data_loader, tickers) assert type(df) == pd.DataFrame assert len(df) == len(df1) np.testing.assert_array_equal( df.columns, ['ticker', 'date', 'p1', 'p2', 'marketcap']) np.testing.assert_array_equal(df1['p1'], df['p1']) np.testing.assert_array_equal(df2['p2'], df['p2'])
parser = argparse.ArgumentParser() arg = parser.add_argument arg('--config_path', type=str) args = parser.parse_args() config = load_json(args.config_path) pipeline_config = config['pipelines']['marketcap_down_std'] data_loader = SF1Data(config['sf1_data_path']) tickers_df = data_loader.load_base_data( currency=pipeline_config['currency'], scalemarketcap=pipeline_config['scalemarketcap']) ticker_list = tickers_df['ticker'].unique().tolist() fc1 = QuarterlyFeatures( columns=pipeline_config['quarter_columns'], quarter_counts=pipeline_config['quarter_counts'], max_back_quarter=pipeline_config['max_back_quarter']) fc2 = BaseCompanyFeatures(cat_columns=pipeline_config['cat_columns']) fc3 = QuarterlyDiffFeatures( columns=pipeline_config['quarter_columns'], compare_quarter_idxs=pipeline_config['compare_quarter_idxs'], max_back_quarter=pipeline_config['max_back_quarter']) fc4 = DailyAggQuarterFeatures( columns=pipeline_config['daily_agg_columns'], agg_day_counts=pipeline_config['agg_day_counts'], max_back_quarter=pipeline_config['max_back_quarter']) feature = FeatureMerger(fc1, fc2, on='ticker')
CAT_COLUMNS = ["sector", "sicindustry"] QUARTER_COLUMNS = [ "revenue", "netinc", "ncf", "assets", "ebitda", "debt", "fcf", "gp", "workingcapital", "cashneq", "rnd", "sgna", "ncfx", "divyield", "currentratio", "netinccmn" ] if __name__ == '__main__': config = load_json('config.json') data_loader = SF1Data(config['sf1_data_path']) tickers_df = data_loader.load_base_data(currency=CURRENCY, scalemarketcap=SCALE_MARKETCAP) ticker_list = tickers_df['ticker'].unique().tolist() fc1 = QuarterlyFeatures(columns=QUARTER_COLUMNS, quarter_counts=QUARTER_COUNTS, max_back_quarter=MAX_BACK_QUARTER) fc2 = BaseCompanyFeatures(cat_columns=CAT_COLUMNS) # Daily agss on marketcap and pe is possible here because it # normalized and there are no leakage. fc3 = DailyAggQuarterFeatures(columns=DAILY_AGG_COLUMNS, agg_day_counts=AGG_DAY_COUNTS, max_back_quarter=MAX_BACK_QUARTER) feature = FeatureMerger(fc1, fc2, on='ticker') feature = FeatureMerger(feature, fc3, on=['ticker', 'date']) target = QuarterlyTarget(col='marketcap', quarter_shift=0)