Example #1
0
 def test_infer_domain_no_terms(self):
     self.assertEqual(Pipeline().domain(default=GENERIC), GENERIC)
     self.assertEqual(Pipeline().domain(default=US_EQUITIES), US_EQUITIES)
 def initialize(context):
     attach_pipeline(Pipeline(), 'test')
Example #3
0
 def run(ts):
     pipe = Pipeline(ts, domain=US_EQUITIES)
     start = self.trading_days[-5]
     end = self.trading_days[-1]
     return self.run_pipeline(pipe, start, end)
from zipline.utils.calendars import get_calendar

universe = AverageDollarVolume(window_length=120).top(500)
trading_calendar = get_calendar('NYSE')
bundle_data = bundles.load(project_helper.EOD_BUNDLE_NAME)
engine = project_helper.build_pipeline_engine(bundle_data, trading_calendar)

# ### View Data
# With the pipeline engine built, let's get the stocks at the end of the period in the universe we're using. We'll use these tickers to generate the returns data for the our risk model.

# In[8]:

universe_end_date = pd.Timestamp('2016-01-05', tz='UTC')

universe_tickers = engine    .run_pipeline(
        Pipeline(screen=universe),
        universe_end_date,
        universe_end_date)\
    .index.get_level_values(1)\
    .values.tolist()

universe_tickers

# ## Get Returns
# Not that we have our pipeline built, let's access the returns data. We'll start by building a data portal.

# In[9]:

from zipline.data.data_portal import DataPortal

data_portal = DataPortal(
 def late_attach(context, data):
     attach_pipeline(Pipeline(), 'test')
     raise AssertionError("Shouldn't make it past attach_pipeline!")
Example #6
0
# and split on a 2-for-1 basis on February 28, 2005, June 21, 2000, and June 16, 1987.

start = pd.to_datetime('2020-08-26', utc=True)
end = pd.to_datetime('2020-09-02', utc=True)

# AAPL sid 199059
print(prices(symbols(['AAPL']), start, end))

engine = make_pipeline_engine()
universe = StaticAssets(symbols(['AAPL']))
pipe = Pipeline(
    columns={
        'close': Latest([EquityPricing.close], mask=universe),
        'mkt_cap': MarketCap(mask=universe),
        'prev': Previous([USEquityPricing.close],
                         window_length=2,
                         mask=universe),
        'ret': Returns(window_length=2, mask=universe),
    },
    screen=universe,
)

stocks = engine.run_pipeline(pipe, start, end, hooks=[])
print(stocks)

expected = [[499.30, 2163847100000.00, 503.43, -0.01],
            [506.09, 2137988000000.00, 499.30, 0.01],
            [500.04, 2134533300000.00, 506.09, -0.01],
            [124.81, 2206911200000.00, 125.01, -0.00],
            [129.04, 2294818300000.00, 124.81, 0.03],
            [134.18, 2247273200000.00, 129.04, 0.04]]
Example #7
0
    def test_factor_correlation_methods_two_factors(self, correlation_length):
        """
        Tests for `Factor.pearsonr` and `Factor.spearmanr` when passed another
        2D factor instead of a Slice.
        """
        assets = self.assets
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        # Ensure that the correlation methods cannot be called with two 2D
        # factors which have different masks.
        returns_masked_1 = Returns(
            window_length=5,
            inputs=[self.col],
            mask=AssetID().eq(1),
        )
        returns_masked_2 = Returns(
            window_length=5,
            inputs=[self.col],
            mask=AssetID().eq(2),
        )
        with self.assertRaises(IncompatibleTerms):
            returns_masked_1.pearsonr(
                target=returns_masked_2,
                correlation_length=correlation_length,
            )
        with self.assertRaises(IncompatibleTerms):
            returns_masked_1.spearmanr(
                target=returns_masked_2,
                correlation_length=correlation_length,
            )

        returns_5 = Returns(window_length=5, inputs=[self.col])
        returns_10 = Returns(window_length=10, inputs=[self.col])

        pearson_factor = returns_5.pearsonr(
            target=returns_10,
            correlation_length=correlation_length,
        )
        spearman_factor = returns_5.spearmanr(
            target=returns_10,
            correlation_length=correlation_length,
        )

        columns = {
            'pearson_factor': pearson_factor,
            'spearman_factor': spearman_factor,
        }
        pipeline = Pipeline(columns=columns)

        results = run_pipeline(pipeline, start_date, end_date)
        pearson_results = results['pearson_factor'].unstack()
        spearman_results = results['spearman_factor'].unstack()

        # Run a separate pipeline that calculates returns starting
        # (correlation_length - 1) days prior to our start date. This is
        # because we need (correlation_length - 1) extra days of returns to
        # compute our expected correlations.
        columns = {'returns_5': returns_5, 'returns_10': returns_10}
        results = run_pipeline(
            Pipeline(columns=columns),
            dates[start_date_index - (correlation_length - 1)],
            dates[end_date_index],
        )
        returns_5_results = results['returns_5'].unstack()
        returns_10_results = results['returns_10'].unstack()

        # On each day, calculate the expected correlation coefficients
        # between each asset's 5 and 10 day rolling returns. Each correlation
        # is calculated over `correlation_length` days.
        expected_pearson_results = full_like(pearson_results, nan)
        expected_spearman_results = full_like(spearman_results, nan)
        for day in range(num_days):
            todays_returns_5 = returns_5_results.iloc[day:day +
                                                      correlation_length]
            todays_returns_10 = returns_10_results.iloc[day:day +
                                                        correlation_length]
            for asset, asset_returns_5 in todays_returns_5.iteritems():
                asset_column = int(asset) - 1
                asset_returns_10 = todays_returns_10[asset]
                expected_pearson_results[day, asset_column] = pearsonr(
                    asset_returns_5,
                    asset_returns_10,
                )[0]
                expected_spearman_results[day, asset_column] = spearmanr(
                    asset_returns_5,
                    asset_returns_10,
                )[0]

        expected_pearson_results = DataFrame(
            data=expected_pearson_results,
            index=dates[start_date_index:end_date_index + 1],
            columns=assets,
        )
        assert_frame_equal(pearson_results, expected_pearson_results)

        expected_spearman_results = DataFrame(
            data=expected_spearman_results,
            index=dates[start_date_index:end_date_index + 1],
            columns=assets,
        )
        assert_frame_equal(spearman_results, expected_spearman_results)
        def initialize(context):
            pipeline_close = attach_pipeline(Pipeline(), "test_close")
            pipeline_volume = attach_pipeline(Pipeline(), "test_volume")

            pipeline_close.add(USEquityPricing.close.latest, "close")
            pipeline_volume.add(USEquityPricing.volume.latest, "volume")
 def initialize(context):
     attach_pipeline(Pipeline(), "test")
     attach_pipeline(Pipeline(), "test")
import pandas as pd
from zipline.pipeline import Pipeline
from zipline.pipeline.data import USEquityPricing
from sharadar.pipeline.engine import symbol, symbols, make_pipeline_engine
from zipline.pipeline.filters import StaticAssets

tickers = symbols(['TR1M', 'TR1Y', 'RATEINF'])
print(tickers)

pipe = Pipeline(columns={
    'Close': USEquityPricing.close.latest,
},
                screen=StaticAssets(tickers))

engine = make_pipeline_engine()
pipe_start = pd.to_datetime('2020-02-03', utc=True)
pipe_end = pd.to_datetime('2020-02-07', utc=True)
stocks = engine.run_pipeline(pipe, pipe_start, pipe_end)
print("stocks.shape [close]", stocks)

print(symbol('TR1M').to_dict())
 def initialize(context):
     p = attach_pipeline(Pipeline(), "test", chunks=chunks)
     p.add(USEquityPricing.close.latest, "close")
import pandas as pd
from zipline.pipeline import Pipeline
from zipline.pipeline.data import USEquityPricing

from sharadar.pipeline.engine import load_sharadar_bundle, symbols, make_pipeline_engine
from zipline.pipeline.filters import StaticAssets
import time
import datetime
from sharadar.pipeline.factors import DaysSinceFiling

bundle = load_sharadar_bundle()

bundle.asset_finder.retrieve_equities([199059, 199623])

spe = make_pipeline_engine()

pipe_start = pd.to_datetime('2020-02-03', utc=True)
pipe_end = pd.to_datetime('2020-02-07', utc=True)

universe = StaticAssets(symbols(['IBM', 'F', 'AAPL']))

pipe_mkt_cap = Pipeline(columns={
    'days_since_filing': DaysSinceFiling(mask=universe),
},
                        screen=universe)

start_time = time.time()
stocks = spe.run_pipeline(pipe_mkt_cap, pipe_start, pipe_end)
print("stocks.shape [mkt cap]", stocks)
Example #13
0
    def test_loader_given_multiple_columns(self):
        class Loader1DataSet1(DataSet):
            col1 = Column(float32)
            col2 = Column(float32)

        class Loader1DataSet2(DataSet):
            col1 = Column(float32)
            col2 = Column(float32)

        class Loader2DataSet(DataSet):
            col1 = Column(float32)
            col2 = Column(float32)

        constants1 = {
            Loader1DataSet1.col1: 1,
            Loader1DataSet1.col2: 2,
            Loader1DataSet2.col1: 3,
            Loader1DataSet2.col2: 4
        }
        loader1 = RecordingConstantLoader(constants=constants1,
                                          dates=self.dates,
                                          assets=self.assets)
        constants2 = {Loader2DataSet.col1: 5, Loader2DataSet.col2: 6}
        loader2 = RecordingConstantLoader(constants=constants2,
                                          dates=self.dates,
                                          assets=self.assets)

        engine = SimplePipelineEngine(
            lambda column: loader2
            if column.dataset == Loader2DataSet else loader1,
            self.dates,
            self.asset_finder,
        )

        pipe_col1 = RollingSumSum(inputs=[
            Loader1DataSet1.col1, Loader1DataSet2.col1, Loader2DataSet.col1
        ],
                                  window_length=2)

        pipe_col2 = RollingSumSum(inputs=[
            Loader1DataSet1.col2, Loader1DataSet2.col2, Loader2DataSet.col2
        ],
                                  window_length=3)

        pipe_col3 = RollingSumSum(inputs=[Loader2DataSet.col1],
                                  window_length=3)

        columns = OrderedDict([
            ('pipe_col1', pipe_col1),
            ('pipe_col2', pipe_col2),
            ('pipe_col3', pipe_col3),
        ])
        result = engine.run_pipeline(
            Pipeline(columns=columns),
            self.dates[2],  # index is >= the largest window length - 1
            self.dates[-1])
        min_window = min(pip_col.window_length
                         for pip_col in itervalues(columns))
        col_to_val = ChainMap(constants1, constants2)
        vals = {
            name: (sum(col_to_val[col]
                       for col in pipe_col.inputs) * pipe_col.window_length)
            for name, pipe_col in iteritems(columns)
        }

        index = MultiIndex.from_product([self.dates[2:], self.assets])
        expected = DataFrame(data={
            col:
            concatenate((full(
                (columns[col].window_length - min_window) * index.levshape[1],
                nan),
                         full((index.levshape[0] -
                               (columns[col].window_length - min_window)) *
                              index.levshape[1], val)))
            for col, val in iteritems(vals)
        },
                             index=index,
                             columns=columns)

        assert_frame_equal(result, expected)

        self.assertEqual(
            set(loader1.load_calls), {
                ColumnArgs.sorted_by_ds(Loader1DataSet1.col1,
                                        Loader1DataSet2.col1),
                ColumnArgs.sorted_by_ds(Loader1DataSet1.col2,
                                        Loader1DataSet2.col2)
            })
        self.assertEqual(set(loader2.load_calls), {
            ColumnArgs.sorted_by_ds(Loader2DataSet.col1, Loader2DataSet.col2)
        })
Example #14
0
    def test_masked_factor(self):
        """
        Test that a Custom Factor computes the correct values when passed a
        mask. The mask/filter should be applied prior to computing any values,
        as opposed to computing the factor across the entire universe of
        assets. Any assets that are filtered out should be filled with missing
        values.
        """
        loader = self.loader
        dates = self.dates[5:8]
        assets = self.assets
        asset_ids = self.asset_ids
        constants = self.constants
        open = USEquityPricing.open
        close = USEquityPricing.close
        engine = SimplePipelineEngine(
            lambda column: loader,
            self.dates,
            self.asset_finder,
        )

        factor1_value = constants[open]
        factor2_value = 3.0 * (constants[open] - constants[close])

        def create_expected_results(expected_value, mask):
            expected_values = where(mask, expected_value, nan)
            return DataFrame(expected_values, index=dates, columns=assets)

        cascading_mask = AssetIDPlusDay() < (asset_ids[-1] + dates[0].day)
        expected_cascading_mask_result = array(
            [[True, True, True, False], [True, True, False, False],
             [True, False, False, False]],
            dtype=bool,
        )

        alternating_mask = (AssetIDPlusDay() % 2).eq(0)
        expected_alternating_mask_result = array(
            [[False, True, False, True], [True, False, True, False],
             [False, True, False, True]],
            dtype=bool,
        )

        masks = cascading_mask, alternating_mask
        expected_mask_results = (
            expected_cascading_mask_result,
            expected_alternating_mask_result,
        )
        for mask, expected_mask in zip(masks, expected_mask_results):
            # Test running a pipeline with a single masked factor.
            columns = {'factor1': OpenPrice(mask=mask), 'mask': mask}
            pipeline = Pipeline(columns=columns)
            results = engine.run_pipeline(pipeline, dates[0], dates[-1])

            mask_results = results['mask'].unstack()
            check_arrays(mask_results.values, expected_mask)

            factor1_results = results['factor1'].unstack()
            factor1_expected = create_expected_results(factor1_value,
                                                       mask_results)
            assert_frame_equal(factor1_results, factor1_expected)

            # Test running a pipeline with a second factor. This ensures that
            # adding another factor to the pipeline with a different window
            # length does not cause any unexpected behavior, especially when
            # both factors share the same mask.
            columns['factor2'] = RollingSumDifference(mask=mask)
            pipeline = Pipeline(columns=columns)
            results = engine.run_pipeline(pipeline, dates[0], dates[-1])

            mask_results = results['mask'].unstack()
            check_arrays(mask_results.values, expected_mask)

            factor1_results = results['factor1'].unstack()
            factor2_results = results['factor2'].unstack()
            factor1_expected = create_expected_results(factor1_value,
                                                       mask_results)
            factor2_expected = create_expected_results(factor2_value,
                                                       mask_results)
            assert_frame_equal(factor1_results, factor1_expected)
            assert_frame_equal(factor2_results, factor2_expected)
 def initialize(context):
     pipeline = attach_pipeline(Pipeline(), 'my_pipeline')
     test_factor = TestFactor()
     pipeline.add(test_factor, 'test_factor')
 def initialize(context):
     pipeline = attach_pipeline(Pipeline(), "my_pipeline")
     test_factor = TestFactor()
     pipeline.add(test_factor, "test_factor")
Example #17
0
    def test_compute_with_adjustments(self):
        dates, assets = self.dates, self.assets
        low, high = USEquityPricing.low, USEquityPricing.high
        apply_idxs = [3, 10, 16]

        def apply_date(idx, offset=0):
            return dates[apply_idxs[idx] + offset]

        adjustments = DataFrame.from_records(
            [
                dict(
                    kind=MULTIPLY,
                    sid=assets[1],
                    value=2.0,
                    start_date=None,
                    end_date=apply_date(0, offset=-1),
                    apply_date=apply_date(0),
                ),
                dict(
                    kind=MULTIPLY,
                    sid=assets[1],
                    value=3.0,
                    start_date=None,
                    end_date=apply_date(1, offset=-1),
                    apply_date=apply_date(1),
                ),
                dict(
                    kind=MULTIPLY,
                    sid=assets[1],
                    value=5.0,
                    start_date=None,
                    end_date=apply_date(2, offset=-1),
                    apply_date=apply_date(2),
                ),
            ]
        )
        low_base = DataFrame(self.make_frame(30.0))
        low_loader = DataFrameLoader(low, low_base.copy(), adjustments=None)

        # Pre-apply inverse of adjustments to the baseline.
        high_base = DataFrame(self.make_frame(30.0))
        high_base.iloc[:apply_idxs[0], 1] /= 2.0
        high_base.iloc[:apply_idxs[1], 1] /= 3.0
        high_base.iloc[:apply_idxs[2], 1] /= 5.0

        high_loader = DataFrameLoader(high, high_base, adjustments)
        loader = MultiColumnLoader({low: low_loader, high: high_loader})

        engine = SimplePipelineEngine(loader, self.dates, self.asset_finder)

        for window_length in range(1, 4):
            low_mavg = SimpleMovingAverage(
                inputs=[USEquityPricing.low],
                window_length=window_length,
            )
            high_mavg = SimpleMovingAverage(
                inputs=[USEquityPricing.high],
                window_length=window_length,
            )
            bounds = product_upper_triangle(range(window_length, len(dates)))
            for start, stop in bounds:
                results = engine.run_pipeline(
                    Pipeline(
                        columns={'low': low_mavg, 'high': high_mavg}
                    ),
                    dates[start],
                    dates[stop],
                )
                self.assertEqual(set(results.columns), {'low', 'high'})
                iloc_bounds = slice(start, stop + 1)  # +1 to include end date

                low_results = results.unstack()['low']
                assert_frame_equal(low_results, low_base.iloc[iloc_bounds])

                high_results = results.unstack()['high']
                assert_frame_equal(high_results, high_base.iloc[iloc_bounds])
Example #18
0
                                      default_domain=calendar)
        return engine.run_pipeline(*args, **kwargs)


if __name__ == '__main__':
    data_path = "/home/yuxuzi/Data/mydataset2"

    start_date, end_date = pd.Timestamp('2018-03-13'), pd.Timestamp(
        '2018-03-27')
    data_source = HDFSimpleDataSource(data_path)
    dataset = HDFSimpleDataSource.infer_dataset(data_path)

    if __name__ == '__main__':
        universe = dataset.sector.latest.element_of(['A', 'B'])

    class RollingSumDifference(CustomFactor):
        window_length = 3
        inputs = [dataset.open, dataset.close]

        def compute(self, today, assets, out, open, close):
            out[:] = (open - close).sum(axis=0)

    pipe = Pipeline(columns={
        'close': dataset.close.latest,
        'sumdiff': RollingSumDifference(),
        'sector': dataset.sector.latest
    },
                    screen=universe)
    df = data_source.run_pipeline(pipe, start_date, end_date)
    print(df)
Example #19
0
    def test_regression_of_returns_factor(self, returns_length,
                                          regression_length):
        """
        Tests for the built-in factor `RollingLinearRegressionOfReturns`.
        """
        assets = self.assets
        my_asset = self.my_asset
        my_asset_column = self.my_asset_column
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        # The order of these is meant to align with the output of `linregress`.
        outputs = ['beta', 'alpha', 'r_value', 'p_value', 'stderr']

        returns = Returns(window_length=returns_length)
        masks = self.cascading_mask, self.alternating_mask, NotSpecified
        expected_mask_results = (
            self.expected_cascading_mask_result,
            self.expected_alternating_mask_result,
            self.expected_no_mask_result,
        )

        for mask, expected_mask in zip(masks, expected_mask_results):
            regression_factor = RollingLinearRegressionOfReturns(
                target=my_asset,
                returns_length=returns_length,
                regression_length=regression_length,
                mask=mask,
            )

            columns = {
                output: getattr(regression_factor, output)
                for output in outputs
            }
            pipeline = Pipeline(columns=columns)
            if mask is not NotSpecified:
                pipeline.add(mask, 'mask')

            results = run_pipeline(pipeline, start_date, end_date)
            if mask is not NotSpecified:
                mask_results = results['mask'].unstack()
                check_arrays(mask_results.values, expected_mask)

            output_results = {}
            expected_output_results = {}
            for output in outputs:
                output_results[output] = results[output].unstack()
                expected_output_results[output] = full_like(
                    output_results[output],
                    nan,
                )

            # Run a separate pipeline that calculates returns starting
            # (regression_length - 1) days prior to our start date. This is
            # because we need (regression_length - 1) extra days of returns to
            # compute our expected regressions.
            results = run_pipeline(
                Pipeline(columns={'returns': returns}),
                dates[start_date_index - (regression_length - 1)],
                dates[end_date_index],
            )
            returns_results = results['returns'].unstack()

            # On each day, calculate the expected regression results for Y ~ X
            # where Y is the asset we are interested in and X is each other
            # asset. Each regression is calculated over `regression_length`
            # days of data.
            for day in range(num_days):
                todays_returns = returns_results.iloc[day:day +
                                                      regression_length]
                my_asset_returns = todays_returns.iloc[:, my_asset_column]
                for asset, other_asset_returns in todays_returns.iteritems():
                    asset_column = int(asset) - 1
                    expected_regression_results = linregress(
                        y=other_asset_returns,
                        x=my_asset_returns,
                    )
                    for i, output in enumerate(outputs):
                        expected_output_results[output][day, asset_column] = \
                            expected_regression_results[i]

            for output in outputs:
                output_result = output_results[output]
                expected_output_result = DataFrame(
                    where(expected_mask, expected_output_results[output], nan),
                    index=dates[start_date_index:end_date_index + 1],
                    columns=assets,
                )
                assert_frame_equal(output_result, expected_output_result)
Example #20
0
    def test_generic_pipeline_with_explicit_domain(self, domain):
        calendar = domain.calendar
        pipe = Pipeline(
            {
                "open": EquityPricing.open.latest,
                "high": EquityPricing.high.latest,
                "low": EquityPricing.low.latest,
                "close": EquityPricing.close.latest,
                "volume": EquityPricing.volume.latest,
            },
            domain=domain,
        )

        sessions = self.daily_bar_sessions[calendar.name]

        # Run the pipeline for a 7 day chunk in the middle of our data.
        #
        # Using this region ensures that there are assets that never appear in
        # the pipeline both because they end too soon, and because they start
        # too late.
        start, end = sessions[[-17, -10]]
        result = self.run_pipeline(pipe, start, end)

        all_assets = self.assets_by_calendar[calendar]

        # We expect the index of the result to contain all assets that were
        # alive during the interval between our start and end (not including
        # the asset's IPO date).
        expected_assets = [
            a for a in all_assets
            if alive_in_range(a, start, end, include_asset_start_date=False)
        ]
        # off by 1 from above to be inclusive of the end date
        expected_dates = sessions[-17:-9]

        for col in pipe.columns:
            # result_date should look like this:
            #
            #     E     F     G     H     I     J     K     L     M     N     O     P # noqa
            # 24.17 25.17 26.17 27.17 28.17   NaN   NaN   NaN   NaN   NaN   NaN   NaN # noqa
            #   NaN 25.18 26.18 27.18 28.18 29.18   NaN   NaN   NaN   NaN   NaN   NaN # noqa
            #   NaN   NaN 26.23 27.23 28.23 29.23 30.23   NaN   NaN   NaN   NaN   NaN # noqa
            #   NaN   NaN   NaN 27.28 28.28 29.28 30.28 31.28   NaN   NaN   NaN   NaN # noqa
            #   NaN   NaN   NaN   NaN 28.30 29.30 30.30 31.30 32.30   NaN   NaN   NaN # noqa
            #   NaN   NaN   NaN   NaN   NaN 29.29 30.29 31.29 32.29 33.29   NaN   NaN # noqa
            #   NaN   NaN   NaN   NaN   NaN   NaN 30.27 31.27 32.27 33.27 34.27   NaN # noqa
            #   NaN   NaN   NaN   NaN   NaN   NaN   NaN 31.29 32.29 33.29 34.29 35.29 # noqa
            result_data = result[col].unstack()

            # Check indices.
            assert_equal(pd.Index(expected_assets), result_data.columns)
            assert_equal(expected_dates, result_data.index)

            # Check values.
            for asset in expected_assets:
                for date in expected_dates:
                    value = result_data.at[date, asset]
                    self.check_expected_latest_value(
                        calendar,
                        col,
                        date,
                        asset,
                        value,
                    )
Example #21
0
    def test_factor_regression_method_two_factors(self, regression_length):
        """
        Tests for `Factor.linear_regression` when passed another 2D factor
        instead of a Slice.
        """
        assets = self.assets
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        # The order of these is meant to align with the output of `linregress`.
        outputs = ['beta', 'alpha', 'r_value', 'p_value', 'stderr']

        # Ensure that the `linear_regression` method cannot be called with two
        # 2D factors which have different masks.
        returns_masked_1 = Returns(
            window_length=5,
            inputs=[self.col],
            mask=AssetID().eq(1),
        )
        returns_masked_2 = Returns(
            window_length=5,
            inputs=[self.col],
            mask=AssetID().eq(2),
        )
        with self.assertRaises(IncompatibleTerms):
            returns_masked_1.linear_regression(
                target=returns_masked_2,
                regression_length=regression_length,
            )

        returns_5 = Returns(window_length=5, inputs=[self.col])
        returns_10 = Returns(window_length=10, inputs=[self.col])

        regression_factor = returns_5.linear_regression(
            target=returns_10,
            regression_length=regression_length,
        )

        columns = {
            output: getattr(regression_factor, output)
            for output in outputs
        }
        pipeline = Pipeline(columns=columns)

        results = run_pipeline(pipeline, start_date, end_date)

        output_results = {}
        expected_output_results = {}
        for output in outputs:
            output_results[output] = results[output].unstack()
            expected_output_results[output] = full_like(
                output_results[output],
                nan,
            )

        # Run a separate pipeline that calculates returns starting
        # (regression_length - 1) days prior to our start date. This is because
        # we need (regression_length - 1) extra days of returns to compute our
        # expected regressions.
        columns = {'returns_5': returns_5, 'returns_10': returns_10}
        results = run_pipeline(
            Pipeline(columns=columns),
            dates[start_date_index - (regression_length - 1)],
            dates[end_date_index],
        )
        returns_5_results = results['returns_5'].unstack()
        returns_10_results = results['returns_10'].unstack()

        # On each day, for each asset, calculate the expected regression
        # results of Y ~ X where Y is the asset's rolling 5 day returns and X
        # is the asset's rolling 10 day returns. Each regression is calculated
        # over `regression_length` days of data.
        for day in range(num_days):
            todays_returns_5 = returns_5_results.iloc[day:day +
                                                      regression_length]
            todays_returns_10 = returns_10_results.iloc[day:day +
                                                        regression_length]
            for asset, asset_returns_5 in todays_returns_5.iteritems():
                asset_column = int(asset) - 1
                asset_returns_10 = todays_returns_10[asset]
                expected_regression_results = linregress(
                    y=asset_returns_5,
                    x=asset_returns_10,
                )
                for i, output in enumerate(outputs):
                    expected_output_results[output][day, asset_column] = \
                        expected_regression_results[i]

        for output in outputs:
            output_result = output_results[output]
            expected_output_result = DataFrame(
                expected_output_results[output],
                index=dates[start_date_index:end_date_index + 1],
                columns=assets,
            )
            assert_frame_equal(output_result, expected_output_result)
Example #22
0
def run_data_pipeline(engine, universe, start_date, end_date):

    pipeline = Pipeline(screen=universe)

    sector = Sector()

    # Alpha Factors :

    pipeline.add(DownsideRisk(), 'Downside Risk (Sortino Ratio)')

    pipeline.add(Vol3M(), '3 Month Volatility')

    pipeline.add(momentum_1yr(252, universe, sector), 'Momentum_1YR')

    pipeline.add(
        mean_reversion_5day_sector_neutral_smoothed(20, universe, sector),
        'Mean_Reversion_Sector_Neutral_Smoothed')

    pipeline.add(overnight_sentiment_smoothed(2, 10, universe),
                 'Overnight_Sentiment_Smoothed')

    pipeline.add(rsi_sector_neutral(15, universe, sector),
                 'RSI_Sector_Neutral_15d')

    pipeline.add(rsi_sector_neutral(30, universe, sector),
                 'RSI_Sector_Neutral_30d')

    beta_factor = (RegressionAgainstTime(mask=universe).beta.rank().zscore())

    gamma_factor = (RegressionAgainstTime(mask=universe).gamma.rank().zscore())

    conditional_factor = (beta_factor * gamma_factor).rank().zscore()

    pipeline.add(beta_factor, 'time_beta')

    pipeline.add(gamma_factor, 'time_gamma')

    pipeline.add(conditional_factor, 'conditional_factor')

    # Universal Quant Features :

    pipeline.add(
        AnnualizedVolatility(window_length=20, mask=universe).rank().zscore(),
        'volatility_20d')

    pipeline.add(
        AnnualizedVolatility(window_length=120, mask=universe).rank().zscore(),
        'volatility_120d')

    pipeline.add(
        AverageDollarVolume(window_length=20, mask=universe).rank().zscore(),
        'adv_20d')

    pipeline.add(
        AverageDollarVolume(window_length=120, mask=universe).rank().zscore(),
        'adv_120d')

    pipeline.add(sector, 'sector_code')

    # Regime Features :

    pipeline.add(
        SimpleMovingAverage(inputs=[MarketDispersion(mask=universe)],
                            window_length=20), 'dispersion_20d')

    pipeline.add(
        SimpleMovingAverage(inputs=[MarketDispersion(mask=universe)],
                            window_length=120), 'dispersion_120d')

    pipeline.add(MarketVolatility(window_length=20), 'market_vol_20d')

    pipeline.add(MarketVolatility(window_length=120), 'market_vol_120d')

    # Target
    # Let's try to predict the go forward 1-week return. When doing this, it's important to quantize the target. The factor we create is the trailing 5-day return

    pipeline.add(
        Returns(window_length=5, mask=universe).quantiles(2), 'return_5d')

    pipeline.add(
        Returns(window_length=5, mask=universe).quantiles(25), 'return_5d_p')

    # Running the Pipeline

    all_factors = engine.run_pipeline(pipeline, start_date, end_date)

    # Computing Date Features

    all_factors = compute_date_features(all_factors, start_date, end_date)

    # One Hot Encoding Sectors

    all_factors = one_hot_encode_sectors(all_factors)

    # Shifted Target For Training The Model

    all_factors['target'] = all_factors.groupby(level=1)['return_5d'].shift(-5)

    return all_factors
Example #23
0
def make_pipeline(context):
    """
    Create our pipeline.
    """

    # Filter for primary share equities. IsPrimaryShare is a built-in filter.
    primary_share = IsPrimaryShare()

    # Not when-issued equities.
    #not_wi = ~IEXCompany.symbol.latest.endswith('.WI')
    not_wi = ~PolygonCompany.symbol.latest.endswith(".WI")

    # Equities without LP in their name, .matches does a match using a regular
    # expression
    #not_lp_name = ~IEXCompany.companyName.latest.matches('.* L[. ]?P.?$')
    not_lp_name = ~PolygonCompany.name.latest.matches(".* L[. ]?P.?$")

    # Equities whose most recent Morningstar market cap is not null have
    # fundamental data and therefore are not ETFs.
    #have_market_cap = IEXKeyStats.marketcap.latest >= 1
    have_market_cap = PolygonCompany.marketcap.latest >= 1

    # At least a certain price
    price = USEquityPricing.close.latest
    AtLeastPrice = (price >= context.MyLeastPrice)
    AtMostPrice = (price <= context.MyMostPrice)

    # Filter for stocks that pass all of our previous filters.
    tradeable_stocks = (primary_share
                        & not_wi
                        & not_lp_name
                        & have_market_cap
                        & AtLeastPrice
                        & AtMostPrice)

    LowVar = 6
    HighVar = 40

    log.info('''
Algorithm initialized variables:
 context.MaxCandidates %s
 LowVar %s
 HighVar %s''' % (context.MaxCandidates, LowVar, HighVar))

    # High dollar volume filter.
    base_universe = AverageDollarVolume(
        window_length=20,
        mask=tradeable_stocks).percentile_between(LowVar, HighVar)

    # Short close price average.
    ShortAvg = SimpleMovingAverage(inputs=[USEquityPricing.close],
                                   window_length=3,
                                   mask=base_universe)

    # Long close price average.
    LongAvg = SimpleMovingAverage(inputs=[USEquityPricing.close],
                                  window_length=45,
                                  mask=base_universe)

    percent_difference = (ShortAvg - LongAvg) / LongAvg

    # Filter to select securities to long.
    stocks_worst = percent_difference.bottom(context.MaxCandidates)
    securities_to_trade = (stocks_worst)

    return Pipeline(
        columns={'stocks_worst': stocks_worst},
        screen=(securities_to_trade),
    )
Example #24
0
def make_pipeline():
    rsi = RSI()
    return Pipeline(columns={
        'longs': rsi.top(3),
        'shorts': rsi.bottom(3),
    }, )
Example #25
0
 def initialize(context):
     attach_pipeline(Pipeline(), 'test')
     pipeline_output('test')
     raise AssertionError("Shouldn't make it past pipeline_output()")
class DummyFactor1(CustomFactor):
    inputs = []
    window_length = 1
    window_safe = False


    def compute(self, today, assets, out):
        log.info('1', today)
        out[:] = 0

class DummyFactor2(CustomFactor):
    inputs = []
    window_length = 1
    window_safe = False


    def compute(self, today, assets, out):
        log.info('2', today)
        out[:] = 0

pipe = Pipeline(columns={
    'close': USEquityPricing.close.latest,
    'dummy1': DummyFactor1(),
    'dummy2': DummyFactor2()
},
    screen=NamedUniverse(universe_name)
)
start_time = time.time()
stocks = spe.run_pipeline(pipe, pipe_start, end)
print(stocks.shape)
print("--- %s ---" % datetime.timedelta(seconds=(time.time() - start_time)))
Example #27
0
 def initialize(context):
     p = attach_pipeline(Pipeline(), 'test', chunksize=chunksize)
     p.add(USEquityPricing.close.latest, 'close')
        def initialize(context):
            pipeline_close = attach_pipeline(Pipeline(), 'test_close')
            pipeline_volume = attach_pipeline(Pipeline(), 'test_volume')

            pipeline_close.add(USEquityPricing.close.latest, 'close')
            pipeline_volume.add(USEquityPricing.volume.latest, 'volume')
Example #29
0
    def test_correlation_factors(self, returns_length, correlation_length):
        """
        Tests for the built-in factors `RollingPearsonOfReturns` and
        `RollingSpearmanOfReturns`.
        """
        assets = self.assets
        my_asset = self.my_asset
        my_asset_column = self.my_asset_column
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        returns = Returns(window_length=returns_length)
        masks = (self.cascading_mask, self.alternating_mask, NotSpecified)
        expected_mask_results = (
            self.expected_cascading_mask_result,
            self.expected_alternating_mask_result,
            self.expected_no_mask_result,
        )

        for mask, expected_mask in zip(masks, expected_mask_results):
            pearson_factor = RollingPearsonOfReturns(
                target=my_asset,
                returns_length=returns_length,
                correlation_length=correlation_length,
                mask=mask,
            )
            spearman_factor = RollingSpearmanOfReturns(
                target=my_asset,
                returns_length=returns_length,
                correlation_length=correlation_length,
                mask=mask,
            )

            columns = {
                "pearson_factor": pearson_factor,
                "spearman_factor": spearman_factor,
            }
            pipeline = Pipeline(columns=columns)
            if mask is not NotSpecified:
                pipeline.add(mask, "mask")

            results = run_pipeline(pipeline, start_date, end_date)
            pearson_results = results["pearson_factor"].unstack()
            spearman_results = results["spearman_factor"].unstack()
            if mask is not NotSpecified:
                mask_results = results["mask"].unstack()
                check_arrays(mask_results.values, expected_mask)

            # Run a separate pipeline that calculates returns starting
            # (correlation_length - 1) days prior to our start date. This is
            # because we need (correlation_length - 1) extra days of returns to
            # compute our expected correlations.
            results = run_pipeline(
                Pipeline(columns={"returns": returns}),
                dates[start_date_index - (correlation_length - 1)],
                dates[end_date_index],
            )
            returns_results = results["returns"].unstack()

            # On each day, calculate the expected correlation coefficients
            # between the asset we are interested in and each other asset. Each
            # correlation is calculated over `correlation_length` days.
            expected_pearson_results = np.full_like(pearson_results, nan)
            expected_spearman_results = np.full_like(spearman_results, nan)
            for day in range(num_days):
                todays_returns = returns_results.iloc[day:day +
                                                      correlation_length]
                my_asset_returns = todays_returns.iloc[:, my_asset_column]
                for asset, other_asset_returns in todays_returns.iteritems():
                    asset_column = int(asset) - 1
                    expected_pearson_results[day, asset_column] = pearsonr(
                        my_asset_returns,
                        other_asset_returns,
                    )[0]
                    expected_spearman_results[day, asset_column] = spearmanr(
                        my_asset_returns,
                        other_asset_returns,
                    )[0]

            expected_pearson_results = pd.DataFrame(
                data=np.where(expected_mask, expected_pearson_results, nan),
                index=dates[start_date_index:end_date_index + 1],
                columns=assets,
            )
            assert_frame_equal(pearson_results, expected_pearson_results)

            expected_spearman_results = pd.DataFrame(
                data=np.where(expected_mask, expected_spearman_results, nan),
                index=dates[start_date_index:end_date_index + 1],
                columns=assets,
            )
            assert_frame_equal(spearman_results, expected_spearman_results)
Example #30
0
    def test_compute_earnings(self, dates):

        (
            engine,
            expected_next,
            expected_next_busday_offset,
            expected_previous,
            expected_previous_busday_offset,
        ) = self.setup(dates)

        pipe = Pipeline(
            columns={
                'next': EarningsCalendar.next_announcement.latest,
                'previous': EarningsCalendar.previous_announcement.latest,
                'days_to_next': BusinessDaysUntilNextEarnings(),
                'days_since_prev': BusinessDaysSincePreviousEarnings(),
            })

        result = engine.run_pipeline(
            pipe,
            start_date=dates[0],
            end_date=dates[-1],
        )

        computed_next = result['next']
        computed_previous = result['previous']
        computed_next_busday_offset = result['days_to_next']
        computed_previous_busday_offset = result['days_since_prev']

        # NaTs in next/prev should correspond to NaNs in offsets.
        assert_series_equal(
            computed_next.isnull(),
            computed_next_busday_offset.isnull(),
        )
        assert_series_equal(
            computed_previous.isnull(),
            computed_previous_busday_offset.isnull(),
        )

        for sid in self.sids:

            assert_series_equal(
                computed_next.xs(sid, level=1),
                expected_next(sid),
                sid,
            )

            assert_series_equal(
                computed_previous.xs(sid, level=1),
                expected_previous(sid),
                sid,
            )

            assert_series_equal(
                computed_next_busday_offset.xs(sid, level=1),
                expected_next_busday_offset(sid),
                sid,
            )

            assert_series_equal(
                computed_previous_busday_offset.xs(sid, level=1),
                expected_previous_busday_offset(sid),
                sid,
            )