Ejemplo n.º 1
0
    def test_bad_input(self):
        data = arange(100).reshape(self.ndates, self.nsids)
        baseline = DataFrame(data, index=self.dates, columns=self.sids)
        loader = DataFrameLoader(
            USEquityPricing.close,
            baseline,
        )

        with self.assertRaises(ValueError):
            # Wrong column.
            loader.load_adjusted_array(
                US_EQUITIES,
                [USEquityPricing.open],
                self.dates,
                self.sids,
                self.mask,
            )

        with self.assertRaises(ValueError):
            # Too many columns.
            loader.load_adjusted_array(
                US_EQUITIES,
                [USEquityPricing.open, USEquityPricing.close],
                self.dates,
                self.sids,
                self.mask,
            )
Ejemplo n.º 2
0
    def init_class_fixtures(cls):
        super(ParameterizedFactorTestCase, cls).init_class_fixtures()
        day = cls.env.trading_day

        cls.dates = dates = date_range(
            '2015-02-01',
            '2015-02-28',
            freq=day,
            tz='UTC',
        )
        sids = cls.sids

        cls.raw_data = DataFrame(
            data=arange(len(dates) * len(sids), dtype=float).reshape(
                len(dates),
                len(sids),
            ),
            index=dates,
            columns=cls.asset_finder.retrieve_all(sids),
        )

        close_loader = DataFrameLoader(USEquityPricing.close, cls.raw_data)
        volume_loader = DataFrameLoader(
            USEquityPricing.volume,
            cls.raw_data * 2,
        )

        cls.engine = SimplePipelineEngine(
            {
                USEquityPricing.close: close_loader,
                USEquityPricing.volume: volume_loader,
            }.__getitem__,
            cls.dates,
            cls.asset_finder,
        )
Ejemplo n.º 3
0
    def _load_events(self, name_map, indexer, columns, dates, sids, mask):
        def to_frame(array):
            return pd.DataFrame(array, index=dates, columns=sids)

        assert indexer.shape == (len(dates), len(sids))

        out = {}
        for c in columns:
            # Array holding the value for column `c` for every event we have.
            col_array = self.events[name_map[c]]

            if not len(col_array):
                # We don't have **any** events, so return col.missing_value
                # every day for every sid. We have to special case empty events
                # because in normal branch we depend on being able to index
                # with -1 for missing values, which fails if there are no
                # events at all.
                raw = np.full((len(dates), len(sids)),
                              c.missing_value,
                              dtype=c.dtype)
            else:
                # Slot event values into sid/date locations using `indexer`.
                # This produces a 2D array of the same shape as `indexer`,
                # which must be (len(dates), len(sids))`.
                raw = col_array[indexer]

                # indexer will be -1 for locations where we don't have a known
                # value. Overwrite those locations with c.missing_value.
                raw[indexer < 0] = c.missing_value

            # Delegate the actual array formatting logic to a DataFrameLoader.
            loader = DataFrameLoader(c, to_frame(raw), adjustments=None)
            out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c]
        return out
    def load_adjusted_array(self, domain, columns, dates, sids, mask):
        out = {}
        for column in columns:
            fundamentals_df = self.fundamentals_reader.read(
                column.name,
                dates,
                sids,
            )
            df_loader = DataFrameLoader(column, fundamentals_df)
            out.update(df_loader.load_adjusted_array(domain, [column,], dates, sids, mask))

        return out
Ejemplo n.º 5
0
    def _load_events(self, name_map, indexer, columns, dates, sids, mask):
        def to_frame(array):
            return pd.DataFrame(array, index=dates, columns=sids)

        out = {}
        for c in columns:
            raw = self.events[name_map[c]][indexer]
            # indexer will be -1 for locations where we don't have a known
            # value.
            raw[indexer < 0] = c.missing_value

            # Delegate the actual array formatting logic to a DataFrameLoader.
            loader = DataFrameLoader(c, to_frame(raw), adjustments=None)
            out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c]
        return out
Ejemplo n.º 6
0
    def _load_events(self, name_map, indexer, columns, dates, sids, mask):
        def to_frame(array):
            return pd.DataFrame(array, index=dates, columns=sids)

        out = {}
        for c in columns:
            raw = self.events[name_map[c]][indexer]
            # indexer will be -1 for locations where we don't have a known
            # value.
            raw[indexer < 0] = c.missing_value

            # Delegate the actual array formatting logic to a DataFrameLoader.
            loader = DataFrameLoader(c, to_frame(raw), adjustments=None)
            out[c] = loader.load_adjusted_array([c], dates, sids, mask)[c]
        return out
Ejemplo n.º 7
0
    def test_baseline(self):
        data = arange(100).reshape(self.ndates, self.nsids)
        baseline = DataFrame(data, index=self.dates, columns=self.sids)
        loader = DataFrameLoader(USEquityPricing.close, baseline)

        dates_slice = slice(None, 10, None)
        sids_slice = slice(1, 3, None)
        [adj_array] = loader.load_adjusted_array(
            [USEquityPricing.close],
            self.dates[dates_slice],
            self.sids[sids_slice],
            self.mask[dates_slice, sids_slice],
        ).values()

        for idx, window in enumerate(adj_array.traverse(window_length=3)):
            expected = baseline.values[dates_slice, sids_slice][idx:idx + 3]
            assert_array_equal(window, expected)
Ejemplo n.º 8
0
    def test_baseline(self):
        data = arange(100).reshape(self.ndates, self.nsids)
        baseline = DataFrame(data, index=self.dates, columns=self.sids)
        loader = DataFrameLoader(USEquityPricing.close, baseline)

        dates_slice = slice(None, 10, None)
        sids_slice = slice(1, 3, None)
        [adj_array] = loader.load_adjusted_array(
            [USEquityPricing.close],
            self.dates[dates_slice],
            self.sids[sids_slice],
            self.mask[dates_slice, sids_slice],
        )

        for idx, window in enumerate(adj_array.traverse(window_length=3)):
            expected = baseline.values[dates_slice, sids_slice][idx:idx + 3]
            assert_array_equal(window, expected)
Ejemplo n.º 9
0
    def _load_events(self,
                     name_map,
                     indexer,
                     domain,
                     columns,
                     dates,
                     sids,
                     mask):
        def to_frame(array):
            return pd.DataFrame(array, index=dates, columns=sids)

        assert indexer.shape == (len(dates), len(sids))

        out = {}
        for c in columns:
            # Array holding the value for column `c` for every event we have.
            col_array = self.events[name_map[c]]

            if not len(col_array):
                # We don't have **any** events, so return col.missing_value
                # every day for every sid. We have to special case empty events
                # because in normal branch we depend on being able to index
                # with -1 for missing values, which fails if there are no
                # events at all.
                raw = np.full(
                    (len(dates), len(sids)),
                    c.missing_value,
                    dtype=c.dtype,
                )
            else:
                # Slot event values into sid/date locations using `indexer`.
                # This produces a 2D array of the same shape as `indexer`,
                # which must be (len(dates), len(sids))`.
                raw = col_array[indexer]

                # indexer will be -1 for locations where we don't have a known
                # value. Overwrite those locations with c.missing_value.
                raw[indexer < 0] = c.missing_value

            # Delegate the actual array formatting logic to a DataFrameLoader.
            loader = DataFrameLoader(c, to_frame(raw), adjustments=None)
            out[c] = loader.load_adjusted_array(
                domain, [c], dates, sids, mask,
            )[c]
        return out
    def init_class_fixtures(cls):
        super(StatisticalBuiltInsTestCase, cls).init_class_fixtures()

        day = cls.trading_calendar.day
        cls.dates = dates = date_range(
            '2015-02-01', '2015-02-28', freq=day, tz='UTC',
        )

        # Using these start and end dates because they are a contigous span of
        # 5 days (Monday - Friday) and they allow for plenty of days to look
        # back on when computing correlations and regressions.
        cls.start_date_index = start_date_index = 14
        cls.end_date_index = end_date_index = 18
        cls.pipeline_start_date = dates[start_date_index]
        cls.pipeline_end_date = dates[end_date_index]
        cls.num_days = num_days = end_date_index - start_date_index + 1

        sids = cls.sids
        cls.assets = assets = cls.asset_finder.retrieve_all(sids)
        cls.my_asset_column = my_asset_column = 0
        cls.my_asset = assets[my_asset_column]
        cls.num_assets = num_assets = len(assets)

        cls.raw_data = raw_data = DataFrame(
            data=arange(len(dates) * len(sids), dtype=float64_dtype).reshape(
                len(dates), len(sids),
            ),
            index=dates,
            columns=assets,
        )

        # Using mock 'close' data here because the correlation and regression
        # built-ins use USEquityPricing.close as the input to their `Returns`
        # factors. Since there is no way to change that when constructing an
        # instance of these built-ins, we need to test with mock 'close' data
        # to most accurately reflect their true behavior and results.
        close_loader = DataFrameLoader(USEquityPricing.close, raw_data)

        cls.run_pipeline = SimplePipelineEngine(
            {USEquityPricing.close: close_loader}.__getitem__,
            dates,
            cls.asset_finder,
        ).run_pipeline

        cls.cascading_mask = \
            AssetIDPlusDay() < (sids[-1] + dates[start_date_index].day)
        cls.expected_cascading_mask_result = make_cascading_boolean_array(
            shape=(num_days, num_assets),
        )
        cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0)
        cls.expected_alternating_mask_result = make_alternating_boolean_array(
            shape=(num_days, num_assets),
        )
        cls.expected_no_mask_result = full(
            shape=(num_days, num_assets), fill_value=True, dtype=bool_dtype,
        )
Ejemplo n.º 11
0
    def setUpClass(cls):
        cls.env = TradingEnvironment()
        day = cls.env.trading_day

        cls.sids = sids = Int64Index([1, 2, 3])
        cls.dates = dates = date_range(
            '2015-02-01',
            '2015-02-28',
            freq=day,
            tz='UTC',
        )

        asset_info = make_simple_equity_info(
            cls.sids,
            start_date=Timestamp('2015-01-31', tz='UTC'),
            end_date=Timestamp('2015-03-01', tz='UTC'),
        )
        cls.env.write_data(equities_df=asset_info)
        cls.asset_finder = cls.env.asset_finder

        cls.raw_data = DataFrame(
            data=arange(len(dates) * len(sids), dtype=float).reshape(
                len(dates),
                len(sids),
            ),
            index=dates,
            columns=cls.asset_finder.retrieve_all(sids),
        )

        close_loader = DataFrameLoader(USEquityPricing.close, cls.raw_data)
        volume_loader = DataFrameLoader(
            USEquityPricing.volume,
            cls.raw_data * 2,
        )

        cls.engine = SimplePipelineEngine(
            {
                USEquityPricing.close: close_loader,
                USEquityPricing.volume: volume_loader,
            }.__getitem__,
            cls.dates,
            cls.asset_finder,
        )
Ejemplo n.º 12
0
    def init_instance_fixtures(self):
        super(ClosesAndVolumes, self).init_instance_fixtures()

        # View of the data on/after the split.
        self.adj_closes = adj_closes = self.closes.copy()
        adj_closes.ix[:self.split_date, self.split_asset] *= self.split_ratio
        self.adj_volumes = adj_volumes = self.volumes.copy()
        adj_volumes.ix[:self.split_date, self.split_asset] *= self.split_ratio

        self.pipeline_close_loader = DataFrameLoader(
            column=USEquityPricing.close,
            baseline=self.closes,
            adjustments=self.adjustments,
        )

        self.pipeline_volume_loader = DataFrameLoader(
            column=USEquityPricing.volume,
            baseline=self.volumes,
            adjustments=self.adjustments,
        )
Ejemplo n.º 13
0
    def test_bad_input(self):
        data = arange(100).reshape(self.ndates, self.nsids)
        baseline = DataFrame(data, index=self.dates, columns=self.sids)
        loader = DataFrameLoader(
            USEquityPricing.close,
            baseline,
        )

        with self.assertRaises(ValueError):
            # Wrong column.
            loader.load_adjusted_array(
                US_EQUITIES,
                [USEquityPricing.open],
                self.dates,
                self.sids,
                self.mask,
            )

        with self.assertRaises(ValueError):
            # Too many columns.
            loader.load_adjusted_array(
                US_EQUITIES,
                [USEquityPricing.open, USEquityPricing.close],
                self.dates,
                self.sids,
                self.mask,
            )
Ejemplo n.º 14
0
    def test_compute_with_adjustments(self):
        dates, assets = self.dates, self.assets
        low, high = USEquityPricing.low, USEquityPricing.high
        apply_idxs = [3, 10, 16]

        def apply_date(idx, offset=0):
            return dates[apply_idxs[idx] + offset]

        adjustments = DataFrame.from_records([
            dict(
                kind=MULTIPLY,
                sid=assets[1],
                value=2.0,
                start_date=None,
                end_date=apply_date(0, offset=-1),
                apply_date=apply_date(0),
            ),
            dict(
                kind=MULTIPLY,
                sid=assets[1],
                value=3.0,
                start_date=None,
                end_date=apply_date(1, offset=-1),
                apply_date=apply_date(1),
            ),
            dict(
                kind=MULTIPLY,
                sid=assets[1],
                value=5.0,
                start_date=None,
                end_date=apply_date(2, offset=-1),
                apply_date=apply_date(2),
            ),
        ])
        low_base = DataFrame(self.make_frame(30.0))
        low_loader = DataFrameLoader(low, low_base.copy(), adjustments=None)

        # Pre-apply inverse of adjustments to the baseline.
        high_base = DataFrame(self.make_frame(30.0))
        high_base.iloc[:apply_idxs[0], 1] /= 2.0
        high_base.iloc[:apply_idxs[1], 1] /= 3.0
        high_base.iloc[:apply_idxs[2], 1] /= 5.0

        high_loader = DataFrameLoader(high, high_base, adjustments)

        engine = SimplePipelineEngine(
            {
                low: low_loader,
                high: high_loader
            }.__getitem__,
            self.dates,
            self.asset_finder,
        )

        for window_length in range(1, 4):
            low_mavg = SimpleMovingAverage(
                inputs=[USEquityPricing.low],
                window_length=window_length,
            )
            high_mavg = SimpleMovingAverage(
                inputs=[USEquityPricing.high],
                window_length=window_length,
            )
            bounds = product_upper_triangle(range(window_length, len(dates)))
            for start, stop in bounds:
                results = engine.run_pipeline(
                    Pipeline(columns={
                        'low': low_mavg,
                        'high': high_mavg
                    }),
                    dates[start],
                    dates[stop],
                )
                self.assertEqual(set(results.columns), {'low', 'high'})
                iloc_bounds = slice(start, stop + 1)  # +1 to include end date

                low_results = results.unstack()['low']
                assert_frame_equal(low_results, low_base.iloc[iloc_bounds])

                high_results = results.unstack()['high']
                assert_frame_equal(high_results, high_base.iloc[iloc_bounds])
Ejemplo n.º 15
0
    def setUp(self):
        self.env = env = trading.TradingEnvironment()
        self.dates = date_range('2014-01-01',
                                '2014-02-01',
                                freq=trading_day,
                                tz='UTC')
        asset_info = DataFrame.from_records([
            {
                'sid': 1,
                'symbol': 'A',
                'asset_type': 'equity',
                'start_date': self.dates[10],
                'end_date': self.dates[13],
                'exchange': 'TEST',
            },
            {
                'sid': 2,
                'symbol': 'B',
                'asset_type': 'equity',
                'start_date': self.dates[11],
                'end_date': self.dates[14],
                'exchange': 'TEST',
            },
            {
                'sid': 3,
                'symbol': 'C',
                'asset_type': 'equity',
                'start_date': self.dates[12],
                'end_date': self.dates[15],
                'exchange': 'TEST',
            },
        ])
        self.first_asset_start = min(asset_info.start_date)
        self.last_asset_end = max(asset_info.end_date)
        env.write_data(equities_df=asset_info)
        self.asset_finder = finder = env.asset_finder

        sids = (1, 2, 3)
        self.assets = finder.retrieve_all(sids)

        # View of the baseline data.
        self.closes = DataFrame(
            {sid: arange(1,
                         len(self.dates) + 1) * sid
             for sid in sids},
            index=self.dates,
            dtype=float,
        )

        # Add a split for 'A' on its second date.
        self.split_asset = self.assets[0]
        self.split_date = self.split_asset.start_date + trading_day
        self.split_ratio = 0.5
        self.adjustments = DataFrame.from_records([{
            'sid':
            self.split_asset.sid,
            'value':
            self.split_ratio,
            'kind':
            MULTIPLY,
            'start_date':
            Timestamp('NaT'),
            'end_date':
            self.split_date,
            'apply_date':
            self.split_date,
        }])

        # View of the data on/after the split.
        self.adj_closes = adj_closes = self.closes.copy()
        adj_closes.ix[:self.split_date, self.split_asset] *= self.split_ratio

        self.pipeline_loader = DataFrameLoader(
            column=USEquityPricing.close,
            baseline=self.closes,
            adjustments=self.adjustments,
        )
Ejemplo n.º 16
0
    def test_adjustments(self):
        data = arange(100).reshape(self.ndates, self.nsids)
        baseline = DataFrame(data, index=self.dates, columns=self.sids)

        # Use the dates from index 10 on and sids 1-3.
        dates_slice = slice(10, None, None)
        sids_slice = slice(1, 4, None)

        # Adjustments that should actually affect the output.
        relevant_adjustments = [
            {
                'sid': 1,
                'start_date': None,
                'end_date': self.dates[15],
                'apply_date': self.dates[16],
                'value': 0.5,
                'kind': MULTIPLY,
            },
            {
                'sid': 2,
                'start_date': self.dates[5],
                'end_date': self.dates[15],
                'apply_date': self.dates[16],
                'value': 1.0,
                'kind': ADD,
            },
            {
                'sid': 2,
                'start_date': self.dates[15],
                'end_date': self.dates[16],
                'apply_date': self.dates[17],
                'value': 1.0,
                'kind': ADD,
            },
            {
                'sid': 3,
                'start_date': self.dates[16],
                'end_date': self.dates[17],
                'apply_date': self.dates[18],
                'value': 99.0,
                'kind': OVERWRITE,
            },
        ]

        # These adjustments shouldn't affect the output.
        irrelevant_adjustments = [
            {  # Sid Not Requested
                'sid': 0,
                'start_date': self.dates[16],
                'end_date': self.dates[17],
                'apply_date': self.dates[18],
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Sid Unknown
                'sid': 9999,
                'start_date': self.dates[16],
                'end_date': self.dates[17],
                'apply_date': self.dates[18],
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Date Not Requested
                'sid': 2,
                'start_date': self.dates[1],
                'end_date': self.dates[2],
                'apply_date': self.dates[3],
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Date Before Known Data
                'sid': 2,
                'start_date': self.dates[0] - (2 * trading_day),
                'end_date': self.dates[0] - trading_day,
                'apply_date': self.dates[0] - trading_day,
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Date After Known Data
                'sid': 2,
                'start_date': self.dates[-1] + trading_day,
                'end_date': self.dates[-1] + (2 * trading_day),
                'apply_date': self.dates[-1] + (3 * trading_day),
                'value': -9999.0,
                'kind': OVERWRITE,
            },
        ]

        adjustments = DataFrame(relevant_adjustments + irrelevant_adjustments)
        loader = DataFrameLoader(
            USEquityPricing.close,
            baseline,
            adjustments=adjustments,
        )

        expected_baseline = baseline.iloc[dates_slice, sids_slice]

        formatted_adjustments = loader.format_adjustments(
            self.dates[dates_slice],
            self.sids[sids_slice],
        )
        expected_formatted_adjustments = {
            6: [
                Float64Multiply(
                    first_row=0,
                    last_row=5,
                    first_col=0,
                    last_col=0,
                    value=0.5,
                ),
                Float64Add(
                    first_row=0,
                    last_row=5,
                    first_col=1,
                    last_col=1,
                    value=1.0,
                ),
            ],
            7: [
                Float64Add(
                    first_row=5,
                    last_row=6,
                    first_col=1,
                    last_col=1,
                    value=1.0,
                ),
            ],
            8: [
                Float64Overwrite(
                    first_row=6,
                    last_row=7,
                    first_col=2,
                    last_col=2,
                    value=99.0,
                )
            ],
        }
        self.assertEqual(formatted_adjustments, expected_formatted_adjustments)

        mask = self.mask[dates_slice, sids_slice]
        with patch('zipline.pipeline.loaders.frame.AdjustedArray') as m:
            loader.load_adjusted_array(
                columns=[USEquityPricing.close],
                dates=self.dates[dates_slice],
                assets=self.sids[sids_slice],
                mask=mask,
            )

        self.assertEqual(m.call_count, 1)

        args, kwargs = m.call_args
        assert_array_equal(kwargs['data'], expected_baseline.values)
        assert_array_equal(kwargs['mask'], mask)
        self.assertEqual(kwargs['adjustments'], expected_formatted_adjustments)
Ejemplo n.º 17
0
    def setUp(self):
        self.env = env = trading.TradingEnvironment()
        self.dates = date_range(
            '2014-01-01', '2014-02-01', freq=trading_day, tz='UTC'
        )
        asset_info = DataFrame.from_records([
            {
                'sid': 1,
                'symbol': 'A',
                'start_date': self.dates[10],
                'end_date': self.dates[13],
                'exchange': 'TEST',
            },
            {
                'sid': 2,
                'symbol': 'B',
                'start_date': self.dates[11],
                'end_date': self.dates[14],
                'exchange': 'TEST',
            },
            {
                'sid': 3,
                'symbol': 'C',
                'start_date': self.dates[12],
                'end_date': self.dates[15],
                'exchange': 'TEST',
            },
        ])
        self.first_asset_start = min(asset_info.start_date)
        self.last_asset_end = max(asset_info.end_date)
        env.write_data(equities_df=asset_info)
        self.asset_finder = finder = env.asset_finder

        sids = (1, 2, 3)
        self.assets = finder.retrieve_all(sids)

        # View of the baseline data.
        self.closes = DataFrame(
            {sid: arange(1, len(self.dates) + 1) * sid for sid in sids},
            index=self.dates,
            dtype=float,
        )

        # Create a data portal holding the data in self.closes
        data = {}
        for sid in sids:
            data[sid] = DataFrame({
                "open": self.closes[sid].values,
                "high": self.closes[sid].values,
                "low": self.closes[sid].values,
                "close": self.closes[sid].values,
                "volume": self.closes[sid].values,
                "day": [day.value for day in self.dates]
            })

        path = os.path.join(self.tempdir.path, "testdaily.bcolz")

        DailyBarWriterFromDataFrames(data).write(
            path,
            self.dates,
            data
        )

        daily_bar_reader = BcolzDailyBarReader(path)

        self.data_portal = DataPortal(
            self.env,
            equity_daily_reader=daily_bar_reader,
        )

        # Add a split for 'A' on its second date.
        self.split_asset = self.assets[0]
        self.split_date = self.split_asset.start_date + trading_day
        self.split_ratio = 0.5
        self.adjustments = DataFrame.from_records([
            {
                'sid': self.split_asset.sid,
                'value': self.split_ratio,
                'kind': MULTIPLY,
                'start_date': Timestamp('NaT'),
                'end_date': self.split_date,
                'apply_date': self.split_date,
            }
        ])

        # View of the data on/after the split.
        self.adj_closes = adj_closes = self.closes.copy()
        adj_closes.ix[:self.split_date, self.split_asset] *= self.split_ratio

        self.pipeline_loader = DataFrameLoader(
            column=USEquityPricing.close,
            baseline=self.closes,
            adjustments=self.adjustments,
        )
Ejemplo n.º 18
0
def initialize(context):
    dates = pd.date_range('2018-01-01', '2018-09-28')
    # assets = bundle_data.asset_finder.lookup_symbols(['A', 'AAL'], as_of_date=None)
    # assets = bundle_data.asset_finder
    sids = bundle_data.asset_finder.sids
    assets = [sid(item) for item in sids]

    # The values for Column A will just be a 2D array of numbers ranging from 1 -> N.
    column_A_frame = pd.DataFrame(
        data=np.arange(len(dates) * len(assets), dtype=float).reshape(len(dates), len(assets)),
        index=dates,
        columns=sids,
    )

    # Column B will always provide True for 0 and False for 1.
    column_B_frame = pd.DataFrame(data={sids[0]: True, sids[1]: False}, index=dates)

    loaders = {
        MyDataSet.column_A: DataFrameLoader(MyDataSet.column_A, column_A_frame),
        MyDataSet.column_B: DataFrameLoader(MyDataSet.column_B, column_B_frame),
    }

    def my_dispatcher(column):
        return loaders[column]

    # Set up pipeline engine

    # Loader for pricing
    pipeline_loader = USEquityPricingLoader(
        bundle_data.equity_daily_bar_reader,
        bundle_data.adjustment_reader,
    )

    def choose_loader(column):
        if column in USEquityPricing.columns:
            return pipeline_loader
        return my_dispatcher(column)

    engine = SimplePipelineEngine(
        get_loader=choose_loader,
        calendar=trading_calendar.all_sessions,
        asset_finder=bundle_data.asset_finder,
    )

    p = Pipeline(
        columns={
            'price': USEquityPricing.close.latest,
            'col_A': MyDataSet.column_A.latest,
            'col_B': MyDataSet.column_B.latest
        },
        screen=StaticAssets(assets)
    )

    df = engine.run_pipeline(
        p,
        pd.Timestamp('2016-01-07', tz='utc'),
        pd.Timestamp('2016-01-07', tz='utc')
    )

    df = df.sort_values(by=['price'], axis=0, ascending=False)

    print(df)
# In[10]:


class SignalData(DataSet):
    predictions = Column(dtype=float)
    domain = US_EQUITIES


# ### Define Pipeline Loaders

# While the bundle’s OHLCV data can rely on the built-in `USEquityPricingLoader`, we need to define our own `zipline.pipeline.loaders.frame.DataFrameLoader`:

# In[11]:

signal_loader = {
    SignalData.predictions: DataFrameLoader(SignalData.predictions,
                                            predictions)
}

# In fact, we need to slightly modify the Zipline library’s source code to bypass the assumption that we will only load price data. To this end, we will add a `custom_loader` parameter to the `run_algorithm` and ensure that this loader is used when the `Pipeline` needs one of `SignalData`’s `Column` instances.

# ## Pipeline Setup

# Our Pipeline is going to have two Boolean columns that identify the assets we would like to trade as long and short positions.
#
# To get there, we first define a `CustomFactor` called `MLSignal` that just receives the current `SignalData.predictions`. The motivation is to allow us to use some of the convenient `Factor` methods designed to rank and filter securities.

# ### Custom ML Factor

# In[12]:

Ejemplo n.º 20
0
#MarketCap_frame         = df[['Date','marketcap', 'sid']].reset_index().set_index(['Date', 'sid'], append=True).sort_index().drop(columns=['index']).unstack().sort_index()
#MarketCap_frame.columns = MarketCap_frame.columns.droplevel()
#MarketCap_frame.index   = pd.to_datetime(MarketCap_frame.index)
#MarketCap_frame.index   = MarketCap_frame.index.tz_localize('UTC')
#MarketCap_frame         = MarketCap_frame.sort_index().fillna(method='ffill')



#DE_frame         = df[['Date','de', 'sid']]
#DE_frame.columns = DE_frame.columns.droplevel()
#DE_frame.index   = pd.to_datetime(DE_frame.index)
#DE_frame.index   = DE_frame.index.tz_localize('UTC')
#DE_frame         = DE_frame.sort_index().fillna(method='ffill')

#MarketCap_frame  = df[['marketcap', 'sid']].sort_index().fillna(method='ffill')
#DE_frame         = df[['de', 'sid']].sort_index().fillna(method='ffill')

DE_frame         = df[['Date','de', 'sid']].reset_index().set_index(['Date', 'sid']).sort_index().drop(columns=['index'])
DE_frame = DE_frame.pivot_table(values='de', index='Date', columns='sid', aggfunc='max', fill_value=None, margins=False, dropna=True, margins_name='All')
DE_frame         = DE_frame.sort_index().fillna(method='ffill')


class Fundamentals(DataSet):
    DE = Column(dtype=float)
    MarketCap = Column(dtype=float)

# register the loaders
loaders[Fundamentals.DE] = DataFrameLoader(Fundamentals.DE, DE_frame)
loaders[Fundamentals.MarketCap] = DataFrameLoader(Fundamentals.MarketCap, MarketCap_frame)
df_loaders=loaders
Ejemplo n.º 21
0
df = pd.read_pickle(path.join(path.dirname(__file__), 'sharadar_with_sid.pkl'))

MarketCap_frame = (
    df[['MarketCap', 'sid']].
    reset_index().set_index(['Date', 'sid']).
    unstack()
)

MarketCap_frame.columns = MarketCap_frame.columns.droplevel()

PriceToBook_frame = df[['P/B', 'sid']].reset_index().set_index(['Date', 'sid']).unstack()
PriceToBook_frame.columns = PriceToBook_frame.columns.droplevel()

PriceToSales_frame = df[['P/S', 'sid']].reset_index().set_index(['Date', 'sid']).unstack()
PriceToSales_frame.columns = PriceToSales_frame.columns.droplevel()

PriceToEarnings_frame = df[['P/E', 'sid']].reset_index().set_index(['Date', 'sid']).unstack()
PriceToEarnings_frame.columns = PriceToEarnings_frame.columns.droplevel()

class Fundamentals(DataSet):
    MarketCap = Column(dtype=float)
    PriceToBook = Column(dtype=float)
    PriceToSales = Column(dtype=float)
    PriceToEarnings = Column(dtype=float)

# register the loaders
loaders[Fundamentals.MarketCap] = DataFrameLoader(Fundamentals.MarketCap, MarketCap_frame)
loaders[Fundamentals.PriceToBook] = DataFrameLoader(Fundamentals.PriceToBook, PriceToBook_frame)
loaders[Fundamentals.PriceToSales] = DataFrameLoader(Fundamentals.PriceToSales, PriceToSales_frame)
loaders[Fundamentals.PriceToEarnings] = DataFrameLoader(Fundamentals.PriceToEarnings, PriceToEarnings_frame)
Ejemplo n.º 22
0
    def test_adjustments(self):
        data = np.arange(100).reshape(self.ndates, self.nsids)
        baseline = pd.DataFrame(data, index=self.dates, columns=self.sids)

        # Use the dates from index 10 on and sids 1-3.
        dates_slice = slice(10, None, None)
        sids_slice = slice(1, 4, None)

        # Adjustments that should actually affect the output.
        relevant_adjustments = [
            {
                "sid": 1,
                "start_date": None,
                "end_date": self.dates[15],
                "apply_date": self.dates[16],
                "value": 0.5,
                "kind": MULTIPLY,
            },
            {
                "sid": 2,
                "start_date": self.dates[5],
                "end_date": self.dates[15],
                "apply_date": self.dates[16],
                "value": 1.0,
                "kind": ADD,
            },
            {
                "sid": 2,
                "start_date": self.dates[15],
                "end_date": self.dates[16],
                "apply_date": self.dates[17],
                "value": 1.0,
                "kind": ADD,
            },
            {
                "sid": 3,
                "start_date": self.dates[16],
                "end_date": self.dates[17],
                "apply_date": self.dates[18],
                "value": 99.0,
                "kind": OVERWRITE,
            },
        ]

        # These adjustments shouldn't affect the output.
        irrelevant_adjustments = [
            {  # Sid Not Requested
                "sid": 0,
                "start_date": self.dates[16],
                "end_date": self.dates[17],
                "apply_date": self.dates[18],
                "value": -9999.0,
                "kind": OVERWRITE,
            },
            {  # Sid Unknown
                "sid": 9999,
                "start_date": self.dates[16],
                "end_date": self.dates[17],
                "apply_date": self.dates[18],
                "value": -9999.0,
                "kind": OVERWRITE,
            },
            {  # Date Not Requested
                "sid": 2,
                "start_date": self.dates[1],
                "end_date": self.dates[2],
                "apply_date": self.dates[3],
                "value": -9999.0,
                "kind": OVERWRITE,
            },
            {  # Date Before Known Data
                "sid": 2,
                "start_date": self.dates[0] - (2 * self.trading_day),
                "end_date": self.dates[0] - self.trading_day,
                "apply_date": self.dates[0] - self.trading_day,
                "value": -9999.0,
                "kind": OVERWRITE,
            },
            {  # Date After Known Data
                "sid": 2,
                "start_date": self.dates[-1] + self.trading_day,
                "end_date": self.dates[-1] + (2 * self.trading_day),
                "apply_date": self.dates[-1] + (3 * self.trading_day),
                "value": -9999.0,
                "kind": OVERWRITE,
            },
        ]

        adjustments = pd.DataFrame(relevant_adjustments +
                                   irrelevant_adjustments)
        loader = DataFrameLoader(
            USEquityPricing.close,
            baseline,
            adjustments=adjustments,
        )

        expected_baseline = baseline.iloc[dates_slice, sids_slice]

        formatted_adjustments = loader.format_adjustments(
            self.dates[dates_slice],
            self.sids[sids_slice],
        )
        expected_formatted_adjustments = {
            6: [
                Float64Multiply(
                    first_row=0,
                    last_row=5,
                    first_col=0,
                    last_col=0,
                    value=0.5,
                ),
                Float64Add(
                    first_row=0,
                    last_row=5,
                    first_col=1,
                    last_col=1,
                    value=1.0,
                ),
            ],
            7: [
                Float64Add(
                    first_row=5,
                    last_row=6,
                    first_col=1,
                    last_col=1,
                    value=1.0,
                ),
            ],
            8: [
                Float64Overwrite(
                    first_row=6,
                    last_row=7,
                    first_col=2,
                    last_col=2,
                    value=99.0,
                )
            ],
        }
        assert formatted_adjustments == expected_formatted_adjustments

        mask = self.mask[dates_slice, sids_slice]
        with patch("zipline.pipeline.loaders.frame.AdjustedArray") as m:
            loader.load_adjusted_array(
                US_EQUITIES,
                columns=[USEquityPricing.close],
                dates=self.dates[dates_slice],
                sids=self.sids[sids_slice],
                mask=mask,
            )

        assert m.call_count == 1

        args, kwargs = m.call_args
        assert_array_equal(kwargs["data"], expected_baseline.values)
        assert kwargs["adjustments"] == expected_formatted_adjustments
Ejemplo n.º 23
0
    locals().update({
        name: Column(dtype=float)  for name in sharadar_f1_top25
    })
    locals().update({
            name: Column(dtype=object)  for name in sharadar_tickers
    })
    
#class Fundamentals(DataSet):
#    for name  in sharadar_f1_top25:
#        name = Column(dtype=float)

#class Fundamentals(DataSet):
    #DE = Column(dtype=float)
    #MarketCap = Column(dtype=float)
    #EUSD = Column(dtype=float)
    #DNC = Column(dtype=float)

# register the loaders
for name  in sharadar_f1_top25:
    loaders[Fundamentals.get_column(name)] = DataFrameLoader(Fundamentals.get_column(name), fundy_frames[name] )

for name  in sharadar_tickers:
    loaders[Fundamentals.get_column(name)] = DataFrameLoader(Fundamentals.get_column(name), fundy_frames[name] )

#loaders[Fundamentals.DE]        = DataFrameLoader(Fundamentals.DE, DE_frame)
#loaders[Fundamentals.MarketCap] = DataFrameLoader(Fundamentals.MarketCap, MarketCap_frame)
#loaders[Fundamentals.EUSD]      = DataFrameLoader(Fundamentals.EUSD, EUSD_frame)
#loaders[Fundamentals.DNC]       = DataFrameLoader(Fundamentals.DNC, DNC_frame)

df_loaders=loaders
Ejemplo n.º 24
0
    def test_adjustments(self):
        data = arange(100).reshape(self.ndates, self.nsids)
        baseline = DataFrame(data, index=self.dates, columns=self.sids)

        # Use the dates from index 10 on and sids 1-3.
        dates_slice = slice(10, None, None)
        sids_slice = slice(1, 4, None)

        # Adjustments that should actually affect the output.
        relevant_adjustments = [
            {
                'sid': 1,
                'start_date': None,
                'end_date': self.dates[15],
                'apply_date': self.dates[16],
                'value': 0.5,
                'kind': MULTIPLY,
            },
            {
                'sid': 2,
                'start_date': self.dates[5],
                'end_date': self.dates[15],
                'apply_date': self.dates[16],
                'value': 1.0,
                'kind': ADD,
            },
            {
                'sid': 2,
                'start_date': self.dates[15],
                'end_date': self.dates[16],
                'apply_date': self.dates[17],
                'value': 1.0,
                'kind': ADD,
            },
            {
                'sid': 3,
                'start_date': self.dates[16],
                'end_date': self.dates[17],
                'apply_date': self.dates[18],
                'value': 99.0,
                'kind': OVERWRITE,
            },
        ]

        # These adjustments shouldn't affect the output.
        irrelevant_adjustments = [
            {  # Sid Not Requested
                'sid': 0,
                'start_date': self.dates[16],
                'end_date': self.dates[17],
                'apply_date': self.dates[18],
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Sid Unknown
                'sid': 9999,
                'start_date': self.dates[16],
                'end_date': self.dates[17],
                'apply_date': self.dates[18],
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Date Not Requested
                'sid': 2,
                'start_date': self.dates[1],
                'end_date': self.dates[2],
                'apply_date': self.dates[3],
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Date Before Known Data
                'sid': 2,
                'start_date': self.dates[0] - (2 * trading_day),
                'end_date': self.dates[0] - trading_day,
                'apply_date': self.dates[0] - trading_day,
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Date After Known Data
                'sid': 2,
                'start_date': self.dates[-1] + trading_day,
                'end_date': self.dates[-1] + (2 * trading_day),
                'apply_date': self.dates[-1] + (3 * trading_day),
                'value': -9999.0,
                'kind': OVERWRITE,
            },
        ]

        adjustments = DataFrame(relevant_adjustments + irrelevant_adjustments)
        loader = DataFrameLoader(
            USEquityPricing.close,
            baseline,
            adjustments=adjustments,
        )

        expected_baseline = baseline.iloc[dates_slice, sids_slice]

        formatted_adjustments = loader.format_adjustments(
            self.dates[dates_slice],
            self.sids[sids_slice],
        )
        expected_formatted_adjustments = {
            6: [
                Float64Multiply(
                    first_row=0,
                    last_row=5,
                    first_col=0,
                    last_col=0,
                    value=0.5,
                ),
                Float64Add(
                    first_row=0,
                    last_row=5,
                    first_col=1,
                    last_col=1,
                    value=1.0,
                ),
            ],
            7: [
                Float64Add(
                    first_row=5,
                    last_row=6,
                    first_col=1,
                    last_col=1,
                    value=1.0,
                ),
            ],
            8: [
                Float64Overwrite(
                    first_row=6,
                    last_row=7,
                    first_col=2,
                    last_col=2,
                    value=99.0,
                )
            ],
        }
        self.assertEqual(formatted_adjustments, expected_formatted_adjustments)

        mask = self.mask[dates_slice, sids_slice]
        with patch('zipline.pipeline.loaders.frame.adjusted_array') as m:
            loader.load_adjusted_array(
                columns=[USEquityPricing.close],
                dates=self.dates[dates_slice],
                assets=self.sids[sids_slice],
                mask=mask,
            )

        self.assertEqual(m.call_count, 1)

        args, kwargs = m.call_args
        assert_array_equal(kwargs['data'], expected_baseline.values)
        assert_array_equal(kwargs['mask'], mask)
        self.assertEqual(kwargs['adjustments'], expected_formatted_adjustments)
Ejemplo n.º 25
0
def prepare_data(bundle_data):
    """
    This function takes a data bundle and matches fundamental data points to the correct asset objects.
    :param bundle_data: The data bundle that you ingested from SEP
    :return: A dictionary of loaders to be used within a data pipeline, and a DataSet class with the correct columns
    """
    """
    Enter the name of the data points you wish to use in the backtest here. The names need to match the name of the
    appropriate CSV file found in processed_data/fundamentals
    """
    data_points = ['pe1', 'de', 'earnings_growth', 'marketcap']

    # Specify where our CSV files live
    fundamentals_directory = '../processed_data/fundamentals/'
    pricing_directory = '../processed_data/pricing/daily/'

    # pricing_assets is an ordered dict that contains the name of every security in the pricing directory
    pricing_assets = helper_functions.get_pricing_securities(pricing_directory)
    """
    fundamental_assets is an ordered dict that contains the name of every security in the fundamentals directory
    dates is a list of dates that the fundamentals directory is indexed by
    """
    fundamental_assets, dates = helper_functions.get_dates(
        fundamentals_directory)

    # Securities that are in both pricing_assets, and fundamental_assets
    tickers = helper_functions.get_tickers_in_both(pricing_assets,
                                                   fundamental_assets)

    date_stamps = helper_functions.convert_to_date_stamps(dates)

    data_frames = {}

    for data in data_points:
        # creates a dataframe for each data point, puts it in the data_frames dict
        data_frames[data] = helper_functions.make_frame(
            data, fundamentals_directory, tickers)

    for data_frame in data_frames:
        """
        assets variable becomes a list of Asset objects, sids becomes a list of SID objects corresponding to the correct
        assets.
        """
        assets = bundle_data.asset_finder.lookup_symbols(
            [ticker for ticker in data_frames[data_frame].columns],
            as_of_date=None)
        sids = pd.Int64Index([asset.sid for asset in assets])
        break

    class MyDataSet(DataSet):
        """
        We need to create an attribute for each needed data point within MyDataSet, before __new__() runs...
        This is so MyDataSet converts the Column types into BoundColumn types.
        """
        for point in data_points:
            locals()[point] = Column(dtype=float)

    """
    We are finally ready to create a dictionary of data frame loaders, with corresponding BoundColumn attributes
    within our MyDataSet class. 
    """
    data_frame_loaders = {}

    for data_frame in data_frames:
        """
        Reindexes the dataframe indexes with date_stamps instead of dates, and replaces the column names (which are
        currently strings) with SIDS.
        """
        data_frames[data_frame].index, data_frames[
            data_frame].columns = date_stamps, sids

    for attr in data_frames:
        """
        Filles data_frame_loaders with key value pairs of: MyDataSet.attribute_name: DataFrameLoader(attribute_name
        """
        data_frame_loaders[getattr(MyDataSet, attr)] = DataFrameLoader(
            getattr(MyDataSet, attr), data_frames[attr])

    return data_frame_loaders, MyDataSet