Esempio n. 1
0
    def test_compute_date_range_chunks(self, chunksize, expected):
        # This date range results in 20 business days
        start_date = T("2017-01-03")
        end_date = T("2017-01-31")

        date_ranges = compute_date_range_chunks(self.calendar.all_sessions,
                                                start_date, end_date,
                                                chunksize)

        assert list(date_ranges) == expected
Esempio n. 2
0
    def test_compute_date_range_chunks(self, chunksize, expected):
        # This date range results in 20 business days
        start_date = T('2017-01-03')
        end_date = T('2017-01-31')

        date_ranges = compute_date_range_chunks(self.calendar.all_sessions,
                                                start_date, end_date,
                                                chunksize)

        self.assertListEqual(list(date_ranges), expected)
Esempio n. 3
0
    def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize):
        ranges = compute_date_range_chunks(
            self._calendar,
            start_date,
            end_date,
            chunksize,
        )
        chunks = [self.run_pipeline(pipeline, s, e) for s, e in ranges]

        return categorical_df_concat(chunks, inplace=True)
Esempio n. 4
0
    def test_compute_date_range_chunks_invalid_input(self):
        # Start date not found in calendar
        with self.assertRaises(KeyError) as cm:
            compute_date_range_chunks(
                self.calendar.all_sessions,
                T('2017-05-07'),  # Sunday
                T('2017-06-01'),
                None)
        self.assertEqual(str(cm.exception),
                         "'Start date 2017-05-07 is not found in calendar.'")

        # End date not found in calendar
        with self.assertRaises(KeyError) as cm:
            compute_date_range_chunks(
                self.calendar.all_sessions,
                T('2017-05-01'),
                T('2017-05-27'),  # Saturday
                None)
        self.assertEqual(str(cm.exception),
                         "'End date 2017-05-27 is not found in calendar.'")

        # End date before start date
        with self.assertRaises(ValueError) as cm:
            compute_date_range_chunks(self.calendar.all_sessions,
                                      T('2017-06-01'), T('2017-05-01'), None)
        self.assertEqual(
            str(cm.exception),
            "End date 2017-05-01 cannot precede start date 2017-06-01.")
Esempio n. 5
0
    def test_compute_date_range_chunks_invalid_input(self):
        # Start date not found in calendar
        err_msg = "'Start date 2017-05-07 is not found in calendar.'"
        with pytest.raises(KeyError, match=err_msg):
            compute_date_range_chunks(
                self.calendar.all_sessions,
                T("2017-05-07"),  # Sunday
                T("2017-06-01"),
                None,
            )

        # End date not found in calendar
        err_msg = "'End date 2017-05-27 is not found in calendar.'"
        with pytest.raises(KeyError, match=err_msg):
            compute_date_range_chunks(
                self.calendar.all_sessions,
                T("2017-05-01"),
                T("2017-05-27"),  # Saturday
                None,
            )

        # End date before start date
        err_msg = "End date 2017-05-01 cannot precede start date 2017-06-01."
        with pytest.raises(ValueError, match=err_msg):
            compute_date_range_chunks(self.calendar.all_sessions,
                                      T("2017-06-01"), T("2017-05-01"), None)
Esempio n. 6
0
    def test_compute_date_range_chunks(self, chunksize, expected):
        # This date range results in 20 business days
        start_date = T('2017-01-03')
        end_date = T('2017-01-31')

        date_ranges = compute_date_range_chunks(
            self.calendar.all_sessions,
            start_date,
            end_date,
            chunksize
        )

        self.assertListEqual(list(date_ranges), expected)
Esempio n. 7
0
    def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize):
        ranges = compute_date_range_chunks(
            self._calendar,
            start_date,
            end_date,
            chunksize,
        )
        chunks = [self.run_pipeline(pipeline, s, e) for s, e in ranges]

        if len(chunks) == 1:
            # OPTIMIZATION: Don't make an extra copy in `categorical_df_concat`
            # if we don't have to.
            return chunks[0]

        return categorical_df_concat(chunks, inplace=True)
Esempio n. 8
0
    def test_compute_date_range_chunks_invalid_input(self):
        # Start date not found in calendar
        with self.assertRaises(KeyError) as cm:
            compute_date_range_chunks(
                self.calendar.all_sessions,
                T('2017-05-07'),  # Sunday
                T('2017-06-01'),
                None
            )
        self.assertEqual(
            str(cm.exception),
            "'Start date 2017-05-07 is not found in calendar.'"
        )

        # End date not found in calendar
        with self.assertRaises(KeyError) as cm:
            compute_date_range_chunks(
                self.calendar.all_sessions,
                T('2017-05-01'),
                T('2017-05-27'),  # Saturday
                None
            )
        self.assertEqual(
            str(cm.exception),
            "'End date 2017-05-27 is not found in calendar.'"
        )

        # End date before start date
        with self.assertRaises(ValueError) as cm:
            compute_date_range_chunks(
                self.calendar.all_sessions,
                T('2017-06-01'),
                T('2017-05-01'),
                None
            )
        self.assertEqual(
            str(cm.exception),
            "End date 2017-05-01 cannot precede start date 2017-06-01."
        )
Esempio n. 9
0
    def run_chunked_pipeline(self,
                             pipeline,
                             start_date,
                             end_date,
                             chunksize,
                             hooks=None):
        """
        Compute values for ``pipeline`` from ``start_date`` to ``end_date``, in
        date chunks of size ``chunksize``.

        Chunked execution reduces memory consumption, and may reduce
        computation time depending on the contents of your pipeline.

        Parameters
        ----------
        pipeline : Pipeline
            The pipeline to run.
        start_date : pd.Timestamp
            The start date to run the pipeline for.
        end_date : pd.Timestamp
            The end date to run the pipeline for.
        chunksize : int
            The number of days to execute at a time.
        hooks : list[implements(PipelineHooks)], optional
            Hooks for instrumenting Pipeline execution.

        Returns
        -------
        result : pd.DataFrame
            A frame of computed results.

            The ``result`` columns correspond to the entries of
            `pipeline.columns`, which should be a dictionary mapping strings to
            instances of :class:`zipline.pipeline.term.Term`.

            For each date between ``start_date`` and ``end_date``, ``result``
            will contain a row for each asset that passed `pipeline.screen`.
            A screen of ``None`` indicates that a row should be returned for
            each asset that existed each day.

        See Also
        --------
        :meth:`zipline.pipeline.engine.PipelineEngine.run_pipeline`
        """
        domain = self.resolve_domain(pipeline)
        ranges = compute_date_range_chunks(
            domain.all_sessions(),
            start_date,
            end_date,
            chunksize,
        )
        hooks = self._resolve_hooks(hooks)

        run_pipeline = partial(self._run_pipeline_impl, pipeline, hooks=hooks)
        with hooks.running_pipeline(pipeline, start_date, end_date):
            chunks = [run_pipeline(s, e) for s, e in ranges]

        if len(chunks) == 1:
            # OPTIMIZATION: Don't make an extra copy in `categorical_df_concat`
            # if we don't have to.
            return chunks[0]

        return categorical_df_concat(chunks, inplace=True)
Esempio n. 10
0
    def run_chunked_pipeline(self,
                             pipeline,
                             start_date,
                             end_date,
                             chunksize,
                             hooks=None):
        """
        Compute values for ``pipeline`` from ``start_date`` to ``end_date``, in
        date chunks of size ``chunksize``.

        Chunked execution reduces memory consumption, and may reduce
        computation time depending on the contents of your pipeline.

        Parameters
        ----------
        pipeline : Pipeline
            The pipeline to run.
        start_date : pd.Timestamp
            The start date to run the pipeline for.
        end_date : pd.Timestamp
            The end date to run the pipeline for.
        chunksize : int
            The number of days to execute at a time.
        hooks : list[implements(PipelineHooks)], optional
            Hooks for instrumenting Pipeline execution.

        Returns
        -------
        result : pd.DataFrame
            A frame of computed results.

            The ``result`` columns correspond to the entries of
            `pipeline.columns`, which should be a dictionary mapping strings to
            instances of :class:`zipline.pipeline.Term`.

            For each date between ``start_date`` and ``end_date``, ``result``
            will contain a row for each asset that passed `pipeline.screen`.
            A screen of ``None`` indicates that a row should be returned for
            each asset that existed each day.

        See Also
        --------
        :meth:`zipline.pipeline.engine.PipelineEngine.run_pipeline`
        """
        domain = self.resolve_domain(pipeline)
        ranges = compute_date_range_chunks(
            domain.all_sessions(),
            start_date,
            end_date,
            chunksize,
        )
        hooks = self._resolve_hooks(hooks)

        run_pipeline = partial(self._run_pipeline_impl, pipeline, hooks=hooks)
        with hooks.running_pipeline(pipeline, start_date, end_date):
            chunks = [run_pipeline(s, e) for s, e in ranges]

        if len(chunks) == 1:
            return chunks[0]

        # Filter out empty chunks. Empty dataframes lose dtype information,
        # which makes concatenation fail.
        nonempty_chunks = [c for c in chunks if len(c)]

        # pandas would fill missing columns with NaT
        return concat(nonempty_chunks)