Esempio n. 1
0
    def test_categorical_df_concat_value_error(self):

        mismatched_dtypes = [
            pd.DataFrame({
                'A': pd.Series(['a', 'b', 'c'], dtype='category'),
                'B': pd.Series([100, 102, 103], dtype='int64'),
            }),
            pd.DataFrame({
                'A': pd.Series(['c', 'b', 'd'], dtype='category'),
                'B': pd.Series([103, 102, 104], dtype='float64'),
            }),
        ]
        mismatched_column_names = [
            pd.DataFrame({
                'A': pd.Series(['a', 'b', 'c'], dtype='category'),
                'B': pd.Series([100, 102, 103], dtype='int64'),
            }),
            pd.DataFrame({
                'A': pd.Series(['c', 'b', 'd'], dtype='category'),
                'X': pd.Series([103, 102, 104], dtype='int64'),
            }),
        ]

        with self.assertRaises(ValueError) as cm:
            categorical_df_concat(mismatched_dtypes)
        self.assertEqual(
            str(cm.exception),
            "Input DataFrames must have the same columns/dtypes.")

        with self.assertRaises(ValueError) as cm:
            categorical_df_concat(mismatched_column_names)
        self.assertEqual(
            str(cm.exception),
            "Input DataFrames must have the same columns/dtypes.")
Esempio n. 2
0
    def test_categorical_df_concat_value_error(self):

        mismatched_dtypes = [
            pd.DataFrame({
                "A": pd.Series(["a", "b", "c"], dtype="category"),
                "B": pd.Series([100, 102, 103], dtype="int64"),
            }),
            pd.DataFrame({
                "A": pd.Series(["c", "b", "d"], dtype="category"),
                "B": pd.Series([103, 102, 104], dtype="float64"),
            }),
        ]
        mismatched_column_names = [
            pd.DataFrame({
                "A": pd.Series(["a", "b", "c"], dtype="category"),
                "B": pd.Series([100, 102, 103], dtype="int64"),
            }),
            pd.DataFrame({
                "A": pd.Series(["c", "b", "d"], dtype="category"),
                "X": pd.Series([103, 102, 104], dtype="int64"),
            }),
        ]

        with self.assertRaises(ValueError) as cm:
            categorical_df_concat(mismatched_dtypes)
        self.assertEqual(
            str(cm.exception),
            "Input DataFrames must have the same columns/dtypes.")

        with self.assertRaises(ValueError) as cm:
            categorical_df_concat(mismatched_column_names)
        self.assertEqual(
            str(cm.exception),
            "Input DataFrames must have the same columns/dtypes.")
    def test_categorical_df_concat_value_error(self):

        mismatched_dtypes = [
            pd.DataFrame({
                "A": pd.Series(["a", "b", "c"], dtype="category"),
                "B": pd.Series([100, 102, 103], dtype="int64"),
            }),
            pd.DataFrame({
                "A": pd.Series(["c", "b", "d"], dtype="category"),
                "B": pd.Series([103, 102, 104], dtype="float64"),
            }),
        ]
        mismatched_column_names = [
            pd.DataFrame({
                "A": pd.Series(["a", "b", "c"], dtype="category"),
                "B": pd.Series([100, 102, 103], dtype="int64"),
            }),
            pd.DataFrame({
                "A": pd.Series(["c", "b", "d"], dtype="category"),
                "X": pd.Series([103, 102, 104], dtype="int64"),
            }),
        ]

        with pytest.raises(
                ValueError,
                match="Input DataFrames must have the same columns/dtypes."):
            categorical_df_concat(mismatched_dtypes)

        with pytest.raises(
                ValueError,
                match="Input DataFrames must have the same columns/dtypes."):
            categorical_df_concat(mismatched_column_names)
    def test_categorical_df_concat_value_error(self):

        mismatched_dtypes = [
            pd.DataFrame(
                {
                    'A': pd.Series(['a', 'b', 'c'], dtype='category'),
                    'B': pd.Series([100, 102, 103], dtype='int64'),
                }
            ),
            pd.DataFrame(
                {
                    'A': pd.Series(['c', 'b', 'd'], dtype='category'),
                    'B': pd.Series([103, 102, 104], dtype='float64'),
                }
            ),
        ]
        mismatched_column_names = [
            pd.DataFrame(
                {
                    'A': pd.Series(['a', 'b', 'c'], dtype='category'),
                    'B': pd.Series([100, 102, 103], dtype='int64'),
                }
            ),
            pd.DataFrame(
                {
                    'A': pd.Series(['c', 'b', 'd'], dtype='category'),
                    'X': pd.Series([103, 102, 104], dtype='int64'),
                }
            ),
        ]

        with self.assertRaises(ValueError) as cm:
            categorical_df_concat(mismatched_dtypes)
        self.assertEqual(
            str(cm.exception),
            "Input DataFrames must have the same columns/dtypes."
        )

        with self.assertRaises(ValueError) as cm:
            categorical_df_concat(mismatched_column_names)
        self.assertEqual(
            str(cm.exception),
            "Input DataFrames must have the same columns/dtypes."
        )
Esempio n. 5
0
    def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize):
        ranges = compute_date_range_chunks(
            self._calendar,
            start_date,
            end_date,
            chunksize,
        )
        chunks = [self.run_pipeline(pipeline, s, e) for s, e in ranges]

        return categorical_df_concat(chunks, inplace=True)
    def test_categorical_df_concat(self):

        inp = [
            pd.DataFrame(
                {
                    'A': pd.Series(['a', 'b', 'c'], dtype='category'),
                    'B': pd.Series([100, 102, 103], dtype='int64'),
                    'C': pd.Series(['x', 'x', 'x'], dtype='category'),
                }
            ),
            pd.DataFrame(
                {
                    'A': pd.Series(['c', 'b', 'd'], dtype='category'),
                    'B': pd.Series([103, 102, 104], dtype='int64'),
                    'C': pd.Series(['y', 'y', 'y'], dtype='category'),
                }
            ),
            pd.DataFrame(
                {
                    'A': pd.Series(['a', 'b', 'd'], dtype='category'),
                    'B': pd.Series([101, 102, 104], dtype='int64'),
                    'C': pd.Series(['z', 'z', 'z'], dtype='category'),
                }
            ),
        ]
        result = categorical_df_concat(inp)

        expected = pd.DataFrame(
            {
                'A': pd.Series(
                    ['a', 'b', 'c', 'c', 'b', 'd', 'a', 'b', 'd'],
                    dtype='category'
                ),
                'B': pd.Series(
                    [100, 102, 103, 103, 102, 104, 101, 102, 104],
                    dtype='int64'
                ),
                'C': pd.Series(
                    ['x', 'x', 'x', 'y', 'y', 'y', 'z', 'z', 'z'],
                    dtype='category'
                ),
            },
        )
        expected.index = pd.Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2])
        assert_equal(expected, result)
        assert_equal(
            expected['A'].cat.categories,
            result['A'].cat.categories
        )
        assert_equal(
            expected['C'].cat.categories,
            result['C'].cat.categories
        )
Esempio n. 7
0
    def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize):
        ranges = compute_date_range_chunks(
            self._calendar,
            start_date,
            end_date,
            chunksize,
        )
        chunks = [self.run_pipeline(pipeline, s, e) for s, e in ranges]

        if len(chunks) == 1:
            # OPTIMIZATION: Don't make an extra copy in `categorical_df_concat`
            # if we don't have to.
            return chunks[0]

        return categorical_df_concat(chunks, inplace=True)
Esempio n. 8
0
    def test_categorical_df_concat(self):

        inp = [
            pd.DataFrame({
                'A': pd.Series(['a', 'b', 'c'], dtype='category'),
                'B': pd.Series([100, 102, 103], dtype='int64'),
                'C': pd.Series(['x', 'x', 'x'], dtype='category'),
            }),
            pd.DataFrame({
                'A': pd.Series(['c', 'b', 'd'], dtype='category'),
                'B': pd.Series([103, 102, 104], dtype='int64'),
                'C': pd.Series(['y', 'y', 'y'], dtype='category'),
            }),
            pd.DataFrame({
                'A': pd.Series(['a', 'b', 'd'], dtype='category'),
                'B': pd.Series([101, 102, 104], dtype='int64'),
                'C': pd.Series(['z', 'z', 'z'], dtype='category'),
            }),
        ]
        result = categorical_df_concat(inp)

        expected = pd.DataFrame(
            {
                'A':
                pd.Series(['a', 'b', 'c', 'c', 'b', 'd', 'a', 'b', 'd'],
                          dtype='category'),
                'B':
                pd.Series([100, 102, 103, 103, 102, 104, 101, 102, 104],
                          dtype='int64'),
                'C':
                pd.Series(['x', 'x', 'x', 'y', 'y', 'y', 'z', 'z', 'z'],
                          dtype='category'),
            }, )
        expected.index = pd.Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2])
        assert_equal(expected, result)
        assert_equal(expected['A'].cat.categories, result['A'].cat.categories)
        assert_equal(expected['C'].cat.categories, result['C'].cat.categories)
Esempio n. 9
0
    def test_categorical_df_concat(self):

        inp = [
            pd.DataFrame({
                "A": pd.Series(["a", "b", "c"], dtype="category"),
                "B": pd.Series([100, 102, 103], dtype="int64"),
                "C": pd.Series(["x", "x", "x"], dtype="category"),
            }),
            pd.DataFrame({
                "A": pd.Series(["c", "b", "d"], dtype="category"),
                "B": pd.Series([103, 102, 104], dtype="int64"),
                "C": pd.Series(["y", "y", "y"], dtype="category"),
            }),
            pd.DataFrame({
                "A": pd.Series(["a", "b", "d"], dtype="category"),
                "B": pd.Series([101, 102, 104], dtype="int64"),
                "C": pd.Series(["z", "z", "z"], dtype="category"),
            }),
        ]
        result = categorical_df_concat(inp)

        expected = pd.DataFrame(
            {
                "A":
                pd.Series(["a", "b", "c", "c", "b", "d", "a", "b", "d"],
                          dtype="category"),
                "B":
                pd.Series([100, 102, 103, 103, 102, 104, 101, 102, 104],
                          dtype="int64"),
                "C":
                pd.Series(["x", "x", "x", "y", "y", "y", "z", "z", "z"],
                          dtype="category"),
            }, )
        expected.index = pd.Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2])
        assert_equal(expected, result)
        assert_equal(expected["A"].cat.categories, result["A"].cat.categories)
        assert_equal(expected["C"].cat.categories, result["C"].cat.categories)
Esempio n. 10
0
    def run_chunked_pipeline(self,
                             pipeline,
                             start_date,
                             end_date,
                             chunksize,
                             hooks=None):
        """
        Compute values for ``pipeline`` from ``start_date`` to ``end_date``, in
        date chunks of size ``chunksize``.

        Chunked execution reduces memory consumption, and may reduce
        computation time depending on the contents of your pipeline.

        Parameters
        ----------
        pipeline : Pipeline
            The pipeline to run.
        start_date : pd.Timestamp
            The start date to run the pipeline for.
        end_date : pd.Timestamp
            The end date to run the pipeline for.
        chunksize : int
            The number of days to execute at a time.
        hooks : list[implements(PipelineHooks)], optional
            Hooks for instrumenting Pipeline execution.

        Returns
        -------
        result : pd.DataFrame
            A frame of computed results.

            The ``result`` columns correspond to the entries of
            `pipeline.columns`, which should be a dictionary mapping strings to
            instances of :class:`zipline.pipeline.term.Term`.

            For each date between ``start_date`` and ``end_date``, ``result``
            will contain a row for each asset that passed `pipeline.screen`.
            A screen of ``None`` indicates that a row should be returned for
            each asset that existed each day.

        See Also
        --------
        :meth:`zipline.pipeline.engine.PipelineEngine.run_pipeline`
        """
        domain = self.resolve_domain(pipeline)
        ranges = compute_date_range_chunks(
            domain.all_sessions(),
            start_date,
            end_date,
            chunksize,
        )
        hooks = self._resolve_hooks(hooks)

        run_pipeline = partial(self._run_pipeline_impl, pipeline, hooks=hooks)
        with hooks.running_pipeline(pipeline, start_date, end_date):
            chunks = [run_pipeline(s, e) for s, e in ranges]

        if len(chunks) == 1:
            # OPTIMIZATION: Don't make an extra copy in `categorical_df_concat`
            # if we don't have to.
            return chunks[0]

        return categorical_df_concat(chunks, inplace=True)