def test_bad_input(self):
        msg = "Mask shape \(2, 3\) != data shape \(5, 5\)"
        data = arange(25).reshape(5, 5)
        bad_mask = array([[0, 1, 1], [0, 0, 1]], dtype=bool)

        with self.assertRaisesRegexp(ValueError, msg):
            adjusted_array(data, bad_mask, {})
    def test_bad_input(self):
        msg = "Mask shape \(2, 3\) != data shape \(5, 5\)"
        data = arange(25).reshape(5, 5)
        bad_mask = array([[0, 1, 1], [0, 0, 1]], dtype=bool)

        with self.assertRaisesRegexp(ValueError, msg):
            adjusted_array(data, bad_mask, {})
    def load_adjusted_array(self, columns, dates, assets, mask):
        # load_adjusted_array is called with dates on which the user's algo
        # will be shown data, which means we need to return the data that would
        # be known at the start of each date.  We assume that the latest data
        # known on day N is the data from day (N - 1), so we shift all query
        # dates back by a day.
        start_date, end_date = _shift_dates(
            self._calendar, dates[0], dates[-1], shift=1,
        )

        raw_arrays = self.raw_price_loader.load_raw_arrays(
            columns,
            start_date,
            end_date,
            assets,
        )
        adjustments = self.adjustments_loader.load_adjustments(
            columns,
            dates,
            assets,
        )
        adjusted_arrays = [
            adjusted_array(raw_array, mask, col_adjustments)
            for raw_array, col_adjustments in zip(raw_arrays, adjustments)
        ]

        return dict(zip(columns, adjusted_arrays))
Exemple #4
0
    def load_adjusted_array(self, columns, mask):
        """
        Load data from our stored baseline.
        """
        if len(columns) != 1:
            raise ValueError(
                "Can't load multiple columns with DataFrameLoader"
            )
        elif columns[0] != self.column:
            raise ValueError("Can't load unknown column %s" % columns[0])

        dates, assets, mask_values = mask.index, mask.columns, mask.values

        date_indexer = self.dates.get_indexer(dates)
        assets_indexer = self.assets.get_indexer(assets)

        # Boolean arrays with True on matched entries
        good_dates = (date_indexer != -1)
        good_assets = (assets_indexer != -1)

        return [adjusted_array(
            # Pull out requested columns/rows from our baseline data.
            data=self.baseline[ix_(date_indexer, assets_indexer)],
            # Mask out requested columns/rows that didnt match.
            mask=(good_assets & good_dates[:, None]) & mask_values,
            adjustments=self.format_adjustments(dates, assets),
        )]
Exemple #5
0
    def load_adjusted_array(self, columns, dates, assets, mask):
        """
        Load data from our stored baseline.
        """
        if len(columns) != 1:
            raise ValueError(
                "Can't load multiple columns with DataFrameLoader"
            )
        elif columns[0] != self.column:
            raise ValueError("Can't load unknown column %s" % columns[0])

        date_indexer = self.dates.get_indexer(dates)
        assets_indexer = self.assets.get_indexer(assets)

        # Boolean arrays with True on matched entries
        good_dates = (date_indexer != -1)
        good_assets = (assets_indexer != -1)

        arrays = [adjusted_array(
            # Pull out requested columns/rows from our baseline data.
            data=self.baseline[ix_(date_indexer, assets_indexer)],
            # Mask out requested columns/rows that didnt match.
            mask=(good_assets & good_dates[:, None]) & mask,
            adjustments=self.format_adjustments(dates, assets),
        )]
        return dict(zip(columns, arrays))
    def test_inspect(self):
        data = arange(15, dtype=float).reshape(5, 3)
        adj_array = adjusted_array(
            data,
            NOMASK,
            {4: [Float64Multiply(2, 3, 0, 0, 4.0)]},
        )

        expected = dedent(
            """\
            Adjusted Array:

            Data:
            array([[  0.,   1.,   2.],
                   [  3.,   4.,   5.],
                   [  6.,   7.,   8.],
                   [  9.,  10.,  11.],
                   [ 12.,  13.,  14.]])

            Adjustments:
            {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \
last_col=0, value=4.000000)]}
            """
        )
        self.assertEqual(expected, adj_array.inspect())
    def load_adjusted_array(self, columns, dates, assets, mask):
        # load_adjusted_array is called with dates on which the user's algo
        # will be shown data, which means we need to return the data that would
        # be known at the start of each date.  We assume that the latest data
        # known on day N is the data from day (N - 1), so we shift all query
        # dates back by a day.
        start_date, end_date = _shift_dates(
            self._calendar, dates[0], dates[-1], shift=1,
        )

        raw_arrays = self.raw_price_loader.load_raw_arrays(
            columns,
            start_date,
            end_date,
            assets,
        )

        adjustments = self.adjustments_loader.load_adjustments(
            columns,
            dates,
            assets,
        )

        return [
            adjusted_array(raw_array, mask, col_adjustments)
            for raw_array, col_adjustments in zip(raw_arrays, adjustments)
        ]
    def test_array_views_arent_writable(self):

        data = arange(30, dtype=float).reshape(6, 5)
        adj_array = adjusted_array(data, NOMASK, {})

        for frame in adj_array.traverse(3):
            with self.assertRaises(ValueError):
                frame[0, 0] = 5.0
    def test_array_views_arent_writable(self):

        data = arange(30, dtype=float).reshape(6, 5)
        adj_array = adjusted_array(data, NOMASK, {})

        for frame in adj_array.traverse(3):
            with self.assertRaises(ValueError):
                frame[0, 0] = 5.0
 def test_no_adjustments(self, name, data, lookback, adjustments, expected):
     array = adjusted_array(
         data,
         NOMASK,
         adjustments,
     )
     for _ in range(2):  # Iterate 2x ensure adjusted_arrays are re-usable.
         window_iter = array.traverse(lookback)
         for yielded, expected_yield in zip_longest(window_iter, expected):
             assert_array_equal(yielded, expected_yield)
    def test_invalid_lookback(self):

        data = arange(30, dtype=float).reshape(6, 5)
        adj_array = adjusted_array(data, NOMASK, {})

        with self.assertRaises(WindowLengthTooLong):
            adj_array.traverse(7)

        with self.assertRaises(WindowLengthNotPositive):
            adj_array.traverse(0)

        with self.assertRaises(WindowLengthNotPositive):
            adj_array.traverse(-1)
    def test_invalid_lookback(self):

        data = arange(30, dtype=float).reshape(6, 5)
        adj_array = adjusted_array(data, NOMASK, {})

        with self.assertRaises(WindowLengthTooLong):
            adj_array.traverse(7)

        with self.assertRaises(WindowLengthNotPositive):
            adj_array.traverse(0)

        with self.assertRaises(WindowLengthNotPositive):
            adj_array.traverse(-1)
 def test_overwrite_adjustment_cases(self,
                                     name,
                                     data,
                                     lookback,
                                     adjustments,
                                     expected):
     array = adjusted_array(
         data,
         NOMASK,
         adjustments,
     )
     for _ in range(2):  # Iterate 2x ensure adjusted_arrays are re-usable.
         window_iter = array.traverse(lookback)
         for yielded, expected_yield in zip_longest(window_iter, expected):
             assert_array_equal(yielded, expected_yield)
    def load_adjusted_array(self, columns, mask):
        dates, assets = mask.index, mask.columns
        raw_arrays = self.raw_price_loader.load_raw_arrays(
            columns,
            dates,
            assets,
        )
        adjustments = self.adjustments_loader.load_adjustments(
            columns,
            dates,
            assets,
        )

        return [
            adjusted_array(raw_array, mask.values, col_adjustments)
            for raw_array, col_adjustments in zip(raw_arrays, adjustments)
        ]
    def load_adjusted_array(self, columns, mask):
        dates, assets = mask.index, mask.columns
        raw_arrays = self.raw_price_loader.load_raw_arrays(
            columns,
            dates,
            assets,
        )
        adjustments = self.adjustments_loader.load_adjustments(
            columns,
            dates,
            assets,
        )

        return [
            adjusted_array(raw_array, mask.values, col_adjustments)
            for raw_array, col_adjustments in zip(raw_arrays, adjustments)
        ]
Exemple #16
0
    def test_inspect(self):
        data = arange(15, dtype=float).reshape(5, 3)
        adj_array = adjusted_array(
            data,
            NOMASK,
            {4: [Float64Multiply(2, 3, 0, 0, 4.0)]},
        )

        expected = dedent("""\
            Adjusted Array:

            Data:
            array([[  0.,   1.,   2.],
                   [  3.,   4.,   5.],
                   [  6.,   7.,   8.],
                   [  9.,  10.,  11.],
                   [ 12.,  13.,  14.]])

            Adjustments:
            {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \
last_col=0, value=4.000000)]}
            """)
        self.assertEqual(expected, adj_array.inspect())
Exemple #17
0
    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, resources = self[dataset]
        have_sids = SID_FIELD_NAME in expr.fields
        assets = list(map(int, assets))  # coerce from numpy.int64
        fields = list(map(dataset_name, columns))
        query_fields = fields + [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )

        def where(e):
            """Create the query to run against the resources.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.

            Returns
            -------
            q : Expr
                The query to run.
            """
            ts = e[TS_FIELD_NAME]
            # Hack to get the lower bound to query:
            # This must be strictly executed because the data for `ts` will
            # be removed from scope too early otherwise.
            lower = odo(ts[ts <= dates[0]].max(), pd.Timestamp)
            selection = ts <= dates[-1]
            if have_sids:
                selection &= e[SID_FIELD_NAME].isin(assets)
            if lower is not pd.NaT:
                selection &= ts >= lower

            return e[selection][query_fields]

        extra_kwargs = {'d': resources} if resources else {}
        materialized_expr = odo(where(expr), pd.DataFrame, **extra_kwargs)
        materialized_deltas = (
            odo(where(deltas), pd.DataFrame, **extra_kwargs)
            if deltas is not None else
            pd.DataFrame(columns=query_fields)
        )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        if have_sids:
            # Unstack by the sid so that we get a multi-index on the columns
            # of datacolumn, sid.
            sparse_output = sparse_output.set_index(
                [TS_FIELD_NAME, SID_FIELD_NAME],
            ).unstack()
            sparse_deltas = non_novel_deltas.set_index(
                [TS_FIELD_NAME, SID_FIELD_NAME],
            ).unstack()

            dense_output = sparse_output.reindex(dates, method='ffill')
            cols = dense_output.columns
            dense_output = dense_output.reindex(
                columns=pd.MultiIndex.from_product(
                    (cols.levels[0], assets),
                    names=cols.names,
                ),
            )

            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # We use the column view to make an array per asset.
            column_view = compose(
                # We need to copy this because we need a concrete ndarray.
                # The `repeat_last_axis` call will give us a fancy strided
                # array which uses a buffer to represent `len(assets)` columns.
                # The engine puts nans at the indicies for which we do not have
                # sid information so that the nan-aware reductions still work.
                # A future change to the engine would be to add first class
                # support for macro econimic datasets.
                copy,
                partial(repeat_last_axis, count=len(assets)),
            )
            sparse_output = sparse_output.set_index(TS_FIELD_NAME)
            dense_output = sparse_output.reindex(dates, method='ffill')
            sparse_deltas = non_novel_deltas.set_index(TS_FIELD_NAME)
            adjustments_from_deltas = adjustments_from_deltas_no_sids

        for column_idx, column in enumerate(columns):
            column_name = column.name
            yield column, adjusted_array(
                column_view(
                    dense_output[column_name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output.index,
                    column_idx,
                    column_name,
                    assets,
                    sparse_deltas,
                )
            )