Esempio n. 1
0
def overwrite_from_dates(asof, dense_dates, sparse_dates, asset_idx, value):
    """Construct a `Float64Overwrite` with the correct
    start and end date based on the asof date of the delta,
    the dense_dates, and the dense_dates.

    Parameters
    ----------
    asof : datetime
        The asof date of the delta.
    dense_dates : pd.DatetimeIndex
        The dates requested by the loader.
    sparse_dates : pd.DatetimeIndex
        The dates that appeared in the dataset.
    asset_idx : tuple of int
        The index of the asset in the block. If this is a tuple, then this
        is treated as the first and last index to use.
    value : np.float64
        The value to overwrite with.

    Returns
    -------
    overwrite : Float64Overwrite
        The overwrite that will apply the new value to the data.
    """
    first_row = dense_dates.searchsorted(asof)
    last_row = dense_dates.searchsorted(
        sparse_dates[sparse_dates.searchsorted(asof, 'right')],
    ) - 1
    if first_row > last_row:
        return

    first, last = asset_idx
    yield Float64Overwrite(first_row, last_row, first, last, value)
Esempio n. 2
0
def overwrite_from_dates(asof, dense_dates, sparse_dates, asset_idx, value):
    """Construct a `Float64Overwrite` with the correct
    start and end date based on the asof date of the delta,
    the dense_dates, and the dense_dates.

    Parameters
    ----------
    asof : datetime
        The asof date of the delta.
    dense_dates : pd.DatetimeIndex
        The dates requested by the loader.
    sparse_dates : pd.DatetimeIndex
        The dates that appeared in the dataset.
    asset_idx : tuple of int
        The index of the asset in the block. If this is a tuple, then this
        is treated as the first and last index to use.
    value : np.float64
        The value to overwrite with.

    Returns
    -------
    overwrite : Float64Overwrite
        The overwrite that will apply the new value to the data.

    Notes
    -----
    This is forward-filling all dense dates that are between the asof_date date
    and the next sparse date after the asof_date.

    For example:
    let ``asof = pd.Timestamp('2014-01-02')``,
        ``dense_dates = pd.date_range('2014-01-01', '2014-01-05')``
        ``sparse_dates = pd.to_datetime(['2014-01', '2014-02', '2014-04'])``

    Then the overwrite will apply to indexes: 1, 2, 3, 4
    """
    if asof is pd.NaT:
        # Not an actual delta.
        # This happens due to the groupby we do on the deltas.
        return

    first_row = dense_dates.searchsorted(asof)
    next_idx = sparse_dates.searchsorted(asof.asm8, 'right')
    if next_idx == len(sparse_dates):
        # There is no next date in the sparse, this overwrite should apply
        # through the end of the dense dates.
        last_row = len(dense_dates) - 1
    else:
        # There is a next date in sparse dates. This means that the overwrite
        # should only apply until the index of this date in the dense dates.
        last_row = dense_dates.searchsorted(sparse_dates[next_idx]) - 1

    if first_row > last_row:
        return

    first, last = asset_idx
    yield Float64Overwrite(first_row, last_row, first, last, value)
Esempio n. 3
0
class AdjustedArrayTestCase(TestCase):
    def test_traverse_invalidating(self):
        data = arange(5 * 3, dtype='f8').reshape(5, 3)
        original_data = data.copy()
        adjustments = {2: [Float64Multiply(0, 4, 0, 2, 2.0)]}
        adjusted_array = AdjustedArray(data, adjustments, float('nan'))

        for _ in adjusted_array.traverse(1, copy=False):
            pass

        assert_equal(data, original_data * 2)

        with self.assertRaises(ValueError) as e:
            adjusted_array.traverse(1)

        assert_equal(
            str(e.exception),
            'cannot traverse invalidated AdjustedArray',
        )

    def test_copy(self):
        data = arange(5 * 3, dtype='f8').reshape(5, 3)
        original_data = data.copy()
        adjustments = {2: [Float64Multiply(0, 4, 0, 2, 2.0)]}
        adjusted_array = AdjustedArray(data, adjustments, float('nan'))
        traverse_copy = adjusted_array.copy()
        clean_copy = adjusted_array.copy()

        a_it = adjusted_array.traverse(2, copy=False)
        b_it = traverse_copy.traverse(2, copy=False)
        for a, b in zip(a_it, b_it):
            assert_equal(a, b)

        with self.assertRaises(ValueError) as e:
            adjusted_array.copy()

        assert_equal(
            str(e.exception),
            'cannot copy invalidated AdjustedArray',
        )

        # the clean copy should have the original data even though the
        # original adjusted array has it's data mutated in place
        assert_equal(clean_copy.data, original_data)
        assert_equal(adjusted_array.data, original_data * 2)

    @parameterized.expand(
        chain(
            _gen_unadjusted_cases(
                'float',
                make_input=as_dtype(float64_dtype),
                make_expected_output=as_dtype(float64_dtype),
                missing_value=default_missing_value_for_dtype(float64_dtype),
            ),
            _gen_unadjusted_cases(
                'datetime',
                make_input=as_dtype(datetime64ns_dtype),
                make_expected_output=as_dtype(datetime64ns_dtype),
                missing_value=default_missing_value_for_dtype(
                    datetime64ns_dtype),
            ),
            # Test passing an array of strings to AdjustedArray.
            _gen_unadjusted_cases(
                'bytes_ndarray',
                make_input=as_dtype(bytes_dtype),
                make_expected_output=as_labelarray(bytes_dtype, b''),
                missing_value=b'',
            ),
            _gen_unadjusted_cases(
                'unicode_ndarray',
                make_input=as_dtype(unicode_dtype),
                make_expected_output=as_labelarray(unicode_dtype, u''),
                missing_value=u'',
            ),
            _gen_unadjusted_cases(
                'object_ndarray',
                make_input=lambda a: a.astype(unicode).astype(object),
                make_expected_output=as_labelarray(unicode_dtype, u''),
                missing_value='',
            ),
            # Test passing a LabelArray directly to AdjustedArray.
            _gen_unadjusted_cases(
                'bytes_labelarray',
                make_input=as_labelarray(bytes_dtype, b''),
                make_expected_output=as_labelarray(bytes_dtype, b''),
                missing_value=b'',
            ),
            _gen_unadjusted_cases(
                'unicode_labelarray',
                make_input=as_labelarray(unicode_dtype, None),
                make_expected_output=as_labelarray(unicode_dtype, None),
                missing_value=u'',
            ),
            _gen_unadjusted_cases(
                'object_labelarray',
                make_input=(lambda a: LabelArray(
                    a.astype(unicode).astype(object), u'')),
                make_expected_output=as_labelarray(unicode_dtype, ''),
                missing_value='',
            ),
        ))
    def test_no_adjustments(self, name, data, lookback, adjustments,
                            missing_value, perspective_offset,
                            expected_output):

        array = AdjustedArray(data, adjustments, missing_value)
        for _ in range(2):  # Iterate 2x ensure adjusted_arrays are re-usable.
            in_out = zip(array.traverse(lookback), expected_output)
            for yielded, expected_yield in in_out:
                check_arrays(yielded, expected_yield)

    @parameterized.expand(_gen_multiplicative_adjustment_cases(float64_dtype))
    def test_multiplicative_adjustments(self, name, data, lookback,
                                        adjustments, missing_value,
                                        perspective_offset, expected):

        array = AdjustedArray(data, adjustments, missing_value)
        for _ in range(2):  # Iterate 2x ensure adjusted_arrays are re-usable.
            window_iter = array.traverse(
                lookback,
                perspective_offset=perspective_offset,
            )
            for yielded, expected_yield in zip_longest(window_iter, expected):
                check_arrays(yielded, expected_yield)

    @parameterized.expand(
        chain(
            _gen_overwrite_adjustment_cases(bool_dtype),
            _gen_overwrite_adjustment_cases(int64_dtype),
            _gen_overwrite_adjustment_cases(float64_dtype),
            _gen_overwrite_adjustment_cases(datetime64ns_dtype),
            _gen_overwrite_1d_array_adjustment_case(float64_dtype),
            _gen_overwrite_1d_array_adjustment_case(datetime64ns_dtype),
            _gen_overwrite_1d_array_adjustment_case(bool_dtype),
            # There are six cases here:
            # Using np.bytes/np.unicode/object arrays as inputs.
            # Passing np.bytes/np.unicode/object arrays to LabelArray,
            # and using those as input.
            #
            # The outputs should always be LabelArrays.
            _gen_unadjusted_cases(
                'bytes_ndarray',
                make_input=as_dtype(bytes_dtype),
                make_expected_output=as_labelarray(bytes_dtype, b''),
                missing_value=b'',
            ),
            _gen_unadjusted_cases(
                'unicode_ndarray',
                make_input=as_dtype(unicode_dtype),
                make_expected_output=as_labelarray(unicode_dtype, u''),
                missing_value=u'',
            ),
            _gen_unadjusted_cases(
                'object_ndarray',
                make_input=lambda a: a.astype(unicode).astype(object),
                make_expected_output=as_labelarray(unicode_dtype, u''),
                missing_value=u'',
            ),
            _gen_unadjusted_cases(
                'bytes_labelarray',
                make_input=as_labelarray(bytes_dtype, b''),
                make_expected_output=as_labelarray(bytes_dtype, b''),
                missing_value=b'',
            ),
            _gen_unadjusted_cases(
                'unicode_labelarray',
                make_input=as_labelarray(unicode_dtype, u''),
                make_expected_output=as_labelarray(unicode_dtype, u''),
                missing_value=u'',
            ),
            _gen_unadjusted_cases(
                'object_labelarray',
                make_input=(lambda a: LabelArray(
                    a.astype(unicode).astype(object),
                    None,
                )),
                make_expected_output=as_labelarray(unicode_dtype, u''),
                missing_value=None,
            ),
        ))
    def test_overwrite_adjustment_cases(self, name, baseline, lookback,
                                        adjustments, missing_value,
                                        perspective_offset, expected):
        array = AdjustedArray(baseline, adjustments, missing_value)

        for _ in range(2):  # Iterate 2x ensure adjusted_arrays are re-usable.
            window_iter = array.traverse(
                lookback,
                perspective_offset=perspective_offset,
            )
            for yielded, expected_yield in zip_longest(window_iter, expected):
                check_arrays(yielded, expected_yield)

    def test_object1darrayoverwrite(self):
        pairs = [u + l for u, l in product(ascii_uppercase, ascii_lowercase)]
        categories = pairs + ['~' + c for c in pairs]
        baseline = LabelArray(
            array([[''.join((r, c)) for c in 'abc'] for r in ascii_uppercase]),
            None,
            categories,
        )
        full_expected = baseline.copy()

        def flip(cs):
            if cs is None:
                return None
            if cs[0] != '~':
                return '~' + cs
            return cs

        def make_overwrite(fr, lr, fc, lc):
            fr, lr, fc, lc = map(ord, (fr, lr, fc, lc))
            fr -= ord('A')
            lr -= ord('A')
            fc -= ord('a')
            lc -= ord('a')

            return Object1DArrayOverwrite(
                fr,
                lr,
                fc,
                lc,
                baseline[fr:lr + 1, fc].map(flip),
            )

        overwrites = {
            3: [make_overwrite('A', 'B', 'a', 'a')],
            4: [make_overwrite('A', 'C', 'b', 'c')],
            5: [make_overwrite('D', 'D', 'a', 'b')],
        }

        it = AdjustedArray(baseline, overwrites, None).traverse(3)

        window = next(it)
        expected = full_expected[:3]
        check_arrays(window, expected)

        window = next(it)
        full_expected[0:2, 0] = LabelArray(['~Aa', '~Ba'], None)
        expected = full_expected[1:4]
        check_arrays(window, expected)

        window = next(it)
        full_expected[0:3, 1:3] = LabelArray(
            [['~Ab', '~Ac'], ['~Bb', '~Bc'], ['~Cb', '~Cb']], None)
        expected = full_expected[2:5]
        check_arrays(window, expected)

        window = next(it)
        full_expected[3, :2] = '~Da'
        expected = full_expected[3:6]
        check_arrays(window, expected)

    def test_invalid_lookback(self):

        data = arange(30, dtype=float).reshape(6, 5)
        adj_array = AdjustedArray(data, {}, float('nan'))

        with self.assertRaises(WindowLengthTooLong):
            adj_array.traverse(7)

        with self.assertRaises(WindowLengthNotPositive):
            adj_array.traverse(0)

        with self.assertRaises(WindowLengthNotPositive):
            adj_array.traverse(-1)

    def test_array_views_arent_writable(self):

        data = arange(30, dtype=float).reshape(6, 5)
        adj_array = AdjustedArray(data, {}, float('nan'))

        for frame in adj_array.traverse(3):
            with self.assertRaises(ValueError):
                frame[0, 0] = 5.0

    def test_inspect(self):
        data = arange(15, dtype=float).reshape(5, 3)
        adj_array = AdjustedArray(
            data,
            {4: [Float64Multiply(2, 3, 0, 0, 4.0)]},
            float('nan'),
        )

        expected = dedent("""\
            Adjusted Array (float64):

            Data:
            array([[  0.,   1.,   2.],
                   [  3.,   4.,   5.],
                   [  6.,   7.,   8.],
                   [  9.,  10.,  11.],
                   [ 12.,  13.,  14.]])

            Adjustments:
            {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \
last_col=0, value=4.000000)]}
            """)
        got = adj_array.inspect()
        self.assertEqual(expected, got)

    def test_update_labels(self):
        data = array([
            ['aaa', 'bbb', 'ccc'],
            ['ddd', 'eee', 'fff'],
            ['ggg', 'hhh', 'iii'],
            ['jjj', 'kkk', 'lll'],
            ['mmm', 'nnn', 'ooo'],
        ])
        label_array = LabelArray(data, missing_value='')

        adj_array = AdjustedArray(
            data=label_array,
            adjustments={4: [ObjectOverwrite(2, 3, 0, 0, 'ppp')]},
            missing_value='',
        )

        expected_data = array([
            ['aaa-foo', 'bbb-foo', 'ccc-foo'],
            ['ddd-foo', 'eee-foo', 'fff-foo'],
            ['ggg-foo', 'hhh-foo', 'iii-foo'],
            ['jjj-foo', 'kkk-foo', 'lll-foo'],
            ['mmm-foo', 'nnn-foo', 'ooo-foo'],
        ])
        expected_label_array = LabelArray(expected_data, missing_value='')

        expected_adj_array = AdjustedArray(
            data=expected_label_array,
            adjustments={4: [ObjectOverwrite(2, 3, 0, 0, 'ppp-foo')]},
            missing_value='',
        )

        adj_array.update_labels(lambda x: x + '-foo')

        # Check that the mapped AdjustedArray has the expected baseline
        # values and adjustment values.
        check_arrays(adj_array.data, expected_adj_array.data)
        self.assertEqual(adj_array.adjustments, expected_adj_array.adjustments)

    A = Float64Multiply(0, 4, 1, 1, 0.5)
    B = Float64Overwrite(3, 3, 4, 4, 4.2)
    C = Float64Multiply(0, 2, 0, 0, 0.14)
    D = Float64Overwrite(0, 3, 0, 0, 4.0)
    E = Float64Overwrite(0, 0, 1, 1, 3.7)
    F = Float64Multiply(0, 4, 3, 3, 10.0)
    G = Float64Overwrite(5, 5, 4, 4, 1.7)
    H = Float64Multiply(0, 4, 2, 2, 0.99)
    S = Float64Multiply(0, 1, 4, 4, 5.06)

    @parameterized.expand([(
        # Initial adjustments
        {
            1: [A, B],
            2: [C],
            4: [D],
        },

        # Adjustments to add
        {
            1: [E],
            2: [F, G],
            3: [H, S],
        },

        # Expected adjustments with 'append'
        {
            1: [A, B, E],
            2: [C, F, G],
            3: [H, S],
            4: [D],
        },

        # Expected adjustments with 'prepend'
        {
            1: [E, A, B],
            2: [F, G, C],
            3: [H, S],
            4: [D],
        },
    )])
    def test_update_adjustments(self, initial_adjustments, adjustments_to_add,
                                expected_adjustments_with_append,
                                expected_adjustments_with_prepend):
        methods = ['append', 'prepend']
        expected_outputs = [
            expected_adjustments_with_append, expected_adjustments_with_prepend
        ]

        for method, expected_output in zip(methods, expected_outputs):
            data = arange(30, dtype=float).reshape(6, 5)
            adjusted_array = AdjustedArray(data, initial_adjustments,
                                           float('nan'))

            adjusted_array.update_adjustments(adjustments_to_add, method)
            self.assertEqual(adjusted_array.adjustments, expected_output)
Esempio n. 4
0
    def test_adjustments(self):
        data = arange(100).reshape(self.ndates, self.nsids)
        baseline = DataFrame(data, index=self.dates, columns=self.sids)

        # Use the dates from index 10 on and sids 1-3.
        dates_slice = slice(10, None, None)
        sids_slice = slice(1, 4, None)

        # Adjustments that should actually affect the output.
        relevant_adjustments = [
            {
                'sid': 1,
                'start_date': None,
                'end_date': self.dates[15],
                'apply_date': self.dates[16],
                'value': 0.5,
                'kind': MULTIPLY,
            },
            {
                'sid': 2,
                'start_date': self.dates[5],
                'end_date': self.dates[15],
                'apply_date': self.dates[16],
                'value': 1.0,
                'kind': ADD,
            },
            {
                'sid': 2,
                'start_date': self.dates[15],
                'end_date': self.dates[16],
                'apply_date': self.dates[17],
                'value': 1.0,
                'kind': ADD,
            },
            {
                'sid': 3,
                'start_date': self.dates[16],
                'end_date': self.dates[17],
                'apply_date': self.dates[18],
                'value': 99.0,
                'kind': OVERWRITE,
            },
        ]

        # These adjustments shouldn't affect the output.
        irrelevant_adjustments = [
            {  # Sid Not Requested
                'sid': 0,
                'start_date': self.dates[16],
                'end_date': self.dates[17],
                'apply_date': self.dates[18],
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Sid Unknown
                'sid': 9999,
                'start_date': self.dates[16],
                'end_date': self.dates[17],
                'apply_date': self.dates[18],
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Date Not Requested
                'sid': 2,
                'start_date': self.dates[1],
                'end_date': self.dates[2],
                'apply_date': self.dates[3],
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Date Before Known Data
                'sid': 2,
                'start_date': self.dates[0] - (2 * trading_day),
                'end_date': self.dates[0] - trading_day,
                'apply_date': self.dates[0] - trading_day,
                'value': -9999.0,
                'kind': OVERWRITE,
            },
            {  # Date After Known Data
                'sid': 2,
                'start_date': self.dates[-1] + trading_day,
                'end_date': self.dates[-1] + (2 * trading_day),
                'apply_date': self.dates[-1] + (3 * trading_day),
                'value': -9999.0,
                'kind': OVERWRITE,
            },
        ]

        adjustments = DataFrame(relevant_adjustments + irrelevant_adjustments)
        loader = DataFrameLoader(
            USEquityPricing.close,
            baseline,
            adjustments=adjustments,
        )

        expected_baseline = baseline.iloc[dates_slice, sids_slice]

        formatted_adjustments = loader.format_adjustments(
            self.dates[dates_slice],
            self.sids[sids_slice],
        )
        expected_formatted_adjustments = {
            6: [
                Float64Multiply(
                    first_row=0,
                    last_row=5,
                    first_col=0,
                    last_col=0,
                    value=0.5,
                ),
                Float64Add(
                    first_row=0,
                    last_row=5,
                    first_col=1,
                    last_col=1,
                    value=1.0,
                ),
            ],
            7: [
                Float64Add(
                    first_row=5,
                    last_row=6,
                    first_col=1,
                    last_col=1,
                    value=1.0,
                ),
            ],
            8: [
                Float64Overwrite(
                    first_row=6,
                    last_row=7,
                    first_col=2,
                    last_col=2,
                    value=99.0,
                )
            ],
        }
        self.assertEqual(formatted_adjustments, expected_formatted_adjustments)

        mask = self.mask[dates_slice, sids_slice]
        with patch('zipline.pipeline.loaders.frame.adjusted_array') as m:
            loader.load_adjusted_array(
                columns=[USEquityPricing.close],
                dates=self.dates[dates_slice],
                assets=self.sids[sids_slice],
                mask=mask,
            )

        self.assertEqual(m.call_count, 1)

        args, kwargs = m.call_args
        assert_array_equal(kwargs['data'], expected_baseline.values)
        assert_array_equal(kwargs['mask'], mask)
        self.assertEqual(kwargs['adjustments'], expected_formatted_adjustments)
Esempio n. 5
0
    def test_adjustments(self):
        data = np.arange(100).reshape(self.ndates, self.nsids)
        baseline = pd.DataFrame(data, index=self.dates, columns=self.sids)

        # Use the dates from index 10 on and sids 1-3.
        dates_slice = slice(10, None, None)
        sids_slice = slice(1, 4, None)

        # Adjustments that should actually affect the output.
        relevant_adjustments = [
            {
                "sid": 1,
                "start_date": None,
                "end_date": self.dates[15],
                "apply_date": self.dates[16],
                "value": 0.5,
                "kind": MULTIPLY,
            },
            {
                "sid": 2,
                "start_date": self.dates[5],
                "end_date": self.dates[15],
                "apply_date": self.dates[16],
                "value": 1.0,
                "kind": ADD,
            },
            {
                "sid": 2,
                "start_date": self.dates[15],
                "end_date": self.dates[16],
                "apply_date": self.dates[17],
                "value": 1.0,
                "kind": ADD,
            },
            {
                "sid": 3,
                "start_date": self.dates[16],
                "end_date": self.dates[17],
                "apply_date": self.dates[18],
                "value": 99.0,
                "kind": OVERWRITE,
            },
        ]

        # These adjustments shouldn't affect the output.
        irrelevant_adjustments = [
            {  # Sid Not Requested
                "sid": 0,
                "start_date": self.dates[16],
                "end_date": self.dates[17],
                "apply_date": self.dates[18],
                "value": -9999.0,
                "kind": OVERWRITE,
            },
            {  # Sid Unknown
                "sid": 9999,
                "start_date": self.dates[16],
                "end_date": self.dates[17],
                "apply_date": self.dates[18],
                "value": -9999.0,
                "kind": OVERWRITE,
            },
            {  # Date Not Requested
                "sid": 2,
                "start_date": self.dates[1],
                "end_date": self.dates[2],
                "apply_date": self.dates[3],
                "value": -9999.0,
                "kind": OVERWRITE,
            },
            {  # Date Before Known Data
                "sid": 2,
                "start_date": self.dates[0] - (2 * self.trading_day),
                "end_date": self.dates[0] - self.trading_day,
                "apply_date": self.dates[0] - self.trading_day,
                "value": -9999.0,
                "kind": OVERWRITE,
            },
            {  # Date After Known Data
                "sid": 2,
                "start_date": self.dates[-1] + self.trading_day,
                "end_date": self.dates[-1] + (2 * self.trading_day),
                "apply_date": self.dates[-1] + (3 * self.trading_day),
                "value": -9999.0,
                "kind": OVERWRITE,
            },
        ]

        adjustments = pd.DataFrame(relevant_adjustments +
                                   irrelevant_adjustments)
        loader = DataFrameLoader(
            USEquityPricing.close,
            baseline,
            adjustments=adjustments,
        )

        expected_baseline = baseline.iloc[dates_slice, sids_slice]

        formatted_adjustments = loader.format_adjustments(
            self.dates[dates_slice],
            self.sids[sids_slice],
        )
        expected_formatted_adjustments = {
            6: [
                Float64Multiply(
                    first_row=0,
                    last_row=5,
                    first_col=0,
                    last_col=0,
                    value=0.5,
                ),
                Float64Add(
                    first_row=0,
                    last_row=5,
                    first_col=1,
                    last_col=1,
                    value=1.0,
                ),
            ],
            7: [
                Float64Add(
                    first_row=5,
                    last_row=6,
                    first_col=1,
                    last_col=1,
                    value=1.0,
                ),
            ],
            8: [
                Float64Overwrite(
                    first_row=6,
                    last_row=7,
                    first_col=2,
                    last_col=2,
                    value=99.0,
                )
            ],
        }
        assert formatted_adjustments == expected_formatted_adjustments

        mask = self.mask[dates_slice, sids_slice]
        with patch("zipline.pipeline.loaders.frame.AdjustedArray") as m:
            loader.load_adjusted_array(
                US_EQUITIES,
                columns=[USEquityPricing.close],
                dates=self.dates[dates_slice],
                sids=self.sids[sids_slice],
                mask=mask,
            )

        assert m.call_count == 1

        args, kwargs = m.call_args
        assert_array_equal(kwargs["data"], expected_baseline.values)
        assert kwargs["adjustments"] == expected_formatted_adjustments
Esempio n. 6
0
class TestAdjustedArray:
    def test_traverse_invalidating(self):
        data = np.arange(5 * 3, dtype="f8").reshape(5, 3)
        original_data = data.copy()
        adjustments = {2: [Float64Multiply(0, 4, 0, 2, 2.0)]}
        adjusted_array = AdjustedArray(data, adjustments, float("nan"))

        for _ in adjusted_array.traverse(1, copy=False):
            pass

        assert_equal(data, original_data * 2)

        err_msg = "cannot traverse invalidated AdjustedArray"
        with pytest.raises(ValueError, match=err_msg):
            adjusted_array.traverse(1)

    def test_copy(self):
        data = np.arange(5 * 3, dtype="f8").reshape(5, 3)
        original_data = data.copy()
        adjustments = {2: [Float64Multiply(0, 4, 0, 2, 2.0)]}
        adjusted_array = AdjustedArray(data, adjustments, float("nan"))
        traverse_copy = adjusted_array.copy()
        clean_copy = adjusted_array.copy()

        a_it = adjusted_array.traverse(2, copy=False)
        b_it = traverse_copy.traverse(2, copy=False)
        for a, b in zip(a_it, b_it):
            assert_equal(a, b)

        err_msg = "cannot copy invalidated AdjustedArray"
        with pytest.raises(ValueError, match=err_msg):
            adjusted_array.copy()

        # the clean copy should have the original data even though the
        # original adjusted array has it's data mutated in place
        assert_equal(clean_copy.data, original_data)
        assert_equal(adjusted_array.data, original_data * 2)

    @pytest.mark.parametrize(
        """name, data, lookback, adjustments, missing_value,\
            perspective_offset, expected_output""",
        chain(
            _gen_unadjusted_cases(
                "float",
                make_input=as_dtype(float64_dtype),
                make_expected_output=as_dtype(float64_dtype),
                missing_value=default_missing_value_for_dtype(float64_dtype),
            ),
            _gen_unadjusted_cases(
                "datetime",
                make_input=as_dtype(datetime64ns_dtype),
                make_expected_output=as_dtype(datetime64ns_dtype),
                missing_value=default_missing_value_for_dtype(datetime64ns_dtype),
            ),
            # Test passing an array of strings to AdjustedArray.
            _gen_unadjusted_cases(
                "bytes_ndarray",
                make_input=as_dtype(bytes_dtype),
                make_expected_output=as_labelarray(bytes_dtype, b""),
                missing_value=b"",
            ),
            _gen_unadjusted_cases(
                "unicode_ndarray",
                make_input=as_dtype(unicode_dtype),
                make_expected_output=as_labelarray(unicode_dtype, ""),
                missing_value="",
            ),
            _gen_unadjusted_cases(
                "object_ndarray",
                make_input=lambda a: a.astype(unicode).astype(object),
                make_expected_output=as_labelarray(unicode_dtype, ""),
                missing_value="",
            ),
            # Test passing a LabelArray directly to AdjustedArray.
            _gen_unadjusted_cases(
                "bytes_labelarray",
                make_input=as_labelarray(bytes_dtype, b""),
                make_expected_output=as_labelarray(bytes_dtype, b""),
                missing_value=b"",
            ),
            _gen_unadjusted_cases(
                "unicode_labelarray",
                make_input=as_labelarray(unicode_dtype, None),
                make_expected_output=as_labelarray(unicode_dtype, None),
                missing_value="",
            ),
            _gen_unadjusted_cases(
                "object_labelarray",
                make_input=(lambda a: LabelArray(a.astype(unicode).astype(object), "")),
                make_expected_output=as_labelarray(unicode_dtype, ""),
                missing_value="",
            ),
        ),
    )
    def test_no_adjustments(
        self,
        name,
        data,
        lookback,
        adjustments,
        missing_value,
        perspective_offset,
        expected_output,
    ):

        array = AdjustedArray(data, adjustments, missing_value)
        for _ in range(2):  # Iterate 2x ensure adjusted_arrays are re-usable.
            in_out = zip(array.traverse(lookback), expected_output)
            for yielded, expected_yield in in_out:
                check_arrays(yielded, expected_yield)

    @pytest.mark.parametrize(
        "name, data, lookback, adjustments, missing_value,\
        perspective_offset, expected",
        _gen_multiplicative_adjustment_cases(float64_dtype),
    )
    def test_multiplicative_adjustments(
        self,
        name,
        data,
        lookback,
        adjustments,
        missing_value,
        perspective_offset,
        expected,
    ):

        array = AdjustedArray(data, adjustments, missing_value)
        for _ in range(2):  # Iterate 2x ensure adjusted_arrays are re-usable.
            window_iter = array.traverse(
                lookback,
                perspective_offset=perspective_offset,
            )
            for yielded, expected_yield in zip_longest(window_iter, expected):
                check_arrays(yielded, expected_yield)

    @pytest.mark.parametrize(
        "name, baseline, lookback, adjustments,\
        missing_value, perspective_offset, expected",
        chain(
            _gen_overwrite_adjustment_cases(bool_dtype),
            _gen_overwrite_adjustment_cases(int64_dtype),
            _gen_overwrite_adjustment_cases(float64_dtype),
            _gen_overwrite_adjustment_cases(datetime64ns_dtype),
            _gen_overwrite_1d_array_adjustment_case(float64_dtype),
            _gen_overwrite_1d_array_adjustment_case(datetime64ns_dtype),
            _gen_overwrite_1d_array_adjustment_case(bool_dtype),
            # There are six cases here:
            # Using np.bytes/np.unicode/object arrays as inputs.
            # Passing np.bytes/np.unicode/object arrays to LabelArray,
            # and using those as input.
            #
            # The outputs should always be LabelArrays.
            _gen_unadjusted_cases(
                "bytes_ndarray",
                make_input=as_dtype(bytes_dtype),
                make_expected_output=as_labelarray(bytes_dtype, b""),
                missing_value=b"",
            ),
            _gen_unadjusted_cases(
                "unicode_ndarray",
                make_input=as_dtype(unicode_dtype),
                make_expected_output=as_labelarray(unicode_dtype, ""),
                missing_value="",
            ),
            _gen_unadjusted_cases(
                "object_ndarray",
                make_input=lambda a: a.astype(unicode).astype(object),
                make_expected_output=as_labelarray(unicode_dtype, ""),
                missing_value="",
            ),
            _gen_unadjusted_cases(
                "bytes_labelarray",
                make_input=as_labelarray(bytes_dtype, b""),
                make_expected_output=as_labelarray(bytes_dtype, b""),
                missing_value=b"",
            ),
            _gen_unadjusted_cases(
                "unicode_labelarray",
                make_input=as_labelarray(unicode_dtype, ""),
                make_expected_output=as_labelarray(unicode_dtype, ""),
                missing_value="",
            ),
            _gen_unadjusted_cases(
                "object_labelarray",
                make_input=(
                    lambda a: LabelArray(
                        a.astype(unicode).astype(object),
                        None,
                    )
                ),
                make_expected_output=as_labelarray(unicode_dtype, ""),
                missing_value=None,
            ),
        ),
    )
    def test_overwrite_adjustment_cases(
        self,
        name,
        baseline,
        lookback,
        adjustments,
        missing_value,
        perspective_offset,
        expected,
    ):
        array = AdjustedArray(baseline, adjustments, missing_value)

        for _ in range(2):  # Iterate 2x ensure adjusted_arrays are re-usable.
            window_iter = array.traverse(
                lookback,
                perspective_offset=perspective_offset,
            )
            for yielded, expected_yield in zip_longest(window_iter, expected):
                check_arrays(yielded, expected_yield)

    def test_object1darrayoverwrite(self):
        pairs = [u + l for u, l in product(ascii_uppercase, ascii_lowercase)]
        categories = pairs + ["~" + c for c in pairs]
        baseline = LabelArray(
            np.array([["".join((r, c)) for c in "abc"] for r in ascii_uppercase]),
            None,
            categories,
        )
        full_expected = baseline.copy()

        def flip(cs):
            if cs is None:
                return None
            if cs[0] != "~":
                return "~" + cs
            return cs

        def make_overwrite(fr, lr, fc, lc):
            fr, lr, fc, lc = map(ord, (fr, lr, fc, lc))
            fr -= ord("A")
            lr -= ord("A")
            fc -= ord("a")
            lc -= ord("a")

            return Object1DArrayOverwrite(
                fr,
                lr,
                fc,
                lc,
                baseline[fr : lr + 1, fc].map(flip),
            )

        overwrites = {
            3: [make_overwrite("A", "B", "a", "a")],
            4: [make_overwrite("A", "C", "b", "c")],
            5: [make_overwrite("D", "D", "a", "b")],
        }

        it = AdjustedArray(baseline, overwrites, None).traverse(3)

        window = next(it)
        expected = full_expected[:3]
        check_arrays(window, expected)

        window = next(it)
        full_expected[0:2, 0] = LabelArray(["~Aa", "~Ba"], None)
        expected = full_expected[1:4]
        check_arrays(window, expected)

        window = next(it)
        full_expected[0:3, 1:3] = LabelArray(
            [["~Ab", "~Ac"], ["~Bb", "~Bc"], ["~Cb", "~Cb"]], None
        )
        expected = full_expected[2:5]
        check_arrays(window, expected)

        window = next(it)
        full_expected[3, :2] = "~Da"
        expected = full_expected[3:6]
        check_arrays(window, expected)

    def test_invalid_lookback(self):

        data = np.arange(30, dtype=float).reshape(6, 5)
        adj_array = AdjustedArray(data, {}, float("nan"))

        with pytest.raises(WindowLengthTooLong):
            adj_array.traverse(7)

        with pytest.raises(WindowLengthNotPositive):
            adj_array.traverse(0)

        with pytest.raises(WindowLengthNotPositive):
            adj_array.traverse(-1)

    def test_array_views_arent_writable(self):

        data = np.arange(30, dtype=float).reshape(6, 5)
        adj_array = AdjustedArray(data, {}, float("nan"))

        for frame in adj_array.traverse(3):
            with pytest.raises(ValueError):
                frame[0, 0] = 5.0

    def test_inspect(self):
        data = np.arange(15, dtype=float).reshape(5, 3)
        adj_array = AdjustedArray(
            data,
            {4: [Float64Multiply(2, 3, 0, 0, 4.0)]},
            float("nan"),
        )
        # TODO: CHECK WHY DO I NEED TO FIX THE INDENT IN THE EXPECTED?
        expected = dedent(
            """\
            Adjusted Array (float64):

            Data:
            array([[ 0.,  1.,  2.],
                   [ 3.,  4.,  5.],
                   [ 6.,  7.,  8.],
                   [ 9., 10., 11.],
                   [12., 13., 14.]])

            Adjustments:
            {4: [Float64Multiply(first_row=2, last_row=3, first_col=0, \
last_col=0, value=4.000000)]}
            """
        )
        got = adj_array.inspect()
        assert expected == got

    def test_update_labels(self):
        data = np.array(
            [
                ["aaa", "bbb", "ccc"],
                ["ddd", "eee", "fff"],
                ["ggg", "hhh", "iii"],
                ["jjj", "kkk", "lll"],
                ["mmm", "nnn", "ooo"],
            ]
        )
        label_array = LabelArray(data, missing_value="")

        adj_array = AdjustedArray(
            data=label_array,
            adjustments={4: [ObjectOverwrite(2, 3, 0, 0, "ppp")]},
            missing_value="",
        )

        expected_data = np.array(
            [
                ["aaa-foo", "bbb-foo", "ccc-foo"],
                ["ddd-foo", "eee-foo", "fff-foo"],
                ["ggg-foo", "hhh-foo", "iii-foo"],
                ["jjj-foo", "kkk-foo", "lll-foo"],
                ["mmm-foo", "nnn-foo", "ooo-foo"],
            ]
        )
        expected_label_array = LabelArray(expected_data, missing_value="")

        expected_adj_array = AdjustedArray(
            data=expected_label_array,
            adjustments={4: [ObjectOverwrite(2, 3, 0, 0, "ppp-foo")]},
            missing_value="",
        )

        adj_array.update_labels(lambda x: x + "-foo")

        # Check that the mapped AdjustedArray has the expected baseline
        # values and adjustment values.
        check_arrays(adj_array.data, expected_adj_array.data)
        assert adj_array.adjustments == expected_adj_array.adjustments

    A = Float64Multiply(0, 4, 1, 1, 0.5)
    B = Float64Overwrite(3, 3, 4, 4, 4.2)
    C = Float64Multiply(0, 2, 0, 0, 0.14)
    D = Float64Overwrite(0, 3, 0, 0, 4.0)
    E = Float64Overwrite(0, 0, 1, 1, 3.7)
    F = Float64Multiply(0, 4, 3, 3, 10.0)
    G = Float64Overwrite(5, 5, 4, 4, 1.7)
    H = Float64Multiply(0, 4, 2, 2, 0.99)
    S = Float64Multiply(0, 1, 4, 4, 5.06)

    @pytest.mark.parametrize(
        "initial_adjustments, adjustments_to_add,\
        expected_adjustments_with_append, expected_adjustments_with_prepend",
        [
            (
                # Initial adjustments
                {
                    1: [A, B],
                    2: [C],
                    4: [D],
                },
                # Adjustments to add
                {
                    1: [E],
                    2: [F, G],
                    3: [H, S],
                },
                # Expected adjustments with 'append'
                {
                    1: [A, B, E],
                    2: [C, F, G],
                    3: [H, S],
                    4: [D],
                },
                # Expected adjustments with 'prepend'
                {
                    1: [E, A, B],
                    2: [F, G, C],
                    3: [H, S],
                    4: [D],
                },
            )
        ],
    )
    def test_update_adjustments(
        self,
        initial_adjustments,
        adjustments_to_add,
        expected_adjustments_with_append,
        expected_adjustments_with_prepend,
    ):
        methods = ["append", "prepend"]
        expected_outputs = [
            expected_adjustments_with_append,
            expected_adjustments_with_prepend,
        ]

        for method, expected_output in zip(methods, expected_outputs):
            data = np.arange(30, dtype=float).reshape(6, 5)
            adjusted_array = AdjustedArray(data, initial_adjustments, float("nan"))

            adjusted_array.update_adjustments(adjustments_to_add, method)
            assert adjusted_array.adjustments == expected_output