Exemple #1
0
    def test_percentile_nasty_partitions(self):
        # Test percentile with nasty partitions: divide up 5 assets into
        # quartiles.
        # There isn't a nice mathematical definition of correct behavior here,
        # so for now we guarantee the behavior of numpy.nanpercentile.  This is
        # mostly for regression testing in case we write our own specialized
        # percentile calculation at some point in the future.

        data = arange(25, dtype=float).reshape(5, 5) % 4
        quartiles = range(4)
        filter_names = ['pct_' + str(q) for q in quartiles]

        graph = TermGraph({
            name: self.f.percentile_between(q * 25.0, (q + 1) * 25.0)
            for name, q in zip(filter_names, quartiles)
        })
        results = self.run_graph(
            graph,
            initial_workspace={self.f: data},
            mask=self.build_mask(ones((5, 5))),
        )

        for name, quartile in zip(filter_names, quartiles):
            result = results[name]
            lower = quartile * 25.0
            upper = (quartile + 1) * 25.0
            expected = and_(
                nanpercentile(data, lower, axis=1, keepdims=True) <= data,
                data <= nanpercentile(data, upper, axis=1, keepdims=True),
            )
            check_arrays(result, expected)
Exemple #2
0
    def test_single_factor(self):
        loader = self.loader
        assets = self.assets
        engine = SimplePipelineEngine(
            lambda column: loader, self.dates, self.asset_finder,
        )
        result_shape = (num_dates, num_assets) = (5, len(assets))
        dates = self.dates[10:10 + num_dates]

        factor = RollingSumDifference()
        expected_result = -factor.window_length

        # Since every asset will pass the screen, these should be equivalent.
        pipelines = [
            Pipeline(columns={'f': factor}),
            Pipeline(
                columns={'f': factor},
                screen=factor.eq(expected_result),
            ),
        ]

        for p in pipelines:
            result = engine.run_pipeline(p, dates[0], dates[-1])
            self.assertEqual(set(result.columns), {'f'})
            assert_multi_index_is_product(
                self, result.index, dates, assets
            )

            check_arrays(
                result['f'].unstack().values,
                full(result_shape, expected_result, dtype=float),
            )
    def test_engine_with_multicolumn_loader(self):
        open_, close = USEquityPricing.open, USEquityPricing.close

        loader = MultiColumnLoader({
            open_: ConstantLoader(dates=self.dates,
                                  assets=self.assets,
                                  constants={open_: 1}),
            close: ConstantLoader(dates=self.dates,
                                  assets=self.assets,
                                  constants={close: 2})
        })

        engine = SimpleFFCEngine(loader, self.dates, self.asset_finder)

        factor = RollingSumDifference()

        result = engine.factor_matrix({'f': factor},
                                      self.dates[2],
                                      self.dates[-1])
        self.assertIsNotNone(result)
        self.assertEqual({'f'}, set(result.columns))

        # (close - open) * window = (1 - 2) * 3 = -3
        # skipped 2 from the start, so that the window is full
        check_arrays(result['f'],
                     Series([-3] * len(self.assets) * (len(self.dates) - 2)))
Exemple #4
0
    def test_engine_with_multicolumn_loader(self):
        open_ = USEquityPricing.open
        close = USEquityPricing.close
        volume = USEquityPricing.volume

        # Test for thirty days up to the second to last day that we think all
        # the assets existed.  If we test the last day of our calendar, no
        # assets will be in our output, because their end dates are all
        dates_to_test = self.dates[-32:-2]

        constants = {open_: 1, close: 2, volume: 3}
        loader = ConstantLoader(constants=constants, dates=self.dates, assets=self.assets)
        engine = SimplePipelineEngine(loader, self.dates, self.asset_finder)

        sumdiff = RollingSumDifference()

        result = engine.run_pipeline(
            Pipeline(
                columns={"sumdiff": sumdiff, "open": open_.latest, "close": close.latest, "volume": volume.latest}
            ),
            dates_to_test[0],
            dates_to_test[-1],
        )
        self.assertIsNotNone(result)
        self.assertEqual({"sumdiff", "open", "close", "volume"}, set(result.columns))

        result_index = self.assets * len(dates_to_test)
        result_shape = (len(result_index),)
        check_arrays(result["sumdiff"], Series(index=result_index, data=full(result_shape, -3)))

        for name, const in [("open", 1), ("close", 2), ("volume", 3)]:
            check_arrays(result[name], Series(index=result_index, data=full(result_shape, const)))
Exemple #5
0
    def test_engine_with_multicolumn_loader(self):
        open_, close = USEquityPricing.open, USEquityPricing.close

        # Test for thirty days up to the second to last day that we think all
        # the assets existed.  If we test the last day of our calendar, no
        # assets will be in our output, because their end dates are all
        dates_to_test = self.dates[-32:-2]

        loader = MultiColumnLoader({
            open_: ConstantLoader(dates=self.dates,
                                  assets=self.assets,
                                  constants={open_: 1}),
            close: ConstantLoader(dates=self.dates,
                                  assets=self.assets,
                                  constants={close: 2})
        })

        engine = SimpleFFCEngine(loader, self.dates, self.asset_finder)

        factor = RollingSumDifference()

        result = engine.factor_matrix({'f': factor},
                                      dates_to_test[0],
                                      dates_to_test[-1])
        self.assertIsNotNone(result)
        self.assertEqual({'f'}, set(result.columns))

        result_index = self.assets * len(dates_to_test)
        result_shape = (len(result_index),)
        check_arrays(
            result['f'],
            Series(index=result_index, data=full(result_shape, -3)),
        )
Exemple #6
0
    def test_rolling_and_nonrolling(self):
        open_ = USEquityPricing.open
        close = USEquityPricing.close
        volume = USEquityPricing.volume

        # Test for thirty days up to the last day that we think all
        # the assets existed.
        dates_to_test = self.dates[-30:]

        constants = {open_: 1, close: 2, volume: 3}
        loader = ConstantLoader(constants=constants, dates=self.dates, assets=self.assets)
        engine = SimplePipelineEngine(lambda column: loader, self.dates, self.asset_finder)

        sumdiff = RollingSumDifference()

        result = engine.run_pipeline(
            Pipeline(
                columns={"sumdiff": sumdiff, "open": open_.latest, "close": close.latest, "volume": volume.latest}
            ),
            dates_to_test[0],
            dates_to_test[-1],
        )
        self.assertIsNotNone(result)
        self.assertEqual({"sumdiff", "open", "close", "volume"}, set(result.columns))

        result_index = self.assets * len(dates_to_test)
        result_shape = (len(result_index),)
        check_arrays(result["sumdiff"], Series(index=result_index, data=full(result_shape, -3)))

        for name, const in [("open", 1), ("close", 2), ("volume", 3)]:
            check_arrays(result[name], Series(index=result_index, data=full(result_shape, const)))
Exemple #7
0
    def test_masked_rankdata_2d(self,
                                seed_value,
                                method,
                                use_mask,
                                set_missing,
                                ascending):
        eyemask = ~eye(5, dtype=bool)
        nomask = ones((5, 5), dtype=bool)

        seed(seed_value)
        asfloat = (randn(5, 5) * seed_value)
        asdatetime = (asfloat).copy().view('datetime64[ns]')

        mask = eyemask if use_mask else nomask
        if set_missing:
            asfloat[:, 2] = nan
            asdatetime[:, 2] = np_NaT

        float_result = masked_rankdata_2d(
            data=asfloat,
            mask=mask,
            missing_value=nan,
            method=method,
            ascending=True,
        )
        datetime_result = masked_rankdata_2d(
            data=asdatetime,
            mask=mask,
            missing_value=np_NaT,
            method=method,
            ascending=True,
        )

        check_arrays(float_result, datetime_result)
Exemple #8
0
    def test_masked_rankdata_2d(self, seed_value, method, use_mask,
                                set_missing, ascending):
        eyemask = ~eye(5, dtype=bool)
        nomask = ones((5, 5), dtype=bool)

        seed(seed_value)
        asfloat = (randn(5, 5) * seed_value)
        asdatetime = (asfloat).copy().view('datetime64[ns]')

        mask = eyemask if use_mask else nomask
        if set_missing:
            asfloat[:, 2] = nan
            asdatetime[:, 2] = np_NaT

        float_result = masked_rankdata_2d(
            data=asfloat,
            mask=mask,
            missing_value=nan,
            method=method,
            ascending=True,
        )
        datetime_result = masked_rankdata_2d(
            data=asdatetime,
            mask=mask,
            missing_value=np_NaT,
            method=method,
            ascending=True,
        )

        check_arrays(float_result, datetime_result)
Exemple #9
0
    def test_isnull_datetime_dtype(self):
        class DatetimeFactor(Factor):
            dtype = datetime64ns_dtype
            window_length = 0
            inputs = ()

        factor = DatetimeFactor()

        data = arange(25).reshape(5, 5).astype('datetime64[ns]')
        data[eye(5, dtype=bool)] = NaTns

        graph = TermGraph(
            {
                'isnull': factor.isnull(),
                'notnull': factor.notnull(),
            }
        )

        results = self.run_graph(
            graph,
            initial_workspace={factor: data},
            mask=self.build_mask(ones((5, 5))),
        )
        check_arrays(results['isnull'], eye(5, dtype=bool))
        check_arrays(results['notnull'], ~eye(5, dtype=bool))
Exemple #10
0
    def test_isnull_int_dtype(self, custom_missing_value):

        class CustomMissingValue(Factor):
            dtype = int64_dtype
            window_length = 0
            missing_value = custom_missing_value
            inputs = ()

        factor = CustomMissingValue()

        data = arange(25).reshape(5, 5)
        data[eye(5, dtype=bool)] = custom_missing_value

        graph = TermGraph(
            {
                'isnull': factor.isnull(),
                'notnull': factor.notnull(),
            }
        )

        results = self.run_graph(
            graph,
            initial_workspace={factor: data},
            mask=self.build_mask(ones((5, 5))),
        )
        check_arrays(results['isnull'], eye(5, dtype=bool))
        check_arrays(results['notnull'], ~eye(5, dtype=bool))
Exemple #11
0
    def test_single_factor(self):
        loader = self.loader
        finder = self.asset_finder
        assets = self.assets
        engine = SimplePipelineEngine(
            lambda column: loader,
            self.dates,
            self.asset_finder,
        )
        result_shape = (num_dates, num_assets) = (5, len(assets))
        dates = self.dates[10:10 + num_dates]

        factor = RollingSumDifference()
        expected_result = -factor.window_length

        # Since every asset will pass the screen, these should be equivalent.
        pipelines = [
            Pipeline(columns={'f': factor}),
            Pipeline(
                columns={'f': factor},
                screen=factor.eq(expected_result),
            ),
        ]

        for p in pipelines:
            result = engine.run_pipeline(p, dates[0], dates[-1])
            self.assertEqual(set(result.columns), {'f'})
            assert_multi_index_is_product(self, result.index, dates,
                                          finder.retrieve_all(assets))

            check_arrays(
                result['f'].unstack().values,
                full(result_shape, expected_result),
            )
Exemple #12
0
    def test_percentile_nasty_partitions(self):
        # Test percentile with nasty partitions: divide up 5 assets into
        # quartiles.
        # There isn't a nice mathematical definition of correct behavior here,
        # so for now we guarantee the behavior of numpy.nanpercentile.  This is
        # mostly for regression testing in case we write our own specialized
        # percentile calculation at some point in the future.

        data = arange(25, dtype=float).reshape(5, 5) % 4
        quartiles = range(4)
        filter_names = ['pct_' + str(q) for q in quartiles]

        graph = TermGraph(
            {
                name: self.f.percentile_between(q * 25.0, (q + 1) * 25.0)
                for name, q in zip(filter_names, quartiles)
            }
        )
        results = self.run_graph(
            graph,
            initial_workspace={self.f: data},
            mask=self.build_mask(ones((5, 5))),
        )

        for name, quartile in zip(filter_names, quartiles):
            result = results[name]
            lower = quartile * 25.0
            upper = (quartile + 1) * 25.0
            expected = and_(
                nanpercentile(data, lower, axis=1, keepdims=True) <= data,
                data <= nanpercentile(data, upper, axis=1, keepdims=True),
            )
            check_arrays(result, expected)
Exemple #13
0
    def test_engine_with_multicolumn_loader(self):
        open_, close = USEquityPricing.open, USEquityPricing.close

        loader = MultiColumnLoader({
            open_:
            ConstantLoader(dates=self.dates,
                           assets=self.assets,
                           constants={open_: 1}),
            close:
            ConstantLoader(dates=self.dates,
                           assets=self.assets,
                           constants={close: 2})
        })

        engine = SimpleFFCEngine(loader, self.dates, self.asset_finder)

        factor = RollingSumDifference()

        result = engine.factor_matrix({'f': factor}, self.dates[2],
                                      self.dates[-1])
        self.assertIsNotNone(result)
        self.assertEqual({'f'}, set(result.columns))

        # (close - open) * window = (1 - 2) * 3 = -3
        # skipped 2 from the start, so that the window is full
        check_arrays(result['f'],
                     Series([-3] * len(self.assets) * (len(self.dates) - 2)))
Exemple #14
0
    def test_rank_after_mask(self):
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2], [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]], dtype=float)
        mask_data = ~eye(5, dtype=bool)
        initial_workspace = {self.f: data, Mask(): mask_data}

        graph = TermGraph(
            {
                "ascending_nomask": self.f.rank(ascending=True),
                "ascending_mask": self.f.rank(ascending=True, mask=Mask()),
                "descending_nomask": self.f.rank(ascending=False),
                "descending_mask": self.f.rank(ascending=False, mask=Mask()),
            }
        )

        expected = {
            "ascending_nomask": array(
                [
                    [1.0, 3.0, 4.0, 5.0, 2.0],
                    [2.0, 4.0, 5.0, 1.0, 3.0],
                    [3.0, 5.0, 1.0, 2.0, 4.0],
                    [4.0, 1.0, 2.0, 3.0, 5.0],
                    [1.0, 3.0, 4.0, 5.0, 2.0],
                ]
            ),
            "descending_nomask": array(
                [
                    [4.0, 3.0, 2.0, 1.0, 5.0],
                    [3.0, 2.0, 1.0, 5.0, 4.0],
                    [2.0, 1.0, 5.0, 4.0, 3.0],
                    [1.0, 5.0, 4.0, 3.0, 2.0],
                    [4.0, 3.0, 2.0, 1.0, 5.0],
                ]
            ),
            # Diagonal should be all nans, and anything whose rank was less
            # than the diagonal in the unmasked calc should go down by 1.
            "ascending_mask": array(
                [
                    [nan, 2.0, 3.0, 4.0, 1.0],
                    [2.0, nan, 4.0, 1.0, 3.0],
                    [2.0, 4.0, nan, 1.0, 3.0],
                    [3.0, 1.0, 2.0, nan, 4.0],
                    [1.0, 2.0, 3.0, 4.0, nan],
                ]
            ),
            "descending_mask": array(
                [
                    [nan, 3.0, 2.0, 1.0, 4.0],
                    [2.0, nan, 1.0, 4.0, 3.0],
                    [2.0, 1.0, nan, 4.0, 3.0],
                    [1.0, 4.0, 3.0, nan, 2.0],
                    [4.0, 3.0, 2.0, 1.0, nan],
                ]
            ),
        }

        results = self.run_graph(graph, initial_workspace, mask=self.build_mask(ones((5, 5))))
        for method in results:
            check_arrays(expected[method], results[method])
Exemple #15
0
 def check(terms):
     results = self.run_terms(
         terms,
         initial_workspace={self.f: data},
         mask=self.build_mask(ones((5, 5))),
     )
     for method in terms:
         check_arrays(results[method], expected_ranks[method])
Exemple #16
0
 def check(terms):
     results = self.run_terms(
         terms,
         initial_workspace={self.f: data},
         mask=self.build_mask(ones((5, 5))),
     )
     for method in terms:
         check_arrays(results[method], expected_ranks[method])
 def check_output(self, expr, expected):
     result = expr._compute(
         [self.fake_raw_data[input_] for input_ in expr.inputs],
         self.mask.index,
         self.mask.columns,
         self.mask.values,
     )
     check_arrays(result, expected)
 def check_output(self, expr, expected):
     result = expr._compute(
         [self.fake_raw_data[input_] for input_ in expr.inputs],
         self.mask.index,
         self.mask.columns,
         self.mask.values,
     )
     check_arrays(result, expected)
Exemple #19
0
    def test_rolling_and_nonrolling(self):
        open_ = USEquityPricing.open
        close = USEquityPricing.close
        volume = USEquityPricing.volume

        # Test for thirty days up to the last day that we think all
        # the assets existed.
        dates_to_test = self.dates[-30:]

        constants = {open_: 1, close: 2, volume: 3}
        loader = PrecomputedLoader(
            constants=constants,
            dates=self.dates,
            sids=self.asset_ids,
        )
        engine = SimplePipelineEngine(
            lambda column: loader, self.dates, self.asset_finder,
        )

        sumdiff = RollingSumDifference()

        result = engine.run_pipeline(
            Pipeline(
                columns={
                    'sumdiff': sumdiff,
                    'open': open_.latest,
                    'close': close.latest,
                    'volume': volume.latest,
                },
            ),
            dates_to_test[0],
            dates_to_test[-1]
        )
        self.assertIsNotNone(result)
        self.assertEqual(
            {'sumdiff', 'open', 'close', 'volume'},
            set(result.columns)
        )

        result_index = self.asset_ids * len(dates_to_test)
        result_shape = (len(result_index),)
        check_arrays(
            result['sumdiff'],
            Series(
                index=result_index,
                data=full(result_shape, -3, dtype=float),
            ),
        )

        for name, const in [('open', 1), ('close', 2), ('volume', 3)]:
            check_arrays(
                result[name],
                Series(
                    index=result_index,
                    data=full(result_shape, const, dtype=float),
                ),
            )
Exemple #20
0
 def check(terms):
     graph = TermGraph(terms)
     results = self.run_graph(
         graph,
         initial_workspace={f: data},
         mask=self.build_mask(ones((5, 5))),
     )
     for method in terms:
         check_arrays(results[method], expected_ranks[method])
Exemple #21
0
 def check(terms):
     graph = TermGraph(terms)
     results = self.run_graph(
         graph,
         initial_workspace={f: data},
         mask=self.build_mask(ones((5, 5))),
     )
     for method in terms:
         check_arrays(results[method], expected_ranks[method])
Exemple #22
0
    def test_notnan(self):
        data = self.randn_data(seed=10)
        diag = eye(*data.shape, dtype=bool)
        data[diag] = nan

        results = self.run_graph(
            TermGraph({'notnan': self.f.notnan()}),
            initial_workspace={self.f: data},
        )
        check_arrays(results['notnan'], ~diag)
Exemple #23
0
    def test_notnan(self):
        data = self.randn_data(seed=10)
        diag = eye(*data.shape, dtype=bool)
        data[diag] = nan

        results = self.run_graph(
            TermGraph({'notnan': self.f.notnan()}),
            initial_workspace={self.f: data},
        )
        check_arrays(results['notnan'], ~diag)
Exemple #24
0
    def test_isfinite(self):
        data = self.randn_data(seed=10)
        data[:, 0] = nan
        data[:, 2] = inf
        data[:, 4] = -inf

        results = self.run_graph(
            TermGraph({'isfinite': self.f.isfinite()}),
            initial_workspace={self.f: data},
        )
        check_arrays(results['isfinite'], isfinite(data))
Exemple #25
0
    def test_isfinite(self):
        data = self.randn_data(seed=10)
        data[:, 0] = nan
        data[:, 2] = inf
        data[:, 4] = -inf

        results = self.run_graph(
            TermGraph({'isfinite': self.f.isfinite()}),
            initial_workspace={self.f: data},
        )
        check_arrays(results['isfinite'], isfinite(data))
Exemple #26
0
    def test_top_and_bottom(self):
        data = self.randn_data(seed=5)  # Fix a seed for determinism.

        mask_data = ones_like(data, dtype=bool)
        mask_data[:, 0] = False

        nan_data = data.copy()
        nan_data[:, 0] = nan

        mask = Mask()
        workspace = {self.f: data, mask: mask_data}

        methods = ['top', 'bottom']
        counts = 2, 3, 10
        term_combos = list(product(methods, counts, [True, False]))

        def termname(method, count, masked):
            return '_'.join([method, str(count), 'mask' if masked else ''])

        # Add a term for each permutation of top/bottom, count, and
        # mask/no_mask.
        terms = {}
        for method, count, masked in term_combos:
            kwargs = {'N': count}
            if masked:
                kwargs['mask'] = mask
            term = getattr(self.f, method)(**kwargs)
            terms[termname(method, count, masked)] = term

        results = self.run_graph(TermGraph(terms), initial_workspace=workspace)

        def expected_result(method, count, masked):
            # Ranking with a mask is equivalent to ranking with nans applied on
            # the masked values.
            to_rank = nan_data if masked else data

            if method == 'top':
                return rowwise_rank(-to_rank) < count
            elif method == 'bottom':
                return rowwise_rank(to_rank) < count

        for method, count, masked in term_combos:
            result = results[termname(method, count, masked)]

            # Check that `min(c, num_assets)` assets passed each day.
            passed_per_day = result.sum(axis=1)
            check_arrays(
                passed_per_day,
                full_like(passed_per_day, min(count, data.shape[1])),
            )

            expected = expected_result(method, count, masked)
            check_arrays(result, expected)
Exemple #27
0
    def test_top_and_bottom(self):
        data = self.randn_data(seed=5)  # Fix a seed for determinism.

        mask_data = ones_like(data, dtype=bool)
        mask_data[:, 0] = False

        nan_data = data.copy()
        nan_data[:, 0] = nan

        mask = Mask()
        workspace = {self.f: data, mask: mask_data}

        methods = ['top', 'bottom']
        counts = 2, 3, 10
        term_combos = list(product(methods, counts, [True, False]))

        def termname(method, count, masked):
            return '_'.join([method, str(count), 'mask' if masked else ''])

        # Add a term for each permutation of top/bottom, count, and
        # mask/no_mask.
        terms = {}
        for method, count, masked in term_combos:
            kwargs = {'N': count}
            if masked:
                kwargs['mask'] = mask
            term = getattr(self.f, method)(**kwargs)
            terms[termname(method, count, masked)] = term

        results = self.run_graph(TermGraph(terms), initial_workspace=workspace)

        def expected_result(method, count, masked):
            # Ranking with a mask is equivalent to ranking with nans applied on
            # the masked values.
            to_rank = nan_data if masked else data

            if method == 'top':
                return rowwise_rank(-to_rank) < count
            elif method == 'bottom':
                return rowwise_rank(to_rank) < count

        for method, count, masked in term_combos:
            result = results[termname(method, count, masked)]

            # Check that `min(c, num_assets)` assets passed each day.
            passed_per_day = result.sum(axis=1)
            check_arrays(
                passed_per_day,
                full_like(passed_per_day, min(count, data.shape[1])),
            )

            expected = expected_result(method, count, masked)
            check_arrays(result, expected)
Exemple #28
0
    def test_sequenced_filter_order_independent(self):
        data = self.arange_data() % 5
        results = self.run_terms(
            {
                # Sequencing is equivalent to &ing for commutative filters.
                'sequenced': (1.5 < self.f).then(self.f < 3.5),
                'anded': (1.5 < self.f) & (self.f < 3.5),
            },
            initial_workspace={self.f: data},
        )
        expected = (1.5 < data) & (data < 3.5)

        check_arrays(results['sequenced'], expected)
        check_arrays(results['anded'], expected)
Exemple #29
0
    def test_engine_with_multicolumn_loader(self):
        open_ = USEquityPricing.open
        close = USEquityPricing.close
        volume = USEquityPricing.volume

        # Test for thirty days up to the second to last day that we think all
        # the assets existed.  If we test the last day of our calendar, no
        # assets will be in our output, because their end dates are all
        dates_to_test = self.dates[-32:-2]

        constants = {open_: 1, close: 2, volume: 3}
        loader = ConstantLoader(
            constants=constants,
            dates=self.dates,
            assets=self.assets,
        )
        engine = SimplePipelineEngine(loader, self.dates, self.asset_finder)

        sumdiff = RollingSumDifference()

        result = engine.run_pipeline(
            Pipeline(
                columns={
                    'sumdiff': sumdiff,
                    'open': open_.latest,
                    'close': close.latest,
                    'volume': volume.latest,
                },
            ),
            dates_to_test[0],
            dates_to_test[-1]
        )
        self.assertIsNotNone(result)
        self.assertEqual(
            {'sumdiff', 'open', 'close', 'volume'},
            set(result.columns)
        )

        result_index = self.assets * len(dates_to_test)
        result_shape = (len(result_index),)
        check_arrays(
            result['sumdiff'],
            Series(index=result_index, data=full(result_shape, -3)),
        )

        for name, const in [('open', 1), ('close', 2), ('volume', 3)]:
            check_arrays(
                result[name],
                Series(index=result_index, data=full(result_shape, const)),
            )
    def test_sequenced_filter_order_independent(self):
        data = self.arange_data() % 5
        results = self.run_terms(
            {
                # Sequencing is equivalent to &ing for commutative filters.
                'sequenced': (1.5 < self.f).then(self.f < 3.5),
                'anded': (1.5 < self.f) & (self.f < 3.5),
            },
            initial_workspace={self.f: data},
        )
        expected = (1.5 < data) & (data < 3.5)

        check_arrays(results['sequenced'], expected)
        check_arrays(results['anded'], expected)
Exemple #31
0
    def test_rank_after_mask(self, name, factor_dtype):

        f = F(dtype=factor_dtype)
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)
        mask_data = ~eye(5, dtype=bool)
        initial_workspace = {f: data, Mask(): mask_data}

        graph = TermGraph({
            "ascending_nomask":
            f.rank(ascending=True),
            "ascending_mask":
            f.rank(ascending=True, mask=Mask()),
            "descending_nomask":
            f.rank(ascending=False),
            "descending_mask":
            f.rank(ascending=False, mask=Mask()),
        })

        expected = {
            "ascending_nomask":
            array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.],
                   [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.],
                   [1., 3., 4., 5., 2.]]),
            "descending_nomask":
            array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.],
                   [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.],
                   [4., 3., 2., 1., 5.]]),
            # Diagonal should be all nans, and anything whose rank was less
            # than the diagonal in the unmasked calc should go down by 1.
            "ascending_mask":
            array([[nan, 2., 3., 4., 1.], [2., nan, 4., 1., 3.],
                   [2., 4., nan, 1., 3.], [3., 1., 2., nan, 4.],
                   [1., 2., 3., 4., nan]]),
            "descending_mask":
            array([[nan, 3., 2., 1., 4.], [2., nan, 1., 4., 3.],
                   [2., 1., nan, 4., 3.], [1., 4., 3., nan, 2.],
                   [4., 3., 2., 1., nan]]),
        }

        results = self.run_graph(
            graph,
            initial_workspace,
            mask=self.build_mask(ones((5, 5))),
        )
        for method in results:
            check_arrays(expected[method], results[method])
Exemple #32
0
    def test_masking(self, dtype, missing_value, window_length):
        missing_value = value_with_dtype(dtype, missing_value)
        baseline_ints = arange(15).reshape(5, 3)
        baseline = baseline_ints.astype(dtype)
        mask = (baseline_ints % 2).astype(bool)
        masked_baseline = where(mask, baseline, missing_value)

        array = AdjustedArray(
            baseline,
            mask,
            adjustments={},
            missing_value=missing_value,
        )

        gen_expected = moving_window(masked_baseline, window_length)
        gen_actual = array.traverse(window_length)
        for expected, actual in zip(gen_expected, gen_actual):
            check_arrays(expected, actual)
    def test_masking(self, dtype, missing_value, window_length):
        missing_value = coerce_to_dtype(dtype, missing_value)
        baseline_ints = arange(15).reshape(5, 3)
        baseline = baseline_ints.astype(dtype)
        mask = (baseline_ints % 2).astype(bool)
        masked_baseline = where(mask, baseline, missing_value)

        array = AdjustedArray(
            baseline,
            mask,
            adjustments={},
            missing_value=missing_value,
        )

        gen_expected = moving_window(masked_baseline, window_length)
        gen_actual = array.traverse(window_length)
        for expected, actual in zip(gen_expected, gen_actual):
            check_arrays(expected, actual)
Exemple #34
0
    def test_bottom(self):
        counts = 2, 3, 10
        data = self.randn_data(seed=5)  # Arbitrary seed choice.
        results = self.run_terms(
            terms={'bottom_' + str(c): self.f.bottom(c) for c in counts},
            initial_workspace={self.f: data},
        )
        for c in counts:
            result = results['bottom_' + str(c)]

            # Check that `min(c, num_assets)` assets passed each day.
            passed_per_day = result.sum(axis=1)
            check_arrays(
                passed_per_day,
                full_like(passed_per_day, min(c, data.shape[1])),
            )

            # Check that the bottom `c` assets passed.
            expected = rowwise_rank(data) < c
            check_arrays(result, expected)
Exemple #35
0
    def test_bottom(self):
        counts = 2, 3, 10
        data = self.randn_data(seed=5)  # Arbitrary seed choice.
        results = self.run_graph(
            TermGraph({'bottom_' + str(c): self.f.bottom(c)
                       for c in counts}),
            initial_workspace={self.f: data},
        )
        for c in counts:
            result = results['bottom_' + str(c)]

            # Check that `min(c, num_assets)` assets passed each day.
            passed_per_day = result.sum(axis=1)
            check_arrays(
                passed_per_day,
                full_like(passed_per_day, min(c, data.shape[1])),
            )

            # Check that the bottom `c` assets passed.
            expected = rowwise_rank(data) < c
            check_arrays(result, expected)
Exemple #36
0
    def test_top(self):
        counts = 2, 3, 10
        data = self.randn_data(seed=5)  # Arbitrary seed choice.
        results = self.run_terms(
            terms={'top_' + str(c): self.f.top(c)
                   for c in counts},
            initial_workspace={self.f: data},
        )
        for c in counts:
            result = results['top_' + str(c)]

            # Check that `min(c, num_assets)` assets passed each day.
            passed_per_day = result.sum(axis=1)
            check_arrays(
                passed_per_day,
                full_like(passed_per_day, min(c, data.shape[1])),
            )

            # Check that the top `c` assets passed.
            expected = rowwise_rank(-data) < c
            check_arrays(result, expected)
Exemple #37
0
    def test_percentile_after_mask(self):
        f_input = eye(5)
        g_input = arange(25, dtype=float).reshape(5, 5)
        initial_mask = self.build_mask(ones((5, 5)))

        custom_mask = self.f < 1
        without_mask = self.g.percentile_between(80, 100)
        with_mask = self.g.percentile_between(80, 100, mask=custom_mask)

        graph = TermGraph(
            {
                'custom_mask': custom_mask,
                'without': without_mask,
                'with': with_mask,
            }
        )

        results = self.run_graph(
            graph,
            initial_workspace={self.f: f_input, self.g: g_input},
            mask=initial_mask,
        )

        # First should pass everything but the diagonal.
        check_arrays(results['custom_mask'], ~eye(5, dtype=bool))

        # Second should pass the largest value each day.  Each row is strictly
        # increasing, so we always select the last value.
        expected_without = array(
            [[0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1]],
            dtype=bool,
        )
        check_arrays(results['without'], expected_without)

        # When sequencing, we should remove the diagonal as an option before
        # computing percentiles.  On the last day, we should get the
        # second-largest value, rather than the largest.
        expected_with = array(
            [[0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 1, 0]],  # Different from previous!
            dtype=bool,
        )
        check_arrays(results['with'], expected_with)
    def test_sequenced_filter_order_dependent(self):

        first = self.f < 1
        f_input = eye(5)

        second = self.g.percentile_between(80, 100)
        g_input = arange(25, dtype=float).reshape(5, 5)

        initial_mask = self.build_mask(ones((5, 5)))

        terms = {
            'first': first,
            'second': second,
            'sequenced': first.then(second),
        }

        results = self.run_terms(
            terms,
            initial_workspace={self.f: f_input, self.g: g_input},
            mask=initial_mask,
        )

        # First should pass everything but the diagonal.
        check_arrays(results['first'], ~eye(5, dtype=bool))

        # Second should pass the largest value each day.  Each row is strictly
        # increasing, so we always select the last value.
        expected_second = array(
            [[0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1]],
            dtype=bool,
        )
        check_arrays(results['second'], expected_second)

        # When sequencing, we should remove the diagonal as an option before
        # computing percentiles.  On the last day, we should get the
        # second-largest value, rather than the largest.
        expected_sequenced = array(
            [[0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 1, 0]],  # Different from previous!
            dtype=bool,
        )
        check_arrays(results['sequenced'], expected_sequenced)
Exemple #39
0
    def test_multiple_rolling_factors(self):

        loader = self.loader
        finder = self.asset_finder
        assets = self.assets
        engine = SimplePipelineEngine(
            lambda column: loader, self.dates, self.asset_finder,
        )
        shape = num_dates, num_assets = (5, len(assets))
        dates = self.dates[10:10 + num_dates]

        short_factor = RollingSumDifference(window_length=3)
        long_factor = RollingSumDifference(window_length=5)
        high_factor = RollingSumDifference(
            window_length=3,
            inputs=[USEquityPricing.open, USEquityPricing.high],
        )

        pipeline = Pipeline(
            columns={
                'short': short_factor,
                'long': long_factor,
                'high': high_factor,
            }
        )
        results = engine.run_pipeline(pipeline, dates[0], dates[-1])

        self.assertEqual(set(results.columns), {'short', 'high', 'long'})
        assert_multi_index_is_product(
            self, results.index, dates, finder.retrieve_all(assets)
        )

        # row-wise sum over an array whose values are all (1 - 2)
        check_arrays(
            results['short'].unstack().values,
            full(shape, -short_factor.window_length),
        )
        check_arrays(
            results['long'].unstack().values,
            full(shape, -long_factor.window_length),
        )
        # row-wise sum over an array whose values are all (1 - 3)
        check_arrays(
            results['high'].unstack().values,
            full(shape, -2 * high_factor.window_length),
        )
Exemple #40
0
    def test_sequenced_filter_order_dependent(self):

        first = self.f < 1
        f_input = eye(5)

        second = self.g.percentile_between(80, 100)
        g_input = arange(25, dtype=float).reshape(5, 5)

        initial_mask = self.build_mask(ones((5, 5)))

        terms = {
            'first': first,
            'second': second,
            'sequenced': first.then(second),
        }

        results = self.run_terms(
            terms,
            initial_workspace={
                self.f: f_input,
                self.g: g_input
            },
            mask=initial_mask,
        )

        # First should pass everything but the diagonal.
        check_arrays(results['first'], ~eye(5, dtype=bool))

        # Second should pass the largest value each day.  Each row is strictly
        # increasing, so we always select the last value.
        expected_second = array(
            [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]],
            dtype=bool,
        )
        check_arrays(results['second'], expected_second)

        # When sequencing, we should remove the diagonal as an option before
        # computing percentiles.  On the last day, we should get the
        # second-largest value, rather than the largest.
        expected_sequenced = array(
            [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]],  # Different from previous!
            dtype=bool,
        )
        check_arrays(results['sequenced'], expected_sequenced)
Exemple #41
0
    def test_percentile_after_mask(self):
        f_input = eye(5)
        g_input = arange(25, dtype=float).reshape(5, 5)
        initial_mask = self.build_mask(ones((5, 5)))

        custom_mask = self.f < 1
        without_mask = self.g.percentile_between(80, 100)
        with_mask = self.g.percentile_between(80, 100, mask=custom_mask)

        graph = TermGraph({
            'custom_mask': custom_mask,
            'without': without_mask,
            'with': with_mask,
        })

        results = self.run_graph(
            graph,
            initial_workspace={
                self.f: f_input,
                self.g: g_input
            },
            mask=initial_mask,
        )

        # First should pass everything but the diagonal.
        check_arrays(results['custom_mask'], ~eye(5, dtype=bool))

        # Second should pass the largest value each day.  Each row is strictly
        # increasing, so we always select the last value.
        expected_without = array(
            [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]],
            dtype=bool,
        )
        check_arrays(results['without'], expected_without)

        # When sequencing, we should remove the diagonal as an option before
        # computing percentiles.  On the last day, we should get the
        # second-largest value, rather than the largest.
        expected_with = array(
            [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]],  # Different from previous!
            dtype=bool,
        )
        check_arrays(results['with'], expected_with)
Exemple #42
0
    def test_multiple_rolling_factors(self):

        loader = self.loader
        finder = self.asset_finder
        assets = self.assets
        engine = SimplePipelineEngine(
            lambda column: loader,
            self.dates,
            self.asset_finder,
        )
        shape = num_dates, num_assets = (5, len(assets))
        dates = self.dates[10:10 + num_dates]

        short_factor = RollingSumDifference(window_length=3)
        long_factor = RollingSumDifference(window_length=5)
        high_factor = RollingSumDifference(
            window_length=3,
            inputs=[USEquityPricing.open, USEquityPricing.high],
        )

        pipeline = Pipeline(columns={
            'short': short_factor,
            'long': long_factor,
            'high': high_factor,
        })
        results = engine.run_pipeline(pipeline, dates[0], dates[-1])

        self.assertEqual(set(results.columns), {'short', 'high', 'long'})
        assert_multi_index_is_product(self, results.index, dates,
                                      finder.retrieve_all(assets))

        # row-wise sum over an array whose values are all (1 - 2)
        check_arrays(
            results['short'].unstack().values,
            full(shape, -short_factor.window_length),
        )
        check_arrays(
            results['long'].unstack().values,
            full(shape, -long_factor.window_length),
        )
        # row-wise sum over an array whose values are all (1 - 3)
        check_arrays(
            results['high'].unstack().values,
            full(shape, -2 * high_factor.window_length),
        )
 def check_output(self, expr, expected):
     result = expr.compute_from_arrays(
         [self.fake_raw_data[input_] for input_ in expr.inputs],
         self.mask,
     )
     check_arrays(result, expected)
 def check_output(self, expr, expected):
     result = expr.compute_from_arrays([self.fake_raw_data[input_] for input_ in expr.inputs], self.mask)
     check_arrays(result, expected)
Exemple #45
0
    def test_percentile_between(self):

        quintiles = range(5)
        filter_names = ['pct_' + str(q) for q in quintiles]
        iter_quintiles = zip(filter_names, quintiles)

        graph = TermGraph({
            name: self.f.percentile_between(q * 20.0, (q + 1) * 20.0)
            for name, q in zip(filter_names, quintiles)
        })

        # Test with 5 columns and no NaNs.
        eye5 = eye(5, dtype=float64)
        results = self.run_graph(
            graph,
            initial_workspace={self.f: eye5},
            mask=self.build_mask(ones((5, 5))),
        )
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # There are four 0s and one 1 in each row, so the first 4
                # quintiles should be all the locations with zeros in the input
                # array.
                check_arrays(result, ~eye5.astype(bool))
            else:
                # The top quintile should match the sole 1 in each row.
                check_arrays(result, eye5.astype(bool))

        # Test with 6 columns, no NaNs, and one masked entry per day.
        eye6 = eye(6, dtype=float64)
        mask = array(
            [[1, 1, 1, 1, 1, 0], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1],
             [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1]],
            dtype=bool)

        results = self.run_graph(graph,
                                 initial_workspace={self.f: eye6},
                                 mask=self.build_mask(mask))
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # Should keep all values that were 0 in the base data and were
                # 1 in the mask.
                check_arrays(result, mask & (~eye6.astype(bool))),
            else:
                # Should keep all the 1s in the base data.
                check_arrays(result, eye6.astype(bool))

        # Test with 6 columns, no mask, and one NaN per day.  Should have the
        # same outcome as if we had masked the NaNs.
        # In particular, the NaNs should never pass any filters.
        eye6_withnans = eye6.copy()
        putmask(eye6_withnans, ~mask, nan)
        results = self.run_graph(graph,
                                 initial_workspace={self.f: eye6},
                                 mask=self.build_mask(mask))
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # Should keep all values that were 0 in the base data and were
                # 1 in the mask.
                check_arrays(result, mask & (~eye6.astype(bool))),
            else:
                # Should keep all the 1s in the base data.
                check_arrays(result, eye6.astype(bool))
Exemple #46
0
    def test_percentile_between(self):

        quintiles = range(5)
        filter_names = ['pct_' + str(q) for q in quintiles]
        iter_quintiles = zip(filter_names, quintiles)

        graph = TermGraph(
            {
                name: self.f.percentile_between(q * 20.0, (q + 1) * 20.0)
                for name, q in zip(filter_names, quintiles)
            }
        )

        # Test with 5 columns and no NaNs.
        eye5 = eye(5, dtype=float64)
        results = self.run_graph(
            graph,
            initial_workspace={self.f: eye5},
            mask=self.build_mask(ones((5, 5))),
        )
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # There are four 0s and one 1 in each row, so the first 4
                # quintiles should be all the locations with zeros in the input
                # array.
                check_arrays(result, ~eye5.astype(bool))
            else:
                # The top quintile should match the sole 1 in each row.
                check_arrays(result, eye5.astype(bool))

        # Test with 6 columns, no NaNs, and one masked entry per day.
        eye6 = eye(6, dtype=float64)
        mask = array([[1, 1, 1, 1, 1, 0],
                      [0, 1, 1, 1, 1, 1],
                      [1, 0, 1, 1, 1, 1],
                      [1, 1, 0, 1, 1, 1],
                      [1, 1, 1, 0, 1, 1],
                      [1, 1, 1, 1, 0, 1]], dtype=bool)

        results = self.run_graph(
            graph,
            initial_workspace={self.f: eye6},
            mask=self.build_mask(mask)
        )
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # Should keep all values that were 0 in the base data and were
                # 1 in the mask.
                check_arrays(result, mask & (~eye6.astype(bool))),
            else:
                # Should keep all the 1s in the base data.
                check_arrays(result, eye6.astype(bool))

        # Test with 6 columns, no mask, and one NaN per day.  Should have the
        # same outcome as if we had masked the NaNs.
        # In particular, the NaNs should never pass any filters.
        eye6_withnans = eye6.copy()
        putmask(eye6_withnans, ~mask, nan)
        results = self.run_graph(
            graph,
            initial_workspace={self.f: eye6},
            mask=self.build_mask(mask)
        )
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # Should keep all values that were 0 in the base data and were
                # 1 in the mask.
                check_arrays(result, mask & (~eye6.astype(bool))),
            else:
                # Should keep all the 1s in the base data.
                check_arrays(result, eye6.astype(bool))