Ejemplo n.º 1
0
    def test_reuse_atomic_terms(self):
        """
        Test that raw inputs only show up in the dependency graph once.
        """
        f1 = SomeFactor([SomeDataSet.foo, SomeDataSet.bar])
        f2 = SomeOtherFactor([SomeDataSet.bar, SomeDataSet.buzz])

        graph = TermGraph(to_dict([f1, f2]))
        resolution_order = list(graph.ordered())

        # bar should only appear once.
        self.assertEqual(len(resolution_order), 6)
        indices = {
            term: resolution_order.index(term)
            for term in resolution_order
        }

        self.assertEqual(indices[AssetExists()], 0)

        # Verify that f1's dependencies will be computed before f1.
        self.assertLess(indices[SomeDataSet.foo], indices[f1])
        self.assertLess(indices[SomeDataSet.bar], indices[f1])

        # Verify that f2's dependencies will be computed before f2.
        self.assertLess(indices[SomeDataSet.bar], indices[f2])
        self.assertLess(indices[SomeDataSet.buzz], indices[f2])
Ejemplo n.º 2
0
    def test_single_factor_instance_args(self):
        """
        Test dependency resolution for a single factor with arguments passed to
        the constructor.
        """
        bar, buzz = SomeDataSet.bar, SomeDataSet.buzz
        graph = TermGraph(to_dict([SomeFactor([bar, buzz], window_length=5)]))

        resolution_order = list(graph.ordered())

        # SomeFactor, its inputs, and AssetExists()
        self.assertEqual(len(resolution_order), 4)

        self.assertIs(resolution_order[0], AssetExists())
        self.assertEqual(graph.extra_rows[AssetExists()], 4)

        self.assertEqual(
            set([resolution_order[1], resolution_order[2]]),
            set([bar, buzz]),
        )
        self.assertEqual(
            resolution_order[-1],
            SomeFactor([bar, buzz], window_length=5),
        )
        self.assertEqual(graph.extra_rows[bar], 4)
        self.assertEqual(graph.extra_rows[buzz], 4)
Ejemplo n.º 3
0
    def test_reuse_atomic_terms(self):
        """
        Test that raw inputs only show up in the dependency graph once.
        """
        f1 = SomeFactor([SomeDataSet.foo, SomeDataSet.bar])
        f2 = SomeOtherFactor([SomeDataSet.bar, SomeDataSet.buzz])

        graph = TermGraph(to_dict([f1, f2]))
        resolution_order = list(graph.ordered())

        # bar should only appear once.
        self.assertEqual(len(resolution_order), 6)
        indices = {
            term: resolution_order.index(term)
            for term in resolution_order
        }

        self.assertEqual(indices[AssetExists()], 0)

        # Verify that f1's dependencies will be computed before f1.
        self.assertLess(indices[SomeDataSet.foo], indices[f1])
        self.assertLess(indices[SomeDataSet.bar], indices[f1])

        # Verify that f2's dependencies will be computed before f2.
        self.assertLess(indices[SomeDataSet.bar], indices[f2])
        self.assertLess(indices[SomeDataSet.buzz], indices[f2])
Ejemplo n.º 4
0
    def test_single_factor_instance_args(self):
        """
        Test dependency resolution for a single factor with arguments passed to
        the constructor.
        """
        bar, buzz = SomeDataSet.bar, SomeDataSet.buzz
        graph = TermGraph(to_dict([SomeFactor([bar, buzz], window_length=5)]))

        resolution_order = list(graph.ordered())

        # SomeFactor, its inputs, and AssetExists()
        self.assertEqual(len(resolution_order), 4)

        self.assertIs(resolution_order[0], AssetExists())
        self.assertEqual(graph.extra_rows[AssetExists()], 4)

        self.assertEqual(
            set([resolution_order[1], resolution_order[2]]),
            set([bar, buzz]),
        )
        self.assertEqual(
            resolution_order[-1],
            SomeFactor([bar, buzz], window_length=5),
        )
        self.assertEqual(graph.extra_rows[bar], 4)
        self.assertEqual(graph.extra_rows[buzz], 4)
Ejemplo n.º 5
0
    def test_reuse_loadable_terms(self):
        """
        Test that raw inputs only show up in the dependency graph once.
        """
        f1 = SomeFactor([SomeDataSet.foo, SomeDataSet.bar])
        f2 = SomeOtherFactor([SomeDataSet.bar, SomeDataSet.buzz])

        graph = TermGraph(to_dict([f1, f2]))
        resolution_order = list(graph.ordered())

        # bar should only appear once.
        self.assertEqual(len(resolution_order), 6)
        self.assertEqual(len(set(resolution_order)), 6)
        self.check_dependency_order(resolution_order)
Ejemplo n.º 6
0
    def test_reuse_atomic_terms(self):
        """
        Test that raw inputs only show up in the dependency graph once.
        """
        f1 = SomeFactor([SomeDataSet.foo, SomeDataSet.bar])
        f2 = SomeOtherFactor([SomeDataSet.bar, SomeDataSet.buzz])

        graph = TermGraph(to_dict([f1, f2]))
        resolution_order = list(graph.ordered())

        # bar should only appear once.
        self.assertEqual(len(resolution_order), 6)
        self.assertEqual(len(set(resolution_order)), 6)
        self.check_dependency_order(resolution_order)
Ejemplo n.º 7
0
    def test_bollinger_bands(self, window_length, k, mask_sid):
        closes = self.closes(mask_sid)
        result = self.run_graph(
            TermGraph({
                'f': BollingerBands(
                    window_length=window_length,
                    k=k,
                ),
            }),
            initial_workspace={
                USEquityPricing.close: AdjustedArray(
                    closes,
                    np.full_like(closes, True, dtype=bool),
                    {},
                    np.nan,
                ),
            },
            mask_sid=mask_sid,
        )['f']

        expected_upper, expected_middle, expected_lower = self.expected(
            window_length,
            k,
            closes,
        )

        assert_equal(result.upper, expected_upper)
        assert_equal(result.middle, expected_middle)
        assert_equal(result.lower, expected_lower)
Ejemplo n.º 8
0
    def test_percentile_nasty_partitions(self):
        # Test percentile with nasty partitions: divide up 5 assets into
        # quartiles.
        # There isn't a nice mathematical definition of correct behavior here,
        # so for now we guarantee the behavior of numpy.nanpercentile.  This is
        # mostly for regression testing in case we write our own specialized
        # percentile calculation at some point in the future.

        data = arange(25, dtype=float).reshape(5, 5) % 4
        quartiles = range(4)
        filter_names = ['pct_' + str(q) for q in quartiles]

        graph = TermGraph({
            name: self.f.percentile_between(q * 25.0, (q + 1) * 25.0)
            for name, q in zip(filter_names, quartiles)
        })
        results = self.run_graph(
            graph,
            initial_workspace={self.f: data},
            mask=self.build_mask(ones((5, 5))),
        )

        for name, quartile in zip(filter_names, quartiles):
            result = results[name]
            lower = quartile * 25.0
            upper = (quartile + 1) * 25.0
            expected = and_(
                nanpercentile(data, lower, axis=1, keepdims=True) <= data,
                data <= nanpercentile(data, upper, axis=1, keepdims=True),
            )
            check_arrays(result, expected)
Ejemplo n.º 9
0
    def test_window_safe(self, factor_len):
        # all true data set of (days, securities)
        data = full(self.default_shape, True, dtype=bool)

        class InputFilter(Filter):
            inputs = ()
            window_length = 0

        class TestFactor(CustomFactor):
            dtype = float64_dtype
            inputs = (InputFilter(), )
            window_length = factor_len

            def compute(self, today, assets, out, filter_):
                # sum for each column
                out[:] = np_sum(filter_, axis=0)

        results = self.run_graph(
            TermGraph({'windowsafe': TestFactor()}),
            initial_workspace={InputFilter(): data},
        )

        # number of days in default_shape
        n = self.default_shape[0]

        # shape of output array
        output_shape = ((n - factor_len + 1), self.default_shape[1])
        check_arrays(results['windowsafe'],
                     full(output_shape, factor_len, dtype=float64))
Ejemplo n.º 10
0
    def test_at_least_N(self):

        # With a window_length of K, AtLeastN should return 1
        # if N or more 1's exist in the lookback window

        # This smoothing filter gives customizable "stickiness"

        data = array(
            [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0],
             [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0],
             [1, 0, 0, 0, 0, 0]],
            dtype=bool)

        expected_1 = array([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1],
                            [1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 0, 0]],
                           dtype=bool)

        expected_2 = array([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0],
                            [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]],
                           dtype=bool)

        expected_3 = array([[1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 0, 0],
                            [1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0]],
                           dtype=bool)

        expected_4 = array([[1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0],
                            [1, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]],
                           dtype=bool)

        class Input(Filter):
            inputs = ()
            window_length = 0

        all_but_one = AtLeastN(inputs=[Input()], window_length=4, N=3)

        all_but_two = AtLeastN(inputs=[Input()], window_length=4, N=2)

        any_equiv = AtLeastN(inputs=[Input()], window_length=4, N=1)

        all_equiv = AtLeastN(inputs=[Input()], window_length=4, N=4)

        results = self.run_graph(
            TermGraph({
                'AllButOne': all_but_one,
                'AllButTwo': all_but_two,
                'AnyEquiv': any_equiv,
                'AllEquiv': all_equiv,
                'Any': Any(inputs=[Input()], window_length=4),
                'All': All(inputs=[Input()], window_length=4)
            }),
            initial_workspace={Input(): data},
            mask=self.build_mask(ones(shape=data.shape)),
        )

        check_arrays(results['Any'], expected_1)
        check_arrays(results['AnyEquiv'], expected_1)
        check_arrays(results['AllButTwo'], expected_2)
        check_arrays(results['AllButOne'], expected_3)
        check_arrays(results['All'], expected_4)
        check_arrays(results['AllEquiv'], expected_4)
Ejemplo n.º 11
0
 def check_terms(self, terms, expected, initial_workspace, mask):
     """
     Compile the given terms into a TermGraph, compute it with
     initial_workspace, and compare the results with ``expected``.
     """
     graph = TermGraph(terms)
     results = self.run_graph(graph, initial_workspace, mask)
     for key, (res, exp) in dzip_exact(results, expected).items():
         check_arrays(res, exp)
Ejemplo n.º 12
0
 def check(terms):
     graph = TermGraph(terms)
     results = self.run_graph(
         graph,
         initial_workspace={self.f: data},
         mask=self.build_mask(ones((5, 5))),
     )
     for method in terms:
         check_arrays(results[method], expected_ranks[method])
Ejemplo n.º 13
0
    def test_notnan(self):
        data = self.randn_data(seed=10)
        diag = eye(*data.shape, dtype=bool)
        data[diag] = nan

        results = self.run_graph(
            TermGraph({'notnan': self.f.notnan()}),
            initial_workspace={self.f: data},
        )
        check_arrays(results['notnan'], ~diag)
Ejemplo n.º 14
0
    def test_isfinite(self):
        data = self.randn_data(seed=10)
        data[:, 0] = nan
        data[:, 2] = inf
        data[:, 4] = -inf

        results = self.run_graph(
            TermGraph({'isfinite': self.f.isfinite()}),
            initial_workspace={self.f: data},
        )
        check_arrays(results['isfinite'], isfinite(data))
Ejemplo n.º 15
0
    def test_any(self):

        # FUN FACT: The inputs and outputs here are exactly the negation of
        # the inputs and outputs for test_all above. This isn't a coincidence.
        #
        # By de Morgan's Laws, we have::
        #
        #     ~(a & b) == (~a | ~b)
        #
        # negating both sides, we have::
        #
        #      (a & b) == ~(a | ~b)
        #
        # Since all(a, b) is isomorphic to (a & b), and any(a, b) is isomorphic
        # to (a | b), we have::
        #
        #     all(a, b) == ~(any(~a, ~b))
        #
        data = array(
            [[0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0],
             [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0],
             [0, 0, 0, 0, 0, 1]],
            dtype=bool)

        # With a window_length of N, 1's should be "sticky" for the (N - 1)
        # days after the 1 in the base data.

        # Note that, the way ``self.run_graph`` works, we compute the same
        # number of output rows for all inputs, so we only get the last 4
        # outputs for expected_3 even though we have enought input data to
        # compute 5 rows.
        expected_3 = array([[1, 1, 1, 0, 0, 0], [0, 1, 1, 1, 0, 0],
                            [0, 0, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1]],
                           dtype=bool)

        expected_4 = array([[1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 0, 0],
                            [0, 1, 1, 1, 1, 0], [0, 0, 1, 1, 1, 1]],
                           dtype=bool)

        class Input(Filter):
            inputs = ()
            window_length = 0

        results = self.run_graph(
            TermGraph({
                '3': Any(inputs=[Input()], window_length=3),
                '4': Any(inputs=[Input()], window_length=4),
            }),
            initial_workspace={Input(): data},
            mask=self.build_mask(ones(shape=data.shape)),
        )

        check_arrays(results['3'], expected_3)
        check_arrays(results['4'], expected_4)
Ejemplo n.º 16
0
    def test_top_and_bottom(self):
        data = self.randn_data(seed=5)  # Fix a seed for determinism.

        mask_data = ones_like(data, dtype=bool)
        mask_data[:, 0] = False

        nan_data = data.copy()
        nan_data[:, 0] = nan

        mask = Mask()
        workspace = {self.f: data, mask: mask_data}

        methods = ['top', 'bottom']
        counts = 2, 3, 10
        term_combos = list(product(methods, counts, [True, False]))

        def termname(method, count, masked):
            return '_'.join([method, str(count), 'mask' if masked else ''])

        # Add a term for each permutation of top/bottom, count, and
        # mask/no_mask.
        terms = {}
        for method, count, masked in term_combos:
            kwargs = {'N': count}
            if masked:
                kwargs['mask'] = mask
            term = getattr(self.f, method)(**kwargs)
            terms[termname(method, count, masked)] = term

        results = self.run_graph(TermGraph(terms), initial_workspace=workspace)

        def expected_result(method, count, masked):
            # Ranking with a mask is equivalent to ranking with nans applied on
            # the masked values.
            to_rank = nan_data if masked else data

            if method == 'top':
                return rowwise_rank(-to_rank) < count
            elif method == 'bottom':
                return rowwise_rank(to_rank) < count

        for method, count, masked in term_combos:
            result = results[termname(method, count, masked)]

            # Check that `min(c, num_assets)` assets passed each day.
            passed_per_day = result.sum(axis=1)
            check_arrays(
                passed_per_day,
                full_like(passed_per_day, min(count, data.shape[1])),
            )

            expected = expected_result(method, count, masked)
            check_arrays(result, expected)
Ejemplo n.º 17
0
        def check(terms):
            graph = TermGraph(terms)
            results = self.run_graph(
                graph,
                initial_workspace={
                    f: data,
                    c: classifier_data,
                    str_c: string_classifier_data,
                },
                mask=self.build_mask(ones((5, 5))),
            )

            for method in terms:
                check_arrays(results[method], expected_grouped_ranks[method])
Ejemplo n.º 18
0
    def test_percentile_after_mask(self):
        f_input = eye(5)
        g_input = arange(25, dtype=float).reshape(5, 5)
        initial_mask = self.build_mask(ones((5, 5)))

        custom_mask = self.f < 1
        without_mask = self.g.percentile_between(80, 100)
        with_mask = self.g.percentile_between(80, 100, mask=custom_mask)

        graph = TermGraph(
            {
                'custom_mask': custom_mask,
                'without': without_mask,
                'with': with_mask,
            }
        )

        results = self.run_graph(
            graph,
            initial_workspace={self.f: f_input, self.g: g_input},
            mask=initial_mask,
        )

        # First should pass everything but the diagonal.
        check_arrays(results['custom_mask'], ~eye(5, dtype=bool))

        # Second should pass the largest value each day.  Each row is strictly
        # increasing, so we always select the last value.
        expected_without = array(
            [[0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1]],
            dtype=bool,
        )
        check_arrays(results['without'], expected_without)

        # When sequencing, we should remove the diagonal as an option before
        # computing percentiles.  On the last day, we should get the
        # second-largest value, rather than the largest.
        expected_with = array(
            [[0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1],
             [0, 0, 0, 1, 0]],  # Different from previous!
            dtype=bool,
        )
        check_arrays(results['with'], expected_with)
Ejemplo n.º 19
0
    def test_isnan(self):
        data = self.randn_data(seed=10)
        diag = eye(*data.shape, dtype=bool)
        data[diag] = nan

        results = self.run_graph(
            TermGraph({
                'isnan': self.f.isnan(),
                'isnull': self.f.isnull(),
            }),
            initial_workspace={self.f: data},
        )
        check_arrays(results['isnan'], diag)
        check_arrays(results['isnull'], diag)
Ejemplo n.º 20
0
    def test_rank_after_mask(self, name, factor_dtype):

        f = F(dtype=factor_dtype)
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)
        mask_data = ~eye(5, dtype=bool)
        initial_workspace = {f: data, Mask(): mask_data}

        graph = TermGraph({
            "ascending_nomask":
            f.rank(ascending=True),
            "ascending_mask":
            f.rank(ascending=True, mask=Mask()),
            "descending_nomask":
            f.rank(ascending=False),
            "descending_mask":
            f.rank(ascending=False, mask=Mask()),
        })

        expected = {
            "ascending_nomask":
            array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.],
                   [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.],
                   [1., 3., 4., 5., 2.]]),
            "descending_nomask":
            array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.],
                   [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.],
                   [4., 3., 2., 1., 5.]]),
            # Diagonal should be all nans, and anything whose rank was less
            # than the diagonal in the unmasked calc should go down by 1.
            "ascending_mask":
            array([[nan, 2., 3., 4., 1.], [2., nan, 4., 1., 3.],
                   [2., 4., nan, 1., 3.], [3., 1., 2., nan, 4.],
                   [1., 2., 3., 4., nan]]),
            "descending_mask":
            array([[nan, 3., 2., 1., 4.], [2., nan, 1., 4., 3.],
                   [2., 1., nan, 4., 3.], [1., 4., 3., nan, 2.],
                   [4., 3., 2., 1., nan]]),
        }

        results = self.run_graph(
            graph,
            initial_workspace,
            mask=self.build_mask(ones((5, 5))),
        )
        for method in results:
            check_arrays(expected[method], results[method])
Ejemplo n.º 21
0
    def test_single_factor(self):
        """
        Test dependency resolution for a single factor.
        """
        def check_output(graph):

            resolution_order = list(graph.ordered())

            self.assertEqual(len(resolution_order), 4)
            self.check_dependency_order(resolution_order)
            self.assertIn(AssetExists(), resolution_order)
            self.assertIn(SomeDataSet.foo, resolution_order)
            self.assertIn(SomeDataSet.bar, resolution_order)
            self.assertIn(SomeFactor(), resolution_order)

            self.assertEqual(graph.node[SomeDataSet.foo]['extra_rows'], 4)
            self.assertEqual(graph.node[SomeDataSet.bar]['extra_rows'], 4)

        for foobar in gen_equivalent_factors():
            check_output(TermGraph(to_dict([foobar])))
Ejemplo n.º 22
0
    def test_bottom(self):
        counts = 2, 3, 10
        data = self.randn_data(seed=5)  # Arbitrary seed choice.
        results = self.run_graph(
            TermGraph({'bottom_' + str(c): self.f.bottom(c)
                       for c in counts}),
            initial_workspace={self.f: data},
        )
        for c in counts:
            result = results['bottom_' + str(c)]

            # Check that `min(c, num_assets)` assets passed each day.
            passed_per_day = result.sum(axis=1)
            check_arrays(
                passed_per_day,
                full_like(passed_per_day, min(c, data.shape[1])),
            )

            # Check that the bottom `c` assets passed.
            expected = rowwise_rank(data) < c
            check_arrays(result, expected)
Ejemplo n.º 23
0
    def test_all(self):

        data = array(
            [[1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1],
             [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1],
             [1, 1, 1, 1, 1, 0]],
            dtype=bool)

        # With a window_length of N, 0's should be "sticky" for the (N - 1)
        # days after the 0 in the base data.

        # Note that, the way ``self.run_graph`` works, we compute the same
        # number of output rows for all inputs, so we only get the last 4
        # outputs for expected_3 even though we have enought input data to
        # compute 5 rows.
        expected_3 = array([[0, 0, 0, 1, 1, 1], [1, 0, 0, 0, 1, 1],
                            [1, 1, 0, 0, 0, 1], [1, 1, 1, 0, 0, 0]],
                           dtype=bool)

        expected_4 = array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1],
                            [1, 0, 0, 0, 0, 1], [1, 1, 0, 0, 0, 0]],
                           dtype=bool)

        class Input(Filter):
            inputs = ()
            window_length = 0

        results = self.run_graph(
            TermGraph({
                '3': All(inputs=[Input()], window_length=3),
                '4': All(inputs=[Input()], window_length=4),
            }),
            initial_workspace={Input(): data},
            mask=self.build_mask(ones(shape=data.shape)),
        )

        check_arrays(results['3'], expected_3)
        check_arrays(results['4'], expected_4)
Ejemplo n.º 24
0
    def test_isnull_datetime_dtype(self):
        class DatetimeFactor(Factor):
            dtype = datetime64ns_dtype
            window_length = 0
            inputs = ()

        factor = DatetimeFactor()

        data = arange(25).reshape(5, 5).astype('datetime64[ns]')
        data[eye(5, dtype=bool)] = NaTns

        graph = TermGraph({
            'isnull': factor.isnull(),
            'notnull': factor.notnull(),
        })

        results = self.run_graph(
            graph,
            initial_workspace={factor: data},
            mask=self.build_mask(ones((5, 5))),
        )
        check_arrays(results['isnull'], eye(5, dtype=bool))
        check_arrays(results['notnull'], ~eye(5, dtype=bool))
Ejemplo n.º 25
0
    def test_isnull_int_dtype(self, custom_missing_value):
        class CustomMissingValue(Factor):
            dtype = int64_dtype
            window_length = 0
            missing_value = custom_missing_value
            inputs = ()

        factor = CustomMissingValue()

        data = arange(25).reshape(5, 5)
        data[eye(5, dtype=bool)] = custom_missing_value

        graph = TermGraph({
            'isnull': factor.isnull(),
            'notnull': factor.notnull(),
        })

        results = self.run_graph(
            graph,
            initial_workspace={factor: data},
            mask=self.build_mask(ones((5, 5))),
        )
        check_arrays(results['isnull'], eye(5, dtype=bool))
        check_arrays(results['notnull'], ~eye(5, dtype=bool))
Ejemplo n.º 26
0
    def test_normalizations_hand_computed(self):
        """
        Test the hand-computed example in factor.demean.
        """
        f = self.f
        m = Mask()
        c = C()
        str_c = C(dtype=categorical_dtype, missing_value=None)

        factor_data = array([[1.0, 2.0, 3.0, 4.0], [1.5, 2.5, 3.5, 1.0],
                             [2.0, 3.0, 4.0, 1.5], [2.5, 3.5, 1.0, 2.0]], )
        filter_data = array(
            [[False, True, True, True], [True, False, True, True],
             [True, True, False, True], [True, True, True, False]],
            dtype=bool,
        )
        classifier_data = array(
            [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]],
            dtype=int64_dtype,
        )
        string_classifier_data = LabelArray(
            classifier_data.astype(str).astype(object),
            missing_value=None,
        )

        terms = {
            'vanilla': f.demean(),
            'masked': f.demean(mask=m),
            'grouped': f.demean(groupby=c),
            'grouped_str': f.demean(groupby=str_c),
            'grouped_masked': f.demean(mask=m, groupby=c),
            'grouped_masked_str': f.demean(mask=m, groupby=str_c),
        }
        expected = {
            'vanilla':
            array([[-1.500, -0.500, 0.500, 1.500],
                   [-0.625, 0.375, 1.375, -1.125],
                   [-0.625, 0.375, 1.375, -1.125],
                   [0.250, 1.250, -1.250, -0.250]], ),
            'masked':
            array(
                [[nan, -1.000, 0.000, 1.000], [-0.500, nan, 1.500, -1.000],
                 [-0.166, 0.833, nan, -0.666], [0.166, 1.166, -1.333, nan]], ),
            'grouped':
            array([[-0.500, 0.500, -0.500, 0.500],
                   [-0.500, 0.500, 1.250, -1.250],
                   [-0.500, 0.500, 1.250, -1.250],
                   [-0.500, 0.500, -0.500, 0.500]], ),
            'grouped_masked':
            array([[nan, 0.000, -0.500, 0.500], [0.000, nan, 1.250, -1.250],
                   [-0.500, 0.500, nan, 0.000], [-0.500, 0.500, 0.000, nan]])
        }
        # Changing the classifier dtype shouldn't affect anything.
        expected['grouped_str'] = expected['grouped']
        expected['grouped_masked_str'] = expected['grouped_masked']

        graph = TermGraph(terms)
        results = self.run_graph(
            graph,
            initial_workspace={
                f: factor_data,
                c: classifier_data,
                str_c: string_classifier_data,
                m: filter_data,
            },
            mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
        )

        for key, (res, exp) in dzip_exact(results, expected).items():
            check_allclose(
                res,
                exp,
                # The hand-computed values aren't very precise (in particular,
                # we truncate repeating decimals at 3 places) This is just
                # asserting that the example isn't misleading by being totally
                # wrong.
                atol=0.001,
                err_msg="Mismatch for %r" % key)
Ejemplo n.º 27
0
    def test_normalizations(self, seed_value, normalizer_name_and_func,
                            add_nulls_to_factor):

        name, func = normalizer_name_and_func

        shape = (7, 7)

        # All Trues.
        nomask = self.ones_mask(shape=shape)
        # Falses on main diagonal.
        eyemask = self.eye_mask(shape=shape)
        # Falses on other diagonal.
        eyemask_T = eyemask.T
        # Falses on both diagonals.
        xmask = eyemask & eyemask_T

        # Block of random data.
        factor_data = self.randn_data(seed=seed_value, shape=shape)
        if add_nulls_to_factor:
            factor_data = where(eyemask, factor_data, nan)

        # Cycles of 0, 1, 2, 0, 1, 2, ...
        classifier_data = (
            (self.arange_data(shape=shape, dtype=int) + seed_value) % 3)
        # With -1s on main diagonal.
        classifier_data_eyenulls = where(eyemask, classifier_data, -1)
        # With -1s on opposite diagonal.
        classifier_data_eyenulls_T = where(eyemask_T, classifier_data, -1)
        # With -1s on both diagonals.
        classifier_data_xnulls = where(xmask, classifier_data, -1)

        f = self.f
        c = C()
        c_with_nulls = OtherC()
        m = Mask()
        method = getattr(f, name)
        terms = {
            'vanilla': method(),
            'masked': method(mask=m),
            'grouped': method(groupby=c),
            'grouped_with_nulls': method(groupby=c_with_nulls),
            'both': method(mask=m, groupby=c),
            'both_with_nulls': method(mask=m, groupby=c_with_nulls),
        }

        expected = {
            'vanilla':
            apply_along_axis(
                func,
                1,
                factor_data,
            ),
            'masked':
            where(
                eyemask,
                grouped_apply(factor_data, eyemask, func),
                nan,
            ),
            'grouped':
            grouped_apply(
                factor_data,
                classifier_data,
                func,
            ),
            # If the classifier has nulls, we should get NaNs in the
            # corresponding locations in the output.
            'grouped_with_nulls':
            where(
                eyemask_T,
                grouped_apply(factor_data, classifier_data_eyenulls_T, func),
                nan,
            ),
            # Passing a mask with a classifier should behave as though the
            # classifier had nulls where the mask was False.
            'both':
            where(
                eyemask,
                grouped_apply(
                    factor_data,
                    classifier_data_eyenulls,
                    func,
                ),
                nan,
            ),
            'both_with_nulls':
            where(
                xmask,
                grouped_apply(
                    factor_data,
                    classifier_data_xnulls,
                    func,
                ),
                nan,
            )
        }

        graph = TermGraph(terms)
        results = self.run_graph(
            graph,
            initial_workspace={
                f: factor_data,
                c: classifier_data,
                c_with_nulls: classifier_data_eyenulls_T,
                Mask(): eyemask,
            },
            mask=self.build_mask(nomask),
        )

        for key in expected:
            check_arrays(expected[key], results[key])
Ejemplo n.º 28
0
    def test_percentile_between(self):

        quintiles = range(5)
        filter_names = ['pct_' + str(q) for q in quintiles]
        iter_quintiles = zip(filter_names, quintiles)

        graph = TermGraph({
            name: self.f.percentile_between(q * 20.0, (q + 1) * 20.0)
            for name, q in zip(filter_names, quintiles)
        })

        # Test with 5 columns and no NaNs.
        eye5 = eye(5, dtype=float64)
        results = self.run_graph(
            graph,
            initial_workspace={self.f: eye5},
            mask=self.build_mask(ones((5, 5))),
        )
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # There are four 0s and one 1 in each row, so the first 4
                # quintiles should be all the locations with zeros in the input
                # array.
                check_arrays(result, ~eye5.astype(bool))
            else:
                # The top quintile should match the sole 1 in each row.
                check_arrays(result, eye5.astype(bool))

        # Test with 6 columns, no NaNs, and one masked entry per day.
        eye6 = eye(6, dtype=float64)
        mask = array(
            [[1, 1, 1, 1, 1, 0], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1],
             [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1]],
            dtype=bool)

        results = self.run_graph(graph,
                                 initial_workspace={self.f: eye6},
                                 mask=self.build_mask(mask))
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # Should keep all values that were 0 in the base data and were
                # 1 in the mask.
                check_arrays(result, mask & (~eye6.astype(bool))),
            else:
                # Should keep all the 1s in the base data.
                check_arrays(result, eye6.astype(bool))

        # Test with 6 columns, no mask, and one NaN per day.  Should have the
        # same outcome as if we had masked the NaNs.
        # In particular, the NaNs should never pass any filters.
        eye6_withnans = eye6.copy()
        putmask(eye6_withnans, ~mask, nan)
        results = self.run_graph(graph,
                                 initial_workspace={self.f: eye6},
                                 mask=self.build_mask(mask))
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # Should keep all values that were 0 in the base data and were
                # 1 in the mask.
                check_arrays(result, mask & (~eye6.astype(bool))),
            else:
                # Should keep all the 1s in the base data.
                check_arrays(result, eye6.astype(bool))