Beispiel #1
0
    def test_percentile_after_mask(self):
        f_input = eye(5)
        g_input = arange(25, dtype=float).reshape(5, 5)
        initial_mask = self.build_mask(ones((5, 5)))

        custom_mask = self.f < 1
        without_mask = self.g.percentile_between(80, 100)
        with_mask = self.g.percentile_between(80, 100, mask=custom_mask)

        graph = TermGraph({
            'custom_mask': custom_mask,
            'without': without_mask,
            'with': with_mask,
        })

        results = self.run_graph(
            graph,
            initial_workspace={
                self.f: f_input,
                self.g: g_input
            },
            mask=initial_mask,
        )

        # First should pass everything but the diagonal.
        check_arrays(results['custom_mask'], ~eye(5, dtype=bool))

        # Second should pass the largest value each day.  Each row is strictly
        # increasing, so we always select the last value.
        expected_without = array(
            [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]],
            dtype=bool,
        )
        check_arrays(results['without'], expected_without)

        # When sequencing, we should remove the diagonal as an option before
        # computing percentiles.  On the last day, we should get the
        # second-largest value, rather than the largest.
        expected_with = array(
            [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1],
             [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]],  # Different from previous!
            dtype=bool,
        )
        check_arrays(results['with'], expected_with)
Beispiel #2
0
    def test_single_factor(self):
        """
        Test dependency resolution for a single factor.
        """
        def check_output(graph):

            resolution_order = list(graph.ordered())

            self.assertEqual(len(resolution_order), 4)
            self.check_dependency_order(resolution_order)
            self.assertIn(AssetExists(), resolution_order)
            self.assertIn(SomeDataSet.foo, resolution_order)
            self.assertIn(SomeDataSet.bar, resolution_order)
            self.assertIn(SomeFactor(), resolution_order)

            self.assertEqual(graph.node[SomeDataSet.foo]['extra_rows'], 4)
            self.assertEqual(graph.node[SomeDataSet.bar]['extra_rows'], 4)

        for foobar in gen_equivalent_factors():
            check_output(TermGraph(to_dict([foobar])))
Beispiel #3
0
    def test_bottom(self):
        counts = 2, 3, 10
        data = self.randn_data(seed=5)  # Arbitrary seed choice.
        results = self.run_graph(
            TermGraph({'bottom_' + str(c): self.f.bottom(c)
                       for c in counts}),
            initial_workspace={self.f: data},
        )
        for c in counts:
            result = results['bottom_' + str(c)]

            # Check that `min(c, num_assets)` assets passed each day.
            passed_per_day = result.sum(axis=1)
            check_arrays(
                passed_per_day,
                full_like(passed_per_day, min(c, data.shape[1])),
            )

            # Check that the bottom `c` assets passed.
            expected = rowwise_rank(data) < c
            check_arrays(result, expected)
Beispiel #4
0
    def test_single_factor_instance_args(self):
        """
        Test dependency resolution for a single factor with arguments passed to
        the constructor.
        """
        bar, buzz = SomeDataSet.bar, SomeDataSet.buzz
        graph = TermGraph(to_dict([SomeFactor([bar, buzz], window_length=5)]))

        resolution_order = list(graph.ordered())

        # SomeFactor, its inputs, and AssetExists()
        self.assertEqual(len(resolution_order), 4)
        self.check_dependency_order(resolution_order)
        self.assertIn(AssetExists(), resolution_order)
        self.assertEqual(graph.extra_rows[AssetExists()], 4)

        self.assertIn(bar, resolution_order)
        self.assertIn(buzz, resolution_order)
        self.assertIn(SomeFactor([bar, buzz], window_length=5),
                      resolution_order)
        self.assertEqual(graph.extra_rows[bar], 4)
        self.assertEqual(graph.extra_rows[buzz], 4)
Beispiel #5
0
    def test_all(self):

        data = array(
            [[1, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1],
             [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1],
             [1, 1, 1, 1, 1, 0]],
            dtype=bool)

        # With a window_length of N, 0's should be "sticky" for the (N - 1)
        # days after the 0 in the base data.

        # Note that, the way ``self.run_graph`` works, we compute the same
        # number of output rows for all inputs, so we only get the last 4
        # outputs for expected_3 even though we have enought input data to
        # compute 5 rows.
        expected_3 = array([[0, 0, 0, 1, 1, 1], [1, 0, 0, 0, 1, 1],
                            [1, 1, 0, 0, 0, 1], [1, 1, 1, 0, 0, 0]],
                           dtype=bool)

        expected_4 = array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1],
                            [1, 0, 0, 0, 0, 1], [1, 1, 0, 0, 0, 0]],
                           dtype=bool)

        class Input(Filter):
            inputs = ()
            window_length = 0

        results = self.run_graph(
            TermGraph({
                '3': All(inputs=[Input()], window_length=3),
                '4': All(inputs=[Input()], window_length=4),
            }),
            initial_workspace={Input(): data},
            mask=self.build_mask(ones(shape=data.shape)),
        )

        check_arrays(results['3'], expected_3)
        check_arrays(results['4'], expected_4)
Beispiel #6
0
    def test_isnull_datetime_dtype(self):
        class DatetimeFactor(Factor):
            dtype = datetime64ns_dtype
            window_length = 0
            inputs = ()

        factor = DatetimeFactor()

        data = arange(25).reshape(5, 5).astype('datetime64[ns]')
        data[eye(5, dtype=bool)] = NaTns

        graph = TermGraph({
            'isnull': factor.isnull(),
            'notnull': factor.notnull(),
        })

        results = self.run_graph(
            graph,
            initial_workspace={factor: data},
            mask=self.build_mask(ones((5, 5))),
        )
        check_arrays(results['isnull'], eye(5, dtype=bool))
        check_arrays(results['notnull'], ~eye(5, dtype=bool))
Beispiel #7
0
    def test_isnull_int_dtype(self, custom_missing_value):
        class CustomMissingValue(Factor):
            dtype = int64_dtype
            window_length = 0
            missing_value = custom_missing_value
            inputs = ()

        factor = CustomMissingValue()

        data = arange(25).reshape(5, 5)
        data[eye(5, dtype=bool)] = custom_missing_value

        graph = TermGraph({
            'isnull': factor.isnull(),
            'notnull': factor.notnull(),
        })

        results = self.run_graph(
            graph,
            initial_workspace={factor: data},
            mask=self.build_mask(ones((5, 5))),
        )
        check_arrays(results['isnull'], eye(5, dtype=bool))
        check_arrays(results['notnull'], ~eye(5, dtype=bool))
Beispiel #8
0
    def test_percentile_between(self):

        quintiles = range(5)
        filter_names = ['pct_' + str(q) for q in quintiles]
        iter_quintiles = zip(filter_names, quintiles)

        graph = TermGraph({
            name: self.f.percentile_between(q * 20.0, (q + 1) * 20.0)
            for name, q in zip(filter_names, quintiles)
        })

        # Test with 5 columns and no NaNs.
        eye5 = eye(5, dtype=float64)
        results = self.run_graph(
            graph,
            initial_workspace={self.f: eye5},
            mask=self.build_mask(ones((5, 5))),
        )
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # There are four 0s and one 1 in each row, so the first 4
                # quintiles should be all the locations with zeros in the input
                # array.
                check_arrays(result, ~eye5.astype(bool))
            else:
                # The top quintile should match the sole 1 in each row.
                check_arrays(result, eye5.astype(bool))

        # Test with 6 columns, no NaNs, and one masked entry per day.
        eye6 = eye(6, dtype=float64)
        mask = array(
            [[1, 1, 1, 1, 1, 0], [0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1],
             [1, 1, 0, 1, 1, 1], [1, 1, 1, 0, 1, 1], [1, 1, 1, 1, 0, 1]],
            dtype=bool)

        results = self.run_graph(graph,
                                 initial_workspace={self.f: eye6},
                                 mask=self.build_mask(mask))
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # Should keep all values that were 0 in the base data and were
                # 1 in the mask.
                check_arrays(result, mask & (~eye6.astype(bool))),
            else:
                # Should keep all the 1s in the base data.
                check_arrays(result, eye6.astype(bool))

        # Test with 6 columns, no mask, and one NaN per day.  Should have the
        # same outcome as if we had masked the NaNs.
        # In particular, the NaNs should never pass any filters.
        eye6_withnans = eye6.copy()
        putmask(eye6_withnans, ~mask, nan)
        results = self.run_graph(graph,
                                 initial_workspace={self.f: eye6},
                                 mask=self.build_mask(mask))
        for name, quintile in iter_quintiles:
            result = results[name]
            if quintile < 4:
                # Should keep all values that were 0 in the base data and were
                # 1 in the mask.
                check_arrays(result, mask & (~eye6.astype(bool))),
            else:
                # Should keep all the 1s in the base data.
                check_arrays(result, eye6.astype(bool))
Beispiel #9
0
    def test_normalizations_hand_computed(self):
        """
        Test the hand-computed example in factor.demean.
        """
        f = self.f
        m = Mask()
        c = C()
        str_c = C(dtype=categorical_dtype, missing_value=None)

        factor_data = array([[1.0, 2.0, 3.0, 4.0], [1.5, 2.5, 3.5, 1.0],
                             [2.0, 3.0, 4.0, 1.5], [2.5, 3.5, 1.0, 2.0]], )
        filter_data = array(
            [[False, True, True, True], [True, False, True, True],
             [True, True, False, True], [True, True, True, False]],
            dtype=bool,
        )
        classifier_data = array(
            [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]],
            dtype=int64_dtype,
        )
        string_classifier_data = LabelArray(
            classifier_data.astype(str).astype(object),
            missing_value=None,
        )

        terms = {
            'vanilla': f.demean(),
            'masked': f.demean(mask=m),
            'grouped': f.demean(groupby=c),
            'grouped_str': f.demean(groupby=str_c),
            'grouped_masked': f.demean(mask=m, groupby=c),
            'grouped_masked_str': f.demean(mask=m, groupby=str_c),
        }
        expected = {
            'vanilla':
            array([[-1.500, -0.500, 0.500, 1.500],
                   [-0.625, 0.375, 1.375, -1.125],
                   [-0.625, 0.375, 1.375, -1.125],
                   [0.250, 1.250, -1.250, -0.250]], ),
            'masked':
            array(
                [[nan, -1.000, 0.000, 1.000], [-0.500, nan, 1.500, -1.000],
                 [-0.166, 0.833, nan, -0.666], [0.166, 1.166, -1.333, nan]], ),
            'grouped':
            array([[-0.500, 0.500, -0.500, 0.500],
                   [-0.500, 0.500, 1.250, -1.250],
                   [-0.500, 0.500, 1.250, -1.250],
                   [-0.500, 0.500, -0.500, 0.500]], ),
            'grouped_masked':
            array([[nan, 0.000, -0.500, 0.500], [0.000, nan, 1.250, -1.250],
                   [-0.500, 0.500, nan, 0.000], [-0.500, 0.500, 0.000, nan]])
        }
        # Changing the classifier dtype shouldn't affect anything.
        expected['grouped_str'] = expected['grouped']
        expected['grouped_masked_str'] = expected['grouped_masked']

        graph = TermGraph(terms)
        results = self.run_graph(
            graph,
            initial_workspace={
                f: factor_data,
                c: classifier_data,
                str_c: string_classifier_data,
                m: filter_data,
            },
            mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
        )

        for key, (res, exp) in dzip_exact(results, expected).items():
            check_allclose(
                res,
                exp,
                # The hand-computed values aren't very precise (in particular,
                # we truncate repeating decimals at 3 places) This is just
                # asserting that the example isn't misleading by being totally
                # wrong.
                atol=0.001,
                err_msg="Mismatch for %r" % key)
Beispiel #10
0
    def test_normalizations(self, seed_value, normalizer_name_and_func,
                            add_nulls_to_factor):

        name, func = normalizer_name_and_func

        shape = (7, 7)

        # All Trues.
        nomask = self.ones_mask(shape=shape)
        # Falses on main diagonal.
        eyemask = self.eye_mask(shape=shape)
        # Falses on other diagonal.
        eyemask_T = eyemask.T
        # Falses on both diagonals.
        xmask = eyemask & eyemask_T

        # Block of random data.
        factor_data = self.randn_data(seed=seed_value, shape=shape)
        if add_nulls_to_factor:
            factor_data = where(eyemask, factor_data, nan)

        # Cycles of 0, 1, 2, 0, 1, 2, ...
        classifier_data = (
            (self.arange_data(shape=shape, dtype=int) + seed_value) % 3)
        # With -1s on main diagonal.
        classifier_data_eyenulls = where(eyemask, classifier_data, -1)
        # With -1s on opposite diagonal.
        classifier_data_eyenulls_T = where(eyemask_T, classifier_data, -1)
        # With -1s on both diagonals.
        classifier_data_xnulls = where(xmask, classifier_data, -1)

        f = self.f
        c = C()
        c_with_nulls = OtherC()
        m = Mask()
        method = getattr(f, name)
        terms = {
            'vanilla': method(),
            'masked': method(mask=m),
            'grouped': method(groupby=c),
            'grouped_with_nulls': method(groupby=c_with_nulls),
            'both': method(mask=m, groupby=c),
            'both_with_nulls': method(mask=m, groupby=c_with_nulls),
        }

        expected = {
            'vanilla':
            apply_along_axis(
                func,
                1,
                factor_data,
            ),
            'masked':
            where(
                eyemask,
                grouped_apply(factor_data, eyemask, func),
                nan,
            ),
            'grouped':
            grouped_apply(
                factor_data,
                classifier_data,
                func,
            ),
            # If the classifier has nulls, we should get NaNs in the
            # corresponding locations in the output.
            'grouped_with_nulls':
            where(
                eyemask_T,
                grouped_apply(factor_data, classifier_data_eyenulls_T, func),
                nan,
            ),
            # Passing a mask with a classifier should behave as though the
            # classifier had nulls where the mask was False.
            'both':
            where(
                eyemask,
                grouped_apply(
                    factor_data,
                    classifier_data_eyenulls,
                    func,
                ),
                nan,
            ),
            'both_with_nulls':
            where(
                xmask,
                grouped_apply(
                    factor_data,
                    classifier_data_xnulls,
                    func,
                ),
                nan,
            )
        }

        graph = TermGraph(terms)
        results = self.run_graph(
            graph,
            initial_workspace={
                f: factor_data,
                c: classifier_data,
                c_with_nulls: classifier_data_eyenulls_T,
                Mask(): eyemask,
            },
            mask=self.build_mask(nomask),
        )

        for key in expected:
            check_arrays(expected[key], results[key])