Beispiel #1
0
def fast_corr(m0, m1):
    """Improving the speed of correlation"""
    nan = np.nan
    isnan = np.isnan
    N, M = m0.shape
    out = np.full(M, nan)
    allowed_missing_count = int(0.25 * N)

    independent = np.where(  # shape: (N, M)
        isnan(m0),
        nan,
        m1,
    )
    ind_residual = independent - nanmean(independent, axis=0)  # shape: (N, M)
    covariances = nanmean(ind_residual * m0, axis=0)  # shape: (M,)

    # corr(x,y) = cov(x,y)/std(x)/std(y)
    std_v = nanstd(m0, axis=0)  # std(X)  could reuse ind_residual for possible speedup
    np.divide(covariances, std_v, out=out)
    std_v = nanstd(m1, axis=0)  # std(Y)
    np.divide(out, std_v, out=out)

    # handle NaNs
    nanlocs = isnan(independent).sum(axis=0) > allowed_missing_count
    out[nanlocs] = nan
    return out
Beispiel #2
0
    def compute(self, today, asset_ids, out, close):
        diffs = np.diff(close, axis=0)
        ups = nanmean(np.clip(diffs, 0, np.inf), axis=0)
        downs = abs(nanmean(np.clip(diffs, -np.inf, 0), axis=0))
        rsi = np.zeros(len(out))
        evaluate(
            "100 - (100 / (1 + (ups / downs)))",
            local_dict={
                'ups': ups,
                'downs': downs
            },
            global_dict={},
            out=rsi,
        )

        difference = BBSTD * nanstd(close, axis=0)
        middle = nanmean(close, axis=0)
        upper = middle + difference
        lower = middle - difference

        for i in range(len(out)):
            out[i] = 0
            if rsi[i] < RSI_LOWER:
                out[i] += RSI_LOWER / rsi[i]
            elif rsi[i] > RSI_UPPER:
                out[i] -= rsi[i] / RSI_UPPER

            prices = close[:, i]
            if prices[-1] < lower[i]:
                out[i] += lower[i] / prices[-1]
            elif prices[-1] > upper[i]:
                out[i] -= prices[-1] / upper[i]

            if TREND_FOLLOW:
                out[i] *= -1
Beispiel #3
0
    def get_simple_transform(self,
                             asset,
                             transform_name,
                             dt,
                             data_frequency,
                             bars=None):
        if transform_name == "returns":
            # returns is always calculated over the last 2 days, regardless
            # of the simulation's data frequency.
            hst = self.get_history_window([asset],
                                          dt,
                                          2,
                                          "1d",
                                          "price",
                                          ffill=True)[asset]

            return (hst.iloc[-1] - hst.iloc[0]) / hst.iloc[0]

        if bars is None:
            raise ValueError("bars cannot be None!")

        if data_frequency == "minute":
            freq_str = "1m"
            calculated_bar_count = int(
                self._get_minute_count_for_transform(dt, bars))
        else:
            freq_str = "1d"
            calculated_bar_count = bars

        price_arr = self.get_history_window([asset],
                                            dt,
                                            calculated_bar_count,
                                            freq_str,
                                            "price",
                                            ffill=True)[asset]

        if transform_name == "mavg":
            return nanmean(price_arr)
        elif transform_name == "stddev":
            return nanstd(price_arr, ddof=1)
        elif transform_name == "vwap":
            volume_arr = self.get_history_window([asset],
                                                 dt,
                                                 calculated_bar_count,
                                                 freq_str,
                                                 "volume",
                                                 ffill=True)[asset]

            vol_sum = nansum(volume_arr)

            try:
                ret = nansum(price_arr * volume_arr) / vol_sum
            except ZeroDivisionError:
                ret = np.nan

            return ret
Beispiel #4
0
    def get_simple_transform(self, asset, transform_name, dt, data_frequency,
                             bars=None):
        if transform_name == "returns":
            # returns is always calculated over the last 2 days, regardless
            # of the simulation's data frequency.
            hst = self.get_history_window(
                [asset], dt, 2, "1d", "price", ffill=True
            )[asset]

            return (hst.iloc[-1] - hst.iloc[0]) / hst.iloc[0]

        if bars is None:
            raise ValueError("bars cannot be None!")

        if data_frequency == "minute":
            freq_str = "1m"
            calculated_bar_count = int(self._get_minute_count_for_transform(
                dt, bars
            ))
        else:
            freq_str = "1d"
            calculated_bar_count = bars

        price_arr = self.get_history_window(
            [asset], dt, calculated_bar_count, freq_str, "price", ffill=True
        )[asset]

        if transform_name == "mavg":
            return nanmean(price_arr)
        elif transform_name == "stddev":
            return nanstd(price_arr, ddof=1)
        elif transform_name == "vwap":
            volume_arr = self.get_history_window(
                [asset], dt, calculated_bar_count, freq_str, "volume",
                ffill=True
            )[asset]

            vol_sum = nansum(volume_arr)

            try:
                ret = nansum(price_arr * volume_arr) / vol_sum
            except ZeroDivisionError:
                ret = np.nan

            return ret
Beispiel #5
0
 def compute(self, today, assets, out, close, k):
     difference = k * nanstd(close, axis=0)
     out.middle = middle = nanmean(close, axis=0)
     out.upper = middle + difference
     out.lower = middle - difference
Beispiel #6
0
 def compute(self, today, assets, out, returns, annualization_factor):
     out[:] = nanstd(returns, axis=0) * (annualization_factor**.5)
Beispiel #7
0
 def compute(self, today, assets, out, close, k):
     difference = k * nanstd(close, axis=0)
     out.middle = middle = nanmean(close, axis=0)
     out.upper = middle + difference
     out.lower = middle - difference
Beispiel #8
0
 def compute(self, today, assets, out, returns, annualization_factor):
     out[:] = nanstd(returns, axis=0) * (annualization_factor ** .5)
Beispiel #9
0
 def zscore(row):
     return (row - nanmean(row)) / nanstd(row)
Beispiel #10
0
 def zscore(row):
     return (row - nanmean(row)) / nanstd(row)
Beispiel #11
0
class FactorTestCase(BasePipelineTestCase):
    def init_instance_fixtures(self):
        super(FactorTestCase, self).init_instance_fixtures()
        self.f = F()

    def test_bad_input(self):
        with self.assertRaises(UnknownRankMethod):
            self.f.rank("not a real rank method")

    @parameter_space(method_name=['isnan', 'notnan', 'isfinite'])
    def test_float64_only_ops(self, method_name):
        class NotFloat(Factor):
            dtype = datetime64ns_dtype
            inputs = ()
            window_length = 0

        nf = NotFloat()
        meth = getattr(nf, method_name)
        with self.assertRaises(TypeError):
            meth()

    @parameter_space(custom_missing_value=[-1, 0])
    def test_isnull_int_dtype(self, custom_missing_value):
        class CustomMissingValue(Factor):
            dtype = int64_dtype
            window_length = 0
            missing_value = custom_missing_value
            inputs = ()

        factor = CustomMissingValue()

        data = arange(25).reshape(5, 5)
        data[eye(5, dtype=bool)] = custom_missing_value

        graph = TermGraph({
            'isnull': factor.isnull(),
            'notnull': factor.notnull(),
        })

        results = self.run_graph(
            graph,
            initial_workspace={factor: data},
            mask=self.build_mask(ones((5, 5))),
        )
        check_arrays(results['isnull'], eye(5, dtype=bool))
        check_arrays(results['notnull'], ~eye(5, dtype=bool))

    def test_isnull_datetime_dtype(self):
        class DatetimeFactor(Factor):
            dtype = datetime64ns_dtype
            window_length = 0
            inputs = ()

        factor = DatetimeFactor()

        data = arange(25).reshape(5, 5).astype('datetime64[ns]')
        data[eye(5, dtype=bool)] = NaTns

        graph = TermGraph({
            'isnull': factor.isnull(),
            'notnull': factor.notnull(),
        })

        results = self.run_graph(
            graph,
            initial_workspace={factor: data},
            mask=self.build_mask(ones((5, 5))),
        )
        check_arrays(results['isnull'], eye(5, dtype=bool))
        check_arrays(results['notnull'], ~eye(5, dtype=bool))

    @for_each_factor_dtype
    def test_rank_ascending(self, name, factor_dtype):

        f = F(dtype=factor_dtype)

        # Generated with:
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)

        expected_ranks = {
            'ordinal':
            array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.],
                   [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.],
                   [1., 3., 4., 5., 2.]]),
            'average':
            array([[1.5, 3., 4., 5., 1.5], [2.5, 4., 5., 1., 2.5],
                   [3.5, 5., 1., 2., 3.5], [4.5, 1., 2., 3., 4.5],
                   [1.5, 3., 4., 5., 1.5]]),
            'min':
            array([[1., 3., 4., 5., 1.], [2., 4., 5., 1., 2.],
                   [3., 5., 1., 2., 3.], [4., 1., 2., 3., 4.],
                   [1., 3., 4., 5., 1.]]),
            'max':
            array([[2., 3., 4., 5., 2.], [3., 4., 5., 1., 3.],
                   [4., 5., 1., 2., 4.], [5., 1., 2., 3., 5.],
                   [2., 3., 4., 5., 2.]]),
            'dense':
            array([[1., 2., 3., 4., 1.], [2., 3., 4., 1., 2.],
                   [3., 4., 1., 2., 3.], [4., 1., 2., 3., 4.],
                   [1., 2., 3., 4., 1.]]),
        }

        def check(terms):
            graph = TermGraph(terms)
            results = self.run_graph(
                graph,
                initial_workspace={f: data},
                mask=self.build_mask(ones((5, 5))),
            )
            for method in terms:
                check_arrays(results[method], expected_ranks[method])

        check({meth: f.rank(method=meth) for meth in expected_ranks})
        check({
            meth: f.rank(method=meth, ascending=True)
            for meth in expected_ranks
        })
        # Not passing a method should default to ordinal.
        check({'ordinal': f.rank()})
        check({'ordinal': f.rank(ascending=True)})

    @for_each_factor_dtype
    def test_rank_descending(self, name, factor_dtype):

        f = F(dtype=factor_dtype)

        # Generated with:
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)
        expected_ranks = {
            'ordinal':
            array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.],
                   [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.],
                   [4., 3., 2., 1., 5.]]),
            'average':
            array([[4.5, 3., 2., 1., 4.5], [3.5, 2., 1., 5., 3.5],
                   [2.5, 1., 5., 4., 2.5], [1.5, 5., 4., 3., 1.5],
                   [4.5, 3., 2., 1., 4.5]]),
            'min':
            array([[4., 3., 2., 1., 4.], [3., 2., 1., 5., 3.],
                   [2., 1., 5., 4., 2.], [1., 5., 4., 3., 1.],
                   [4., 3., 2., 1., 4.]]),
            'max':
            array([[5., 3., 2., 1., 5.], [4., 2., 1., 5., 4.],
                   [3., 1., 5., 4., 3.], [2., 5., 4., 3., 2.],
                   [5., 3., 2., 1., 5.]]),
            'dense':
            array([[4., 3., 2., 1., 4.], [3., 2., 1., 4., 3.],
                   [2., 1., 4., 3., 2.], [1., 4., 3., 2., 1.],
                   [4., 3., 2., 1., 4.]]),
        }

        def check(terms):
            graph = TermGraph(terms)
            results = self.run_graph(
                graph,
                initial_workspace={f: data},
                mask=self.build_mask(ones((5, 5))),
            )
            for method in terms:
                check_arrays(results[method], expected_ranks[method])

        check({
            meth: f.rank(method=meth, ascending=False)
            for meth in expected_ranks
        })
        # Not passing a method should default to ordinal.
        check({'ordinal': f.rank(ascending=False)})

    @for_each_factor_dtype
    def test_rank_after_mask(self, name, factor_dtype):

        f = F(dtype=factor_dtype)
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)
        mask_data = ~eye(5, dtype=bool)
        initial_workspace = {f: data, Mask(): mask_data}

        graph = TermGraph({
            "ascending_nomask":
            f.rank(ascending=True),
            "ascending_mask":
            f.rank(ascending=True, mask=Mask()),
            "descending_nomask":
            f.rank(ascending=False),
            "descending_mask":
            f.rank(ascending=False, mask=Mask()),
        })

        expected = {
            "ascending_nomask":
            array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.],
                   [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.],
                   [1., 3., 4., 5., 2.]]),
            "descending_nomask":
            array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.],
                   [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.],
                   [4., 3., 2., 1., 5.]]),
            # Diagonal should be all nans, and anything whose rank was less
            # than the diagonal in the unmasked calc should go down by 1.
            "ascending_mask":
            array([[nan, 2., 3., 4., 1.], [2., nan, 4., 1., 3.],
                   [2., 4., nan, 1., 3.], [3., 1., 2., nan, 4.],
                   [1., 2., 3., 4., nan]]),
            "descending_mask":
            array([[nan, 3., 2., 1., 4.], [2., nan, 1., 4., 3.],
                   [2., 1., nan, 4., 3.], [1., 4., 3., nan, 2.],
                   [4., 3., 2., 1., nan]]),
        }

        results = self.run_graph(
            graph,
            initial_workspace,
            mask=self.build_mask(ones((5, 5))),
        )
        for method in results:
            check_arrays(expected[method], results[method])

    @for_each_factor_dtype
    def test_grouped_rank_ascending(self, name, factor_dtype=float64_dtype):

        f = F(dtype=factor_dtype)
        c = C()
        str_c = C(dtype=categorical_dtype, missing_value=None)

        # Generated with:
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)

        # Generated with:
        # classifier_data = arange(25).reshape(5, 5).transpose() % 2
        classifier_data = array(
            [[0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0],
             [1, 0, 1, 0, 1], [0, 1, 0, 1, 0]],
            dtype=int64_dtype)
        string_classifier_data = LabelArray(
            classifier_data.astype(str).astype(object),
            missing_value=None,
        )

        expected_grouped_ranks = {
            'ordinal':
            array([[1., 1., 3., 2., 2.], [1., 2., 3., 1., 2.],
                   [2., 2., 1., 1., 3.], [2., 1., 1., 2., 3.],
                   [1., 1., 3., 2., 2.]]),
            'average':
            array([[1.5, 1., 3., 2., 1.5], [1.5, 2., 3., 1., 1.5],
                   [2.5, 2., 1., 1., 2.5], [2.5, 1., 1., 2., 2.5],
                   [1.5, 1., 3., 2., 1.5]]),
            'min':
            array([[1., 1., 3., 2., 1.], [1., 2., 3., 1., 1.],
                   [2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.],
                   [1., 1., 3., 2., 1.]]),
            'max':
            array([[2., 1., 3., 2., 2.], [2., 2., 3., 1., 2.],
                   [3., 2., 1., 1., 3.], [3., 1., 1., 2., 3.],
                   [2., 1., 3., 2., 2.]]),
            'dense':
            array([[1., 1., 2., 2., 1.], [1., 2., 2., 1., 1.],
                   [2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.],
                   [1., 1., 2., 2., 1.]]),
        }

        def check(terms):
            graph = TermGraph(terms)
            results = self.run_graph(
                graph,
                initial_workspace={
                    f: data,
                    c: classifier_data,
                    str_c: string_classifier_data,
                },
                mask=self.build_mask(ones((5, 5))),
            )

            for method in terms:
                check_arrays(results[method], expected_grouped_ranks[method])

        # Not specifying the value of ascending param should default to True
        check({
            meth: f.rank(method=meth, groupby=c)
            for meth in expected_grouped_ranks
        })
        check({
            meth: f.rank(method=meth, groupby=str_c)
            for meth in expected_grouped_ranks
        })
        check({
            meth: f.rank(method=meth, groupby=c, ascending=True)
            for meth in expected_grouped_ranks
        })
        check({
            meth: f.rank(method=meth, groupby=str_c, ascending=True)
            for meth in expected_grouped_ranks
        })

        # Not passing a method should default to ordinal
        check({'ordinal': f.rank(groupby=c)})
        check({'ordinal': f.rank(groupby=str_c)})
        check({'ordinal': f.rank(groupby=c, ascending=True)})
        check({'ordinal': f.rank(groupby=str_c, ascending=True)})

    @for_each_factor_dtype
    def test_grouped_rank_descending(self, name, factor_dtype):

        f = F(dtype=factor_dtype)
        c = C()
        str_c = C(dtype=categorical_dtype, missing_value=None)

        # Generated with:
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)

        # Generated with:
        # classifier_data = arange(25).reshape(5, 5).transpose() % 2
        classifier_data = array(
            [[0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0],
             [1, 0, 1, 0, 1], [0, 1, 0, 1, 0]],
            dtype=int64_dtype)

        string_classifier_data = LabelArray(
            classifier_data.astype(str).astype(object),
            missing_value=None,
        )

        expected_grouped_ranks = {
            'ordinal':
            array([[2., 2., 1., 1., 3.], [2., 1., 1., 2., 3.],
                   [1., 1., 3., 2., 2.], [1., 2., 3., 1., 2.],
                   [2., 2., 1., 1., 3.]]),
            'average':
            array([[2.5, 2., 1., 1., 2.5], [2.5, 1., 1., 2., 2.5],
                   [1.5, 1., 3., 2., 1.5], [1.5, 2., 3., 1., 1.5],
                   [2.5, 2., 1., 1., 2.5]]),
            'min':
            array([[2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.],
                   [1., 1., 3., 2., 1.], [1., 2., 3., 1., 1.],
                   [2., 2., 1., 1., 2.]]),
            'max':
            array([[3., 2., 1., 1., 3.], [3., 1., 1., 2., 3.],
                   [2., 1., 3., 2., 2.], [2., 2., 3., 1., 2.],
                   [3., 2., 1., 1., 3.]]),
            'dense':
            array([[2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.],
                   [1., 1., 2., 2., 1.], [1., 2., 2., 1., 1.],
                   [2., 2., 1., 1., 2.]]),
        }

        def check(terms):
            graph = TermGraph(terms)
            results = self.run_graph(
                graph,
                initial_workspace={
                    f: data,
                    c: classifier_data,
                    str_c: string_classifier_data,
                },
                mask=self.build_mask(ones((5, 5))),
            )

            for method in terms:
                check_arrays(results[method], expected_grouped_ranks[method])

        check({
            meth: f.rank(method=meth, groupby=c, ascending=False)
            for meth in expected_grouped_ranks
        })
        check({
            meth: f.rank(method=meth, groupby=str_c, ascending=False)
            for meth in expected_grouped_ranks
        })

        # Not passing a method should default to ordinal
        check({'ordinal': f.rank(groupby=c, ascending=False)})
        check({'ordinal': f.rank(groupby=str_c, ascending=False)})

    @parameterized.expand([
        # Test cases computed by doing:
        # from numpy.random import seed, randn
        # from talib import RSI
        # seed(seed_value)
        # data = abs(randn(15, 3))
        # expected = [RSI(data[:, i])[-1] for i in range(3)]
        (100, array([41.032913785966, 51.553585468393, 51.022005016446])),
        (101, array([43.506969935466, 46.145367530182, 50.57407044197])),
        (102, array([46.610102205934, 47.646892444315, 52.13182788538])),
    ])
    def test_rsi(self, seed_value, expected):

        rsi = RSI()

        today = datetime64(1, 'ns')
        assets = arange(3)
        out = empty((3, ), dtype=float)

        seed(seed_value)  # Seed so we get deterministic results.
        test_data = abs(randn(15, 3))

        out = empty((3, ), dtype=float)
        rsi.compute(today, assets, out, test_data)

        check_allclose(expected, out)

    @parameterized.expand([
        (100, 15),
        (101, 4),
        (102, 100),
    ])
    def test_returns(self, seed_value, window_length):

        returns = Returns(window_length=window_length)

        today = datetime64(1, 'ns')
        assets = arange(3)
        out = empty((3, ), dtype=float)

        seed(seed_value)  # Seed so we get deterministic results.
        test_data = abs(randn(window_length, 3))

        # Calculate the expected returns
        expected = (test_data[-1] - test_data[0]) / test_data[0]

        out = empty((3, ), dtype=float)
        returns.compute(today, assets, out, test_data)

        check_allclose(expected, out)

    def gen_ranking_cases():
        seeds = range(int(1e4), int(1e5), int(1e4))
        methods = ('ordinal', 'average')
        use_mask_values = (True, False)
        set_missing_values = (True, False)
        ascending_values = (True, False)
        return product(
            seeds,
            methods,
            use_mask_values,
            set_missing_values,
            ascending_values,
        )

    @parameterized.expand(gen_ranking_cases())
    def test_masked_rankdata_2d(self, seed_value, method, use_mask,
                                set_missing, ascending):
        eyemask = ~eye(5, dtype=bool)
        nomask = ones((5, 5), dtype=bool)

        seed(seed_value)
        asfloat = (randn(5, 5) * seed_value)
        asdatetime = (asfloat).copy().view('datetime64[ns]')

        mask = eyemask if use_mask else nomask
        if set_missing:
            asfloat[:, 2] = nan
            asdatetime[:, 2] = NaTns

        float_result = masked_rankdata_2d(
            data=asfloat,
            mask=mask,
            missing_value=nan,
            method=method,
            ascending=True,
        )
        datetime_result = masked_rankdata_2d(
            data=asdatetime,
            mask=mask,
            missing_value=NaTns,
            method=method,
            ascending=True,
        )

        check_arrays(float_result, datetime_result)

    def test_normalizations_hand_computed(self):
        """
        Test the hand-computed example in factor.demean.
        """
        f = self.f
        m = Mask()
        c = C()
        str_c = C(dtype=categorical_dtype, missing_value=None)

        factor_data = array([[1.0, 2.0, 3.0, 4.0], [1.5, 2.5, 3.5, 1.0],
                             [2.0, 3.0, 4.0, 1.5], [2.5, 3.5, 1.0, 2.0]], )
        filter_data = array(
            [[False, True, True, True], [True, False, True, True],
             [True, True, False, True], [True, True, True, False]],
            dtype=bool,
        )
        classifier_data = array(
            [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]],
            dtype=int64_dtype,
        )
        string_classifier_data = LabelArray(
            classifier_data.astype(str).astype(object),
            missing_value=None,
        )

        terms = {
            'vanilla': f.demean(),
            'masked': f.demean(mask=m),
            'grouped': f.demean(groupby=c),
            'grouped_str': f.demean(groupby=str_c),
            'grouped_masked': f.demean(mask=m, groupby=c),
            'grouped_masked_str': f.demean(mask=m, groupby=str_c),
        }
        expected = {
            'vanilla':
            array([[-1.500, -0.500, 0.500, 1.500],
                   [-0.625, 0.375, 1.375, -1.125],
                   [-0.625, 0.375, 1.375, -1.125],
                   [0.250, 1.250, -1.250, -0.250]], ),
            'masked':
            array(
                [[nan, -1.000, 0.000, 1.000], [-0.500, nan, 1.500, -1.000],
                 [-0.166, 0.833, nan, -0.666], [0.166, 1.166, -1.333, nan]], ),
            'grouped':
            array([[-0.500, 0.500, -0.500, 0.500],
                   [-0.500, 0.500, 1.250, -1.250],
                   [-0.500, 0.500, 1.250, -1.250],
                   [-0.500, 0.500, -0.500, 0.500]], ),
            'grouped_masked':
            array([[nan, 0.000, -0.500, 0.500], [0.000, nan, 1.250, -1.250],
                   [-0.500, 0.500, nan, 0.000], [-0.500, 0.500, 0.000, nan]])
        }
        # Changing the classifier dtype shouldn't affect anything.
        expected['grouped_str'] = expected['grouped']
        expected['grouped_masked_str'] = expected['grouped_masked']

        graph = TermGraph(terms)
        results = self.run_graph(
            graph,
            initial_workspace={
                f: factor_data,
                c: classifier_data,
                str_c: string_classifier_data,
                m: filter_data,
            },
            mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
        )

        for key, (res, exp) in dzip_exact(results, expected).items():
            check_allclose(
                res,
                exp,
                # The hand-computed values aren't very precise (in particular,
                # we truncate repeating decimals at 3 places) This is just
                # asserting that the example isn't misleading by being totally
                # wrong.
                atol=0.001,
                err_msg="Mismatch for %r" % key)

    @parameter_space(
        seed_value=range(1, 2),
        normalizer_name_and_func=[
            ('demean', lambda row: row - nanmean(row)),
            ('zscore', lambda row: (row - nanmean(row)) / nanstd(row)),
        ],
        add_nulls_to_factor=(
            False,
            True,
        ),
    )
    def test_normalizations_randomized(self, seed_value,
                                       normalizer_name_and_func,
                                       add_nulls_to_factor):

        name, func = normalizer_name_and_func

        shape = (7, 7)

        # All Trues.
        nomask = self.ones_mask(shape=shape)
        # Falses on main diagonal.
        eyemask = self.eye_mask(shape=shape)
        # Falses on other diagonal.
        eyemask90 = rot90(eyemask)
        # Falses on both diagonals.
        xmask = eyemask & eyemask90

        # Block of random data.
        factor_data = self.randn_data(seed=seed_value, shape=shape)
        if add_nulls_to_factor:
            factor_data = where(eyemask, factor_data, nan)

        # Cycles of 0, 1, 2, 0, 1, 2, ...
        classifier_data = (
            (self.arange_data(shape=shape, dtype=int64_dtype) + seed_value) %
            3)
        # With -1s on main diagonal.
        classifier_data_eyenulls = where(eyemask, classifier_data, -1)
        # With -1s on opposite diagonal.
        classifier_data_eyenulls90 = where(eyemask90, classifier_data, -1)
        # With -1s on both diagonals.
        classifier_data_xnulls = where(xmask, classifier_data, -1)

        f = self.f
        c = C()
        c_with_nulls = OtherC()
        m = Mask()
        method = getattr(f, name)
        terms = {
            'vanilla': method(),
            'masked': method(mask=m),
            'grouped': method(groupby=c),
            'grouped_with_nulls': method(groupby=c_with_nulls),
            'both': method(mask=m, groupby=c),
            'both_with_nulls': method(mask=m, groupby=c_with_nulls),
        }

        expected = {
            'vanilla':
            apply_along_axis(
                func,
                1,
                factor_data,
            ),
            'masked':
            where(
                eyemask,
                grouped_apply(factor_data, eyemask, func),
                nan,
            ),
            'grouped':
            grouped_apply(
                factor_data,
                classifier_data,
                func,
            ),
            # If the classifier has nulls, we should get NaNs in the
            # corresponding locations in the output.
            'grouped_with_nulls':
            where(
                eyemask90,
                grouped_apply(factor_data, classifier_data_eyenulls90, func),
                nan,
            ),
            # Passing a mask with a classifier should behave as though the
            # classifier had nulls where the mask was False.
            'both':
            where(
                eyemask,
                grouped_apply(
                    factor_data,
                    classifier_data_eyenulls,
                    func,
                ),
                nan,
            ),
            'both_with_nulls':
            where(
                xmask,
                grouped_apply(
                    factor_data,
                    classifier_data_xnulls,
                    func,
                ),
                nan,
            )
        }

        self.check_terms(
            terms=terms,
            expected=expected,
            initial_workspace={
                f: factor_data,
                c: classifier_data,
                c_with_nulls: classifier_data_eyenulls90,
                Mask(): eyemask,
            },
            mask=self.build_mask(nomask),
        )

    @parameter_space(method_name=['demean', 'zscore'])
    def test_cant_normalize_non_float(self, method_name):
        class DateFactor(Factor):
            dtype = datetime64ns_dtype
            inputs = ()
            window_length = 0

        d = DateFactor()
        with self.assertRaises(TypeError) as e:
            getattr(d, method_name)()

        errmsg = str(e.exception)
        expected = (
            "{normalizer}() is only defined on Factors of dtype float64,"
            " but it was called on a Factor of dtype datetime64[ns].").format(
                normalizer=method_name)

        self.assertEqual(errmsg, expected)

    @parameter_space(seed=[1, 2, 3])
    def test_quantiles_unmasked(self, seed):
        permute = partial(permute_rows, seed)

        shape = (6, 6)

        # Shuffle the input rows to verify that we don't depend on the order.
        # Take the log to ensure that we don't depend on linear scaling or
        # integrality of inputs
        factor_data = permute(log1p(arange(36, dtype=float).reshape(shape)))

        f = self.f

        # Apply the same shuffle we applied to the input rows to our
        # expectations. Doing it this way makes it obvious that our
        # expectation corresponds to our input, while still testing against
        # a range of input orderings.
        permuted_array = compose(permute, partial(array, dtype=int64_dtype))
        self.check_terms(
            terms={
                '2': f.quantiles(bins=2),
                '3': f.quantiles(bins=3),
                '6': f.quantiles(bins=6),
            },
            initial_workspace={
                f: factor_data,
            },
            expected={
                # The values in the input are all increasing, so the first half
                # of each row should be in the bottom bucket, and the second
                # half should be in the top bucket.
                '2':
                permuted_array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1],
                                [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1],
                                [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]]),
                # Similar for three buckets.
                '3':
                permuted_array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2],
                                [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2],
                                [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]),
                # In the limiting case, we just have every column different.
                '6':
                permuted_array([[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5],
                                [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5],
                                [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]),
            },
            mask=self.build_mask(self.ones_mask(shape=shape)),
        )

    @parameter_space(seed=[1, 2, 3])
    def test_quantiles_masked(self, seed):
        permute = partial(permute_rows, seed)

        # 7 x 7 so that we divide evenly into 2/3/6-tiles after including the
        # nan value in each row.
        shape = (7, 7)

        # Shuffle the input rows to verify that we don't depend on the order.
        # Take the log to ensure that we don't depend on linear scaling or
        # integrality of inputs
        factor_data = permute(log1p(arange(49, dtype=float).reshape(shape)))
        factor_data_w_nans = where(
            permute(rot90(self.eye_mask(shape=shape))),
            factor_data,
            nan,
        )
        mask_data = permute(self.eye_mask(shape=shape))

        f = F()
        f_nans = OtherF()
        m = Mask()

        # Apply the same shuffle we applied to the input rows to our
        # expectations. Doing it this way makes it obvious that our
        # expectation corresponds to our input, while still testing against
        # a range of input orderings.
        permuted_array = compose(permute, partial(array, dtype=int64_dtype))

        self.check_terms(
            terms={
                '2_masked': f.quantiles(bins=2, mask=m),
                '3_masked': f.quantiles(bins=3, mask=m),
                '6_masked': f.quantiles(bins=6, mask=m),
                '2_nans': f_nans.quantiles(bins=2),
                '3_nans': f_nans.quantiles(bins=3),
                '6_nans': f_nans.quantiles(bins=6),
            },
            initial_workspace={
                f: factor_data,
                f_nans: factor_data_w_nans,
                m: mask_data,
            },
            expected={
                # Expected results here are the same as in
                # test_quantiles_unmasked, except with diagonals of -1s
                # interpolated to match the effects of masking and/or input
                # nans.
                '2_masked':
                permuted_array([[-1, 0, 0, 0, 1, 1, 1], [0, -1, 0, 0, 1, 1, 1],
                                [0, 0, -1, 0, 1, 1, 1], [0, 0, 0, -1, 1, 1, 1],
                                [0, 0, 0, 1, -1, 1, 1], [0, 0, 0, 1, 1, -1, 1],
                                [0, 0, 0, 1, 1, 1, -1]]),
                '3_masked':
                permuted_array([[-1, 0, 0, 1, 1, 2, 2], [0, -1, 0, 1, 1, 2, 2],
                                [0, 0, -1, 1, 1, 2, 2], [0, 0, 1, -1, 1, 2, 2],
                                [0, 0, 1, 1, -1, 2, 2], [0, 0, 1, 1, 2, -1, 2],
                                [0, 0, 1, 1, 2, 2, -1]]),
                '6_masked':
                permuted_array([[-1, 0, 1, 2, 3, 4, 5], [0, -1, 1, 2, 3, 4, 5],
                                [0, 1, -1, 2, 3, 4, 5], [0, 1, 2, -1, 3, 4, 5],
                                [0, 1, 2, 3, -1, 4, 5], [0, 1, 2, 3, 4, -1, 5],
                                [0, 1, 2, 3, 4, 5, -1]]),
                '2_nans':
                permuted_array([[0, 0, 0, 1, 1, 1, -1], [0, 0, 0, 1, 1, -1, 1],
                                [0, 0, 0, 1, -1, 1, 1], [0, 0, 0, -1, 1, 1, 1],
                                [0, 0, -1, 0, 1, 1, 1], [0, -1, 0, 0, 1, 1, 1],
                                [-1, 0, 0, 0, 1, 1, 1]]),
                '3_nans':
                permuted_array([[0, 0, 1, 1, 2, 2, -1], [0, 0, 1, 1, 2, -1, 2],
                                [0, 0, 1, 1, -1, 2, 2], [0, 0, 1, -1, 1, 2, 2],
                                [0, 0, -1, 1, 1, 2, 2], [0, -1, 0, 1, 1, 2, 2],
                                [-1, 0, 0, 1, 1, 2, 2]]),
                '6_nans':
                permuted_array([[0, 1, 2, 3, 4, 5, -1], [0, 1, 2, 3, 4, -1, 5],
                                [0, 1, 2, 3, -1, 4, 5], [0, 1, 2, -1, 3, 4, 5],
                                [0, 1, -1, 2, 3, 4, 5], [0, -1, 1, 2, 3, 4, 5],
                                [-1, 0, 1, 2, 3, 4, 5]]),
            },
            mask=self.build_mask(self.ones_mask(shape=shape)),
        )

    def test_quantiles_uneven_buckets(self):
        permute = partial(permute_rows, 5)
        shape = (5, 5)

        factor_data = permute(log1p(arange(25, dtype=float).reshape(shape)))
        mask_data = permute(self.eye_mask(shape=shape))

        f = F()
        m = Mask()

        permuted_array = compose(permute, partial(array, dtype=int64_dtype))
        self.check_terms(
            terms={
                '3_masked': f.quantiles(bins=3, mask=m),
                '7_masked': f.quantiles(bins=7, mask=m),
            },
            initial_workspace={
                f: factor_data,
                m: mask_data,
            },
            expected={
                '3_masked':
                permuted_array([[-1, 0, 0, 1, 2], [0, -1, 0, 1, 2],
                                [0, 0, -1, 1, 2], [0, 0, 1, -1, 2],
                                [0, 0, 1, 2, -1]]),
                '7_masked':
                permuted_array([[-1, 0, 2, 4, 6], [0, -1, 2, 4, 6],
                                [0, 2, -1, 4, 6], [0, 2, 4, -1, 6],
                                [0, 2, 4, 6, -1]]),
            },
            mask=self.build_mask(self.ones_mask(shape=shape)),
        )

    def test_quantile_helpers(self):
        f = self.f
        m = Mask()

        self.assertIs(f.quartiles(), f.quantiles(bins=4))
        self.assertIs(f.quartiles(mask=m), f.quantiles(bins=4, mask=m))
        self.assertIsNot(f.quartiles(), f.quartiles(mask=m))

        self.assertIs(f.quintiles(), f.quantiles(bins=5))
        self.assertIs(f.quintiles(mask=m), f.quantiles(bins=5, mask=m))
        self.assertIsNot(f.quintiles(), f.quintiles(mask=m))

        self.assertIs(f.deciles(), f.quantiles(bins=10))
        self.assertIs(f.deciles(mask=m), f.quantiles(bins=10, mask=m))
        self.assertIsNot(f.deciles(), f.deciles(mask=m))