Esempio n. 1
0
    def compute(self, today, asset_ids, out, close):
        diffs = np.diff(close, axis=0)
        ups = nanmean(np.clip(diffs, 0, np.inf), axis=0)
        downs = abs(nanmean(np.clip(diffs, -np.inf, 0), axis=0))
        rsi = np.zeros(len(out))
        evaluate(
            "100 - (100 / (1 + (ups / downs)))",
            local_dict={
                'ups': ups,
                'downs': downs
            },
            global_dict={},
            out=rsi,
        )

        difference = BBSTD * nanstd(close, axis=0)
        middle = nanmean(close, axis=0)
        upper = middle + difference
        lower = middle - difference

        for i in range(len(out)):
            out[i] = 0
            if rsi[i] < RSI_LOWER:
                out[i] += RSI_LOWER / rsi[i]
            elif rsi[i] > RSI_UPPER:
                out[i] -= rsi[i] / RSI_UPPER

            prices = close[:, i]
            if prices[-1] < lower[i]:
                out[i] += lower[i] / prices[-1]
            elif prices[-1] > upper[i]:
                out[i] -= prices[-1] / upper[i]

            if TREND_FOLLOW:
                out[i] *= -1
Esempio n. 2
0
def fast_corr(m0, m1):
    """Improving the speed of correlation"""
    nan = np.nan
    isnan = np.isnan
    N, M = m0.shape
    out = np.full(M, nan)
    allowed_missing_count = int(0.25 * N)

    independent = np.where(  # shape: (N, M)
        isnan(m0),
        nan,
        m1,
    )
    ind_residual = independent - nanmean(independent, axis=0)  # shape: (N, M)
    covariances = nanmean(ind_residual * m0, axis=0)  # shape: (M,)

    # corr(x,y) = cov(x,y)/std(x)/std(y)
    std_v = nanstd(m0, axis=0)  # std(X)  could reuse ind_residual for possible speedup
    np.divide(covariances, std_v, out=out)
    std_v = nanstd(m1, axis=0)  # std(Y)
    np.divide(out, std_v, out=out)

    # handle NaNs
    nanlocs = isnan(independent).sum(axis=0) > allowed_missing_count
    out[nanlocs] = nan
    return out
Esempio n. 3
0
 def compute(self, today, assets, out, closes):
     diffs = diff(closes, axis=0)
     ups = nanmean(clip(diffs, 0, inf), axis=0)
     downs = abs(nanmean(clip(diffs, -inf, 0), axis=0))
     return evaluate(
         "100 - (100 / (1 + (ups / downs)))", local_dict={"ups": ups, "downs": downs}, global_dict={}, out=out
     )
Esempio n. 4
0
 def compute(self, today, assets, out, closes):
     diffs = diff(closes, axis=0)
     ups = nanmean(clip(diffs, 0, inf), axis=0)
     downs = abs(nanmean(clip(diffs, -inf, 0), axis=0))
     return evaluate(
         "100 - (100 / (1 + (ups / downs)))",
         local_dict={'ups': ups, 'downs': downs},
         global_dict={},
         out=out,
     )
Esempio n. 5
0
    def compute(self, today, assets, out, closes, sectors):
        res = np.zeros(closes.shape[1])
        change_ratio = np.diff(closes, axis=0) / closes[:-1]
        latest_sectors = sectors[-1]

        stock_in_sector = latest_sectors == self.sector_code
        change_ratio_in_sector = change_ratio[:, stock_in_sector]

        # epsilon = 0.000001
        # nan_locs = np.where(np.isnan(change_ratio_in_sector))[1]  # 列
        # print(assets[np.unique(nan_locs)])

        # change_ratio_in_sector = np.where(np.isnan(change_ratio_in_sector), epsilon, change_ratio_in_sector)
        # 行业收益率
        sector_returns = nanmean(change_ratio_in_sector, axis=1).reshape(-1, 1)

        allowed_missing = int(self.window_length * 0.25)
        # 行业内各股票收益率基于行业平均收益率回归得到各股票的β值,即敞口
        beta = vectorized_beta(
            dependents=change_ratio_in_sector,
            independent=sector_returns,
            allowed_missing=allowed_missing,
        )
        # 更新β值,其余部分为0
        res[stock_in_sector] = beta
        out[:] = res
Esempio n. 6
0
    def get_simple_transform(self,
                             asset,
                             transform_name,
                             dt,
                             data_frequency,
                             bars=None):
        if transform_name == "returns":
            # returns is always calculated over the last 2 days, regardless
            # of the simulation's data frequency.
            hst = self.get_history_window([asset],
                                          dt,
                                          2,
                                          "1d",
                                          "price",
                                          ffill=True)[asset]

            return (hst.iloc[-1] - hst.iloc[0]) / hst.iloc[0]

        if bars is None:
            raise ValueError("bars cannot be None!")

        if data_frequency == "minute":
            freq_str = "1m"
            calculated_bar_count = int(
                self._get_minute_count_for_transform(dt, bars))
        else:
            freq_str = "1d"
            calculated_bar_count = bars

        price_arr = self.get_history_window([asset],
                                            dt,
                                            calculated_bar_count,
                                            freq_str,
                                            "price",
                                            ffill=True)[asset]

        if transform_name == "mavg":
            return nanmean(price_arr)
        elif transform_name == "stddev":
            return nanstd(price_arr, ddof=1)
        elif transform_name == "vwap":
            volume_arr = self.get_history_window([asset],
                                                 dt,
                                                 calculated_bar_count,
                                                 freq_str,
                                                 "volume",
                                                 ffill=True)[asset]

            vol_sum = nansum(volume_arr)

            try:
                ret = nansum(price_arr * volume_arr) / vol_sum
            except ZeroDivisionError:
                ret = np.nan

            return ret
Esempio n. 7
0
def fast_cov(m0, m1):
    """Improving the speed of cov()"""
    nan = np.nan
    isnan = np.isnan
    N, M = m0.shape
    #out = np.full(M, nan)
    allowed_missing_count = int(0.25 * N)

    independent = np.where(  # shape: (N, M)
        isnan(m0),
        nan,
        m1,
    )
    ind_residual = independent - nanmean(independent, axis=0)  # shape: (N, M)
    covariances = nanmean(ind_residual * m0, axis=0)           # shape: (M,)

    nanlocs = isnan(independent).sum(axis=0) > allowed_missing_count
    covariances[nanlocs] = nan
    return covariances
Esempio n. 8
0
    def zscore(self, mask=NotSpecified, groupby=NotSpecified):
        """
        Construct a Factor that Z-Scores each day's results.

        The Z-Score of a row is defined as::

            (row - row.mean()) / row.stddev()

        If ``mask`` is supplied, ignore values where ``mask`` returns False
        when computing row means and standard deviations, and output NaN
        anywhere the mask is False.

        If ``groupby`` is supplied, compute by partitioning each row based on
        the values produced by ``groupby``, z-scoring the partitioned arrays,
        and stitching the sub-results back together.

        Parameters
        ----------
        mask : zipline.pipeline.Filter, optional
            A Filter defining values to ignore when Z-Scoring.
        groupby : zipline.pipeline.Classifier, optional
            A classifier defining partitions over which to compute Z-Scores.

        Returns
        -------
        zscored : zipline.pipeline.Factor
            A Factor producing that z-scores the output of self.

        Notes
        -----
        Mean and standard deviation are sensitive to the magnitudes of
        outliers. When working with factor that can potentially produce large
        outliers, it is often useful to use the ``mask`` parameter to discard
        values at the extremes of the distribution::

            >>> base = MyFactor(...)
            >>> normalized = base.zscore(mask=base.percentile_between(1, 99))

        ``zscore()`` is only supported on Factors of dtype float64.

        Example
        -------
        See :meth:`~zipline.pipeline.factors.Factor.demean` for an in-depth
        example of the semantics for ``mask`` and ``groupby``.

        See Also
        --------
        :meth:`pandas.DataFrame.groupby`
        """
        return GroupedRowTransform(
            transform=lambda row: (row - nanmean(row)) / nanstd(row),
            factor=self,
            mask=mask,
            groupby=groupby,
        )
Esempio n. 9
0
    def zscore(self, mask=NotSpecified, groupby=NotSpecified):
        """
        Construct a Factor that Z-Scores each day's results.

        The Z-Score of a row is defined as::

            (row - row.mean()) / row.stddev()

        If ``mask`` is supplied, ignore values where ``mask`` returns False
        when computing row means and standard deviations, and output NaN
        anywhere the mask is False.

        If ``groupby`` is supplied, compute by partitioning each row based on
        the values produced by ``groupby``, z-scoring the partitioned arrays,
        and stitching the sub-results back together.

        Parameters
        ----------
        mask : zipline.pipeline.Filter, optional
            A Filter defining values to ignore when Z-Scoring.
        groupby : zipline.pipeline.Classifier, optional
            A classifier defining partitions over which to compute Z-Scores.

        Returns
        -------
        zscored : zipline.pipeline.Factor
            A Factor producing that z-scores the output of self.

        Notes
        -----
        Mean and standard deviation are sensitive to the magnitudes of
        outliers. When working with factor that can potentially produce large
        outliers, it is often useful to use the ``mask`` parameter to discard
        values at the extremes of the distribution::

            >>> base = MyFactor(...)
            >>> normalized = base.zscore(mask=base.percentile_between(1, 99))

        ``zscore()`` is only supported on Factors of dtype float64.

        Example
        -------
        See :meth:`~zipline.pipeline.factors.Factor.demean` for an in-depth
        example of the semantics for ``mask`` and ``groupby``.

        See Also
        --------
        :meth:`pandas.DataFrame.groupby`
        """
        return GroupedRowTransform(
            transform=lambda row: (row - nanmean(row)) / nanstd(row),
            factor=self,
            mask=mask,
            groupby=groupby,
        )
Esempio n. 10
0
    def get_simple_transform(self, asset, transform_name, dt, data_frequency,
                             bars=None):
        if transform_name == "returns":
            # returns is always calculated over the last 2 days, regardless
            # of the simulation's data frequency.
            hst = self.get_history_window(
                [asset], dt, 2, "1d", "price", ffill=True
            )[asset]

            return (hst.iloc[-1] - hst.iloc[0]) / hst.iloc[0]

        if bars is None:
            raise ValueError("bars cannot be None!")

        if data_frequency == "minute":
            freq_str = "1m"
            calculated_bar_count = int(self._get_minute_count_for_transform(
                dt, bars
            ))
        else:
            freq_str = "1d"
            calculated_bar_count = bars

        price_arr = self.get_history_window(
            [asset], dt, calculated_bar_count, freq_str, "price", ffill=True
        )[asset]

        if transform_name == "mavg":
            return nanmean(price_arr)
        elif transform_name == "stddev":
            return nanstd(price_arr, ddof=1)
        elif transform_name == "vwap":
            volume_arr = self.get_history_window(
                [asset], dt, calculated_bar_count, freq_str, "volume",
                ffill=True
            )[asset]

            vol_sum = nansum(volume_arr)

            try:
                ret = nansum(price_arr * volume_arr) / vol_sum
            except ZeroDivisionError:
                ret = np.nan

            return ret
Esempio n. 11
0
 def compute(self, today, assets, out, close, volume):
     out[:] = nanmean(close * volume, axis=0)
Esempio n. 12
0
 def compute(self, today, assets, out, data):
     out[:] = nanmean(data, axis=0)
Esempio n. 13
0
    def demean(self, mask=NotSpecified, groupby=NotSpecified):
        """
        Construct a Factor that computes ``self`` and subtracts the mean from
        row of the result.

        If ``mask`` is supplied, ignore values where ``mask`` returns False
        when computing row means, and output NaN anywhere the mask is False.

        If ``groupby`` is supplied, compute by partitioning each row based on
        the values produced by ``groupby``, de-meaning the partitioned arrays,
        and stitching the sub-results back together.

        Parameters
        ----------
        mask : zipline.pipeline.Filter, optional
            A Filter defining values to ignore when computing means.
        groupby : zipline.pipeline.Classifier, optional
            A classifier defining partitions over which to compute means.

        Example
        -------
        Let ``f`` be a Factor which would produce the following output::

                         AAPL   MSFT    MCD     BK
            2017-03-13    1.0    2.0    3.0    4.0
            2017-03-14    1.5    2.5    3.5    1.0
            2017-03-15    2.0    3.0    4.0    1.5
            2017-03-16    2.5    3.5    1.0    2.0

        Let ``c`` be a Classifier producing the following output::

                         AAPL   MSFT    MCD     BK
            2017-03-13      1      1      2      2
            2017-03-14      1      1      2      2
            2017-03-15      1      1      2      2
            2017-03-16      1      1      2      2

        Let ``m`` be a Filter producing the following output::

                         AAPL   MSFT    MCD     BK
            2017-03-13  False   True   True   True
            2017-03-14   True  False   True   True
            2017-03-15   True   True  False   True
            2017-03-16   True   True   True  False

        Then ``f.demean()`` will subtract the mean from each row produced by
        ``f``.

        ::

                         AAPL   MSFT    MCD     BK
            2017-03-13 -1.500 -0.500  0.500  1.500
            2017-03-14 -0.625  0.375  1.375 -1.125
            2017-03-15 -0.625  0.375  1.375 -1.125
            2017-03-16  0.250  1.250 -1.250 -0.250

        ``f.demean(mask=m)`` will subtract the mean from each row, but means
        will be calculated ignoring values on the diagonal, and NaNs will
        written to the diagonal in the output. Diagonal values are ignored
        because they are the locations where the mask ``m`` produced False.

        ::

                         AAPL   MSFT    MCD     BK
            2017-03-13    NaN -1.000  0.000  1.000
            2017-03-14 -0.500    NaN  1.500 -1.000
            2017-03-15 -0.166  0.833    NaN -0.666
            2017-03-16  0.166  1.166 -1.333    NaN

        ``f.demean(groupby=c)`` will subtract the group-mean of AAPL/MSFT and
        MCD/BK from their respective entries.  The AAPL/MSFT are grouped
        together because both assets always produce 1 in the output of the
        classifier ``c``.  Similarly, MCD/BK are grouped together because they
        always produce 2.

        ::

                         AAPL   MSFT    MCD     BK
            2017-03-13 -0.500  0.500 -0.500  0.500
            2017-03-14 -0.500  0.500  1.250 -1.250
            2017-03-15 -0.500  0.500  1.250 -1.250
            2017-03-16 -0.500  0.500 -0.500  0.500

        ``f.demean(mask=m, groupby=c)`` will also subtract the group-mean of
        AAPL/MSFT and MCD/BK, but means will be calculated ignoring values on
        the diagonal , and NaNs will be written to the diagonal in the output.

        ::

                         AAPL   MSFT    MCD     BK
            2017-03-13    NaN  0.000 -0.500  0.500
            2017-03-14  0.000    NaN  1.250 -1.250
            2017-03-15 -0.500  0.500    NaN  0.000
            2017-03-16 -0.500  0.500  0.000    NaN

        Notes
        -----
        Mean is sensitive to the magnitudes of outliers. When working with
        factor that can potentially produce large outliers, it is often useful
        to use the ``mask`` parameter to discard values at the extremes of the
        distribution::

            >>> base = MyFactor(...)
            >>> normalized = base.demean(mask=base.percentile_between(1, 99))

        ``demean()`` is only supported on Factors of dtype float64.

        See Also
        --------
        :meth:`pandas.DataFrame.groupby`
        """
        return GroupedRowTransform(
            transform=lambda row: row - nanmean(row),
            factor=self,
            mask=mask,
            groupby=groupby,
        )
Esempio n. 14
0
def vectorized_beta(dependents, independent, allowed_missing, out=None):
    """
    Compute slopes of linear regressions between columns of ``dependents`` and
    ``independent``.

    Parameters
    ----------
    dependents : np.array[N, M]
        Array with columns of data to be regressed against ``independent``.
    independent : np.array[N, 1]
        Independent variable of the regression
    allowed_missing : int
        Number of allowed missing (NaN) observations per column. Columns with
        more than this many non-nan observations in both ``dependents`` and
        ``independents`` will output NaN as the regression coefficient.

    Returns
    -------
    slopes : np.array[M]
        Linear regression coefficients for each column of ``dependents``.
    """
    # Cache these as locals since we're going to call them multiple times.
    nan = np.nan
    isnan = np.isnan
    N, M = dependents.shape

    if out is None:
        out = np.full(M, nan)

    # Copy N times as a column vector and fill with nans to have the same
    # missing value pattern as the dependent variable.
    #
    # PERF_TODO: We could probably avoid the space blowup by doing this in
    # Cython.

    # shape: (N, M)
    independent = np.where(
        isnan(dependents),
        nan,
        independent,
    )

    # Calculate beta as Cov(X, Y) / Cov(X, X).
    # https://en.wikipedia.org/wiki/Simple_linear_regression#Fitting_the_regression_line  # noqa
    #
    # NOTE: The usual formula for covariance is::
    #
    #    mean((X - mean(X)) * (Y - mean(Y)))
    #
    # However, we don't actually need to take the mean of both sides of the
    # product, because of the folllowing equivalence::
    #
    # Let X_res = (X - mean(X)).
    # We have:
    #
    #     mean(X_res * (Y - mean(Y))) = mean(X_res * (Y - mean(Y)))
    #                             (1) = mean((X_res * Y) - (X_res * mean(Y)))
    #                             (2) = mean(X_res * Y) - mean(X_res * mean(Y))
    #                             (3) = mean(X_res * Y) - mean(X_res) * mean(Y)
    #                             (4) = mean(X_res * Y) - 0 * mean(Y)
    #                             (5) = mean(X_res * Y)
    #
    #
    # The tricky step in the above derivation is step (4). We know that
    # mean(X_res) is zero because, for any X:
    #
    #     mean(X - mean(X)) = mean(X) - mean(X) = 0.
    #
    # The upshot of this is that we only have to center one of `independent`
    # and `dependent` when calculating covariances. Since we need the centered
    # `independent` to calculate its variance in the next step, we choose to
    # center `independent`.

    # shape: (N, M)
    ind_residual = independent - nanmean(independent, axis=0)

    # shape: (M,)
    covariances = nanmean(ind_residual * dependents, axis=0)

    # We end up with different variances in each column here because each
    # column may have a different subset of the data dropped due to missing
    # data in the corresponding dependent column.
    # shape: (M,)
    independent_variances = nanmean(ind_residual ** 2, axis=0)

    # shape: (M,)
    np.divide(covariances, independent_variances, out=out)

    # Write nans back to locations where we have more then allowed number of
    # missing entries.
    nanlocs = isnan(independent).sum(axis=0) > allowed_missing
    out[nanlocs] = nan

    return out
Esempio n. 15
0
 def demean(row):
     return row - nanmean(row)
Esempio n. 16
0
class FactorTestCase(BasePipelineTestCase):
    def init_instance_fixtures(self):
        super(FactorTestCase, self).init_instance_fixtures()
        self.f = F()

    def test_bad_input(self):
        with self.assertRaises(UnknownRankMethod):
            self.f.rank("not a real rank method")

    @parameter_space(method_name=['isnan', 'notnan', 'isfinite'])
    def test_float64_only_ops(self, method_name):
        class NotFloat(Factor):
            dtype = datetime64ns_dtype
            inputs = ()
            window_length = 0

        nf = NotFloat()
        meth = getattr(nf, method_name)
        with self.assertRaises(TypeError):
            meth()

    @parameter_space(custom_missing_value=[-1, 0])
    def test_isnull_int_dtype(self, custom_missing_value):
        class CustomMissingValue(Factor):
            dtype = int64_dtype
            window_length = 0
            missing_value = custom_missing_value
            inputs = ()

        factor = CustomMissingValue()

        data = arange(25).reshape(5, 5)
        data[eye(5, dtype=bool)] = custom_missing_value

        graph = TermGraph({
            'isnull': factor.isnull(),
            'notnull': factor.notnull(),
        })

        results = self.run_graph(
            graph,
            initial_workspace={factor: data},
            mask=self.build_mask(ones((5, 5))),
        )
        check_arrays(results['isnull'], eye(5, dtype=bool))
        check_arrays(results['notnull'], ~eye(5, dtype=bool))

    def test_isnull_datetime_dtype(self):
        class DatetimeFactor(Factor):
            dtype = datetime64ns_dtype
            window_length = 0
            inputs = ()

        factor = DatetimeFactor()

        data = arange(25).reshape(5, 5).astype('datetime64[ns]')
        data[eye(5, dtype=bool)] = NaTns

        graph = TermGraph({
            'isnull': factor.isnull(),
            'notnull': factor.notnull(),
        })

        results = self.run_graph(
            graph,
            initial_workspace={factor: data},
            mask=self.build_mask(ones((5, 5))),
        )
        check_arrays(results['isnull'], eye(5, dtype=bool))
        check_arrays(results['notnull'], ~eye(5, dtype=bool))

    @for_each_factor_dtype
    def test_rank_ascending(self, name, factor_dtype):

        f = F(dtype=factor_dtype)

        # Generated with:
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)

        expected_ranks = {
            'ordinal':
            array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.],
                   [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.],
                   [1., 3., 4., 5., 2.]]),
            'average':
            array([[1.5, 3., 4., 5., 1.5], [2.5, 4., 5., 1., 2.5],
                   [3.5, 5., 1., 2., 3.5], [4.5, 1., 2., 3., 4.5],
                   [1.5, 3., 4., 5., 1.5]]),
            'min':
            array([[1., 3., 4., 5., 1.], [2., 4., 5., 1., 2.],
                   [3., 5., 1., 2., 3.], [4., 1., 2., 3., 4.],
                   [1., 3., 4., 5., 1.]]),
            'max':
            array([[2., 3., 4., 5., 2.], [3., 4., 5., 1., 3.],
                   [4., 5., 1., 2., 4.], [5., 1., 2., 3., 5.],
                   [2., 3., 4., 5., 2.]]),
            'dense':
            array([[1., 2., 3., 4., 1.], [2., 3., 4., 1., 2.],
                   [3., 4., 1., 2., 3.], [4., 1., 2., 3., 4.],
                   [1., 2., 3., 4., 1.]]),
        }

        def check(terms):
            graph = TermGraph(terms)
            results = self.run_graph(
                graph,
                initial_workspace={f: data},
                mask=self.build_mask(ones((5, 5))),
            )
            for method in terms:
                check_arrays(results[method], expected_ranks[method])

        check({meth: f.rank(method=meth) for meth in expected_ranks})
        check({
            meth: f.rank(method=meth, ascending=True)
            for meth in expected_ranks
        })
        # Not passing a method should default to ordinal.
        check({'ordinal': f.rank()})
        check({'ordinal': f.rank(ascending=True)})

    @for_each_factor_dtype
    def test_rank_descending(self, name, factor_dtype):

        f = F(dtype=factor_dtype)

        # Generated with:
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)
        expected_ranks = {
            'ordinal':
            array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.],
                   [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.],
                   [4., 3., 2., 1., 5.]]),
            'average':
            array([[4.5, 3., 2., 1., 4.5], [3.5, 2., 1., 5., 3.5],
                   [2.5, 1., 5., 4., 2.5], [1.5, 5., 4., 3., 1.5],
                   [4.5, 3., 2., 1., 4.5]]),
            'min':
            array([[4., 3., 2., 1., 4.], [3., 2., 1., 5., 3.],
                   [2., 1., 5., 4., 2.], [1., 5., 4., 3., 1.],
                   [4., 3., 2., 1., 4.]]),
            'max':
            array([[5., 3., 2., 1., 5.], [4., 2., 1., 5., 4.],
                   [3., 1., 5., 4., 3.], [2., 5., 4., 3., 2.],
                   [5., 3., 2., 1., 5.]]),
            'dense':
            array([[4., 3., 2., 1., 4.], [3., 2., 1., 4., 3.],
                   [2., 1., 4., 3., 2.], [1., 4., 3., 2., 1.],
                   [4., 3., 2., 1., 4.]]),
        }

        def check(terms):
            graph = TermGraph(terms)
            results = self.run_graph(
                graph,
                initial_workspace={f: data},
                mask=self.build_mask(ones((5, 5))),
            )
            for method in terms:
                check_arrays(results[method], expected_ranks[method])

        check({
            meth: f.rank(method=meth, ascending=False)
            for meth in expected_ranks
        })
        # Not passing a method should default to ordinal.
        check({'ordinal': f.rank(ascending=False)})

    @for_each_factor_dtype
    def test_rank_after_mask(self, name, factor_dtype):

        f = F(dtype=factor_dtype)
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)
        mask_data = ~eye(5, dtype=bool)
        initial_workspace = {f: data, Mask(): mask_data}

        graph = TermGraph({
            "ascending_nomask":
            f.rank(ascending=True),
            "ascending_mask":
            f.rank(ascending=True, mask=Mask()),
            "descending_nomask":
            f.rank(ascending=False),
            "descending_mask":
            f.rank(ascending=False, mask=Mask()),
        })

        expected = {
            "ascending_nomask":
            array([[1., 3., 4., 5., 2.], [2., 4., 5., 1., 3.],
                   [3., 5., 1., 2., 4.], [4., 1., 2., 3., 5.],
                   [1., 3., 4., 5., 2.]]),
            "descending_nomask":
            array([[4., 3., 2., 1., 5.], [3., 2., 1., 5., 4.],
                   [2., 1., 5., 4., 3.], [1., 5., 4., 3., 2.],
                   [4., 3., 2., 1., 5.]]),
            # Diagonal should be all nans, and anything whose rank was less
            # than the diagonal in the unmasked calc should go down by 1.
            "ascending_mask":
            array([[nan, 2., 3., 4., 1.], [2., nan, 4., 1., 3.],
                   [2., 4., nan, 1., 3.], [3., 1., 2., nan, 4.],
                   [1., 2., 3., 4., nan]]),
            "descending_mask":
            array([[nan, 3., 2., 1., 4.], [2., nan, 1., 4., 3.],
                   [2., 1., nan, 4., 3.], [1., 4., 3., nan, 2.],
                   [4., 3., 2., 1., nan]]),
        }

        results = self.run_graph(
            graph,
            initial_workspace,
            mask=self.build_mask(ones((5, 5))),
        )
        for method in results:
            check_arrays(expected[method], results[method])

    @for_each_factor_dtype
    def test_grouped_rank_ascending(self, name, factor_dtype=float64_dtype):

        f = F(dtype=factor_dtype)
        c = C()
        str_c = C(dtype=categorical_dtype, missing_value=None)

        # Generated with:
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)

        # Generated with:
        # classifier_data = arange(25).reshape(5, 5).transpose() % 2
        classifier_data = array(
            [[0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0],
             [1, 0, 1, 0, 1], [0, 1, 0, 1, 0]],
            dtype=int64_dtype)
        string_classifier_data = LabelArray(
            classifier_data.astype(str).astype(object),
            missing_value=None,
        )

        expected_grouped_ranks = {
            'ordinal':
            array([[1., 1., 3., 2., 2.], [1., 2., 3., 1., 2.],
                   [2., 2., 1., 1., 3.], [2., 1., 1., 2., 3.],
                   [1., 1., 3., 2., 2.]]),
            'average':
            array([[1.5, 1., 3., 2., 1.5], [1.5, 2., 3., 1., 1.5],
                   [2.5, 2., 1., 1., 2.5], [2.5, 1., 1., 2., 2.5],
                   [1.5, 1., 3., 2., 1.5]]),
            'min':
            array([[1., 1., 3., 2., 1.], [1., 2., 3., 1., 1.],
                   [2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.],
                   [1., 1., 3., 2., 1.]]),
            'max':
            array([[2., 1., 3., 2., 2.], [2., 2., 3., 1., 2.],
                   [3., 2., 1., 1., 3.], [3., 1., 1., 2., 3.],
                   [2., 1., 3., 2., 2.]]),
            'dense':
            array([[1., 1., 2., 2., 1.], [1., 2., 2., 1., 1.],
                   [2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.],
                   [1., 1., 2., 2., 1.]]),
        }

        def check(terms):
            graph = TermGraph(terms)
            results = self.run_graph(
                graph,
                initial_workspace={
                    f: data,
                    c: classifier_data,
                    str_c: string_classifier_data,
                },
                mask=self.build_mask(ones((5, 5))),
            )

            for method in terms:
                check_arrays(results[method], expected_grouped_ranks[method])

        # Not specifying the value of ascending param should default to True
        check({
            meth: f.rank(method=meth, groupby=c)
            for meth in expected_grouped_ranks
        })
        check({
            meth: f.rank(method=meth, groupby=str_c)
            for meth in expected_grouped_ranks
        })
        check({
            meth: f.rank(method=meth, groupby=c, ascending=True)
            for meth in expected_grouped_ranks
        })
        check({
            meth: f.rank(method=meth, groupby=str_c, ascending=True)
            for meth in expected_grouped_ranks
        })

        # Not passing a method should default to ordinal
        check({'ordinal': f.rank(groupby=c)})
        check({'ordinal': f.rank(groupby=str_c)})
        check({'ordinal': f.rank(groupby=c, ascending=True)})
        check({'ordinal': f.rank(groupby=str_c, ascending=True)})

    @for_each_factor_dtype
    def test_grouped_rank_descending(self, name, factor_dtype):

        f = F(dtype=factor_dtype)
        c = C()
        str_c = C(dtype=categorical_dtype, missing_value=None)

        # Generated with:
        # data = arange(25).reshape(5, 5).transpose() % 4
        data = array([[0, 1, 2, 3, 0], [1, 2, 3, 0, 1], [2, 3, 0, 1, 2],
                      [3, 0, 1, 2, 3], [0, 1, 2, 3, 0]],
                     dtype=factor_dtype)

        # Generated with:
        # classifier_data = arange(25).reshape(5, 5).transpose() % 2
        classifier_data = array(
            [[0, 1, 0, 1, 0], [1, 0, 1, 0, 1], [0, 1, 0, 1, 0],
             [1, 0, 1, 0, 1], [0, 1, 0, 1, 0]],
            dtype=int64_dtype)

        string_classifier_data = LabelArray(
            classifier_data.astype(str).astype(object),
            missing_value=None,
        )

        expected_grouped_ranks = {
            'ordinal':
            array([[2., 2., 1., 1., 3.], [2., 1., 1., 2., 3.],
                   [1., 1., 3., 2., 2.], [1., 2., 3., 1., 2.],
                   [2., 2., 1., 1., 3.]]),
            'average':
            array([[2.5, 2., 1., 1., 2.5], [2.5, 1., 1., 2., 2.5],
                   [1.5, 1., 3., 2., 1.5], [1.5, 2., 3., 1., 1.5],
                   [2.5, 2., 1., 1., 2.5]]),
            'min':
            array([[2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.],
                   [1., 1., 3., 2., 1.], [1., 2., 3., 1., 1.],
                   [2., 2., 1., 1., 2.]]),
            'max':
            array([[3., 2., 1., 1., 3.], [3., 1., 1., 2., 3.],
                   [2., 1., 3., 2., 2.], [2., 2., 3., 1., 2.],
                   [3., 2., 1., 1., 3.]]),
            'dense':
            array([[2., 2., 1., 1., 2.], [2., 1., 1., 2., 2.],
                   [1., 1., 2., 2., 1.], [1., 2., 2., 1., 1.],
                   [2., 2., 1., 1., 2.]]),
        }

        def check(terms):
            graph = TermGraph(terms)
            results = self.run_graph(
                graph,
                initial_workspace={
                    f: data,
                    c: classifier_data,
                    str_c: string_classifier_data,
                },
                mask=self.build_mask(ones((5, 5))),
            )

            for method in terms:
                check_arrays(results[method], expected_grouped_ranks[method])

        check({
            meth: f.rank(method=meth, groupby=c, ascending=False)
            for meth in expected_grouped_ranks
        })
        check({
            meth: f.rank(method=meth, groupby=str_c, ascending=False)
            for meth in expected_grouped_ranks
        })

        # Not passing a method should default to ordinal
        check({'ordinal': f.rank(groupby=c, ascending=False)})
        check({'ordinal': f.rank(groupby=str_c, ascending=False)})

    @parameterized.expand([
        # Test cases computed by doing:
        # from numpy.random import seed, randn
        # from talib import RSI
        # seed(seed_value)
        # data = abs(randn(15, 3))
        # expected = [RSI(data[:, i])[-1] for i in range(3)]
        (100, array([41.032913785966, 51.553585468393, 51.022005016446])),
        (101, array([43.506969935466, 46.145367530182, 50.57407044197])),
        (102, array([46.610102205934, 47.646892444315, 52.13182788538])),
    ])
    def test_rsi(self, seed_value, expected):

        rsi = RSI()

        today = datetime64(1, 'ns')
        assets = arange(3)
        out = empty((3, ), dtype=float)

        seed(seed_value)  # Seed so we get deterministic results.
        test_data = abs(randn(15, 3))

        out = empty((3, ), dtype=float)
        rsi.compute(today, assets, out, test_data)

        check_allclose(expected, out)

    @parameterized.expand([
        (100, 15),
        (101, 4),
        (102, 100),
    ])
    def test_returns(self, seed_value, window_length):

        returns = Returns(window_length=window_length)

        today = datetime64(1, 'ns')
        assets = arange(3)
        out = empty((3, ), dtype=float)

        seed(seed_value)  # Seed so we get deterministic results.
        test_data = abs(randn(window_length, 3))

        # Calculate the expected returns
        expected = (test_data[-1] - test_data[0]) / test_data[0]

        out = empty((3, ), dtype=float)
        returns.compute(today, assets, out, test_data)

        check_allclose(expected, out)

    def gen_ranking_cases():
        seeds = range(int(1e4), int(1e5), int(1e4))
        methods = ('ordinal', 'average')
        use_mask_values = (True, False)
        set_missing_values = (True, False)
        ascending_values = (True, False)
        return product(
            seeds,
            methods,
            use_mask_values,
            set_missing_values,
            ascending_values,
        )

    @parameterized.expand(gen_ranking_cases())
    def test_masked_rankdata_2d(self, seed_value, method, use_mask,
                                set_missing, ascending):
        eyemask = ~eye(5, dtype=bool)
        nomask = ones((5, 5), dtype=bool)

        seed(seed_value)
        asfloat = (randn(5, 5) * seed_value)
        asdatetime = (asfloat).copy().view('datetime64[ns]')

        mask = eyemask if use_mask else nomask
        if set_missing:
            asfloat[:, 2] = nan
            asdatetime[:, 2] = NaTns

        float_result = masked_rankdata_2d(
            data=asfloat,
            mask=mask,
            missing_value=nan,
            method=method,
            ascending=True,
        )
        datetime_result = masked_rankdata_2d(
            data=asdatetime,
            mask=mask,
            missing_value=NaTns,
            method=method,
            ascending=True,
        )

        check_arrays(float_result, datetime_result)

    def test_normalizations_hand_computed(self):
        """
        Test the hand-computed example in factor.demean.
        """
        f = self.f
        m = Mask()
        c = C()
        str_c = C(dtype=categorical_dtype, missing_value=None)

        factor_data = array([[1.0, 2.0, 3.0, 4.0], [1.5, 2.5, 3.5, 1.0],
                             [2.0, 3.0, 4.0, 1.5], [2.5, 3.5, 1.0, 2.0]], )
        filter_data = array(
            [[False, True, True, True], [True, False, True, True],
             [True, True, False, True], [True, True, True, False]],
            dtype=bool,
        )
        classifier_data = array(
            [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]],
            dtype=int64_dtype,
        )
        string_classifier_data = LabelArray(
            classifier_data.astype(str).astype(object),
            missing_value=None,
        )

        terms = {
            'vanilla': f.demean(),
            'masked': f.demean(mask=m),
            'grouped': f.demean(groupby=c),
            'grouped_str': f.demean(groupby=str_c),
            'grouped_masked': f.demean(mask=m, groupby=c),
            'grouped_masked_str': f.demean(mask=m, groupby=str_c),
        }
        expected = {
            'vanilla':
            array([[-1.500, -0.500, 0.500, 1.500],
                   [-0.625, 0.375, 1.375, -1.125],
                   [-0.625, 0.375, 1.375, -1.125],
                   [0.250, 1.250, -1.250, -0.250]], ),
            'masked':
            array(
                [[nan, -1.000, 0.000, 1.000], [-0.500, nan, 1.500, -1.000],
                 [-0.166, 0.833, nan, -0.666], [0.166, 1.166, -1.333, nan]], ),
            'grouped':
            array([[-0.500, 0.500, -0.500, 0.500],
                   [-0.500, 0.500, 1.250, -1.250],
                   [-0.500, 0.500, 1.250, -1.250],
                   [-0.500, 0.500, -0.500, 0.500]], ),
            'grouped_masked':
            array([[nan, 0.000, -0.500, 0.500], [0.000, nan, 1.250, -1.250],
                   [-0.500, 0.500, nan, 0.000], [-0.500, 0.500, 0.000, nan]])
        }
        # Changing the classifier dtype shouldn't affect anything.
        expected['grouped_str'] = expected['grouped']
        expected['grouped_masked_str'] = expected['grouped_masked']

        graph = TermGraph(terms)
        results = self.run_graph(
            graph,
            initial_workspace={
                f: factor_data,
                c: classifier_data,
                str_c: string_classifier_data,
                m: filter_data,
            },
            mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
        )

        for key, (res, exp) in dzip_exact(results, expected).items():
            check_allclose(
                res,
                exp,
                # The hand-computed values aren't very precise (in particular,
                # we truncate repeating decimals at 3 places) This is just
                # asserting that the example isn't misleading by being totally
                # wrong.
                atol=0.001,
                err_msg="Mismatch for %r" % key)

    @parameter_space(
        seed_value=range(1, 2),
        normalizer_name_and_func=[
            ('demean', lambda row: row - nanmean(row)),
            ('zscore', lambda row: (row - nanmean(row)) / nanstd(row)),
        ],
        add_nulls_to_factor=(
            False,
            True,
        ),
    )
    def test_normalizations_randomized(self, seed_value,
                                       normalizer_name_and_func,
                                       add_nulls_to_factor):

        name, func = normalizer_name_and_func

        shape = (7, 7)

        # All Trues.
        nomask = self.ones_mask(shape=shape)
        # Falses on main diagonal.
        eyemask = self.eye_mask(shape=shape)
        # Falses on other diagonal.
        eyemask90 = rot90(eyemask)
        # Falses on both diagonals.
        xmask = eyemask & eyemask90

        # Block of random data.
        factor_data = self.randn_data(seed=seed_value, shape=shape)
        if add_nulls_to_factor:
            factor_data = where(eyemask, factor_data, nan)

        # Cycles of 0, 1, 2, 0, 1, 2, ...
        classifier_data = (
            (self.arange_data(shape=shape, dtype=int64_dtype) + seed_value) %
            3)
        # With -1s on main diagonal.
        classifier_data_eyenulls = where(eyemask, classifier_data, -1)
        # With -1s on opposite diagonal.
        classifier_data_eyenulls90 = where(eyemask90, classifier_data, -1)
        # With -1s on both diagonals.
        classifier_data_xnulls = where(xmask, classifier_data, -1)

        f = self.f
        c = C()
        c_with_nulls = OtherC()
        m = Mask()
        method = getattr(f, name)
        terms = {
            'vanilla': method(),
            'masked': method(mask=m),
            'grouped': method(groupby=c),
            'grouped_with_nulls': method(groupby=c_with_nulls),
            'both': method(mask=m, groupby=c),
            'both_with_nulls': method(mask=m, groupby=c_with_nulls),
        }

        expected = {
            'vanilla':
            apply_along_axis(
                func,
                1,
                factor_data,
            ),
            'masked':
            where(
                eyemask,
                grouped_apply(factor_data, eyemask, func),
                nan,
            ),
            'grouped':
            grouped_apply(
                factor_data,
                classifier_data,
                func,
            ),
            # If the classifier has nulls, we should get NaNs in the
            # corresponding locations in the output.
            'grouped_with_nulls':
            where(
                eyemask90,
                grouped_apply(factor_data, classifier_data_eyenulls90, func),
                nan,
            ),
            # Passing a mask with a classifier should behave as though the
            # classifier had nulls where the mask was False.
            'both':
            where(
                eyemask,
                grouped_apply(
                    factor_data,
                    classifier_data_eyenulls,
                    func,
                ),
                nan,
            ),
            'both_with_nulls':
            where(
                xmask,
                grouped_apply(
                    factor_data,
                    classifier_data_xnulls,
                    func,
                ),
                nan,
            )
        }

        self.check_terms(
            terms=terms,
            expected=expected,
            initial_workspace={
                f: factor_data,
                c: classifier_data,
                c_with_nulls: classifier_data_eyenulls90,
                Mask(): eyemask,
            },
            mask=self.build_mask(nomask),
        )

    @parameter_space(method_name=['demean', 'zscore'])
    def test_cant_normalize_non_float(self, method_name):
        class DateFactor(Factor):
            dtype = datetime64ns_dtype
            inputs = ()
            window_length = 0

        d = DateFactor()
        with self.assertRaises(TypeError) as e:
            getattr(d, method_name)()

        errmsg = str(e.exception)
        expected = (
            "{normalizer}() is only defined on Factors of dtype float64,"
            " but it was called on a Factor of dtype datetime64[ns].").format(
                normalizer=method_name)

        self.assertEqual(errmsg, expected)

    @parameter_space(seed=[1, 2, 3])
    def test_quantiles_unmasked(self, seed):
        permute = partial(permute_rows, seed)

        shape = (6, 6)

        # Shuffle the input rows to verify that we don't depend on the order.
        # Take the log to ensure that we don't depend on linear scaling or
        # integrality of inputs
        factor_data = permute(log1p(arange(36, dtype=float).reshape(shape)))

        f = self.f

        # Apply the same shuffle we applied to the input rows to our
        # expectations. Doing it this way makes it obvious that our
        # expectation corresponds to our input, while still testing against
        # a range of input orderings.
        permuted_array = compose(permute, partial(array, dtype=int64_dtype))
        self.check_terms(
            terms={
                '2': f.quantiles(bins=2),
                '3': f.quantiles(bins=3),
                '6': f.quantiles(bins=6),
            },
            initial_workspace={
                f: factor_data,
            },
            expected={
                # The values in the input are all increasing, so the first half
                # of each row should be in the bottom bucket, and the second
                # half should be in the top bucket.
                '2':
                permuted_array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1],
                                [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1],
                                [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]]),
                # Similar for three buckets.
                '3':
                permuted_array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2],
                                [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2],
                                [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]),
                # In the limiting case, we just have every column different.
                '6':
                permuted_array([[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5],
                                [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5],
                                [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]),
            },
            mask=self.build_mask(self.ones_mask(shape=shape)),
        )

    @parameter_space(seed=[1, 2, 3])
    def test_quantiles_masked(self, seed):
        permute = partial(permute_rows, seed)

        # 7 x 7 so that we divide evenly into 2/3/6-tiles after including the
        # nan value in each row.
        shape = (7, 7)

        # Shuffle the input rows to verify that we don't depend on the order.
        # Take the log to ensure that we don't depend on linear scaling or
        # integrality of inputs
        factor_data = permute(log1p(arange(49, dtype=float).reshape(shape)))
        factor_data_w_nans = where(
            permute(rot90(self.eye_mask(shape=shape))),
            factor_data,
            nan,
        )
        mask_data = permute(self.eye_mask(shape=shape))

        f = F()
        f_nans = OtherF()
        m = Mask()

        # Apply the same shuffle we applied to the input rows to our
        # expectations. Doing it this way makes it obvious that our
        # expectation corresponds to our input, while still testing against
        # a range of input orderings.
        permuted_array = compose(permute, partial(array, dtype=int64_dtype))

        self.check_terms(
            terms={
                '2_masked': f.quantiles(bins=2, mask=m),
                '3_masked': f.quantiles(bins=3, mask=m),
                '6_masked': f.quantiles(bins=6, mask=m),
                '2_nans': f_nans.quantiles(bins=2),
                '3_nans': f_nans.quantiles(bins=3),
                '6_nans': f_nans.quantiles(bins=6),
            },
            initial_workspace={
                f: factor_data,
                f_nans: factor_data_w_nans,
                m: mask_data,
            },
            expected={
                # Expected results here are the same as in
                # test_quantiles_unmasked, except with diagonals of -1s
                # interpolated to match the effects of masking and/or input
                # nans.
                '2_masked':
                permuted_array([[-1, 0, 0, 0, 1, 1, 1], [0, -1, 0, 0, 1, 1, 1],
                                [0, 0, -1, 0, 1, 1, 1], [0, 0, 0, -1, 1, 1, 1],
                                [0, 0, 0, 1, -1, 1, 1], [0, 0, 0, 1, 1, -1, 1],
                                [0, 0, 0, 1, 1, 1, -1]]),
                '3_masked':
                permuted_array([[-1, 0, 0, 1, 1, 2, 2], [0, -1, 0, 1, 1, 2, 2],
                                [0, 0, -1, 1, 1, 2, 2], [0, 0, 1, -1, 1, 2, 2],
                                [0, 0, 1, 1, -1, 2, 2], [0, 0, 1, 1, 2, -1, 2],
                                [0, 0, 1, 1, 2, 2, -1]]),
                '6_masked':
                permuted_array([[-1, 0, 1, 2, 3, 4, 5], [0, -1, 1, 2, 3, 4, 5],
                                [0, 1, -1, 2, 3, 4, 5], [0, 1, 2, -1, 3, 4, 5],
                                [0, 1, 2, 3, -1, 4, 5], [0, 1, 2, 3, 4, -1, 5],
                                [0, 1, 2, 3, 4, 5, -1]]),
                '2_nans':
                permuted_array([[0, 0, 0, 1, 1, 1, -1], [0, 0, 0, 1, 1, -1, 1],
                                [0, 0, 0, 1, -1, 1, 1], [0, 0, 0, -1, 1, 1, 1],
                                [0, 0, -1, 0, 1, 1, 1], [0, -1, 0, 0, 1, 1, 1],
                                [-1, 0, 0, 0, 1, 1, 1]]),
                '3_nans':
                permuted_array([[0, 0, 1, 1, 2, 2, -1], [0, 0, 1, 1, 2, -1, 2],
                                [0, 0, 1, 1, -1, 2, 2], [0, 0, 1, -1, 1, 2, 2],
                                [0, 0, -1, 1, 1, 2, 2], [0, -1, 0, 1, 1, 2, 2],
                                [-1, 0, 0, 1, 1, 2, 2]]),
                '6_nans':
                permuted_array([[0, 1, 2, 3, 4, 5, -1], [0, 1, 2, 3, 4, -1, 5],
                                [0, 1, 2, 3, -1, 4, 5], [0, 1, 2, -1, 3, 4, 5],
                                [0, 1, -1, 2, 3, 4, 5], [0, -1, 1, 2, 3, 4, 5],
                                [-1, 0, 1, 2, 3, 4, 5]]),
            },
            mask=self.build_mask(self.ones_mask(shape=shape)),
        )

    def test_quantiles_uneven_buckets(self):
        permute = partial(permute_rows, 5)
        shape = (5, 5)

        factor_data = permute(log1p(arange(25, dtype=float).reshape(shape)))
        mask_data = permute(self.eye_mask(shape=shape))

        f = F()
        m = Mask()

        permuted_array = compose(permute, partial(array, dtype=int64_dtype))
        self.check_terms(
            terms={
                '3_masked': f.quantiles(bins=3, mask=m),
                '7_masked': f.quantiles(bins=7, mask=m),
            },
            initial_workspace={
                f: factor_data,
                m: mask_data,
            },
            expected={
                '3_masked':
                permuted_array([[-1, 0, 0, 1, 2], [0, -1, 0, 1, 2],
                                [0, 0, -1, 1, 2], [0, 0, 1, -1, 2],
                                [0, 0, 1, 2, -1]]),
                '7_masked':
                permuted_array([[-1, 0, 2, 4, 6], [0, -1, 2, 4, 6],
                                [0, 2, -1, 4, 6], [0, 2, 4, -1, 6],
                                [0, 2, 4, 6, -1]]),
            },
            mask=self.build_mask(self.ones_mask(shape=shape)),
        )

    def test_quantile_helpers(self):
        f = self.f
        m = Mask()

        self.assertIs(f.quartiles(), f.quantiles(bins=4))
        self.assertIs(f.quartiles(mask=m), f.quantiles(bins=4, mask=m))
        self.assertIsNot(f.quartiles(), f.quartiles(mask=m))

        self.assertIs(f.quintiles(), f.quantiles(bins=5))
        self.assertIs(f.quintiles(mask=m), f.quantiles(bins=5, mask=m))
        self.assertIsNot(f.quintiles(), f.quintiles(mask=m))

        self.assertIs(f.deciles(), f.quantiles(bins=10))
        self.assertIs(f.deciles(mask=m), f.quantiles(bins=10, mask=m))
        self.assertIsNot(f.deciles(), f.deciles(mask=m))
Esempio n. 17
0
 def zscore(row):
     return (row - nanmean(row)) / nanstd(row)
Esempio n. 18
0
 def compute(self, today, assets, out, close, k):
     difference = k * nanstd(close, axis=0)
     out.middle = middle = nanmean(close, axis=0)
     out.upper = middle + difference
     out.lower = middle - difference
Esempio n. 19
0
 def demean(row):
     return row - nanmean(row)
Esempio n. 20
0
 def compute(self, today, assets, out, close, k):
     difference = k * nanstd(close, axis=0)
     out.middle = middle = nanmean(close, axis=0)
     out.upper = middle + difference
     out.lower = middle - difference
Esempio n. 21
0
 def zscore(row):
     return (row - nanmean(row)) / nanstd(row)
Esempio n. 22
0
def vectorized_beta(dependents, independent, allowed_missing, out=None):
    """
    Compute slopes of linear regressions between columns of ``dependents`` and
    ``independent``.

    Parameters
    ----------
    dependents : np.array[N, M]
        Array with columns of data to be regressed against ``independent``.
    independent : np.array[N, 1]
        Independent variable of the regression
    allowed_missing : int
        Number of allowed missing (NaN) observations per column. Columns with
        more than this many non-nan observations in either ``dependents`` or
        ``independents`` will output NaN as the regression coefficient.
    out : np.array[M] or None, optional
        Output array into which to write results.  If None, a new array is
        created and returned.

    Returns
    -------
    slopes : np.array[M]
        Linear regression coefficients for each column of ``dependents``.
    """
    # Cache these as locals since we're going to call them multiple times.
    nan = np.nan
    isnan = np.isnan
    N, M = dependents.shape

    if out is None:
        out = np.full(M, nan)

    # Copy N times as a column vector and fill with nans to have the same
    # missing value pattern as the dependent variable.
    #
    # PERF_TODO: We could probably avoid the space blowup by doing this in
    # Cython.

    # shape: (N, M)
    independent = np.where(
        isnan(dependents),
        nan,
        independent,
    )

    # Calculate beta as Cov(X, Y) / Cov(X, X).
    # https://en.wikipedia.org/wiki/Simple_linear_regression#Fitting_the_regression_line  # noqa
    #
    # NOTE: The usual formula for covariance is::
    #
    #    mean((X - mean(X)) * (Y - mean(Y)))
    #
    # However, we don't actually need to take the mean of both sides of the
    # product, because of the folllowing equivalence::
    #
    # Let X_res = (X - mean(X)).
    # We have:
    #
    #     mean(X_res * (Y - mean(Y))) = mean(X_res * (Y - mean(Y)))
    #                             (1) = mean((X_res * Y) - (X_res * mean(Y)))
    #                             (2) = mean(X_res * Y) - mean(X_res * mean(Y))
    #                             (3) = mean(X_res * Y) - mean(X_res) * mean(Y)
    #                             (4) = mean(X_res * Y) - 0 * mean(Y)
    #                             (5) = mean(X_res * Y)
    #
    #
    # The tricky step in the above derivation is step (4). We know that
    # mean(X_res) is zero because, for any X:
    #
    #     mean(X - mean(X)) = mean(X) - mean(X) = 0.
    #
    # The upshot of this is that we only have to center one of `independent`
    # and `dependent` when calculating covariances. Since we need the centered
    # `independent` to calculate its variance in the next step, we choose to
    # center `independent`.

    # shape: (N, M)
    ind_residual = independent - nanmean(independent, axis=0)

    # shape: (M,)
    covariances = nanmean(ind_residual * dependents, axis=0)

    # We end up with different variances in each column here because each
    # column may have a different subset of the data dropped due to missing
    # data in the corresponding dependent column.
    # shape: (M,)
    independent_variances = nanmean(ind_residual**2, axis=0)

    # shape: (M,)
    np.divide(covariances, independent_variances, out=out)

    # Write nans back to locations where we have more then allowed number of
    # missing entries.
    nanlocs = isnan(independent).sum(axis=0) > allowed_missing
    out[nanlocs] = nan

    return out
Esempio n. 23
0
 def compute(self, today, assets, out, close, volume):
     out[:] = nanmean(close * volume, axis=0)
Esempio n. 24
0
 def compute(self, today, assets, out, data):
     out[:] = nanmean(data, axis=0)
Esempio n. 25
0
    def demean(self, mask=NotSpecified, groupby=NotSpecified):
        """
        Construct a Factor that computes ``self`` and subtracts the mean from
        row of the result.

        If ``mask`` is supplied, ignore values where ``mask`` returns False
        when computing row means, and output NaN anywhere the mask is False.

        If ``groupby`` is supplied, compute by partitioning each row based on
        the values produced by ``groupby``, de-meaning the partitioned arrays,
        and stitching the sub-results back together.

        Parameters
        ----------
        mask : zipline.pipeline.Filter, optional
            A Filter defining values to ignore when computing means.
        groupby : zipline.pipeline.Classifier, optional
            A classifier defining partitions over which to compute means.

        Example
        -------
        Let ``f`` be a Factor which would produce the following output::

                         AAPL   MSFT    MCD     BK
            2017-03-13    1.0    2.0    3.0    4.0
            2017-03-14    1.5    2.5    3.5    1.0
            2017-03-15    2.0    3.0    4.0    1.5
            2017-03-16    2.5    3.5    1.0    2.0

        Let ``c`` be a Classifier producing the following output::

                         AAPL   MSFT    MCD     BK
            2017-03-13      1      1      2      2
            2017-03-14      1      1      2      2
            2017-03-15      1      1      2      2
            2017-03-16      1      1      2      2

        Let ``m`` be a Filter producing the following output::

                         AAPL   MSFT    MCD     BK
            2017-03-13  False   True   True   True
            2017-03-14   True  False   True   True
            2017-03-15   True   True  False   True
            2017-03-16   True   True   True  False

        Then ``f.demean()`` will subtract the mean from each row produced by
        ``f``.

        ::

                         AAPL   MSFT    MCD     BK
            2017-03-13 -1.500 -0.500  0.500  1.500
            2017-03-14 -0.625  0.375  1.375 -1.125
            2017-03-15 -0.625  0.375  1.375 -1.125
            2017-03-16  0.250  1.250 -1.250 -0.250

        ``f.demean(mask=m)`` will subtract the mean from each row, but means
        will be calculated ignoring values on the diagonal, and NaNs will
        written to the diagonal in the output. Diagonal values are ignored
        because they are the locations where the mask ``m`` produced False.

        ::

                         AAPL   MSFT    MCD     BK
            2017-03-13    NaN -1.000  0.000  1.000
            2017-03-14 -0.500    NaN  1.500 -1.000
            2017-03-15 -0.166  0.833    NaN -0.666
            2017-03-16  0.166  1.166 -1.333    NaN

        ``f.demean(groupby=c)`` will subtract the group-mean of AAPL/MSFT and
        MCD/BK from their respective entries.  The AAPL/MSFT are grouped
        together because both assets always produce 1 in the output of the
        classifier ``c``.  Similarly, MCD/BK are grouped together because they
        always produce 2.

        ::

                         AAPL   MSFT    MCD     BK
            2017-03-13 -0.500  0.500 -0.500  0.500
            2017-03-14 -0.500  0.500  1.250 -1.250
            2017-03-15 -0.500  0.500  1.250 -1.250
            2017-03-16 -0.500  0.500 -0.500  0.500

        ``f.demean(mask=m, groupby=c)`` will also subtract the group-mean of
        AAPL/MSFT and MCD/BK, but means will be calculated ignoring values on
        the diagonal , and NaNs will be written to the diagonal in the output.

        ::

                         AAPL   MSFT    MCD     BK
            2017-03-13    NaN  0.000 -0.500  0.500
            2017-03-14  0.000    NaN  1.250 -1.250
            2017-03-15 -0.500  0.500    NaN  0.000
            2017-03-16 -0.500  0.500  0.000    NaN

        Notes
        -----
        Mean is sensitive to the magnitudes of outliers. When working with
        factor that can potentially produce large outliers, it is often useful
        to use the ``mask`` parameter to discard values at the extremes of the
        distribution::

            >>> base = MyFactor(...)
            >>> normalized = base.demean(mask=base.percentile_between(1, 99))

        ``demean()`` is only supported on Factors of dtype float64.

        See Also
        --------
        :meth:`pandas.DataFrame.groupby`
        """
        return GroupedRowTransform(
            transform=lambda row: row - nanmean(row),
            factor=self,
            mask=mask,
            groupby=groupby,
        )