class Filter(CompositeTerm): """ Pipeline API expression producing boolean-valued outputs. """ dtype = bool_dtype clsdict = locals() clsdict.update( { method_name_for_op(op): binary_operator(op) for op in FILTER_BINOPS } ) clsdict.update( { method_name_for_op(op, commute=True): binary_operator(op) for op in FILTER_BINOPS } ) __invert__ = unary_operator('~') def _validate(self): # Run superclass validation first so that we handle `dtype not passed` # before this. retval = super(Filter, self)._validate() if self.dtype != bool_dtype: raise UnsupportedDataType( typename=type(self).__name__, dtype=self.dtype ) return retval
def binary_operator(op): """ Factory function for making binary operator methods on a Factor subclass. Returns a function, "binary_operator" suitable for implementing functions like __add__. """ # When combining a Factor with a NumericalExpression, we use this # attrgetter instance to defer to the commuted implementation of the # NumericalExpression operator. commuted_method_getter = attrgetter(method_name_for_op(op, commute=True)) @preprocess(other=numbers_to_float64) @with_doc("Binary Operator: '%s'" % op) @with_name(method_name_for_op(op)) def binary_operator(self, other): # This can't be hoisted up a scope because the types returned by # binop_return_type aren't defined when the top-level function is # invoked in the class body of Factor. return_type = binop_return_type(op) if isinstance(self, NumExprFactor): self_expr, other_expr, new_inputs = self.build_binary_op( op, other, ) return return_type( "({left}) {op} ({right})".format( left=self_expr, op=op, right=other_expr, ), new_inputs, dtype=binop_return_dtype(op, self.dtype, other.dtype), ) elif isinstance(other, NumExprFactor): # NumericalExpression overrides ops to correctly handle merging of # inputs. Look up and call the appropriate reflected operator with # ourself as the input. return commuted_method_getter(other)(self) elif isinstance(other, Factor): if self is other: return return_type( "x_0 {op} x_0".format(op=op), (self, ), dtype=binop_return_dtype(op, self.dtype, other.dtype), ) return return_type( "x_0 {op} x_1".format(op=op), (self, other), dtype=binop_return_dtype(op, self.dtype, other.dtype), ) elif isinstance(other, Number): return return_type( "x_0 {op} ({constant})".format(op=op, constant=other), binds=(self, ), # Interpret numeric literals as floats. dtype=binop_return_dtype(op, self.dtype, other.dtype)) raise BadBinaryOperator(op, self, other) return binary_operator
def binary_operator(op): """ Factory function for making binary operator methods on a Factor subclass. Returns a function, "binary_operator" suitable for implementing functions like __add__. """ # When combining a Factor with a NumericalExpression, we use this # attrgetter instance to defer to the commuted implementation of the # NumericalExpression operator. commuted_method_getter = attrgetter(method_name_for_op(op, commute=True)) @preprocess(other=numbers_to_float64) @with_doc("Binary Operator: '%s'" % op) @with_name(method_name_for_op(op)) def binary_operator(self, other): # This can't be hoisted up a scope because the types returned by # binop_return_type aren't defined when the top-level function is # invoked in the class body of Factor. return_type = binop_return_type(op) if isinstance(self, NumExprFactor): self_expr, other_expr, new_inputs = self.build_binary_op( op, other, ) return return_type( "({left}) {op} ({right})".format( left=self_expr, op=op, right=other_expr, ), new_inputs, dtype=binop_return_dtype(op, self.dtype, other.dtype), ) elif isinstance(other, NumExprFactor): # NumericalExpression overrides ops to correctly handle merging of # inputs. Look up and call the appropriate reflected operator with # ourself as the input. return commuted_method_getter(other)(self) elif isinstance(other, Factor): if self is other: return return_type( "x_0 {op} x_0".format(op=op), (self,), dtype=binop_return_dtype(op, self.dtype, other.dtype), ) return return_type( "x_0 {op} x_1".format(op=op), (self, other), dtype=binop_return_dtype(op, self.dtype, other.dtype), ) elif isinstance(other, Number): return return_type( "x_0 {op} ({constant})".format(op=op, constant=other), binds=(self,), # Interpret numeric literals as floats. dtype=binop_return_dtype(op, self.dtype, other.dtype) ) raise BadBinaryOperator(op, self, other) return binary_operator
def reflected_binary_operator(op): """ Factory function for making binary operator methods on a Factor. Returns a function, "reflected_binary_operator" suitable for implementing functions like __radd__. """ assert not is_comparison(op) @preprocess(other=numbers_to_float64) @with_name(method_name_for_op(op, commute=True)) def reflected_binary_operator(self, other): if isinstance(self, NumericalExpression): self_expr, other_expr, new_inputs = self.build_binary_op(op, other) return NumExprFactor("({left}) {op} ({right})".format( left=other_expr, right=self_expr, op=op, ), new_inputs, dtype=binop_return_dtype( op, other.dtype, self.dtype)) # Only have to handle the numeric case because in all other valid cases # the corresponding left-binding method will be called. elif isinstance(other, Number): return NumExprFactor( "{constant} {op} x_0".format(op=op, constant=other), binds=(self, ), dtype=binop_return_dtype(op, other.dtype, self.dtype), ) raise BadBinaryOperator(op, other, self) return reflected_binary_operator
def binary_operator(op): """ Factory function for making binary operator methods on a Filter subclass. Returns a function "binary_operator" suitable for implementing functions like __and__ or __or__. """ # When combining a Filter with a NumericalExpression, we use this # attrgetter instance to defer to the commuted interpretation of the # NumericalExpression operator. commuted_method_getter = attrgetter(method_name_for_op(op, commute=True)) def binary_operator(self, other): if isinstance(self, NumericalExpression): self_expr, other_expr, new_inputs = self.build_binary_op(op, other) return NumExprFilter.create( "({left}) {op} ({right})".format(left=self_expr, op=op, right=other_expr), new_inputs ) elif isinstance(other, NumericalExpression): # NumericalExpression overrides numerical ops to correctly handle # merging of inputs. Look up and call the appropriate # right-binding operator with ourself as the input. return commuted_method_getter(other)(self) elif isinstance(other, Term): if other.dtype != bool_dtype: raise BadBinaryOperator(op, self, other) if self is other: return NumExprFilter.create("x_0 {op} x_0".format(op=op), (self,)) return NumExprFilter.create("x_0 {op} x_1".format(op=op), (self, other)) elif isinstance(other, int): # Note that this is true for bool as well return NumExprFilter.create("x_0 {op} {constant}".format(op=op, constant=int(other)), binds=(self,)) raise BadBinaryOperator(op, self, other) binary_operator.__doc__ = "Binary Operator: '%s'" % op return binary_operator
class Filter(CompositeTerm): """ Pipeline API expression producing boolean-valued outputs. """ dtype = bool_ clsdict = locals() clsdict.update( {method_name_for_op(op): binary_operator(op) for op in FILTER_BINOPS})
def binary_operator(op): """ Factory function for making binary operator methods on a Filter subclass. Returns a function "binary_operator" suitable for implementing functions like __and__ or __or__. """ # When combining a Filter with a NumericalExpression, we use this # attrgetter instance to defer to the commuted interpretation of the # NumericalExpression operator. commuted_method_getter = attrgetter(method_name_for_op(op, commute=True)) def binary_operator(self, other): if isinstance(self, NumericalExpression): self_expr, other_expr, new_inputs = self.build_binary_op( op, other, ) return NumExprFilter.create( "({left}) {op} ({right})".format( left=self_expr, op=op, right=other_expr, ), new_inputs, ) elif isinstance(other, NumericalExpression): # NumericalExpression overrides numerical ops to correctly handle # merging of inputs. Look up and call the appropriate # right-binding operator with ourself as the input. return commuted_method_getter(other)(self) elif isinstance(other, Term): if other.dtype != bool_dtype: raise BadBinaryOperator(op, self, other) if self is other: return NumExprFilter.create( "x_0 {op} x_0".format(op=op), (self, ), ) return NumExprFilter.create( "x_0 {op} x_1".format(op=op), (self, other), ) elif isinstance(other, int): # Note that this is true for bool as well return NumExprFilter.create( "x_0 {op} {constant}".format(op=op, constant=int(other)), binds=(self, ), ) raise BadBinaryOperator(op, self, other) binary_operator.__doc__ = "Binary Operator: '%s'" % op return binary_operator
def reflected_binary_operator(op): """ Factory function for making binary operator methods on a Factor. Returns a function, "reflected_binary_operator" suitable for implementing functions like __radd__. """ assert not is_comparison(op) @preprocess(other=numbers_to_float64) @with_name(method_name_for_op(op, commute=True)) def reflected_binary_operator(self, other): if isinstance(self, NumericalExpression): self_expr, other_expr, new_inputs = self.build_binary_op( op, other ) return NumExprFactor( "({left}) {op} ({right})".format( left=other_expr, right=self_expr, op=op, ), new_inputs, dtype=binop_return_dtype(op, other.dtype, self.dtype) ) # Only have to handle the numeric case because in all other valid cases # the corresponding left-binding method will be called. elif isinstance(other, Number): return NumExprFactor( "{constant} {op} x_0".format(op=op, constant=other), binds=(self,), dtype=binop_return_dtype(op, other.dtype, self.dtype), ) raise BadBinaryOperator(op, other, self) return reflected_binary_operator
class Factor(CompositeTerm): """ Pipeline API expression producing numerically-valued outputs. """ # Dynamically add functions for creating NumExprFactor/NumExprFilter # instances. clsdict = locals() clsdict.update({ method_name_for_op(op): binary_operator(op) # Don't override __eq__ because it breaks comparisons on tuples of # Factors. for op in MATH_BINOPS.union(COMPARISONS - {'=='}) }) clsdict.update({ method_name_for_op(op, commute=True): reflected_binary_operator(op) for op in MATH_BINOPS }) clsdict.update({unary_op_name(op): unary_operator(op) for op in UNARY_OPS}) clsdict.update({ funcname: function_application(funcname) for funcname in NUMEXPR_MATH_FUNCS }) __truediv__ = clsdict['__div__'] __rtruediv__ = clsdict['__rdiv__'] eq = binary_operator('==') def _validate(self): # Do superclass validation first so that `NotSpecified` dtypes get # handled. retval = super(Factor, self)._validate() if self.dtype not in FACTOR_DTYPES: raise UnsupportedDataType(typename=type(self).__name__, dtype=self.dtype) return retval def rank(self, method='ordinal', ascending=True, mask=NotSpecified): """ Construct a new Factor representing the sorted rank of each column within each row. Parameters ---------- method : str, {'ordinal', 'min', 'max', 'dense', 'average'} The method used to assign ranks to tied elements. See `scipy.stats.rankdata` for a full description of the semantics for each ranking method. Default is 'ordinal'. ascending : bool, optional Whether to return sorted rank in ascending or descending order. Default is True. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, ranks are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- ranks : zipline.pipeline.factors.Rank A new factor that will compute the ranking of the data produced by `self`. Notes ----- The default value for `method` is different from the default for `scipy.stats.rankdata`. See that function's documentation for a full description of the valid inputs to `method`. Missing or non-existent data on a given day will cause an asset to be given a rank of NaN for that day. See Also -------- scipy.stats.rankdata zipline.lib.rank.masked_rankdata_2d zipline.pipeline.factors.factor.Rank """ return Rank(self, method=method, ascending=ascending, mask=mask) def top(self, N, mask=NotSpecified): """ Construct a Filter matching the top N asset values of self each day. Parameters ---------- N : int Number of assets passing the returned filter each day. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, top values are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- filter : zipline.pipeline.filters.Filter """ return self.rank(ascending=False, mask=mask) <= N def bottom(self, N, mask=NotSpecified): """ Construct a Filter matching the bottom N asset values of self each day. Parameters ---------- N : int Number of assets passing the returned filter each day. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, bottom values are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- filter : zipline.pipeline.Filter """ return self.rank(ascending=True, mask=mask) <= N def percentile_between(self, min_percentile, max_percentile, mask=NotSpecified): """ Construct a new Filter representing entries from the output of this Factor that fall within the percentile range defined by min_percentile and max_percentile. Parameters ---------- min_percentile : float [0.0, 100.0] Return True for assets falling above this percentile in the data. max_percentile : float [0.0, 100.0] Return True for assets falling below this percentile in the data. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when percentile thresholds. If mask is supplied, percentile cutoffs are computed each day using only assets for which `mask` returns True, and assets not passing `mask` will produce False in the output of this filter as well. Returns ------- out : zipline.pipeline.filters.PercentileFilter A new filter that will compute the specified percentile-range mask. See Also -------- zipline.pipeline.filters.filter.PercentileFilter """ return PercentileFilter( self, min_percentile=min_percentile, max_percentile=max_percentile, mask=mask, ) def isnan(self): """ A Filter producing True for all values where this Factor is NaN. Returns ------- nanfilter : zipline.pipeline.filters.Filter """ return self != self def notnan(self): """ A Filter producing True for values where this Factor is not NaN. Returns ------- nanfilter : zipline.pipeline.filters.Filter """ return ~self.isnan() def isfinite(self): """ A Filter producing True for values where this Factor is anything but NaN, inf, or -inf. """ return (-inf < self) & (self < inf)
class Filter(RestrictedDTypeMixin, ComputableTerm): """ Pipeline expression computing a boolean output. Filters are most commonly useful for describing sets of assets to include or exclude for some particular purpose. Many Pipeline API functions accept a ``mask`` argument, which can be supplied a Filter indicating that only values passing the Filter should be considered when performing the requested computation. For example, :meth:`zipline.pipeline.Factor.top` accepts a mask indicating that ranks should be computed only on assets that passed the specified Filter. The most common way to construct a Filter is via one of the comparison operators (``<``, ``<=``, ``!=``, ``eq``, ``>``, ``>=``) of :class:`~zipline.pipeline.Factor`. For example, a natural way to construct a Filter for stocks with a 10-day VWAP less than $20.0 is to first construct a Factor computing 10-day VWAP and compare it to the scalar value 20.0:: >>> from zipline.pipeline.factors import VWAP >>> vwap_10 = VWAP(window_length=10) >>> vwaps_under_20 = (vwap_10 <= 20) Filters can also be constructed via comparisons between two Factors. For example, to construct a Filter producing True for asset/date pairs where the asset's 10-day VWAP was greater than it's 30-day VWAP:: >>> short_vwap = VWAP(window_length=10) >>> long_vwap = VWAP(window_length=30) >>> higher_short_vwap = (short_vwap > long_vwap) Filters can be combined via the ``&`` (and) and ``|`` (or) operators. ``&``-ing together two filters produces a new Filter that produces True if **both** of the inputs produced True. ``|``-ing together two filters produces a new Filter that produces True if **either** of its inputs produced True. The ``~`` operator can be used to invert a Filter, swapping all True values with Falses and vice-versa. Filters may be set as the ``screen`` attribute of a Pipeline, indicating asset/date pairs for which the filter produces False should be excluded from the Pipeline's output. This is useful both for reducing noise in the output of a Pipeline and for reducing memory consumption of Pipeline results. """ # Filters are window-safe by default, since a yes/no decision means the # same thing from all temporal perspectives. window_safe = True ALLOWED_DTYPES = (bool_dtype, ) # Used by RestrictedDTypeMixin dtype = bool_dtype clsdict = locals() clsdict.update( {method_name_for_op(op): binary_operator(op) for op in FILTER_BINOPS}) clsdict.update({ method_name_for_op(op, commute=True): binary_operator(op) for op in FILTER_BINOPS }) __invert__ = unary_operator('~') def _validate(self): # Run superclass validation first so that we handle `dtype not passed` # before this. retval = super(Filter, self)._validate() if self.dtype != bool_dtype: raise UnsupportedDataType(typename=type(self).__name__, dtype=self.dtype) return retval @classlazyval def _downsampled_type(self): return DownsampledMixin.make_downsampled_type(Filter) @classlazyval def _aliased_type(self): return AliasedMixin.make_aliased_type(Filter)
class Factor(RestrictedDTypeMixin, ComputableTerm): """ Pipeline API expression producing a numerical or date-valued output. Factors are the most commonly-used Pipeline term, representing the result of any computation producing a numerical result. Factors can be combined, both with other Factors and with scalar values, via any of the builtin mathematical operators (``+``, ``-``, ``*``, etc). This makes it easy to write complex expressions that combine multiple Factors. For example, constructing a Factor that computes the average of two other Factors is simply:: >>> f1 = SomeFactor(...) >>> f2 = SomeOtherFactor(...) >>> average = (f1 + f2) / 2.0 Factors can also be converted into :class:`zipline.pipeline.Filter` objects via comparison operators: (``<``, ``<=``, ``!=``, ``eq``, ``>``, ``>=``). There are many natural operators defined on Factors besides the basic numerical operators. These include methods identifying missing or extreme-valued outputs (isnull, notnull, isnan, notnan), methods for normalizing outputs (rank, demean, zscore), and methods for constructing Filters based on rank-order properties of results (top, bottom, percentile_between). """ ALLOWED_DTYPES = FACTOR_DTYPES # Used by RestrictedDTypeMixin # Dynamically add functions for creating NumExprFactor/NumExprFilter # instances. clsdict = locals() clsdict.update({ method_name_for_op(op): binary_operator(op) # Don't override __eq__ because it breaks comparisons on tuples of # Factors. for op in MATH_BINOPS.union(COMPARISONS - {'=='}) }) clsdict.update({ method_name_for_op(op, commute=True): reflected_binary_operator(op) for op in MATH_BINOPS }) clsdict.update({unary_op_name(op): unary_operator(op) for op in UNARY_OPS}) clsdict.update({ funcname: function_application(funcname) for funcname in NUMEXPR_MATH_FUNCS }) __truediv__ = clsdict['__div__'] __rtruediv__ = clsdict['__rdiv__'] eq = binary_operator('==') @expect_types( mask=(Filter, NotSpecifiedType), groupby=(Classifier, NotSpecifiedType), ) @float64_only def demean(self, mask=NotSpecified, groupby=NotSpecified): """ Construct a Factor that computes ``self`` and subtracts the mean from row of the result. If ``mask`` is supplied, ignore values where ``mask`` returns False when computing row means, and output NaN anywhere the mask is False. If ``groupby`` is supplied, compute by partitioning each row based on the values produced by ``groupby``, de-meaning the partitioned arrays, and stitching the sub-results back together. Parameters ---------- mask : zipline.pipeline.Filter, optional A Filter defining values to ignore when computing means. groupby : zipline.pipeline.Classifier, optional A classifier defining partitions over which to compute means. Example ------- Let ``f`` be a Factor which would produce the following output:: AAPL MSFT MCD BK 2017-03-13 1.0 2.0 3.0 4.0 2017-03-14 1.5 2.5 3.5 1.0 2017-03-15 2.0 3.0 4.0 1.5 2017-03-16 2.5 3.5 1.0 2.0 Let ``c`` be a Classifier producing the following output:: AAPL MSFT MCD BK 2017-03-13 1 1 2 2 2017-03-14 1 1 2 2 2017-03-15 1 1 2 2 2017-03-16 1 1 2 2 Let ``m`` be a Filter producing the following output:: AAPL MSFT MCD BK 2017-03-13 False True True True 2017-03-14 True False True True 2017-03-15 True True False True 2017-03-16 True True True False Then ``f.demean()`` will subtract the mean from each row produced by ``f``. :: AAPL MSFT MCD BK 2017-03-13 -1.500 -0.500 0.500 1.500 2017-03-14 -0.625 0.375 1.375 -1.125 2017-03-15 -0.625 0.375 1.375 -1.125 2017-03-16 0.250 1.250 -1.250 -0.250 ``f.demean(mask=m)`` will subtract the mean from each row, but means will be calculated ignoring values on the diagonal, and NaNs will written to the diagonal in the output. Diagonal values are ignored because they are the locations where the mask ``m`` produced False. :: AAPL MSFT MCD BK 2017-03-13 NaN -1.000 0.000 1.000 2017-03-14 -0.500 NaN 1.500 -1.000 2017-03-15 -0.166 0.833 NaN -0.666 2017-03-16 0.166 1.166 -1.333 NaN ``f.demean(groupby=c)`` will subtract the group-mean of AAPL/MSFT and MCD/BK from their respective entries. The AAPL/MSFT are grouped together because both assets always produce 1 in the output of the classifier ``c``. Similarly, MCD/BK are grouped together because they always produce 2. :: AAPL MSFT MCD BK 2017-03-13 -0.500 0.500 -0.500 0.500 2017-03-14 -0.500 0.500 1.250 -1.250 2017-03-15 -0.500 0.500 1.250 -1.250 2017-03-16 -0.500 0.500 -0.500 0.500 ``f.demean(mask=m, groupby=c)`` will also subtract the group-mean of AAPL/MSFT and MCD/BK, but means will be calculated ignoring values on the diagonal , and NaNs will be written to the diagonal in the output. :: AAPL MSFT MCD BK 2017-03-13 NaN 0.000 -0.500 0.500 2017-03-14 0.000 NaN 1.250 -1.250 2017-03-15 -0.500 0.500 NaN 0.000 2017-03-16 -0.500 0.500 0.000 NaN Notes ----- Mean is sensitive to the magnitudes of outliers. When working with factor that can potentially produce large outliers, it is often useful to use the ``mask`` parameter to discard values at the extremes of the distribution:: >>> base = MyFactor(...) >>> normalized = base.demean(mask=base.percentile_between(1, 99)) ``demean()`` is only supported on Factors of dtype float64. See Also -------- :meth:`pandas.DataFrame.groupby` """ return GroupedRowTransform( transform=lambda row: row - nanmean(row), factor=self, mask=mask, groupby=groupby, ) @expect_types( mask=(Filter, NotSpecifiedType), groupby=(Classifier, NotSpecifiedType), ) @float64_only def zscore(self, mask=NotSpecified, groupby=NotSpecified): """ Construct a Factor that Z-Scores each day's results. The Z-Score of a row is defined as:: (row - row.mean()) / row.stddev() If ``mask`` is supplied, ignore values where ``mask`` returns False when computing row means and standard deviations, and output NaN anywhere the mask is False. If ``groupby`` is supplied, compute by partitioning each row based on the values produced by ``groupby``, z-scoring the partitioned arrays, and stitching the sub-results back together. Parameters ---------- mask : zipline.pipeline.Filter, optional A Filter defining values to ignore when Z-Scoring. groupby : zipline.pipeline.Classifier, optional A classifier defining partitions over which to compute Z-Scores. Returns ------- zscored : zipline.pipeline.Factor A Factor producing that z-scores the output of self. Notes ----- Mean and standard deviation are sensitive to the magnitudes of outliers. When working with factor that can potentially produce large outliers, it is often useful to use the ``mask`` parameter to discard values at the extremes of the distribution:: >>> base = MyFactor(...) >>> normalized = base.zscore(mask=base.percentile_between(1, 99)) ``zscore()`` is only supported on Factors of dtype float64. Example ------- See :meth:`~zipline.pipeline.factors.Factor.demean` for an in-depth example of the semantics for ``mask`` and ``groupby``. See Also -------- :meth:`pandas.DataFrame.groupby` """ return GroupedRowTransform( transform=lambda row: (row - nanmean(row)) / nanstd(row), factor=self, mask=mask, groupby=groupby, ) def rank(self, method='ordinal', ascending=True, mask=NotSpecified): """ Construct a new Factor representing the sorted rank of each column within each row. Parameters ---------- method : str, {'ordinal', 'min', 'max', 'dense', 'average'} The method used to assign ranks to tied elements. See `scipy.stats.rankdata` for a full description of the semantics for each ranking method. Default is 'ordinal'. ascending : bool, optional Whether to return sorted rank in ascending or descending order. Default is True. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, ranks are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- ranks : zipline.pipeline.factors.Rank A new factor that will compute the ranking of the data produced by `self`. Notes ----- The default value for `method` is different from the default for `scipy.stats.rankdata`. See that function's documentation for a full description of the valid inputs to `method`. Missing or non-existent data on a given day will cause an asset to be given a rank of NaN for that day. See Also -------- :func:`scipy.stats.rankdata` :class:`zipline.pipeline.factors.factor.Rank` """ return Rank(self, method=method, ascending=ascending, mask=mask) def top(self, N, mask=NotSpecified): """ Construct a Filter matching the top N asset values of self each day. Parameters ---------- N : int Number of assets passing the returned filter each day. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, top values are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- filter : zipline.pipeline.filters.Filter """ return self.rank(ascending=False, mask=mask) <= N def bottom(self, N, mask=NotSpecified): """ Construct a Filter matching the bottom N asset values of self each day. Parameters ---------- N : int Number of assets passing the returned filter each day. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when computing ranks. If mask is supplied, bottom values are computed ignoring any asset/date pairs for which `mask` produces a value of False. Returns ------- filter : zipline.pipeline.Filter """ return self.rank(ascending=True, mask=mask) <= N def percentile_between(self, min_percentile, max_percentile, mask=NotSpecified): """ Construct a new Filter representing entries from the output of this Factor that fall within the percentile range defined by min_percentile and max_percentile. Parameters ---------- min_percentile : float [0.0, 100.0] Return True for assets falling above this percentile in the data. max_percentile : float [0.0, 100.0] Return True for assets falling below this percentile in the data. mask : zipline.pipeline.Filter, optional A Filter representing assets to consider when percentile calculating thresholds. If mask is supplied, percentile cutoffs are computed each day using only assets for which ``mask`` returns True. Assets for which ``mask`` produces False will produce False in the output of this Factor as well. Returns ------- out : zipline.pipeline.filters.PercentileFilter A new filter that will compute the specified percentile-range mask. See Also -------- zipline.pipeline.filters.filter.PercentileFilter """ return PercentileFilter( self, min_percentile=min_percentile, max_percentile=max_percentile, mask=mask, ) def isnull(self): """ A Filter producing True for values where this Factor has missing data. Equivalent to self.isnan() when ``self.dtype`` is float64. Otherwise equivalent to ``self.eq(self.missing_value)``. Returns ------- filter : zipline.pipeline.filters.Filter """ if self.dtype == float64_dtype: # Using isnan is more efficient when possible because we can fold # the isnan computation with other NumExpr expressions. return self.isnan() else: return NullFilter(self) def notnull(self): """ A Filter producing True for values where this Factor has complete data. Equivalent to ``~self.isnan()` when ``self.dtype`` is float64. Otherwise equivalent to ``(self != self.missing_value)``. """ return ~self.isnull() @if_not_float64_tell_caller_to_use_isnull def isnan(self): """ A Filter producing True for all values where this Factor is NaN. Returns ------- nanfilter : zipline.pipeline.filters.Filter """ return self != self @if_not_float64_tell_caller_to_use_isnull def notnan(self): """ A Filter producing True for values where this Factor is not NaN. Returns ------- nanfilter : zipline.pipeline.filters.Filter """ return ~self.isnan() @if_not_float64_tell_caller_to_use_isnull def isfinite(self): """ A Filter producing True for values where this Factor is anything but NaN, inf, or -inf. """ return (-inf < self) & (self < inf)
class Factor(Term): """ Pipeline API expression producing numerically-valued outputs. """ dtype = float64 # Dynamically add functions for creating NumExprFactor/NumExprFilter # instances. clsdict = locals() clsdict.update({ method_name_for_op(op): binary_operator(op) # Don't override __eq__ because it breaks comparisons on tuples of # Factors. for op in MATH_BINOPS.union(COMPARISONS - {'=='}) }) clsdict.update({ method_name_for_op(op, commute=True): reflected_binary_operator(op) for op in MATH_BINOPS }) clsdict.update({'__neg__': unary_operator(op) for op in UNARY_OPS}) clsdict.update({ funcname: function_application(funcname) for funcname in NUMEXPR_MATH_FUNCS }) __truediv__ = clsdict['__div__'] __rtruediv__ = clsdict['__rdiv__'] eq = binary_operator('==') def rank(self, method='ordinal', ascending=True, mask=NotSpecified): """ Construct a new Factor representing the sorted rank of each column within each row. Parameters ---------- method : str, {'ordinal', 'min', 'max', 'dense', 'average'} The method used to assign ranks to tied elements. See `scipy.stats.rankdata` for a full description of the semantics for each ranking method. Default is 'ordinal'. ascending : bool, optional Whether to return sorted rank in ascending or descending order. Default is True. Returns ------- ranks : zipline.pipeline.factors.Rank A new factor that will compute the ranking of the data produced by `self`. Notes ----- The default value for `method` is different from the default for `scipy.stats.rankdata`. See that function's documentation for a full description of the valid inputs to `method`. Missing or non-existent data on a given day will cause an asset to be given a rank of NaN for that day. See Also -------- scipy.stats.rankdata zipline.lib.rank zipline.pipeline.factors.Rank """ return Rank(self if ascending else -self, method=method, mask=mask) def top(self, N, mask=NotSpecified): """ Construct a Filter matching the top N asset values of self each day. Parameters ---------- N : int Number of assets passing the returned filter each day. mask : zipline.pipeline.Filter Filter to apply as a mask before computing top values. Returns ------- filter : zipline.pipeline.filters.Filter """ return self.rank(ascending=False, mask=mask) <= N def bottom(self, N, mask=NotSpecified): """ Construct a Filter matching the bottom N asset values of self each day. Parameters ---------- N : int Number of assets passing the returned filter each day. mask : zipline.pipeline.filters.Filter Filter to apply as a mask before computing bottom values. Returns ------- filter : zipline.pipeline.Filter """ return self.rank(ascending=True, mask=mask) <= N def percentile_between(self, min_percentile, max_percentile, mask=NotSpecified): """ Construct a new Filter representing entries from the output of this Factor that fall within the percentile range defined by min_percentile and max_percentile. Parameters ---------- min_percentile : float [0.0, 100.0] max_percentile : float [0.0, 100.0] Returns ------- out : zipline.pipeline.filters.PercentileFilter A new filter that will compute the specified percentile-range mask. See Also -------- zipline.pipeline.filters.PercentileFilter """ return PercentileFilter( self, min_percentile=min_percentile, max_percentile=max_percentile, mask=mask, )
class Filter(RestrictedDTypeMixin, ComputableTerm): """ Pipeline expression computing a boolean output. Filters are most commonly useful for describing sets of assets to include or exclude for some particular purpose. Many Pipeline API functions accept a ``mask`` argument, which can be supplied a Filter indicating that only values passing the Filter should be considered when performing the requested computation. For example, :meth:`zipline.pipeline.Factor.top` accepts a mask indicating that ranks should be computed only on assets that passed the specified Filter. The most common way to construct a Filter is via one of the comparison operators (``<``, ``<=``, ``!=``, ``eq``, ``>``, ``>=``) of :class:`~zipline.pipeline.Factor`. For example, a natural way to construct a Filter for stocks with a 10-day VWAP less than $20.0 is to first construct a Factor computing 10-day VWAP and compare it to the scalar value 20.0:: >>> from zipline.pipeline.factors import VWAP >>> vwap_10 = VWAP(window_length=10) >>> vwaps_under_20 = (vwap_10 <= 20) Filters can also be constructed via comparisons between two Factors. For example, to construct a Filter producing True for asset/date pairs where the asset's 10-day VWAP was greater than it's 30-day VWAP:: >>> short_vwap = VWAP(window_length=10) >>> long_vwap = VWAP(window_length=30) >>> higher_short_vwap = (short_vwap > long_vwap) Filters can be combined via the ``&`` (and) and ``|`` (or) operators. ``&``-ing together two filters produces a new Filter that produces True if **both** of the inputs produced True. ``|``-ing together two filters produces a new Filter that produces True if **either** of its inputs produced True. The ``~`` operator can be used to invert a Filter, swapping all True values with Falses and vice-versa. Filters may be set as the ``screen`` attribute of a Pipeline, indicating asset/date pairs for which the filter produces False should be excluded from the Pipeline's output. This is useful both for reducing noise in the output of a Pipeline and for reducing memory consumption of Pipeline results. """ # Filters are window-safe by default, since a yes/no decision means the # same thing from all temporal perspectives. window_safe = True # Used by RestrictedDTypeMixin ALLOWED_DTYPES = FILTER_DTYPES dtype = bool_dtype clsdict = locals() clsdict.update( { method_name_for_op(op): binary_operator(op) for op in FILTER_BINOPS } ) clsdict.update( { method_name_for_op(op, commute=True): binary_operator(op) for op in FILTER_BINOPS } ) __invert__ = unary_operator('~') def _validate(self): # Run superclass validation first so that we handle `dtype not passed` # before this. retval = super(Filter, self)._validate() if self.dtype != bool_dtype: raise UnsupportedDataType( typename=type(self).__name__, dtype=self.dtype ) return retval @classmethod def _principal_computable_term_type(cls): return Filter @expect_types(if_true=ComputableTerm, if_false=ComputableTerm) def if_else(self, if_true, if_false): """ Create a term that selects values from one of two choices. Parameters ---------- if_true : zipline.pipeline.term.ComputableTerm Expression whose values should be used at locations where this filter outputs True. if_false : zipline.pipeline.term.ComputableTerm Expression whose values should be used at locations where this filter outputs False. Returns ------- merged : zipline.pipeline.term.ComputableTerm A term that computes by taking values from either ``if_true`` or ``if_false``, depending on the values produced by ``self``. The returned term draws from``if_true`` at locations where ``self`` produces True, and it draws from ``if_false`` at locations where ``self`` produces False. Example ------- Let ``f`` be a Factor that produces the following output:: AAPL MSFT MCD BK 2017-03-13 1.0 2.0 3.0 4.0 2017-03-14 5.0 6.0 7.0 8.0 Let ``g`` be another Factor that produces the following output:: AAPL MSFT MCD BK 2017-03-13 10.0 20.0 30.0 40.0 2017-03-14 50.0 60.0 70.0 80.0 Finally, let ``condition`` be a Filter that produces the following output:: AAPL MSFT MCD BK 2017-03-13 True False True False 2017-03-14 True True False False Then, the expression ``condition.if_else(f, g)`` produces the following output:: AAPL MSFT MCD BK 2017-03-13 1.0 20.0 3.0 40.0 2017-03-14 5.0 6.0 70.0 80.0 See Also -------- numpy.where Factor.fillna """ true_type = if_true._principal_computable_term_type() false_type = if_false._principal_computable_term_type() if true_type is not false_type: raise TypeError( "Mismatched types in if_else(): if_true={}, but if_false={}" .format(true_type.__name__, false_type.__name__) ) if if_true.dtype != if_false.dtype: raise TypeError( "Mismatched dtypes in if_else(): " "if_true.dtype = {}, if_false.dtype = {}" .format(if_true.dtype, if_false.dtype) ) if if_true.outputs != if_false.outputs: raise ValueError( "Mismatched outputs in if_else(): " "if_true.outputs = {}, if_false.outputs = {}" .format(if_true.outputs, if_false.outputs), ) if not same(if_true.missing_value, if_false.missing_value): raise ValueError( "Mismatched missing values in if_else(): " "if_true.missing_value = {!r}, if_false.missing_value = {!r}" .format(if_true.missing_value, if_false.missing_value) ) return_type = type(if_true)._with_mixin(IfElseMixin) return return_type( condition=self, if_true=if_true, if_false=if_false, )