class EquitySessionDomain(Domain): """A domain built directly from an index of sessions. Mostly useful for testing. Parameters ---------- sessions : pd.DatetimeIndex Sessions to use as output labels for pipelines run on this domain. country_code : str ISO 3166 country code of equities to be used with this domain. data_query_time : datetime.time, optional The time of day when data should no longer be considered available for a session. data_query_date_offset : int, optional The number of days to add to the session label before applying the ``data_query_time``. This can be used to express that the cutoff time for a session falls on a different calendar day from the session label. """ @expect_types( sessions=pd.DatetimeIndex, country_code=str, data_query_time=optional(datetime.time), data_query_date_offset=int, __funcname='EquitySessionDomain', ) def __init__(self, sessions, country_code, data_query_time=None, data_query_date_offset=0): self._country_code = country_code self._sessions = sessions if data_query_time is None: data_query_time = datetime.time(0, 0, tzinfo=pytz.timezone('UTC')) if data_query_time.tzinfo is None: raise ValueError("data_query_time cannot be tz-naive") self._data_query_time = data_query_time self._data_query_date_offset = data_query_date_offset @property def country_code(self): return self._country_code def all_sessions(self): return self._sessions def data_query_cutoff_for_sessions(self, sessions): return days_at_time( sessions, self._data_query_time, self._data_query_time.tzinfo, self._data_query_date_offset, )
def test_expect_optional_types(self): @expect_types(a=optional(int)) def foo(a=None): return a assert foo() is None assert foo(None) is None assert foo(a=None) is None assert foo(1) == 1 assert foo(a=1) == 1 expected_message = ( "{qualname}() expected a value of " "type int or NoneType for argument 'a', but got str instead." ).format(qualname=qualname(foo)) with pytest.raises(TypeError, match=re.escape(expected_message)): foo("1")
def test_expect_optional_types(self): @expect_types(a=optional(int)) def foo(a=None): return a self.assertIs(foo(), None) self.assertIs(foo(None), None) self.assertIs(foo(a=None), None) self.assertEqual(foo(1), 1) self.assertEqual(foo(a=1), 1) with self.assertRaises(TypeError) as e: foo("1") expected_message = ( "{qualname}() expected a value of " "type int or NoneType for argument 'a', but got str instead." ).format(qualname=qualname(foo)) self.assertEqual(e.exception.args[0], expected_message)
def test_expect_optional_types(self): @expect_types(a=optional(int)) def foo(a=None): return a self.assertIs(foo(), None) self.assertIs(foo(None), None) self.assertIs(foo(a=None), None) self.assertEqual(foo(1), 1) self.assertEqual(foo(a=1), 1) with self.assertRaises(TypeError) as e: foo('1') expected_message = ( "{qualname}() expected a value of " "type int or NoneType for argument 'a', but got str instead." ).format(qualname=qualname(foo)) self.assertEqual(e.exception.args[0], expected_message)
class LabelArray(ndarray): """ An ndarray subclass for working with arrays of strings. Factorizes the input array into integers, but overloads equality on strings to check against the factor label. Parameters ---------- values : array-like Array of values that can be passed to np.asarray with dtype=object. missing_value : str Scalar value to treat as 'missing' for operations on ``self``. categories : list[str], optional List of values to use as categories. If not supplied, categories will be inferred as the unique set of entries in ``values``. sort : bool, optional Whether to sort categories. If sort is False and categories is supplied, they are left in the order provided. If sort is False and categories is None, categories will be constructed in a random order. Attributes ---------- categories : ndarray[str] An array containing the unique labels of self. reverse_categories : dict[str -> int] Reverse lookup table for ``categories``. Stores the index in ``categories`` at which each entry each unique entry is found. missing_value : str or None A sentinel missing value with NaN semantics for comparisons. Notes ----- Consumers should be cautious when passing instances of LabelArray to numpy functions. We attempt to disallow as many meaningless operations as possible, but since a LabelArray is just an ndarray of ints with some additional metadata, many numpy functions (for example, trigonometric) will happily accept a LabelArray and treat its values as though they were integers. In a future change, we may be able to disallow more numerical operations by creating a wrapper dtype which doesn't register an implementation for most numpy ufuncs. Until that change is made, consumers of LabelArray should assume that it is undefined behavior to pass a LabelArray to any numpy ufunc that operates on semantically-numerical data. See Also -------- http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html """ SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None)) @preprocess( values=coerce(list, partial(np.asarray, dtype=object)), categories=coerce(np.ndarray, list), ) @expect_types( values=np.ndarray, missing_value=SUPPORTED_SCALAR_TYPES, categories=optional(list), ) @expect_kinds(values=("O", "S", "U")) def __new__(cls, values, missing_value, categories=None, sort=True): # Numpy's fixed-width string types aren't very efficient. Working with # object arrays is faster than bytes or unicode arrays in almost all # cases. if not is_object(values): values = values.astype(object) if categories is None: codes, categories, reverse_categories = factorize_strings( values.ravel(), missing_value=missing_value, sort=sort, ) else: codes, categories, reverse_categories = ( factorize_strings_known_categories( values.ravel(), categories=categories, missing_value=missing_value, sort=sort, )) categories.setflags(write=False) return cls._from_codes_and_metadata( codes=codes.reshape(values.shape), categories=categories, reverse_categories=reverse_categories, missing_value=missing_value, ) @classmethod def _from_codes_and_metadata(cls, codes, categories, reverse_categories, missing_value): """ View codes as a LabelArray and set LabelArray metadata on the result. """ ret = codes.view(type=cls, dtype=np.void) ret._categories = categories ret._reverse_categories = reverse_categories ret._missing_value = missing_value return ret @property def categories(self): # This is a property because it should be immutable. return self._categories @property def reverse_categories(self): # This is a property because it should be immutable. return self._reverse_categories @property def missing_value(self): # This is a property because it should be immutable. return self._missing_value @property def missing_value_code(self): return self.reverse_categories[self.missing_value] def has_label(self, value): return value in self.reverse_categories def __array_finalize__(self, obj): """ Called by Numpy after array construction. There are three cases where this can happen: 1. Someone tries to directly construct a new array by doing:: >>> ndarray.__new__(LabelArray, ...) # doctest: +SKIP In this case, obj will be None. We treat this as an error case and fail. 2. Someone (most likely our own __new__) does:: >>> other_array.view(type=LabelArray) # doctest: +SKIP In this case, `self` will be the new LabelArray instance, and ``obj` will be the array on which ``view`` is being called. The caller of ``obj.view`` is responsible for setting category metadata on ``self`` after we exit. 3. Someone creates a new LabelArray by slicing an existing one. In this case, ``obj`` will be the original LabelArray. We're responsible for copying over the parent array's category metadata. """ if obj is None: raise TypeError( "Direct construction of LabelArrays is not supported.") # See docstring for an explanation of when these will or will not be # set. self._categories = getattr(obj, 'categories', None) self._reverse_categories = getattr(obj, 'reverse_categories', None) self._missing_value = getattr(obj, 'missing_value', None) def as_int_array(self): """ Convert self into a regular ndarray of ints. This is an O(1) operation. It does not copy the underlying data. """ return self.view( type=ndarray, dtype=int_dtype_with_size_in_bytes(self.itemsize), ) def as_string_array(self): """ Convert self back into an array of strings. This is an O(N) operation. """ return self.categories[self.as_int_array()] def as_categorical(self, name=None): """ Coerce self into a pandas categorical. This is only defined on 1D arrays, since that's all pandas supports. """ if len(self.shape) > 1: raise ValueError("Can't convert a 2D array to a categorical.") with ignore_pandas_nan_categorical_warning(): return pd.Categorical.from_codes( self.as_int_array(), # We need to make a copy because pandas >= 0.17 fails if this # buffer isn't writeable. self.categories.copy(), ordered=False, name=name, ) def as_categorical_frame(self, index, columns, name=None): """ Coerce self into a pandas DataFrame of Categoricals. """ if len(self.shape) != 2: raise ValueError( "Can't convert a non-2D LabelArray into a DataFrame.") expected_shape = (len(index), len(columns)) if expected_shape != self.shape: raise ValueError( "Can't construct a DataFrame with provided indices:\n\n" "LabelArray shape is {actual}, but index and columns imply " "that shape should be {expected}.".format( actual=self.shape, expected=expected_shape, )) return pd.Series( index=pd.MultiIndex.from_product([index, columns]), data=self.ravel().as_categorical(name=name), ).unstack() def __setitem__(self, indexer, value): self_categories = self.categories if isinstance(value, LabelArray): value_categories = value.categories if compare_arrays(self_categories, value_categories): return super(LabelArray, self).__setitem__(indexer, value) else: raise CategoryMismatch(self_categories, value_categories) elif isinstance(value, self.SUPPORTED_SCALAR_TYPES): value_code = self.reverse_categories.get(value, -1) if value_code < 0: raise ValueError("%r is not in LabelArray categories." % value) self.as_int_array()[indexer] = value_code else: raise NotImplementedError( "Setting into a LabelArray with a value of " "type {type} is not yet supported.".format( type=type(value).__name__, ), ) def __setslice__(self, i, j, sequence): """ This method was deprecated in Python 2.0. It predates slice objects, but Python 2.7.11 still uses it if you implement it, which ndarray does. In newer Pythons, __setitem__ is always called, but we need to manuallly forward in py2. """ self.__setitem__(slice(i, j), sequence) def __getitem__(self, indexer): result = super(LabelArray, self).__getitem__(indexer) if result.ndim: # Result is still a LabelArray, so we can just return it. return result # Result is a scalar value, which will be an instance of np.void. # Map it back to one of our category entries. index = result.view(int_dtype_with_size_in_bytes(self.itemsize)) return self.categories[index] def is_missing(self): """ Like isnan, but checks for locations where we store missing values. """ return ( self.as_int_array() == self.reverse_categories[self.missing_value]) def not_missing(self): """ Like ~isnan, but checks for locations where we store missing values. """ return (self.as_int_array() != self.reverse_categories[self.missing_value]) def _equality_check(op): """ Shared code for __eq__ and __ne__, parameterized on the actual comparison operator to use. """ def method(self, other): if isinstance(other, LabelArray): self_mv = self.missing_value other_mv = other.missing_value if self_mv != other_mv: raise MissingValueMismatch(self_mv, other_mv) self_categories = self.categories other_categories = other.categories if not compare_arrays(self_categories, other_categories): raise CategoryMismatch(self_categories, other_categories) return (op(self.as_int_array(), other.as_int_array()) & self.not_missing() & other.not_missing()) elif isinstance(other, ndarray): # Compare to ndarrays as though we were an array of strings. # This is fairly expensive, and should generally be avoided. return op(self.as_string_array(), other) & self.not_missing() elif isinstance(other, self.SUPPORTED_SCALAR_TYPES): i = self._reverse_categories.get(other, -1) return op(self.as_int_array(), i) & self.not_missing() return op(super(LabelArray, self), other) return method __eq__ = _equality_check(eq) __ne__ = _equality_check(ne) del _equality_check def view(self, dtype=_NotPassed, type=_NotPassed): if type is _NotPassed and dtype not in (_NotPassed, self.dtype): raise TypeError("Can't view LabelArray as another dtype.") # The text signature on ndarray.view makes it look like the default # values for dtype and type are `None`, but passing None explicitly has # different semantics than not passing an arg at all, so we reconstruct # the kwargs dict here to simulate the args not being passed at all. kwargs = {} if dtype is not _NotPassed: kwargs['dtype'] = dtype if type is not _NotPassed: kwargs['type'] = type return super(LabelArray, self).view(**kwargs) # In general, we support resizing, slicing, and reshaping methods, but not # numeric methods. SUPPORTED_NDARRAY_METHODS = frozenset([ 'base', 'compress', 'copy', 'data', 'diagonal', 'dtype', 'flat', 'flatten', 'item', 'itemset', 'itemsize', 'nbytes', 'ndim', 'ravel', 'repeat', 'reshape', 'resize', 'setflags', 'shape', 'size', 'squeeze', 'strides', 'swapaxes', 'take', 'trace', 'transpose', 'view' ]) PUBLIC_NDARRAY_METHODS = frozenset( [s for s in dir(ndarray) if not s.startswith('_')]) # Generate failing wrappers for all unsupported methods. locals().update({ method: _make_unsupported_method(method) for method in PUBLIC_NDARRAY_METHODS - SUPPORTED_NDARRAY_METHODS }) def __repr__(self): repr_lines = repr(self.as_string_array()).splitlines() repr_lines[0] = repr_lines[0].replace('array(', 'LabelArray(', 1) repr_lines[-1] = repr_lines[-1].rsplit(',', 1)[0] + ')' # The extra spaces here account for the difference in length between # 'array(' and 'LabelArray('. return '\n '.join(repr_lines) def empty_like(self, shape): """ Make an empty LabelArray with the same categories as ``self``, filled with ``self.missing_value``. """ return type(self)._from_codes_and_metadata( codes=np.full( shape, self.reverse_categories[self.missing_value], dtype=int_dtype_with_size_in_bytes(self.itemsize), ), categories=self.categories, reverse_categories=self.reverse_categories, missing_value=self.missing_value, ) def map_predicate(self, f): """ Map a function from str -> bool element-wise over ``self``. ``f`` will be applied exactly once to each non-missing unique value in ``self``. Missing values will always return False. """ # Functions passed to this are of type str -> bool. Don't ever call # them on None, which is the only non-str value we ever store in # categories. if self.missing_value is None: f_to_use = lambda x: False if x is None else f(x) else: f_to_use = f # Call f on each unique value in our categories. results = np.vectorize(f_to_use, otypes=[bool_dtype])(self.categories) # missing_value should produce False no matter what results[self.reverse_categories[self.missing_value]] = False # unpack the results form each unique value into their corresponding # locations in our indices. return results[self.as_int_array()] def startswith(self, prefix): """ Element-wise startswith. Parameters ---------- prefix : str Returns ------- matches : np.ndarray[bool] An array with the same shape as self indicating whether each element of self started with ``prefix``. """ return self.map_predicate(lambda elem: elem.startswith(prefix)) def endswith(self, suffix): """ Elementwise endswith. Parameters ---------- suffix : str Returns ------- matches : np.ndarray[bool] An array with the same shape as self indicating whether each element of self ended with ``suffix`` """ return self.map_predicate(lambda elem: elem.endswith(suffix)) def has_substring(self, substring): """ Elementwise contains. Parameters ---------- substring : str Returns ------- matches : np.ndarray[bool] An array with the same shape as self indicating whether each element of self ended with ``suffix``. """ return self.map_predicate(lambda elem: substring in elem) @preprocess(pattern=coerce(from_=(bytes, unicode), to=re.compile)) def matches(self, pattern): """ Elementwise regex match. Parameters ---------- pattern : str or compiled regex Returns ------- matches : np.ndarray[bool] An array with the same shape as self indicating whether each element of self was matched by ``pattern``. """ return self.map_predicate(compose(bool, pattern.match)) # These types all implement an O(N) __contains__, so pre-emptively # coerce to `set`. @preprocess(container=coerce((list, tuple, np.ndarray), set)) def element_of(self, container): """ Check if each element of self is an of ``container``. Parameters ---------- container : object An object implementing a __contains__ to call on each element of ``self``. Returns ------- is_contained : np.ndarray[bool] An array with the same shape as self indicating whether each element of self was an element of ``container``. """ return self.map_predicate(container.__contains__)
class Pipeline(object): """ A Pipeline object represents a collection of named expressions to be compiled and executed by a PipelineEngine. A Pipeline has two important attributes: 'columns', a dictionary of named :class:`~zipline.pipeline.Term` instances, and 'screen', a :class:`~zipline.pipeline.Filter` representing criteria for including an asset in the results of a Pipeline. To compute a pipeline in the context of a TradingAlgorithm, users must call ``attach_pipeline`` in their ``initialize`` function to register that the pipeline should be computed each trading day. The most recent outputs of an attached pipeline can be retrieved by calling ``pipeline_output`` from ``handle_data``, ``before_trading_start``, or a scheduled function. Parameters ---------- columns : dict, optional Initial columns. screen : zipline.pipeline.Filter, optional Initial screen. """ __slots__ = ('_columns', '_screen', '_domain', '__weakref__') @expect_types( columns=optional(dict), screen=optional(Filter), domain=Domain ) def __init__(self, columns=None, screen=None, domain=GENERIC): if columns is None: columns = {} validate_column = self.validate_column for column_name, term in columns.items(): validate_column(column_name, term) if not isinstance(term, ComputableTerm): raise TypeError( "Column {column_name!r} contains an invalid pipeline term " "({term}). Did you mean to append '.latest'?".format( column_name=column_name, term=term, ) ) self._columns = columns self._screen = screen self._domain = domain @property def columns(self): """The output columns of this pipeline. Returns ------- columns : dict[str, zipline.pipeline.ComputableTerm] Map from column name to expression computing that column's output. """ return self._columns @property def screen(self): """ The screen of this pipeline. Returns ------- screen : zipline.pipeline.Filter or None Term defining the screen for this pipeline. If ``screen`` is a filter, rows that do not pass the filter (i.e., rows for which the filter computed ``False``) will be dropped from the output of this pipeline before returning results. Notes ----- Setting a screen on a Pipeline does not change the values produced for any rows: it only affects whether a given row is returned. Computing a pipeline with a screen is logically equivalent to computing the pipeline without the screen and then, as a post-processing-step, filtering out any rows for which the screen computed ``False``. """ return self._screen @expect_types(term=Term, name=str) def add(self, term, name, overwrite=False): """Add a column. The results of computing ``term`` will show up as a column in the DataFrame produced by running this pipeline. Parameters ---------- column : zipline.pipeline.Term A Filter, Factor, or Classifier to add to the pipeline. name : str Name of the column to add. overwrite : bool Whether to overwrite the existing entry if we already have a column named `name`. """ self.validate_column(name, term) columns = self.columns if name in columns: if overwrite: self.remove(name) else: raise KeyError("Column '{}' already exists.".format(name)) if not isinstance(term, ComputableTerm): raise TypeError( "{term} is not a valid pipeline column. Did you mean to " "append '.latest'?".format(term=term) ) self._columns[name] = term @expect_types(name=str) def remove(self, name): """Remove a column. Parameters ---------- name : str The name of the column to remove. Raises ------ KeyError If `name` is not in self.columns. Returns ------- removed : zipline.pipeline.Term The removed term. """ return self.columns.pop(name) @expect_types(screen=Filter, overwrite=(bool, int)) def set_screen(self, screen, overwrite=False): """Set a screen on this Pipeline. Parameters ---------- filter : zipline.pipeline.Filter The filter to apply as a screen. overwrite : bool Whether to overwrite any existing screen. If overwrite is False and self.screen is not None, we raise an error. """ if self._screen is not None and not overwrite: raise ValueError( "set_screen() called with overwrite=False and screen already " "set.\n" "If you want to apply multiple filters as a screen use " "set_screen(filter1 & filter2 & ...).\n" "If you want to replace the previous screen with a new one, " "use set_screen(new_filter, overwrite=True)." ) self._screen = screen def to_execution_plan(self, domain, default_screen, start_date, end_date): """ Compile into an ExecutionPlan. Parameters ---------- domain : zipline.pipeline.domain.Domain Domain on which the pipeline will be executed. default_screen : zipline.pipeline.Term Term to use as a screen if self.screen is None. all_dates : pd.DatetimeIndex A calendar of dates to use to calculate starts and ends for each term. start_date : pd.Timestamp The first date of requested output. end_date : pd.Timestamp The last date of requested output. Returns ------- graph : zipline.pipeline.graph.ExecutionPlan Graph encoding term dependencies, including metadata about extra row requirements. """ if self._domain is not GENERIC and self._domain is not domain: raise AssertionError( "Attempted to compile Pipeline with domain {} to execution " "plan with different domain {}.".format(self._domain, domain) ) return ExecutionPlan( domain=domain, terms=self._prepare_graph_terms(default_screen), start_date=start_date, end_date=end_date, ) def to_simple_graph(self, default_screen): """ Compile into a simple TermGraph with no extra row metadata. Parameters ---------- default_screen : zipline.pipeline.Term Term to use as a screen if self.screen is None. Returns ------- graph : zipline.pipeline.graph.TermGraph Graph encoding term dependencies. """ return TermGraph(self._prepare_graph_terms(default_screen)) def _prepare_graph_terms(self, default_screen): """Helper for to_graph and to_execution_plan.""" columns = self.columns.copy() screen = self.screen if screen is None: screen = default_screen columns[SCREEN_NAME] = screen return columns @expect_element(format=('svg', 'png', 'jpeg')) def show_graph(self, format='svg'): """ Render this Pipeline as a DAG. Parameters ---------- format : {'svg', 'png', 'jpeg'} Image format to render with. Default is 'svg'. """ g = self.to_simple_graph(AssetExists()) if format == 'svg': return g.svg elif format == 'png': return g.png elif format == 'jpeg': return g.jpeg else: # We should never get here because of the expect_element decorator # above. raise AssertionError("Unknown graph format %r." % format) @staticmethod @expect_types(term=Term, column_name=six.string_types) def validate_column(column_name, term): if term.ndim == 1: raise UnsupportedPipelineOutput(column_name=column_name, term=term) @property def _output_terms(self): """ A list of terms that are outputs of this pipeline. Includes all terms registered as data outputs of the pipeline, plus the screen, if present. """ terms = list(six.itervalues(self._columns)) screen = self.screen if screen is not None: terms.append(screen) return terms @expect_types(default=Domain) def domain(self, default): """ Get the domain for this pipeline. - If an explicit domain was provided at construction time, use it. - Otherwise, infer a domain from the registered columns. - If no domain can be inferred, return ``default``. Parameters ---------- default : zipline.pipeline.domain.Domain Domain to use if no domain can be inferred from this pipeline by itself. Returns ------- domain : zipline.pipeline.domain.Domain The domain for the pipeline. Raises ------ AmbiguousDomain ValueError If the terms in ``self`` conflict with self._domain. """ # Always compute our inferred domain to ensure that it's compatible # with our explicit domain. inferred = infer_domain(self._output_terms) if inferred is GENERIC and self._domain is GENERIC: # Both generic. Fall back to default. return default elif inferred is GENERIC and self._domain is not GENERIC: # Use the non-generic domain. return self._domain elif inferred is not GENERIC and self._domain is GENERIC: # Use the non-generic domain. return inferred else: # Both non-generic. They have to match. if inferred is not self._domain: raise ValueError( "Conflicting domains in Pipeline. Inferred {}, but {} was " "passed at construction.".format(inferred, self._domain) ) return inferred
class Pipeline(object): """ A Pipeline object represents a collection of named expressions to be compiled and executed by a PipelineEngine. A Pipeline has two important attributes: 'columns', a dictionary of named `Term` instances, and 'screen', a Filter representing criteria for including an asset in the results of a Pipeline. To compute a pipeline in the context of a TradingAlgorithm, users must call ``attach_pipeline`` in their ``initialize`` function to register that the pipeline should be computed each trading day. The outputs of a pipeline on a given day can be accessed by calling ``pipeline_output`` in ``handle_data`` or ``before_trading_start``. Parameters ---------- columns : dict, optional Initial columns. screen : zipline.pipeline.term.Filter, optional Initial screen. """ __slots__ = ('_columns', '_screen', '__weakref__') @expect_types( columns=optional(dict), screen=optional(Filter), ) def __init__(self, columns=None, screen=None): if columns is None: columns = {} self._columns = columns self._screen = screen @property def columns(self): """ The columns registered with this pipeline. """ return self._columns @property def screen(self): """ The screen applied to the rows of this pipeline. """ return self._screen @expect_types(term=Term, name=str) def add(self, term, name, overwrite=False): """ Add a column. The results of computing `term` will show up as a column in the DataFrame produced by running this pipeline. Parameters ---------- column : zipline.pipeline.Term A Filter, Factor, or Classifier to add to the pipeline. name : str Name of the column to add. overwrite : bool Whether to overwrite the existing entry if we already have a column named `name`. """ columns = self.columns if name in columns: if overwrite: self.remove(name) else: raise KeyError("Column '{}' already exists.".format(name)) self._columns[name] = term @expect_types(name=str) def remove(self, name): """ Remove a column. Parameters ---------- name : str The name of the column to remove. Raises ------ KeyError If `name` is not in self.columns. Returns ------- removed : zipline.pipeline.term.Term The removed term. """ return self.columns.pop(name) @expect_types(screen=Filter, overwrite=(bool, int)) def set_screen(self, screen, overwrite=False): """ Set a screen on this Pipeline. Parameters ---------- filter : zipline.pipeline.Filter The filter to apply as a screen. overwrite : bool Whether to overwrite any existing screen. If overwrite is False and self.screen is not None, we raise an error. """ if self._screen is not None and not overwrite: raise ValueError( "set_screen() called with overwrite=False and screen already " "set.\n" "If you want to apply multiple filters as a screen use " "set_screen(filter1 & filter2 & ...).\n" "If you want to replace the previous screen with a new one, " "use set_screen(new_filter, overwrite=True).") self._screen = screen def to_graph(self, screen_name, default_screen): """ Compile into a TermGraph. Parameters ---------- screen_name : str Name to supply for self.screen. default_screen : zipline.pipeline.term.Term Term to use as a screen if self.screen is None. """ columns = self.columns.copy() screen = self.screen if screen is None: screen = default_screen columns[screen_name] = screen return TermGraph(columns) def show_graph(self, format='svg'): """ Render this Pipeline as a DAG. Parameters ---------- format : {'svg', 'png', 'jpeg'} Image format to render with. Default is 'svg'. """ g = self.to_graph('', AssetExists()) if format == 'svg': return g.svg elif format == 'png': return g.png elif format == 'jpeg': return g.jpeg else: raise ValueError("Unknown graph format %r." % format)
class LabelArray(ndarray): """ An ndarray subclass for working with arrays of strings. Factorizes the input array into integers, but overloads equality on strings to check against the factor label. Parameters ---------- values : array-like Array of values that can be passed to np.asarray with dtype=object. missing_value : str Scalar value to treat as 'missing' for operations on ``self``. categories : list[str], optional List of values to use as categories. If not supplied, categories will be inferred as the unique set of entries in ``values``. sort : bool, optional Whether to sort categories. If sort is False and categories is supplied, they are left in the order provided. If sort is False and categories is None, categories will be constructed in a random order. Attributes ---------- categories : ndarray[str] An array containing the unique labels of self. reverse_categories : dict[str -> int] Reverse lookup table for ``categories``. Stores the index in ``categories`` at which each entry each unique entry is found. missing_value : str or None A sentinel missing value with NaN semantics for comparisons. Notes ----- Consumers should be cautious when passing instances of LabelArray to numpy functions. We attempt to disallow as many meaningless operations as possible, but since a LabelArray is just an ndarray of ints with some additional metadata, many numpy functions (for example, trigonometric) will happily accept a LabelArray and treat its values as though they were integers. In a future change, we may be able to disallow more numerical operations by creating a wrapper dtype which doesn't register an implementation for most numpy ufuncs. Until that change is made, consumers of LabelArray should assume that it is undefined behavior to pass a LabelArray to any numpy ufunc that operates on semantically-numerical data. See Also -------- https://docs.scipy.org/doc/numpy-1.11.0/user/basics.subclassing.html """ SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None)) SUPPORTED_NON_NONE_SCALAR_TYPES = (bytes, unicode) @preprocess( values=coerce(list, partial(np.asarray, dtype=object)), # Coerce ``list`` to ``list`` to make a copy. Code internally may call # ``categories.insert(0, missing_value)`` which will mutate this list # in place. categories=coerce((list, np.ndarray, set), list), ) @expect_types( values=np.ndarray, missing_value=SUPPORTED_SCALAR_TYPES, categories=optional(list), ) @expect_kinds(values=("O", "S", "U")) def __new__(cls, values, missing_value, categories=None, sort=True): # Numpy's fixed-width string types aren't very efficient. Working with # object arrays is faster than bytes or unicode arrays in almost all # cases. if not is_object(values): values = values.astype(object) if values.flags.f_contiguous: ravel_order = "F" else: ravel_order = "C" if categories is None: codes, categories, reverse_categories = factorize_strings( values.ravel(ravel_order), missing_value=missing_value, sort=sort, ) else: ( codes, categories, reverse_categories, ) = factorize_strings_known_categories( values.ravel(ravel_order), categories=categories, missing_value=missing_value, sort=sort, ) categories.setflags(write=False) return cls.from_codes_and_metadata( codes=codes.reshape(values.shape, order=ravel_order), categories=categories, reverse_categories=reverse_categories, missing_value=missing_value, ) @classmethod def from_codes_and_metadata(cls, codes, categories, reverse_categories, missing_value): """ Rehydrate a LabelArray from the codes and metadata. Parameters ---------- codes : np.ndarray[integral] The codes for the label array. categories : np.ndarray[object] The unique string categories. reverse_categories : dict[str, int] The mapping from category to its code-index. missing_value : any The value used to represent missing data. """ ret = codes.view(type=cls, dtype=np.void) ret._categories = categories ret._reverse_categories = reverse_categories ret._missing_value = missing_value return ret @classmethod def from_categorical(cls, categorical, missing_value=None): """ Create a LabelArray from a pandas categorical. Parameters ---------- categorical : pd.Categorical The categorical object to convert. missing_value : bytes, unicode, or None, optional The missing value to use for this LabelArray. Returns ------- la : LabelArray The LabelArray representation of this categorical. """ return LabelArray( categorical, missing_value, categorical.categories, ) @property def categories(self): # This is a property because it should be immutable. return self._categories @property def reverse_categories(self): # This is a property because it should be immutable. return self._reverse_categories @property def missing_value(self): # This is a property because it should be immutable. return self._missing_value @property def missing_value_code(self): return self.reverse_categories[self.missing_value] def has_label(self, value): return value in self.reverse_categories def __array_finalize__(self, obj): """ Called by Numpy after array construction. There are three cases where this can happen: 1. Someone tries to directly construct a new array by doing:: >>> ndarray.__new__(LabelArray, ...) # doctest: +SKIP In this case, obj will be None. We treat this as an error case and fail. 2. Someone (most likely our own __new__) does:: >>> other_array.view(type=LabelArray) # doctest: +SKIP In this case, `self` will be the new LabelArray instance, and ``obj` will be the array on which ``view`` is being called. The caller of ``obj.view`` is responsible for setting category metadata on ``self`` after we exit. 3. Someone creates a new LabelArray by slicing an existing one. In this case, ``obj`` will be the original LabelArray. We're responsible for copying over the parent array's category metadata. """ if obj is None: raise TypeError( "Direct construction of LabelArrays is not supported.") # See docstring for an explanation of when these will or will not be # set. self._categories = getattr(obj, "categories", None) self._reverse_categories = getattr(obj, "reverse_categories", None) self._missing_value = getattr(obj, "missing_value", None) def as_int_array(self): """ Convert self into a regular ndarray of ints. This is an O(1) operation. It does not copy the underlying data. """ return self.view( type=ndarray, dtype=unsigned_int_dtype_with_size_in_bytes(self.itemsize), ) def as_string_array(self): """ Convert self back into an array of strings. This is an O(N) operation. """ return self.categories[self.as_int_array()] def as_categorical(self): """ Coerce self into a pandas categorical. This is only defined on 1D arrays, since that's all pandas supports. """ if len(self.shape) > 1: raise ValueError("Can't convert a 2D array to a categorical.") with ignore_pandas_nan_categorical_warning(): return pd.Categorical.from_codes( self.as_int_array(), # We need to make a copy because pandas >= 0.17 fails if this # buffer isn't writeable. self.categories.copy(), ordered=False, ) def as_categorical_frame(self, index, columns, name=None): """ Coerce self into a pandas DataFrame of Categoricals. """ if len(self.shape) != 2: raise ValueError( "Can't convert a non-2D LabelArray into a DataFrame.") expected_shape = (len(index), len(columns)) if expected_shape != self.shape: raise ValueError( "Can't construct a DataFrame with provided indices:\n\n" "LabelArray shape is {actual}, but index and columns imply " "that shape should be {expected}.".format( actual=self.shape, expected=expected_shape, )) return pd.Series( index=pd.MultiIndex.from_product([index, columns]), data=self.ravel().as_categorical(), name=name, ).unstack() def __setitem__(self, indexer, value): self_categories = self.categories if isinstance(value, self.SUPPORTED_SCALAR_TYPES): value_code = self.reverse_categories.get(value, None) if value_code is None: raise ValueError("%r is not in LabelArray categories." % value) self.as_int_array()[indexer] = value_code elif isinstance(value, LabelArray): value_categories = value.categories if compare_arrays(self_categories, value_categories): return super(LabelArray, self).__setitem__(indexer, value) elif self.missing_value == value.missing_value and set( value.categories) <= set(self.categories): rhs = LabelArray.from_codes_and_metadata( *factorize_strings_known_categories( value.as_string_array().ravel(), list(self.categories), self.missing_value, False, ), missing_value=self.missing_value, ).reshape(value.shape) super(LabelArray, self).__setitem__(indexer, rhs) else: raise CategoryMismatch(self_categories, value_categories) else: raise NotImplementedError( "Setting into a LabelArray with a value of " "type {type} is not yet supported.".format( type=type(value).__name__, ), ) def set_scalar(self, indexer, value): """ Set scalar value into the array. Parameters ---------- indexer : any The indexer to set the value at. value : str The value to assign at the given locations. Raises ------ ValueError Raised when ``value`` is not a value element of this this label array. """ try: value_code = self.reverse_categories[value] except KeyError: raise ValueError("%r is not in LabelArray categories." % value) self.as_int_array()[indexer] = value_code def __getitem__(self, indexer): result = super(LabelArray, self).__getitem__(indexer) if result.ndim: # Result is still a LabelArray, so we can just return it. return result # Result is a scalar value, which will be an instance of np.void. # Map it back to one of our category entries. index = result.view( unsigned_int_dtype_with_size_in_bytes(self.itemsize), ) return self.categories[index] def is_missing(self): """ Like isnan, but checks for locations where we store missing values. """ return ( self.as_int_array() == self.reverse_categories[self.missing_value]) def not_missing(self): """ Like ~isnan, but checks for locations where we store missing values. """ return (self.as_int_array() != self.reverse_categories[self.missing_value]) def _equality_check(op): """ Shared code for __eq__ and __ne__, parameterized on the actual comparison operator to use. """ def method(self, other): if isinstance(other, LabelArray): self_mv = self.missing_value other_mv = other.missing_value if self_mv != other_mv: raise MissingValueMismatch(self_mv, other_mv) self_categories = self.categories other_categories = other.categories if not compare_arrays(self_categories, other_categories): raise CategoryMismatch(self_categories, other_categories) return (op(self.as_int_array(), other.as_int_array()) & self.not_missing() & other.not_missing()) elif isinstance(other, ndarray): # Compare to ndarrays as though we were an array of strings. # This is fairly expensive, and should generally be avoided. return op(self.as_string_array(), other) & self.not_missing() elif isinstance(other, self.SUPPORTED_SCALAR_TYPES): i = self._reverse_categories.get(other, -1) return op(self.as_int_array(), i) & self.not_missing() return op(super(LabelArray, self), other) return method __eq__ = _equality_check(eq) __ne__ = _equality_check(ne) del _equality_check def view(self, dtype=_NotPassed, type=_NotPassed): if type is _NotPassed and dtype not in (_NotPassed, self.dtype): raise TypeError("Can't view LabelArray as another dtype.") # The text signature on ndarray.view makes it look like the default # values for dtype and type are `None`, but passing None explicitly has # different semantics than not passing an arg at all, so we reconstruct # the kwargs dict here to simulate the args not being passed at all. kwargs = {} if dtype is not _NotPassed: kwargs["dtype"] = dtype if type is not _NotPassed: kwargs["type"] = type return super(LabelArray, self).view(**kwargs) def astype(self, dtype, order="K", casting="unsafe", subok=True, copy=True): if dtype == self.dtype: if not subok: array = self.view(type=np.ndarray) else: array = self if copy: return array.copy() return array if dtype == object_dtype: return self.as_string_array() if dtype.kind == "S": return self.as_string_array().astype( dtype, order=order, casting=casting, subok=subok, copy=copy, ) raise TypeError( "%s can only be converted into object, string, or void," " got: %r" % ( type(self).__name__, dtype, ), ) # In general, we support resizing, slicing, and reshaping methods, but not # numeric methods. SUPPORTED_NDARRAY_METHODS = frozenset([ "astype", "base", "compress", "copy", "data", "diagonal", "dtype", "flat", "flatten", "item", "itemset", "itemsize", "nbytes", "ndim", "ravel", "repeat", "reshape", "resize", "setflags", "shape", "size", "squeeze", "strides", "swapaxes", "take", "trace", "transpose", "view", ]) PUBLIC_NDARRAY_METHODS = frozenset( [s for s in dir(ndarray) if not s.startswith("_")]) # Generate failing wrappers for all unsupported methods. locals().update({ method: _make_unsupported_method(method) for method in PUBLIC_NDARRAY_METHODS - SUPPORTED_NDARRAY_METHODS }) def __repr__(self): repr_lines = repr(self.as_string_array()).splitlines() repr_lines[0] = repr_lines[0].replace("array(", "LabelArray(", 1) repr_lines[-1] = repr_lines[-1].rsplit(",", 1)[0] + ")" # The extra spaces here account for the difference in length between # 'array(' and 'LabelArray('. return "\n ".join(repr_lines) def empty_like(self, shape): """ Make an empty LabelArray with the same categories as ``self``, filled with ``self.missing_value``. """ return type(self).from_codes_and_metadata( codes=np.full( shape, self.reverse_categories[self.missing_value], dtype=unsigned_int_dtype_with_size_in_bytes(self.itemsize), ), categories=self.categories, reverse_categories=self.reverse_categories, missing_value=self.missing_value, ) def map_predicate(self, f): """ Map a function from str -> bool element-wise over ``self``. ``f`` will be applied exactly once to each non-missing unique value in ``self``. Missing values will always return False. """ # Functions passed to this are of type str -> bool. Don't ever call # them on None, which is the only non-str value we ever store in # categories. if self.missing_value is None: def f_to_use(x): return False if x is None else f(x) else: f_to_use = f # Call f on each unique value in our categories. results = np.vectorize(f_to_use, otypes=[bool_dtype])(self.categories) # missing_value should produce False no matter what results[self.reverse_categories[self.missing_value]] = False # unpack the results form each unique value into their corresponding # locations in our indices. return results[self.as_int_array()] def map(self, f): """ Map a function from str -> str element-wise over ``self``. ``f`` will be applied exactly once to each non-missing unique value in ``self``. Missing values will always map to ``self.missing_value``. """ # f() should only return None if None is our missing value. if self.missing_value is None: allowed_outtypes = self.SUPPORTED_SCALAR_TYPES else: allowed_outtypes = self.SUPPORTED_NON_NONE_SCALAR_TYPES def f_to_use(x, missing_value=self.missing_value, otypes=allowed_outtypes): # Don't call f on the missing value; those locations don't exist # semantically. We return _sortable_sentinel rather than None # because the np.unique call below sorts the categories array, # which raises an error on Python 3 because None and str aren't # comparable. if x == missing_value: return _sortable_sentinel ret = f(x) if not isinstance(ret, otypes): raise TypeError( "LabelArray.map expected function {f} to return a string" " or None, but got {type} instead.\n" "Value was {value}.".format( f=f.__name__, type=type(ret).__name__, value=ret, )) if ret == missing_value: return _sortable_sentinel return ret new_categories_with_duplicates = np.vectorize(f_to_use, otypes=[object])( self.categories) # If f() maps multiple inputs to the same output, then we can end up # with the same code duplicated multiple times. Compress the categories # by running them through np.unique, and then use the reverse lookup # table to compress codes as well. new_categories, bloated_inverse_index = np.unique( new_categories_with_duplicates, return_inverse=True) if new_categories[0] is _sortable_sentinel: # f_to_use return _sortable_sentinel for locations that should be # missing values in our output. Since np.unique returns the uniques # in sorted order, and since _sortable_sentinel sorts before any # string, we only need to check the first array entry. new_categories[0] = self.missing_value # `reverse_index` will always be a 64 bit integer even if we can hold a # smaller array. reverse_index = bloated_inverse_index.astype( smallest_uint_that_can_hold(len(new_categories))) new_codes = np.take(reverse_index, self.as_int_array()) return self.from_codes_and_metadata( new_codes, new_categories, dict(zip(new_categories, range(len(new_categories)))), missing_value=self.missing_value, ) def startswith(self, prefix): """ Element-wise startswith. Parameters ---------- prefix : str Returns ------- matches : np.ndarray[bool] An array with the same shape as self indicating whether each element of self started with ``prefix``. """ return self.map_predicate(lambda elem: elem.startswith(prefix)) def endswith(self, suffix): """ Elementwise endswith. Parameters ---------- suffix : str Returns ------- matches : np.ndarray[bool] An array with the same shape as self indicating whether each element of self ended with ``suffix`` """ return self.map_predicate(lambda elem: elem.endswith(suffix)) def has_substring(self, substring): """ Elementwise contains. Parameters ---------- substring : str Returns ------- matches : np.ndarray[bool] An array with the same shape as self indicating whether each element of self ended with ``suffix``. """ return self.map_predicate(lambda elem: substring in elem) @preprocess(pattern=coerce(from_=(bytes, unicode), to=re.compile)) def matches(self, pattern): """ Elementwise regex match. Parameters ---------- pattern : str or compiled regex Returns ------- matches : np.ndarray[bool] An array with the same shape as self indicating whether each element of self was matched by ``pattern``. """ return self.map_predicate(compose(bool, pattern.match)) # These types all implement an O(N) __contains__, so pre-emptively # coerce to `set`. @preprocess(container=coerce((list, tuple, np.ndarray), set)) def element_of(self, container): """ Check if each element of self is an of ``container``. Parameters ---------- container : object An object implementing a __contains__ to call on each element of ``self``. Returns ------- is_contained : np.ndarray[bool] An array with the same shape as self indicating whether each element of self was an element of ``container``. """ return self.map_predicate(container.__contains__)
class Pipeline(object): """ A Pipeline object represents a collection of named expressions to be compiled and executed by a PipelineEngine. A Pipeline has two important attributes: 'columns', a dictionary of named `Term` instances, and 'screen', a Filter representing criteria for including an asset in the results of a Pipeline. To compute a pipeline in the context of a TradingAlgorithm, users must call ``attach_pipeline`` in their ``initialize`` function to register that the pipeline should be computed each trading day. The outputs of a pipeline on a given day can be accessed by calling ``pipeline_output`` in ``handle_data`` or ``before_trading_start``. Parameters ---------- columns : dict, optional Initial columns. screen : zipline.pipeline.term.Filter, optional Initial screen. """ __slots__ = ('_columns', '_screen', '__weakref__') @expect_types( columns=optional(dict), screen=optional(Filter), ) def __init__(self, columns=None, screen=None): if columns is None: columns = {} validate_column = self.validate_column for column_name, term in columns.items(): validate_column(column_name, term) if not isinstance(term, ComputableTerm): raise TypeError( "Column {column_name!r} contains an invalid pipeline term " "({term}). Did you mean to append '.latest'?".format( column_name=column_name, term=term, )) self._columns = columns self._screen = screen @property def columns(self): """ The columns registered with this pipeline. """ return self._columns @property def screen(self): """ The screen applied to the rows of this pipeline. """ return self._screen @expect_types(term=Term, name=str) def add(self, term, name, overwrite=False): """ Add a column. The results of computing `term` will show up as a column in the DataFrame produced by running this pipeline. Parameters ---------- column : zipline.pipeline.Term A Filter, Factor, or Classifier to add to the pipeline. name : str Name of the column to add. overwrite : bool Whether to overwrite the existing entry if we already have a column named `name`. """ self.validate_column(name, term) columns = self.columns if name in columns: if overwrite: self.remove(name) else: raise KeyError("Column '{}' already exists.".format(name)) if not isinstance(term, ComputableTerm): raise TypeError( "{term} is not a valid pipeline column. Did you mean to " "append '.latest'?".format(term=term)) self._columns[name] = term @expect_types(name=str) def remove(self, name): """ Remove a column. Parameters ---------- name : str The name of the column to remove. Raises ------ KeyError If `name` is not in self.columns. Returns ------- removed : zipline.pipeline.term.Term The removed term. """ return self.columns.pop(name) @expect_types(screen=Filter, overwrite=(bool, int)) def set_screen(self, screen, overwrite=False): """ Set a screen on this Pipeline. Parameters ---------- filter : zipline.pipeline.Filter The filter to apply as a screen. overwrite : bool Whether to overwrite any existing screen. If overwrite is False and self.screen is not None, we raise an error. """ if self._screen is not None and not overwrite: raise ValueError( "set_screen() called with overwrite=False and screen already " "set.\n" "If you want to apply multiple filters as a screen use " "set_screen(filter1 & filter2 & ...).\n" "If you want to replace the previous screen with a new one, " "use set_screen(new_filter, overwrite=True).") self._screen = screen def to_execution_plan(self, screen_name, default_screen, all_dates, start_date, end_date): """ Compile into an ExecutionPlan. Parameters ---------- screen_name : str Name to supply for self.screen. default_screen : zipline.pipeline.term.Term Term to use as a screen if self.screen is None. all_dates : pd.DatetimeIndex A calendar of dates to use to calculate starts and ends for each term. start_date : pd.Timestamp The first date of requested output. end_date : pd.Timestamp The last date of requested output. """ return ExecutionPlan( self._prepare_graph_terms(screen_name, default_screen), all_dates, start_date, end_date, ) def to_simple_graph(self, screen_name, default_screen): """ Compile into a simple TermGraph with no extra row metadata. Parameters ---------- screen_name : str Name to supply for self.screen. default_screen : zipline.pipeline.term.Term Term to use as a screen if self.screen is None. """ return TermGraph(self._prepare_graph_terms(screen_name, default_screen)) def _prepare_graph_terms(self, screen_name, default_screen): """Helper for to_graph and to_execution_plan.""" columns = self.columns.copy() screen = self.screen if screen is None: screen = default_screen columns[screen_name] = screen return columns @expect_element(format=('svg', 'png', 'jpeg')) def show_graph(self, format='svg'): """ Render this Pipeline as a DAG. Parameters ---------- format : {'svg', 'png', 'jpeg'} Image format to render with. Default is 'svg'. """ g = self.to_simple_graph('', AssetExists()) if format == 'svg': return g.svg elif format == 'png': return g.png elif format == 'jpeg': return g.jpeg else: # We should never get here because of the expect_element decorator # above. raise AssertionError("Unknown graph format %r." % format) @staticmethod @expect_types(term=Term, column_name=str) def validate_column(column_name, term): if term.ndim == 1: raise UnsupportedPipelineOutput(column_name=column_name, term=term)