def resize_from_observed( observed, ndim_implied: int ) -> Tuple[int, StrongSize, Union[np.ndarray, Variable]]: """Determines a potential resize shape from observations. Parameters ---------- observed : scalar, array-like The value of the `observed` kwarg to the RV creation. ndim_implied : int Number of RV dimensions that were implied from its inputs alone. Returns ------- ndim_resize : int Number of dimensions that should be added through resizing. resize_shape : array-like The shape of the new dimensions. observed : scalar, array-like Observations as numpy array or `Variable`. """ if not hasattr(observed, "shape"): observed = pandas_to_array(observed) ndim_resize = observed.ndim - ndim_implied resize_shape = tuple(observed.shape[d] for d in range(ndim_resize)) return ndim_resize, resize_shape, observed
def __new__(self, name, value, *, dims=None, export_index_as_coords=False): if isinstance(value, list): value = np.array(value) # Add data container to the named variables of the model. try: model = pm.Model.get_context() except TypeError: raise TypeError( "No model on context stack, which is needed to instantiate a data container. " "Add variable inside a 'with model:' block.") name = model.name_for(name) # `pandas_to_array` takes care of parameter `value` and # transforms it to something digestible for pymc shared_object = aesara.shared(pandas_to_array(value), name) if isinstance(dims, str): dims = (dims, ) if not (dims is None or len(dims) == shared_object.ndim): raise pm.exceptions.ShapeError( "Length of `dims` must match the dimensions of the dataset.", actual=len(dims), expected=shared_object.ndim, ) coords = self.set_coords(model, value, dims) if export_index_as_coords: model.add_coords(coords) elif dims: # Register new dimension lengths for d, dname in enumerate(dims): if not dname in model.dim_lengths: model.add_coord(dname, values=None, length=shared_object.shape[d]) # To draw the node for this variable in the graphviz Digraph we need # its shape. # XXX: This needs to be refactored # shared_object.dshape = tuple(shared_object.shape.eval()) # if dims is not None: # shape_dims = model.shape_from_dims(dims) # if shared_object.dshape != shape_dims: # raise pm.exceptions.ShapeError( # "Data shape does not match with specified `dims`.", # actual=shared_object.dshape, # expected=shape_dims, # ) model.add_random_variable(shared_object, dims=dims) return shared_object
def test_pandas_to_array_pandas_index(): data = pd.Index([1, 2, 3]) result = pandas_to_array(data) expected = np.array([1, 2, 3]) np.testing.assert_array_equal(result, expected)
def Data( name: str, value, *, dims: Optional[Sequence[str]] = None, export_index_as_coords=False, mutable: Optional[bool] = None, **kwargs, ) -> Union[SharedVariable, TensorConstant]: """Data container that registers a data variable with the model. Depending on the ``mutable`` setting (default: True), the variable is registered as a ``SharedVariable``, enabling it to be altered in value and shape, but NOT in dimensionality using ``pm.set_data()``. Parameters ---------- name: str The name for this variable value: {List, np.ndarray, pd.Series, pd.Dataframe} A value to associate with this variable mutable : bool, optional Switches between creating a ``SharedVariable`` (``mutable=True``, default) vs. creating a ``TensorConstant`` (``mutable=False``). Consider using ``pm.ConstantData`` or ``pm.MutableData`` as less verbose alternatives to ``pm.Data(..., mutable=...)``. dims: {str, tuple of str}, optional, default=None Dimension names of the random variables (as opposed to the shapes of these random variables). Use this when `value` is a pandas Series or DataFrame. The `dims` will then be the name of the Series / DataFrame's columns. See ArviZ documentation for more information about dimensions and coordinates: :ref:`arviz:quickstart`. export_index_as_coords: bool, optional, default=False If True, the `Data` container will try to infer what the coordinates should be if there is an index in `value`. **kwargs: dict, optional Extra arguments passed to :func:`aesara.shared`. Examples -------- >>> import pymc as pm >>> import numpy as np >>> # We generate 10 datasets >>> true_mu = [np.random.randn() for _ in range(10)] >>> observed_data = [mu + np.random.randn(20) for mu in true_mu] >>> with pm.Model() as model: ... data = pm.MutableData('data', observed_data[0]) ... mu = pm.Normal('mu', 0, 10) ... pm.Normal('y', mu=mu, sigma=1, observed=data) >>> # Generate one trace for each dataset >>> idatas = [] >>> for data_vals in observed_data: ... with model: ... # Switch out the observed dataset ... model.set_data('data', data_vals) ... idatas.append(pm.sample()) To set the value of the data container variable, check out :func:`pymc.model.set_data()`. For more information, take a look at this example notebook https://docs.pymc.io/notebooks/data_container.html """ if isinstance(value, list): value = np.array(value) # Add data container to the named variables of the model. try: model = pm.Model.get_context() except TypeError: raise TypeError( "No model on context stack, which is needed to instantiate a data container. " "Add variable inside a 'with model:' block.") name = model.name_for(name) # `pandas_to_array` takes care of parameter `value` and # transforms it to something digestible for Aesara. arr = pandas_to_array(value) if mutable is None: current = version.Version(pm.__version__) mutable = current.major == 4 and current.minor < 1 if mutable: warnings.warn( "The `mutable` kwarg was not specified. Currently it defaults to `pm.Data(mutable=True)`," " which is equivalent to using `pm.MutableData()`." " In v4.1.0 the default will change to `pm.Data(mutable=False)`, equivalent to `pm.ConstantData`." " Set `pm.Data(..., mutable=False/True)`, or use `pm.ConstantData`/`pm.MutableData`.", FutureWarning, ) if mutable: x = aesara.shared(arr, name, **kwargs) else: x = at.as_tensor_variable(arr, name, **kwargs) if isinstance(dims, str): dims = (dims, ) if not (dims is None or len(dims) == x.ndim): raise pm.exceptions.ShapeError( "Length of `dims` must match the dimensions of the dataset.", actual=len(dims), expected=x.ndim, ) coords = determine_coords(model, value, dims) if export_index_as_coords: model.add_coords(coords) elif dims: # Register new dimension lengths for d, dname in enumerate(dims): if not dname in model.dim_lengths: model.add_coord(dname, values=None, length=x.shape[d]) model.add_random_variable(x, dims=dims) return x