class DatasetEvaluator(object):

    """A DatasetEvaluator evaluates many Theano variables or other quantities.
    The DatasetEvaluator provides a do-it-all method, :meth:`evaluate`,
    which computes values of ``variables`` on a dataset.
    Alternatively, methods :meth:`initialize_aggregators`,
    :meth:`process_batch`, :meth:`get_aggregated_values` can be used with a
    custom loop over data.
    The values computed on subsets of the given dataset are aggregated
    using the :class:`AggregationScheme`s provided in the
    `aggregation_scheme` tags. If no tag is given, the value is **averaged
    over minibatches**. However, care is taken to ensure that variables
    which do not depend on data are not unnecessarily recomputed.
    Parameters
    ----------
    variables : list of :class:`~tensor.TensorVariable` and
        :class:`MonitoredQuantity`
        The variable names are used as record names in the logs. Hence, all
        the names must be different.
        Each variable can be tagged with an :class:`AggregationScheme` that
        specifies how the value can be computed for a data set by
        aggregating minibatches.
    updates : list of tuples or :class:`~collections.OrderedDict` or None
        :class:`~tensor.TensorSharedVariable` updates to be performed
        during evaluation. This parameter is only for Theano variables.
        Be careful not to update any model parameters as this is not
        intended to alter your model in any meaningfullway. A typical
        use case of this option arises when the theano function used
        for evaluation contains a call to:function:`~theano.scan` which
        might have returned shared variable updates.
    """

    def __init__(self, variables, mini_batch_size, state_updates,
                 dataset, updates=None):
        theano_variables = []
        monitored_quantities = []
        for variable in variables:
            if isinstance(variable, MonitoredQuantity):
                monitored_quantities.append(variable)
            else:
                theano_variables.append(variable)
        self.theano_variables = theano_variables
        self.monitored_quantities = monitored_quantities
        variable_names = [v.name for v in variables]
        if len(set(variable_names)) < len(variables):
            raise ValueError("variables should have different names")
        self.theano_buffer = AggregationBuffer(theano_variables)
        self.monitored_quantities_buffer = MonitoredQuantityBuffer(
            monitored_quantities)
        self.dataset = dataset
        self.updates = updates
        self.mini_batch_size = mini_batch_size
        self._compile(state_updates)

    def _compile(self, state_updates):
        """Compiles Theano functions.
        .. todo::
            The current compilation method does not account for updates
            attached to `ComputationGraph` elements. Compiling should
            be out-sourced to `ComputationGraph` to deal with it.
        """
        inputs = []
        outputs = []
        updates = None

        givens, f_updates = carry_hidden_state(state_updates,
                                               self.mini_batch_size,
                                               reset=not(has_indices(self.dataset)))

        if self.theano_buffer.accumulation_updates:
            updates = OrderedDict()
            updates.update(self.theano_buffer.accumulation_updates)
            if self.updates:
                updates.update(self.updates)
            inputs += self.theano_buffer.inputs
        inputs += self.monitored_quantities_buffer.inputs
        outputs = self.monitored_quantities_buffer.requires

        if inputs != []:
            self.unique_inputs = list(set(inputs))
            updates.update(f_updates)
            self._accumulate_fun = theano.function(self.unique_inputs,
                                                   outputs,
                                                   givens=givens,
                                                   updates=updates)
        else:
            self._accumulate_fun = None

    def initialize_aggregators(self):
        self.theano_buffer.initialize_aggregators()
        self.monitored_quantities_buffer.initialize()

    def process_batch(self, batch):
        try:
            input_names = [v.name for v in self.unique_inputs]
            batch = dict_subset(batch, input_names)
        except KeyError:
            reraise_as(
                "Not all data sources required for monitoring were"
                " provided. The list of required data sources:"
                " {}.".format(input_names))
        if self._accumulate_fun is not None:
            numerical_values = self._accumulate_fun(**batch)
            self.monitored_quantities_buffer.accumulate_quantities(
                numerical_values)

    def get_aggregated_values(self):
        values = self.theano_buffer.get_aggregated_values()
        values.update(
            self.monitored_quantities_buffer.get_aggregated_values())
        return values

    def evaluate(self, data_stream):
        """Compute the variables over a data stream.
        Parameters
        ----------
        data_stream : instance of :class:`.DataStream`
            The data stream. Only the first epoch of data is used.
        Returns
        -------
        A mapping from record names to the values computed on the provided
        dataset.
        """
        self.initialize_aggregators()
        if self._accumulate_fun is not None:
            for batch in data_stream.get_epoch_iterator(as_dict=True):
                self.process_batch(batch)
        else:
            logger.debug(
                'Only data independent variables were given,'
                'will not iterate the over data!')

        return self.get_aggregated_values()
Beispiel #2
0
class DatasetEvaluator(object):
    """A DatasetEvaluator evaluates many Theano variables or other quantities.
    The DatasetEvaluator provides a do-it-all method, :meth:`evaluate`,
    which computes values of ``variables`` on a dataset.
    Alternatively, methods :meth:`initialize_aggregators`,
    :meth:`process_batch`, :meth:`get_aggregated_values` can be used with a
    custom loop over data.
    The values computed on subsets of the given dataset are aggregated
    using the :class:`AggregationScheme`s provided in the
    `aggregation_scheme` tags. If no tag is given, the value is **averaged
    over minibatches**. However, care is taken to ensure that variables
    which do not depend on data are not unnecessarily recomputed.
    Parameters
    ----------
    variables : list of :class:`~tensor.TensorVariable` and
        :class:`MonitoredQuantity`
        The variable names are used as record names in the logs. Hence, all
        the names must be different.
        Each variable can be tagged with an :class:`AggregationScheme` that
        specifies how the value can be computed for a data set by
        aggregating minibatches.
    updates : list of tuples or :class:`~collections.OrderedDict` or None
        :class:`~tensor.TensorSharedVariable` updates to be performed
        during evaluation. This parameter is only for Theano variables.
        Be careful not to update any model parameters as this is not
        intended to alter your model in any meaningfullway. A typical
        use case of this option arises when the theano function used
        for evaluation contains a call to:function:`~theano.scan` which
        might have returned shared variable updates.
    """
    def __init__(self,
                 variables,
                 mini_batch_size,
                 state_updates,
                 updates=None):
        theano_variables = []
        monitored_quantities = []
        for variable in variables:
            if isinstance(variable, MonitoredQuantity):
                monitored_quantities.append(variable)
            else:
                theano_variables.append(variable)
        self.theano_variables = theano_variables
        self.monitored_quantities = monitored_quantities
        variable_names = [v.name for v in variables]
        if len(set(variable_names)) < len(variables):
            raise ValueError("variables should have different names")
        self.theano_buffer = AggregationBuffer(theano_variables)
        self.monitored_quantities_buffer = MonitoredQuantityBuffer(
            monitored_quantities)
        self.updates = updates
        self.mini_batch_size = mini_batch_size
        self._compile(state_updates)

    def _compile(self, state_updates):
        """Compiles Theano functions.
        .. todo::
            The current compilation method does not account for updates
            attached to `ComputationGraph` elements. Compiling should
            be out-sourced to `ComputationGraph` to deal with it.
        """
        inputs = []
        outputs = []
        updates = None

        state_vars = [
            theano.shared(
                numpy.zeros((self.mini_batch_size, v.shape[1].eval()),
                            dtype=numpy.float32), v.name + '-gen')
            for v, _ in state_updates
        ]
        givens = [(v, x) for (v, _), x in zip(state_updates, state_vars)]
        f_updates = [(x, upd)
                     for x, (_, upd) in zip(state_vars, state_updates)]

        if self.theano_buffer.accumulation_updates:
            updates = OrderedDict()
            updates.update(self.theano_buffer.accumulation_updates)
            if self.updates:
                updates.update(self.updates)
            inputs += self.theano_buffer.inputs
        inputs += self.monitored_quantities_buffer.inputs
        outputs = self.monitored_quantities_buffer.requires

        if inputs != []:
            self.unique_inputs = list(set(inputs))
            updates.update(f_updates)
            self._accumulate_fun = theano.function(self.unique_inputs,
                                                   outputs,
                                                   givens=givens,
                                                   updates=updates)
        else:
            self._accumulate_fun = None

    def initialize_aggregators(self):
        self.theano_buffer.initialize_aggregators()
        self.monitored_quantities_buffer.initialize()

    def process_batch(self, batch):
        try:
            input_names = [v.name for v in self.unique_inputs]
            batch = dict_subset(batch, input_names)
        except KeyError:
            reraise_as("Not all data sources required for monitoring were"
                       " provided. The list of required data sources:"
                       " {}.".format(input_names))
        if self._accumulate_fun is not None:
            numerical_values = self._accumulate_fun(**batch)
            self.monitored_quantities_buffer.accumulate_quantities(
                numerical_values)

    def get_aggregated_values(self):
        values = self.theano_buffer.get_aggregated_values()
        values.update(self.monitored_quantities_buffer.get_aggregated_values())
        return values

    def evaluate(self, data_stream):
        """Compute the variables over a data stream.
        Parameters
        ----------
        data_stream : instance of :class:`.DataStream`
            The data stream. Only the first epoch of data is used.
        Returns
        -------
        A mapping from record names to the values computed on the provided
        dataset.
        """
        self.initialize_aggregators()
        if self._accumulate_fun is not None:
            for batch in data_stream.get_epoch_iterator(as_dict=True):
                self.process_batch(batch)
        else:
            logger.debug('Only data independent variables were given,'
                         'will not iterate the over data!')

        return self.get_aggregated_values()