Beispiel #1
0
    def _standardize_user_data(self, s, a, s_next, r, absorbing, theta,
                               check_batch_dim=False):
        """

        Args:
            s (numpy.array): the samples of the state (nsamples, state_dim)
            a (numpy.array): the samples of the state (nsamples, action_dim)
            s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim)
            r (numpy.array): the sample of the reward (nsamples, )
            theta (numpy.array): the sample of the Q-function parameters (1, n_params)
            check_batch_dim (bool): default False

        Returns:
            The standardized values (s, a, s_next, r, theta)

        """
        s = standardize_input_data(
            s, ['s'],
            [(None, self.state_dim)] if self.state_dim is not None else None,
            exception_prefix='state')
        a = standardize_input_data(
            a, ['a'],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            exception_prefix='action')
        # r = standardize_input_data(r, ['r'], [(None, 1)],
        #                            check_batch_dim=False, exception_prefix='reward')
        s_next = standardize_input_data(
            s_next, ['s_next'],
            [(None, self.state_dim)] if self.state_dim is not None else None,
            exception_prefix='state_next')
        theta = standardize_input_data(theta, ['theta'],
                                       (None, self.bellman_model.n_inputs()),
                                       exception_prefix='theta')
        check_array_lengths(s, a, s_next)
        return s, a, s_next, r, absorbing, theta
Beispiel #2
0
    def _standardize_user_data(self, s, a, s_next, r, check_batch_dim=False):
        """

        Args:
            s (numpy.array): the samples of the state (nsamples, state_dim)
            a (numpy.array): the samples of the state (nsamples, action_dim)
            s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim)
            r (numpy.array): the sample of the reward (nsamples, )
            check_batch_dim (bool): default False

        Returns:
            The standardized values (s, a, s_next, r, theta)

        """
        s = standardize_input_data(
            s, ['s'],
            [(None, self.state_dim)] if self.state_dim is not None else None,
            check_batch_dim=check_batch_dim,
            exception_prefix='state')
        a = standardize_input_data(
            a, ['a'],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            check_batch_dim=check_batch_dim,
            exception_prefix='action')
        # r = standardize_input_data(r, ['r'], [(None, 1)],
        #                            check_batch_dim=False, exception_prefix='reward')
        s_next = standardize_input_data(
            s_next, ['s_next'],
            [(None, self.state_dim)] if self.state_dim is not None else None,
            check_batch_dim=check_batch_dim,
            exception_prefix='state_next')
        check_array_lengths(s, a, s_next)
        return s, a, s_next, r
Beispiel #3
0
    def _standardize_user_data(self, s, a, s_next, r, check_batch_dim=False):
        """

        Args:
            s (numpy.array): the samples of the state (nsamples, state_dim)
            a (numpy.array): the samples of the state (nsamples, action_dim)
            s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim)
            r (numpy.array): the sample of the reward (nsamples, )
            check_batch_dim (bool): default False

        Returns:
            The standardized values (s, a, s_next, r, theta)

        """
        s = standardize_input_data(s, ['s'], [(None, self.state_dim)] if self.state_dim is not None else None,
                                   check_batch_dim=check_batch_dim, exception_prefix='state')
        a = standardize_input_data(a, ['a'], [(None, self.action_dim)] if self.action_dim is not None else None,
                                   check_batch_dim=check_batch_dim, exception_prefix='action')
        # r = standardize_input_data(r, ['r'], [(None, 1)],
        #                            check_batch_dim=False, exception_prefix='reward')
        s_next = standardize_input_data(s_next, ['s_next'],
                                        [(None, self.state_dim)] if self.state_dim is not None else None,
                                        check_batch_dim=check_batch_dim, exception_prefix='state_next')
        check_array_lengths(s, a, s_next)
        return s, a, s_next, r
Beispiel #4
0
def _standardize_user_data(model,
                           x,
                           y,
                           sample_weight=None,
                           class_weight=None,
                           check_batch_dim=True,
                           batch_size=None):
    if not hasattr(model, 'optimizer'):
        raise Exception('You must compile a model before training/testing.'
                        ' Use `model.compile(optimizer, loss)`.')

    output_shapes = []
    for output_shape, loss_fn in zip(model.internal_output_shapes,
                                     model.loss_functions):
        if loss_fn.__name__ == 'sparse_categorical_crossentropy':
            output_shapes.append(output_shape[:-1] + (1, ))
        elif getattr(objectives, loss_fn.__name__, None) is None:
            output_shapes.append(None)
        else:
            output_shapes.append(output_shape)
    x = standardize_input_data(x,
                               model.input_names,
                               model.internal_input_shapes,
                               check_batch_dim=False,
                               exception_prefix='model input')
    y = standardize_input_data(y,
                               model.output_names,
                               output_shapes,
                               check_batch_dim=False,
                               exception_prefix='model target')
    sample_weights = standardize_sample_weights(sample_weight,
                                                model.output_names)
    class_weights = standardize_class_weights(class_weight, model.output_names)
    sample_weights = [
        standardize_weights(ref, sw, cw, mode) for (ref, sw, cw, mode) in zip(
            y, sample_weights, class_weights, model.sample_weight_modes)
    ]
    '''
    We only need to comment out check_array_lengeh(x, y, weights) in the next line to
    let the model compile and train.
    '''
    # check_array_lengths(x, y, sample_weights)

    check_loss_and_target_compatibility(y, model.loss_functions,
                                        model.internal_output_shapes)
    if model.stateful and batch_size:
        if x[0].shape[0] % batch_size != 0:
            raise Exception('In a stateful network, '
                            'you should only pass inputs with '
                            'a number of samples that can be '
                            'divided by the batch size. Found: ' +
                            str(x[0].shape[0]) + ' samples')
    return x, y, sample_weights
Beispiel #5
0
    def finetune(self, X, Y, batch_size=128, gp_n_iter=10, verbose=1):
        '''Finetune the output GP layers assuming the network is pre-trained.

        Arguments:
        ----------
            X : np.ndarray or list of np.ndarrays
            Y : np.ndarray or list of np.ndarrays
            batch_size : uint (default: 128)
                Batch size used for data streaming through the network.
            gp_n_iter : uint (default: 100)
                Number of iterations for GP training.
            verbose : uint (default: 1)
                Verbosity mode, 0 or 1.
        '''
        # Validate user data
        X = standardize_input_data(X, self.input_names,
                                   self.internal_input_shapes,
                                   check_batch_dim=False)

        H = self.transform(X, batch_size=batch_size)

        if verbose:
            print("Finetuning output GPs...")

        for gp, h, y in zip(self.gp_output_layers, H, Y):
            # Update GP data (and grid if necessary)
            gp.backend.update_data('tr', h, y)
            if gp.update_grid:
                gp.backend.update_grid('tr')

            # Train GP
            gp.hyp = gp.backend.train(gp_n_iter, verbose=verbose)

        if verbose:
            print("Done.")
def _standardize_user_data(model, x, y,
                           sample_weight=None, class_weight=None,
                           check_batch_dim=True, batch_size=None):
    if not hasattr(model, 'optimizer'):
        raise Exception('You must compile a model before training/testing.'
                        ' Use `model.compile(optimizer, loss)`.')

    output_shapes = []
    for output_shape, loss_fn in zip(model.internal_output_shapes, model.loss_functions):
        if loss_fn.__name__ == 'sparse_categorical_crossentropy':
            output_shapes.append(output_shape[:-1] + (1,))
        elif getattr(objectives, loss_fn.__name__, None) is None:
            output_shapes.append(None)
        else:
            output_shapes.append(output_shape)
    x = standardize_input_data(x, model.input_names,
                               model.internal_input_shapes,
                               check_batch_dim=False,
                               exception_prefix='model input')
    y = standardize_input_data(y, model.output_names,
                               output_shapes,
                               check_batch_dim=False,
                               exception_prefix='model target')
    sample_weights = standardize_sample_weights(sample_weight,
                                                model.output_names)
    class_weights = standardize_class_weights(class_weight,
                                              model.output_names)
    sample_weights = [standardize_weights(ref, sw, cw, mode)
                      for (ref, sw, cw, mode)
                      in zip(y, sample_weights, class_weights, model.sample_weight_modes)]

    '''
    We only need to comment out check_array_lengeh(x, y, weights) in the next line to
    let the model compile and train.
    '''
    # check_array_lengths(x, y, sample_weights)

    check_loss_and_target_compatibility(y, model.loss_functions, model.internal_output_shapes)
    if model.stateful and batch_size:
        if x[0].shape[0] % batch_size != 0:
            raise Exception('In a stateful network, '
                            'you should only pass inputs with '
                            'a number of samples that can be '
                            'divided by the batch size. Found: ' +
                            str(x[0].shape[0]) + ' samples')
    return x, y, sample_weights
Beispiel #7
0
    def _standardize_user_data(self,
                               s,
                               a,
                               s_next,
                               r,
                               absorbing,
                               theta,
                               check_batch_dim=False):
        """

        Args:
            s (numpy.array): the samples of the state (nsamples, state_dim)
            a (numpy.array): the samples of the state (nsamples, action_dim)
            s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim)
            r (numpy.array): the sample of the reward (nsamples, )
            theta (numpy.array): the sample of the Q-function parameters (1, n_params)
            check_batch_dim (bool): default False

        Returns:
            The standardized values (s, a, s_next, r, theta)

        """
        s = standardize_input_data(
            s, ['s'],
            [(None, self.state_dim)] if self.state_dim is not None else None,
            exception_prefix='state')
        a = standardize_input_data(
            a, ['a'],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            exception_prefix='action')
        # r = standardize_input_data(r, ['r'], [(None, 1)],
        #                            check_batch_dim=False, exception_prefix='reward')
        s_next = standardize_input_data(
            s_next, ['s_next'],
            [(None, self.state_dim)] if self.state_dim is not None else None,
            exception_prefix='state_next')
        theta = standardize_input_data(theta, ['theta'],
                                       (None, self.bellman_model.n_inputs()),
                                       exception_prefix='theta')
        check_array_lengths(s, a, s_next)
        return s, a, s_next, r, absorbing, theta
Beispiel #8
0
    def __init__(
        self,
        estimator,
        gamma,
        discrete_actions,
        optimizer="adam",
        state_dim=None,
        action_dim=None,
        norm_value=2,
        update_theta_every=1,
        horizon=10,
        verbose=0,
    ):
        super(GenGradFQI, self).__init__(estimator, state_dim, action_dim, discrete_actions, gamma, horizon, verbose)
        # save MDP information
        self.norm_value = norm_value
        self.update_theta_every = update_theta_every if update_theta_every > 0 else -1

        # create theano variables
        self.T_Y = T.dvector()

        # define bellman operator (check that BOP has only one output)
        assert isinstance(estimator.inputs, list)
        assert len(estimator.inputs) == 1
        assert isinstance(estimator.outputs, list)
        assert len(estimator.outputs) == 1

        # construct (theano) Bellman error
        v = self._estimator.outputs[0] - self.T_Y
        if self.norm_value == np.inf:
            err = T.max(v ** 2)
        else:
            err = T.mean(v ** self.norm_value) ** (1.0 / self.norm_value)
        self.fqi_loss = err

        # define function to be used for train and drawing actions
        self.train_function = None

        # get keras optimizer
        self.optimizer = optimizers.get(optimizer)

        # validate input data (the output is a list storing the validated input)
        self.discrete_actions = standardize_input_data(
            discrete_actions,
            ["discrete_actions"],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            exception_prefix="discrete_actions",
        )
Beispiel #9
0
    def __init__(self,
                 q_model,
                 gamma,
                 discrete_actions,
                 optimizer,
                 state_dim=None,
                 action_dim=None,
                 incremental=True):
        # save MDP information
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.incremental = incremental
        self.gamma = gamma

        # create theano variables
        T_s = T.matrix()
        T_a = T.matrix()
        T_s_next = T.matrix()
        T_r = T.vector()
        # T_r = T.dmatrix()
        T_discrete_actions = T.matrix()

        # store models of bellman apx and Q-function
        self.q_model = q_model

        # construct (theano) Bellman error
        self.T_bellman_err = self.bellman_error(T_s, T_a, T_s_next, T_r,
                                                self.gamma, T_discrete_actions)

        # define function to be used for train and drawing actions
        self.train_function = None
        self.draw_action_function = None

        self.T_s = T_s
        self.T_a = T_a
        self.T_s_next = T_s_next
        self.T_r = T_r
        self.T_discrete_actions = T_discrete_actions

        # get keras optimizer
        self.optimizer = optimizers.get(optimizer)

        # validate input data (the output is a list storing the validated input)
        self.discrete_actions = standardize_input_data(
            discrete_actions, ['discrete_actions'],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            check_batch_dim=False,
            exception_prefix='discrete_actions')
Beispiel #10
0
    def __init__(self,
                 estimator,
                 gamma,
                 discrete_actions,
                 optimizer="adam",
                 state_dim=None,
                 action_dim=None,
                 norm_value=2,
                 update_theta_every=1,
                 horizon=10,
                 verbose=0):
        super(GenGradFQI,
              self).__init__(estimator, state_dim, action_dim,
                             discrete_actions, gamma, horizon, verbose)
        # save MDP information
        self.norm_value = norm_value
        self.update_theta_every = update_theta_every if update_theta_every > \
                                                        0 else -1

        # create theano variables
        self.T_Y = T.dvector()

        # define bellman operator (check that BOP has only one output)
        assert isinstance(estimator.inputs, list)
        assert len(estimator.inputs) == 1
        assert isinstance(estimator.outputs, list)
        assert len(estimator.outputs) == 1

        # construct (theano) Bellman error
        v = self._estimator.outputs[0] - self.T_Y
        if self.norm_value == np.inf:
            err = T.max(v**2)
        else:
            err = T.mean(v**self.norm_value)**(1. / self.norm_value)
        self.fqi_loss = err

        # define function to be used for train and drawing actions
        self.train_function = None

        # get keras optimizer
        self.optimizer = optimizers.get(optimizer)

        # validate input data (the output is a list storing the validated input)
        self.discrete_actions = standardize_input_data(
            discrete_actions, ['discrete_actions'],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            exception_prefix='discrete_actions')
Beispiel #11
0
    def draw_action(self, state, done, flag):
        """
        Samples the action to be executed.
        Args:
            state (numpy.array): the state to be evaluated (1, state_dim) or (state_dim,)
            done: ??
            flag: ??

        Returns:
            The action to be executed in the state
        """
        state = state.astype(theano.config.floatX)
        self._make_draw_action_function()
        state = standardize_input_data(state, ['state'],
                                       [(None, self.state_dim)] if self.state_dim is not None else None,
                                       check_batch_dim=False, exception_prefix='draw_state')
        return self.draw_action_function(state[0], self.discrete_actions[
            0])  # we take index zero since they are lists of numpy matrices
Beispiel #12
0
    def predict(self, X, X_tr=None, Y_tr=None,
                batch_size=128, return_var=False, verbose=0):
        '''Generate output predictions for the input samples batch by batch.

        Arguments:
        ----------
            X : np.ndarray or list of np.ndarrays
            batch_size : uint (default: 128)
            return_var : bool (default: False)
                Whether predictive variance is returned.
            verbose : uint (default: 0)
                Verbosity mode, 0 or 1.

        Returns:
        --------
            preds : a list or a tuple of lists
                Lists of output predictions and variance estimates.
        '''
        # Update GP data if provided (and grid if necessary)
        if X_tr is not None and Y_tr is not None:
            X_tr, Y_tr, _ = self._standardize_user_data(X_tr, Y_tr,
                                                        check_batch_dim=False,
                                                        batch_size=batch_size)
            H_tr = self.transform(X_tr, batch_size=batch_size)
            for gp, h, y in zip(self.gp_output_layers, H_tr, Y_tr):
                gp.backend.update_data('tr', h, y)
                if gp.update_grid:
                    gp.backend.update_grid('tr')

        # Validate user data
        X = standardize_input_data(X, self.input_names,
                                   self.internal_input_shapes,
                                   check_batch_dim=False)

        H = self.transform(X, batch_size=batch_size)

        preds = []
        for gp, h in zip(self.gp_output_layers, H):
            preds.append(gp.backend.predict(h, return_var=return_var))

        if return_var:
            preds = map(list, zip(*preds))

        return preds
Beispiel #13
0
    def draw_action(self, state, done, flag):
        """
        Samples the action to be executed.
        Args:
            state (numpy.array): the state to be evaluated (1, state_dim) or (state_dim,)
            done: ??
            flag: ??

        Returns:
            The action to be executed in the state
        """
        self._make_draw_action_function()
        state = standardize_input_data(
            state, ['state'],
            [(None, self.state_dim)] if self.state_dim is not None else None,
            exception_prefix='draw_state')
        return self.draw_action_function(
            state[0], self.learned_theta_value, self.discrete_actions[0]
        )  # we take index zero since they are lists of numpy matrices
Beispiel #14
0
    def __init__(self, q_model, gamma,
                 discrete_actions,
                 optimizer,
                 state_dim=None, action_dim=None, incremental=True):
        # save MDP information
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.incremental = incremental
        self.gamma = gamma

        # create theano variables
        T_s = T.matrix()
        T_a = T.matrix()
        T_s_next = T.matrix()
        T_r = T.vector()
        # T_r = T.dmatrix()
        T_discrete_actions = T.matrix()

        # store models of bellman apx and Q-function
        self.q_model = q_model

        # construct (theano) Bellman error
        self.T_bellman_err = self.bellman_error(T_s, T_a, T_s_next, T_r, self.gamma, T_discrete_actions)

        # define function to be used for train and drawing actions
        self.train_function = None
        self.draw_action_function = None

        self.T_s = T_s
        self.T_a = T_a
        self.T_s_next = T_s_next
        self.T_r = T_r
        self.T_discrete_actions = T_discrete_actions

        # get keras optimizer
        self.optimizer = optimizers.get(optimizer)

        # validate input data (the output is a list storing the validated input)
        self.discrete_actions = standardize_input_data(discrete_actions, ['discrete_actions'],
                                                       [(None,
                                                         self.action_dim)] if self.action_dim is not None else None,
                                                       check_batch_dim=False, exception_prefix='discrete_actions')
Beispiel #15
0
    def fit(self,
            s,
            a,
            s_next,
            r,
            absorbing,
            theta,
            batch_size=32,
            nb_epoch=10,
            shuffle=True,
            theta_metrics={}):
        """

        Args:
            s (numpy.array): the samples of the state (nsamples, state_dim)
            a (numpy.array): the samples of the state (nsamples, action_dim)
            s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim)
            r (numpy.array): the sample of the reward (nsamples, )
            theta (numpy.array): the sample of the Q-function parameters (1, n_params)
            batch_size (int): dimension of the batch used for a single step of the gradient
            nb_epoch (int): number of epochs
            verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose.
            callbacks (list): list of callbacks to be called during training.
                See [Keras Callbacks](https://keras.io/callbacks/).
            validation_split (float): float between 0 and 1:
                fraction of the training data to be used as validation data.
                The model will set apart this fraction of the training data,
                will not train on it, and will evaluate the loss and any model metrics
                on this data at the end of each epoch.
            validation_data (tuple): data on which to evaluate the loss and any model metrics
                at the end of each epoch. The model will not be trained on this data.
                This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple
                (val_s, val_a, val_s_next, val_r, val_theta).
            shuffle (boolean): whether to shuffle the training data before each epoch.
            theta_metrics (dict): dictionary storing the pairs (name: callable object).
                The callable object/function is used to evaluate the Q-function parameters
                at each iteration. The signature of the callable is simple: f(theta)
                e.g.: theta_metrics={'k': lambda theta: evaluate(theta)})

        Returns:
            A PBOHistory instance storing train information
        """
        s, a, s_next, r, absorbing, theta = self._standardize_user_data(
            s, a, s_next, r, absorbing, theta, check_batch_dim=False)

        all_actions = standardize_input_data(
            self.discrete_actions, ['all_actions'],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            exception_prefix='discrete_actions')

        n_updates = 0
        history = {"theta": [], 'rho': []}
        for k in theta_metrics.keys():
            history.update({k: []})

        ins = s + a + s_next + [r, absorbing]
        self._make_train_function()
        f = self.train_function

        nb_train_sample = ins[0].shape[0]
        index_array = np.arange(nb_train_sample)

        # append evolution of theta for independent case
        for _ in range(len(self.theta_list) - 1):
            if self.incremental:
                tmp = theta[-1] + self.bellman_model.predict(theta[-1])
            else:
                tmp = self.bellman_model.predict(theta[-1])
            theta += [tmp]

        term_condition = self.term_condition
        stop = False
        old_theta = theta

        for epoch in range(nb_epoch):
            if stop:
                break

            if shuffle == 'batch':
                index_array = batch_shuffle(index_array, batch_size)
            elif shuffle:
                np.random.shuffle(index_array)
            batches = make_batches(nb_train_sample, batch_size)
            for batch_index, (batch_start, batch_end) in enumerate(batches):

                history["theta"].append(theta[0])
                if hasattr(self.bellman_model, '_model'):
                    history["rho"].append(
                        self.bellman_model._model.get_weights())
                else:
                    history["rho"].append(self.bellman_model.get_weights())
                for k, v in iteritems(theta_metrics):
                    history[k].append(v(theta))

                batch_ids = index_array[batch_start:batch_end]
                try:
                    if type(ins[-1]) is float:
                        # do not slice the training phase flag
                        ins_batch = slice_X(ins[:-1], batch_ids) + [ins[-1]]
                    else:
                        ins_batch = slice_X(ins, batch_ids)
                except TypeError:
                    raise Exception('TypeError while preparing batch. '
                                    'If using HDF5 input data, '
                                    'pass shuffle="batch".')
                inp = ins_batch + theta + all_actions
                outs = f(*inp)
                n_updates += 1

                if self.update_theta_every > 0 and n_updates % self.update_theta_every == 0:
                    tmp = self.apply_bo(theta[0],
                                        n_times=self.steps_per_theta_update)
                    theta = [tmp]
                    for _ in range(len(self.theta_list) - 1):
                        if self.incremental:
                            tmp = tmp + self.bellman_model.predict(tmp)
                        else:
                            tmp = self.bellman_model.predict(tmp)
                        theta += [tmp]

                    if term_condition is not None:
                        stop = term_condition(old_theta, theta)
                        if stop:
                            break
                        old_theta = theta

        # finally apply the bellman operator K-times to get the final point
        self.learned_theta_value = self.apply_bo(theta[0], n_times=100)
        if self.verbose > 1:
            print('learned theta: {}'.format(self.learned_theta_value))

        self.history = history
        return history
Beispiel #16
0
    def fit(self, sast, r, batch_size=32, nb_epoch=10, shuffle=True, theta_metrics={}):
        """

        Args:
            s (numpy.array): the samples of the state (nsamples, state_dim)
            a (numpy.array): the samples of the state (nsamples, action_dim)
            s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim)
            r (numpy.array): the sample of the reward (nsamples, )
            theta (numpy.array): the sample of the Q-function parameters (1, n_params)
            batch_size (int): dimension of the batch used for a single step of the gradient
            nb_epoch (int): number of epochs
            verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose.
            callbacks (list): list of callbacks to be called during training.
                See [Keras Callbacks](https://keras.io/callbacks/).
            validation_split (float): float between 0 and 1:
                fraction of the training data to be used as validation data.
                The model will set apart this fraction of the training data,
                will not train on it, and will evaluate the loss and any model metrics
                on this data at the end of each epoch.
            validation_data (tuple): data on which to evaluate the loss and any model metrics
                at the end of each epoch. The model will not be trained on this data.
                This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple
                (val_s, val_a, val_s_next, val_r, val_theta).
            shuffle (boolean): whether to shuffle the training data before each epoch.
            theta_metrics (dict): dictionary storing the pairs (name: callable object).
                The callable object/function is used to evaluate the Q-function parameters
                at each iteration. The signature of the callable is simple: f(theta)
                e.g.: theta_metrics={'k': lambda theta: evaluate(theta)})

        Returns:
            A PBOHistory instance storing train information
        """
        sast = standardize_input_data(
            sast, ["sast"], (None, 2 * self.state_dim + self.action_dim + 1), exception_prefix="sast"
        )[0]

        next_states_idx = self.state_dim + self.action_dim
        sa = sast[:, :next_states_idx]
        s_next = sast[:, next_states_idx:-1]
        absorbing = sast[:, -1]

        n_updates = 0

        maxq, maxa = self.maxQA(s_next, absorbing)

        if hasattr(self._estimator, "adapt"):
            # update estimator structure
            self._estimator.adapt(iteration=self._iteration)

        # y = np.reshape(r + self.gamma * maxq, (-1, 1))
        y = r + self.gamma * maxq

        ins = [sa, y]
        self._make_train_function()
        f = self.train_function

        nb_train_sample = sa.shape[0]
        index_array = np.arange(nb_train_sample)
        history = {"theta": []}
        for k in theta_metrics.keys():
            history.update({k: []})

        for epoch in range(nb_epoch):
            if shuffle == "batch":
                index_array = batch_shuffle(index_array, batch_size)
            elif shuffle:
                np.random.shuffle(index_array)

            batches = make_batches(nb_train_sample, batch_size)
            for batch_index, (batch_start, batch_end) in enumerate(batches):

                if hasattr(self._estimator, "_model"):
                    ltheta = self._model.get_weights()
                    history["theta"].append(ltheta)
                else:
                    ltheta = self._estimator.get_weights()
                    history["theta"].append(ltheta)
                for k, v in iteritems(theta_metrics):
                    history[k].append(v(ltheta))

                batch_ids = index_array[batch_start:batch_end]
                try:
                    if type(ins[-1]) is float:
                        # do not slice the training phase flag
                        ins_batch = slice_X(ins[:-1], batch_ids) + [ins[-1]]
                    else:
                        ins_batch = slice_X(ins, batch_ids)
                except TypeError:
                    raise Exception(
                        "TypeError while preparing batch. " "If using HDF5 input data, " 'pass shuffle="batch".'
                    )

                outs = f(*ins_batch)
                n_updates += 1

                if self.update_theta_every > 0 and n_updates % self.update_theta_every == 0:
                    maxq, maxa = self.maxQA(s_next, absorbing)

                    if hasattr(self._estimator, "adapt"):
                        # update estimator structure
                        self._estimator.adapt(iteration=self._iteration)

                    # y = np.reshape(r + self.gamma * maxq, (-1, 1))
                    y = r + self.gamma * maxq
                    ins = [ins[0], y]

        if self._verbose > 1:
            print("learned theta: {}".format(self._estimator.get_weights()))

        return history
Beispiel #17
0
    def fit(self,
            sast,
            r,
            batch_size=32,
            nb_epoch=10,
            shuffle=True,
            theta_metrics={}):
        """

        Args:
            s (numpy.array): the samples of the state (nsamples, state_dim)
            a (numpy.array): the samples of the state (nsamples, action_dim)
            s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim)
            r (numpy.array): the sample of the reward (nsamples, )
            theta (numpy.array): the sample of the Q-function parameters (1, n_params)
            batch_size (int): dimension of the batch used for a single step of the gradient
            nb_epoch (int): number of epochs
            verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose.
            callbacks (list): list of callbacks to be called during training.
                See [Keras Callbacks](https://keras.io/callbacks/).
            validation_split (float): float between 0 and 1:
                fraction of the training data to be used as validation data.
                The model will set apart this fraction of the training data,
                will not train on it, and will evaluate the loss and any model metrics
                on this data at the end of each epoch.
            validation_data (tuple): data on which to evaluate the loss and any model metrics
                at the end of each epoch. The model will not be trained on this data.
                This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple
                (val_s, val_a, val_s_next, val_r, val_theta).
            shuffle (boolean): whether to shuffle the training data before each epoch.
            theta_metrics (dict): dictionary storing the pairs (name: callable object).
                The callable object/function is used to evaluate the Q-function parameters
                at each iteration. The signature of the callable is simple: f(theta)
                e.g.: theta_metrics={'k': lambda theta: evaluate(theta)})

        Returns:
            A PBOHistory instance storing train information
        """
        sast = standardize_input_data(
            sast, ['sast'], (None, 2 * self.state_dim + self.action_dim + 1),
            exception_prefix='sast')[0]

        next_states_idx = self.state_dim + self.action_dim
        sa = sast[:, :next_states_idx]
        s_next = sast[:, next_states_idx:-1]
        absorbing = sast[:, -1]

        n_updates = 0

        maxq, maxa = self.maxQA(s_next, absorbing)

        if hasattr(self._estimator, 'adapt'):
            # update estimator structure
            self._estimator.adapt(iteration=self._iteration)

        # y = np.reshape(r + self.gamma * maxq, (-1, 1))
        y = r + self.gamma * maxq

        ins = [sa, y]
        self._make_train_function()
        f = self.train_function

        nb_train_sample = sa.shape[0]
        index_array = np.arange(nb_train_sample)
        history = {"theta": []}
        for k in theta_metrics.keys():
            history.update({k: []})

        for epoch in range(nb_epoch):
            if shuffle == 'batch':
                index_array = batch_shuffle(index_array, batch_size)
            elif shuffle:
                np.random.shuffle(index_array)

            batches = make_batches(nb_train_sample, batch_size)
            for batch_index, (batch_start, batch_end) in enumerate(batches):

                if hasattr(self._estimator, '_model'):
                    ltheta = self._model.get_weights()
                    history["theta"].append(ltheta)
                else:
                    ltheta = self._estimator.get_weights()
                    history["theta"].append(ltheta)
                for k, v in iteritems(theta_metrics):
                    history[k].append(v(ltheta))

                batch_ids = index_array[batch_start:batch_end]
                try:
                    if type(ins[-1]) is float:
                        # do not slice the training phase flag
                        ins_batch = slice_X(ins[:-1], batch_ids) + [ins[-1]]
                    else:
                        ins_batch = slice_X(ins, batch_ids)
                except TypeError:
                    raise Exception('TypeError while preparing batch. '
                                    'If using HDF5 input data, '
                                    'pass shuffle="batch".')

                outs = f(*ins_batch)
                n_updates += 1

                if self.update_theta_every > 0 \
                        and n_updates % self.update_theta_every == 0:
                    maxq, maxa = self.maxQA(s_next, absorbing)

                    if hasattr(self._estimator, 'adapt'):
                        # update estimator structure
                        self._estimator.adapt(iteration=self._iteration)

                    # y = np.reshape(r + self.gamma * maxq, (-1, 1))
                    y = r + self.gamma * maxq
                    ins = [ins[0], y]

        if self._verbose > 1:
            print('learned theta: {}'.format(self._estimator.get_weights()))

        return history
Beispiel #18
0
    def fit(self,
            s,
            a,
            s_next,
            r,
            batch_size=32,
            nb_epoch=10,
            verbose=1,
            callbacks=[],
            validation_split=0.,
            validation_data=None,
            shuffle=True,
            theta_metrics={}):
        """

        Args:
            s (numpy.array): the samples of the state (nsamples, state_dim)
            a (numpy.array): the samples of the state (nsamples, action_dim)
            s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim)
            r (numpy.array): the sample of the reward (nsamples, )
            batch_size (int): dimension of the batch used for a single step of the gradient
            nb_epoch (int): number of epochs
            verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose.
            callbacks (list): list of callbacks to be called during training.
                See [Keras Callbacks](https://keras.io/callbacks/).
            validation_split (float): float between 0 and 1:
                fraction of the training data to be used as validation data.
                The model will set apart this fraction of the training data,
                will not train on it, and will evaluate the loss and any model metrics
                on this data at the end of each epoch.
            validation_data (tuple): data on which to evaluate the loss and any model metrics
                at the end of each epoch. The model will not be trained on this data.
                This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple
                (val_s, val_a, val_s_next, val_r, val_theta).
            shuffle (boolean): whether to shuffle the training data before each epoch.
            theta_metrics (dict): dictionary storing the pairs (name: callable object).
                The callable object/function is used to evaluate the Q-function parameters
                at each iteration. The signature of the callable is simple: f(theta)
                e.g.: theta_metrics={'k': lambda theta: evaluate(theta)})

        Returns:
            A PBOHistory instance storing train information
        """
        s, a, s_next, r = self._standardize_user_data(s,
                                                      a,
                                                      s_next,
                                                      r,
                                                      check_batch_dim=False)

        all_actions = standardize_input_data(
            self.discrete_actions, ['all_actions'],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            check_batch_dim=False,
            exception_prefix='discrete_actions')

        # # prepare validation data
        # if validation_data:
        #     do_validation = True
        #     if len(validation_data) == 4:
        #         val_s, val_a, val_s_next, val_r = validation_data
        #     elif len(validation_data) == 5:
        #         val_s, val_a, val_s_next, val_r, val_theta = validation_data
        #     else:
        #         raise
        #
        #     val_s, val_a, val_s_next, val_r, val_theta = self._standardize_user_data(
        #         val_s, val_a, val_s_next, val_r, val_theta,
        #         check_batch_dim=False,
        #         batch_size=batch_size
        #     )
        #     self._make_test_function()
        #     val_f = self.test_function
        #     val_ins = val_s + val_a + val_s_next + [val_r]
        #
        # elif validation_split and 0. < validation_split < 1.:
        #     do_validation = True
        #     split_at = int(len(x[0]) * (1. - validation_split))
        #     x, val_x = (slice_X(x, 0, split_at), slice_X(x, split_at))
        #     y, val_y = (slice_X(y, 0, split_at), slice_X(y, split_at))
        #     sample_weights, val_sample_weights = (
        #         slice_X(sample_weights, 0, split_at), slice_X(sample_weights, split_at))
        #     self._make_test_function()
        #     val_f = self.test_function
        #     if self.uses_learning_phase and type(K.learning_phase()) is not int:
        #         val_ins = val_x + val_y + val_sample_weights + [0.]
        #     else:
        #         val_ins = val_x + val_y + val_sample_weights
        # else:
        #     do_validation = False
        #     val_f = None
        #     val_ins = None

        do_validation = False
        val_f = None
        val_ins = None

        ins = s + a + s_next + [r]
        self._make_train_function()
        f = self.train_function

        # prepare display labels
        out_labels = ['bellman_error']

        if do_validation:
            callback_metrics = copy.copy(out_labels) + [
                'val_' + n for n in out_labels
            ]
        else:
            callback_metrics = copy.copy(out_labels)

        return self._fit_loop(f,
                              ins,
                              all_actions,
                              out_labels=out_labels,
                              batch_size=batch_size,
                              nb_epoch=nb_epoch,
                              verbose=verbose,
                              callbacks=callbacks,
                              val_f=val_f,
                              val_ins=val_ins,
                              shuffle=shuffle,
                              callback_metrics=callback_metrics,
                              theta_metrics=theta_metrics)
Beispiel #19
0
    def __init__(self, bellman_model, q_model, steps_ahead,
                 gamma, discrete_actions,
                 optimizer,
                 state_dim=None, action_dim=None, incremental=True,
                 norm_value=np.inf, update_theta_every=1,
                 steps_per_theta_update=None,
                 independent=False,
                 verbose=0, term_condition=None):
        # save MDP information
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.incremental = incremental
        self.gamma = gamma
        self.norm_value = norm_value
        self.update_theta_every = update_theta_every if update_theta_every > 0 else -1
        self.verbose = verbose
        self.independent = independent
        self.steps_per_theta_update = steps_ahead if steps_per_theta_update is None else max(
            1, steps_per_theta_update)

        # create theano variables
        T_s = T.dmatrix()
        T_a = T.dmatrix()
        T_s_next = T.dmatrix()
        T_r = T.dvector()
        T_absorbing = T.dvector()
        # T_r = T.dmatrix()
        T_discrete_actions = T.dmatrix()

        # store models of bellman apx and Q-function
        self.bellman_model = bellman_model
        self.q_model = q_model
        self.steps_ahead = steps_ahead

        # define bellman operator (check that BOP has only one output)
        assert isinstance(bellman_model.inputs, list)
        assert len(bellman_model.inputs) == 1
        assert isinstance(bellman_model.outputs, list)
        assert len(bellman_model.outputs) == 1

        # construct (theano) Bellman error
        self.theta_list = [bellman_model.inputs[0]]
        if not independent:
            self.T_bellman_err, _ = self.k_step_bellman_error(
                T_s, T_a, T_s_next, T_r, T_absorbing,
                self.theta_list[0], gamma, T_discrete_actions, steps_ahead)
            assert len(self.theta_list) == 1
        else:
            self.theta_list += [T.fmatrix(str(ll)) for ll in
                                range(
                                    steps_ahead - 1)]  # theta_0, theta_1, ..., theta_steps
            T_bellman_err = None
            for theta in self.theta_list:
                if T_bellman_err is None:
                    T_bellman_err = self.bellman_error(
                        T_s, T_a, T_s_next, T_r, theta,
                        gamma, T_discrete_actions)[0]
                else:
                    T_bellman_err = T_bellman_err + \
                                    self.bellman_error(
                                        T_s, T_a, T_s_next, T_r, theta,
                                        gamma, T_discrete_actions)[0]
            self.T_bellman_err = T_bellman_err
            assert len(self.theta_list) == steps_ahead

        # define function to be used for train and drawing actions
        self.train_function = None
        self.draw_action_function = None

        self.T_s = T_s
        self.T_a = T_a
        self.T_s_next = T_s_next
        self.T_r = T_r
        self.T_discrete_actions = T_discrete_actions
        self.T_absorbing = T_absorbing

        # get keras optimizer
        self.optimizer = optimizers.get(optimizer)

        # validate input data (the output is a list storing the validated input)
        self.discrete_actions = standardize_input_data(
            discrete_actions, ['discrete_actions'],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            exception_prefix='discrete_actions')

        if isinstance(term_condition, str):
            self.term_condition = DEFAULT_TERM[term_condition]
        else:
            self.term_condition = term_condition
Beispiel #20
0
    def fit(self, s, a, s_next, r, absorbing, theta,
            batch_size=32, nb_epoch=10, shuffle=True,
            theta_metrics={}):
        """

        Args:
            s (numpy.array): the samples of the state (nsamples, state_dim)
            a (numpy.array): the samples of the state (nsamples, action_dim)
            s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim)
            r (numpy.array): the sample of the reward (nsamples, )
            theta (numpy.array): the sample of the Q-function parameters (1, n_params)
            batch_size (int): dimension of the batch used for a single step of the gradient
            nb_epoch (int): number of epochs
            verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose.
            callbacks (list): list of callbacks to be called during training.
                See [Keras Callbacks](https://keras.io/callbacks/).
            validation_split (float): float between 0 and 1:
                fraction of the training data to be used as validation data.
                The model will set apart this fraction of the training data,
                will not train on it, and will evaluate the loss and any model metrics
                on this data at the end of each epoch.
            validation_data (tuple): data on which to evaluate the loss and any model metrics
                at the end of each epoch. The model will not be trained on this data.
                This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple
                (val_s, val_a, val_s_next, val_r, val_theta).
            shuffle (boolean): whether to shuffle the training data before each epoch.
            theta_metrics (dict): dictionary storing the pairs (name: callable object).
                The callable object/function is used to evaluate the Q-function parameters
                at each iteration. The signature of the callable is simple: f(theta)
                e.g.: theta_metrics={'k': lambda theta: evaluate(theta)})

        Returns:
            A PBOHistory instance storing train information
        """
        s, a, s_next, r, absorbing, theta = self._standardize_user_data(
            s, a, s_next, r, absorbing, theta,
            check_batch_dim=False
        )

        all_actions = standardize_input_data(
            self.discrete_actions, ['all_actions'],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            exception_prefix='discrete_actions')

        n_updates = 0
        history = {"theta": [], 'rho': []}
        for k in theta_metrics.keys():
            history.update({k: []})

        ins = s + a + s_next + [r, absorbing]
        self._make_train_function()
        f = self.train_function

        nb_train_sample = ins[0].shape[0]
        index_array = np.arange(nb_train_sample)

        # append evolution of theta for independent case
        for _ in range(len(self.theta_list) - 1):
            if self.incremental:
                tmp = theta[-1] + self.bellman_model.predict(theta[-1])
            else:
                tmp = self.bellman_model.predict(theta[-1])
            theta += [tmp]

        term_condition = self.term_condition
        stop = False
        old_theta = theta

        for epoch in range(nb_epoch):
            if stop:
                break

            if shuffle == 'batch':
                index_array = batch_shuffle(index_array, batch_size)
            elif shuffle:
                np.random.shuffle(index_array)
            batches = make_batches(nb_train_sample, batch_size)
            for batch_index, (batch_start, batch_end) in enumerate(batches):

                history["theta"].append(theta[0])
                if hasattr(self.bellman_model, '_model'):
                    history["rho"].append(
                        self.bellman_model._model.get_weights())
                else:
                    history["rho"].append(self.bellman_model.get_weights())
                for k, v in iteritems(theta_metrics):
                    history[k].append(v(theta))

                batch_ids = index_array[batch_start:batch_end]
                try:
                    if type(ins[-1]) is float:
                        # do not slice the training phase flag
                        ins_batch = slice_X(ins[:-1], batch_ids) + [ins[-1]]
                    else:
                        ins_batch = slice_X(ins, batch_ids)
                except TypeError:
                    raise Exception('TypeError while preparing batch. '
                                    'If using HDF5 input data, '
                                    'pass shuffle="batch".')
                inp = ins_batch + theta + all_actions
                outs = f(*inp)
                n_updates += 1

                if self.update_theta_every > 0 and n_updates % self.update_theta_every == 0:
                    tmp = self.apply_bo(theta[0],
                                        n_times=self.steps_per_theta_update)
                    theta = [tmp]
                    for _ in range(len(self.theta_list) - 1):
                        if self.incremental:
                            tmp = tmp + self.bellman_model.predict(tmp)
                        else:
                            tmp = self.bellman_model.predict(tmp)
                        theta += [tmp]

                    if term_condition is not None:
                        stop = term_condition(old_theta, theta)
                        if stop:
                            break
                        old_theta = theta

        # finally apply the bellman operator K-times to get the final point
        self.learned_theta_value = self.apply_bo(theta[0], n_times=100)
        if self.verbose > 1:
            print('learned theta: {}'.format(self.learned_theta_value))

        self.history = history
        return history
Beispiel #21
0
    def __init__(self,
                 bellman_model,
                 q_model,
                 steps_ahead,
                 gamma,
                 discrete_actions,
                 optimizer,
                 state_dim=None,
                 action_dim=None,
                 incremental=True,
                 norm_value=np.inf,
                 update_theta_every=1,
                 steps_per_theta_update=None,
                 independent=False,
                 verbose=0,
                 term_condition=None):
        # save MDP information
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.incremental = incremental
        self.gamma = gamma
        self.norm_value = norm_value
        self.update_theta_every = update_theta_every if update_theta_every > 0 else -1
        self.verbose = verbose
        self.independent = independent
        self.steps_per_theta_update = steps_ahead if steps_per_theta_update is None else max(
            1, steps_per_theta_update)

        # create theano variables
        T_s = T.dmatrix()
        T_a = T.dmatrix()
        T_s_next = T.dmatrix()
        T_r = T.dvector()
        T_absorbing = T.dvector()
        # T_r = T.dmatrix()
        T_discrete_actions = T.dmatrix()

        # store models of bellman apx and Q-function
        self.bellman_model = bellman_model
        self.q_model = q_model
        self.steps_ahead = steps_ahead

        # define bellman operator (check that BOP has only one output)
        assert isinstance(bellman_model.inputs, list)
        assert len(bellman_model.inputs) == 1
        assert isinstance(bellman_model.outputs, list)
        assert len(bellman_model.outputs) == 1

        # construct (theano) Bellman error
        self.theta_list = [bellman_model.inputs[0]]
        if not independent:
            self.T_bellman_err, _ = self.k_step_bellman_error(
                T_s, T_a, T_s_next, T_r, T_absorbing, self.theta_list[0],
                gamma, T_discrete_actions, steps_ahead)
            assert len(self.theta_list) == 1
        else:
            self.theta_list += [
                T.fmatrix(str(ll)) for ll in range(steps_ahead - 1)
            ]  # theta_0, theta_1, ..., theta_steps
            T_bellman_err = None
            for theta in self.theta_list:
                if T_bellman_err is None:
                    T_bellman_err = self.bellman_error(T_s, T_a, T_s_next, T_r,
                                                       theta, gamma,
                                                       T_discrete_actions)[0]
                else:
                    T_bellman_err = T_bellman_err + \
                                    self.bellman_error(
                                        T_s, T_a, T_s_next, T_r, theta,
                                        gamma, T_discrete_actions)[0]
            self.T_bellman_err = T_bellman_err
            assert len(self.theta_list) == steps_ahead

        # define function to be used for train and drawing actions
        self.train_function = None
        self.draw_action_function = None

        self.T_s = T_s
        self.T_a = T_a
        self.T_s_next = T_s_next
        self.T_r = T_r
        self.T_discrete_actions = T_discrete_actions
        self.T_absorbing = T_absorbing

        # get keras optimizer
        self.optimizer = optimizers.get(optimizer)

        # validate input data (the output is a list storing the validated input)
        self.discrete_actions = standardize_input_data(
            discrete_actions, ['discrete_actions'],
            [(None, self.action_dim)] if self.action_dim is not None else None,
            exception_prefix='discrete_actions')

        if isinstance(term_condition, str):
            self.term_condition = DEFAULT_TERM[term_condition]
        else:
            self.term_condition = term_condition
Beispiel #22
0
    def fit(self, s, a, s_next, r,
            batch_size=32, nb_epoch=10, verbose=1, callbacks=[],
            validation_split=0., validation_data=None, shuffle=True, theta_metrics={}):
        """

        Args:
            s (numpy.array): the samples of the state (nsamples, state_dim)
            a (numpy.array): the samples of the state (nsamples, action_dim)
            s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim)
            r (numpy.array): the sample of the reward (nsamples, )
            batch_size (int): dimension of the batch used for a single step of the gradient
            nb_epoch (int): number of epochs
            verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose.
            callbacks (list): list of callbacks to be called during training.
                See [Keras Callbacks](https://keras.io/callbacks/).
            validation_split (float): float between 0 and 1:
                fraction of the training data to be used as validation data.
                The model will set apart this fraction of the training data,
                will not train on it, and will evaluate the loss and any model metrics
                on this data at the end of each epoch.
            validation_data (tuple): data on which to evaluate the loss and any model metrics
                at the end of each epoch. The model will not be trained on this data.
                This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple
                (val_s, val_a, val_s_next, val_r, val_theta).
            shuffle (boolean): whether to shuffle the training data before each epoch.
            theta_metrics (dict): dictionary storing the pairs (name: callable object).
                The callable object/function is used to evaluate the Q-function parameters
                at each iteration. The signature of the callable is simple: f(theta)
                e.g.: theta_metrics={'k': lambda theta: evaluate(theta)})

        Returns:
            A PBOHistory instance storing train information
        """
        s, a, s_next, r = self._standardize_user_data(
            s, a, s_next, r,
            check_batch_dim=False
        )

        all_actions = standardize_input_data(self.discrete_actions, ['all_actions'],
                                             [(None, self.action_dim)] if self.action_dim is not None else None,
                                             check_batch_dim=False, exception_prefix='discrete_actions')

        # # prepare validation data
        # if validation_data:
        #     do_validation = True
        #     if len(validation_data) == 4:
        #         val_s, val_a, val_s_next, val_r = validation_data
        #     elif len(validation_data) == 5:
        #         val_s, val_a, val_s_next, val_r, val_theta = validation_data
        #     else:
        #         raise
        #
        #     val_s, val_a, val_s_next, val_r, val_theta = self._standardize_user_data(
        #         val_s, val_a, val_s_next, val_r, val_theta,
        #         check_batch_dim=False,
        #         batch_size=batch_size
        #     )
        #     self._make_test_function()
        #     val_f = self.test_function
        #     val_ins = val_s + val_a + val_s_next + [val_r]
        #
        # elif validation_split and 0. < validation_split < 1.:
        #     do_validation = True
        #     split_at = int(len(x[0]) * (1. - validation_split))
        #     x, val_x = (slice_X(x, 0, split_at), slice_X(x, split_at))
        #     y, val_y = (slice_X(y, 0, split_at), slice_X(y, split_at))
        #     sample_weights, val_sample_weights = (
        #         slice_X(sample_weights, 0, split_at), slice_X(sample_weights, split_at))
        #     self._make_test_function()
        #     val_f = self.test_function
        #     if self.uses_learning_phase and type(K.learning_phase()) is not int:
        #         val_ins = val_x + val_y + val_sample_weights + [0.]
        #     else:
        #         val_ins = val_x + val_y + val_sample_weights
        # else:
        #     do_validation = False
        #     val_f = None
        #     val_ins = None

        do_validation = False
        val_f = None
        val_ins = None

        ins = s + a + s_next + [r]
        self._make_train_function()
        f = self.train_function

        # prepare display labels
        out_labels = ['bellman_error']

        if do_validation:
            callback_metrics = copy.copy(out_labels) + ['val_' + n for n in out_labels]
        else:
            callback_metrics = copy.copy(out_labels)

        return self._fit_loop(f, ins, all_actions,
                              out_labels=out_labels,
                              batch_size=batch_size, nb_epoch=nb_epoch,
                              verbose=verbose, callbacks=callbacks,
                              val_f=val_f, val_ins=val_ins, shuffle=shuffle,
                              callback_metrics=callback_metrics,
                              theta_metrics=theta_metrics)