def __init__(
        self,
        name,
        env,
        hidden_sizes=(512, ),
        cell_type='lstm',
        hidden_nonlinearity=tf.nn.tanh,
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,
        rolling_average_persitency=0.99,
        backprop_steps=50,
    ):

        Serializable.quick_init(self, locals())
        self.recurrent = True

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency
        self.backprop_steps = backprop_steps

        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.name = name
        self._dataset_train = None
        self._dataset_test = None

        # Determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = action_space_dims = env.action_space.shape[0]
        """ computation graph for training and simple inference """
        with tf.variable_scope(name):
            # Placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, None, obs_space_dims),
                                         name='obs_ph')
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, None, action_space_dims),
                                         name='act_ph')
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, None, obs_space_dims),
                                           name='delta_ph')

            # Concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=2)

            # Create RNN
            rnns = []
            delta_preds = []
            self.obs_next_pred = []
            self.hidden_state_ph = []
            self.next_hidden_state_var = []
            self.cell = []
            with tf.variable_scope('rnn_model'):
                rnn = RNN(
                    name,
                    output_dim=self.obs_space_dims,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    input_var=self.nn_input,
                    input_dim=self.obs_space_dims + self.action_space_dims,
                    cell_type=cell_type,
                )

            self.delta_pred = rnn.output_var
            self.hidden_state_ph = rnn.state_var
            self.next_hidden_state_var = rnn.next_state_var
            self.cell = rnn.cell
            self._zero_state = self.cell.zero_state(1, tf.float32)

            self.loss = tf.reduce_mean(
                tf.square(self.delta_pred - self.delta_ph))
            params = list(rnn.get_params().values())
            self._gradients_ph = [
                tf.placeholder(shape=param.shape, dtype=tf.float32)
                for param in params
            ]
            self._gradients_vars = tf.gradients(self.loss, params)
            applied_gradients = zip(self._gradients_ph, params)
            self.train_op = optimizer(
                self.learning_rate).apply_gradients(applied_gradients)

            # Tensor_utils
            self.f_delta_pred = tensor_utils.compile_function(
                [self.obs_ph, self.act_ph, self.hidden_state_ph],
                [self.delta_pred, self.next_hidden_state_var])

        self._networks = [rnn]
    def __init__(self,
                 name,
                 env,
                 hidden_sizes=(512, 512),
                 meta_batch_size=10,
                 hidden_nonlinearity=tf.nn.relu,
                 output_nonlinearity=None,
                 batch_size=500,
                 learning_rate=0.001,
                 inner_learning_rate=0.1,
                 normalize_input=True,
                 optimizer=tf.train.AdamOptimizer,
                 valid_split_ratio=0.2,
                 rolling_average_persitency=0.99,
                 ):

        Serializable.quick_init(self, locals())

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None
        self.meta_batch_size = meta_batch_size

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.inner_learning_rate = inner_learning_rate
        self.name = name
        self._dataset_train = None
        self._dataset_test = None
        self._prev_params = None
        self._adapted_param_values = None

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = action_space_dims = env.action_space.shape[0]

        hidden_nonlinearity = self._activations[hidden_nonlinearity]
        output_nonlinearity = self._activations[output_nonlinearity]

        """ ------------------ Pre-Update Graph + Adaptation ----------------------- """
        with tf.variable_scope(name):
            # Placeholders
            self.obs_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32, shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims))

            # Concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            # Create MLP
            mlp = MLP(name,
                      output_dim=obs_space_dims,
                      hidden_sizes=hidden_sizes,
                      hidden_nonlinearity=hidden_nonlinearity,
                      output_nonlinearity=output_nonlinearity,
                      input_var=self.nn_input,
                      input_dim=obs_space_dims+action_space_dims)

            self.delta_pred = mlp.output_var  # shape: (batch_size, ndim_obs, n_models)

            self.loss = tf.reduce_mean(tf.square(self.delta_ph - self.delta_pred))
            self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
            self.adaptation_sym = tf.train.GradientDescentOptimizer(self.inner_learning_rate).minimize(self.loss)

            # Tensor_utils
            self.f_delta_pred = tensor_utils.compile_function([self.obs_ph, self.act_ph], self.delta_pred)

        """ --------------------------- Meta-training Graph ---------------------------------- """
        nn_input_per_task = tf.split(self.nn_input, self.meta_batch_size, axis=0)
        delta_per_task = tf.split(self.delta_ph, self.meta_batch_size, axis=0)

        pre_input_per_task, post_input_per_task = zip(*[tf.split(nn_input, 2, axis=0) for nn_input in nn_input_per_task])
        pre_delta_per_task, post_delta_per_task = zip(*[tf.split(delta, 2, axis=0) for delta in delta_per_task])

        pre_losses = []
        post_losses = []
        self._adapted_params = []

        for idx in range(self.meta_batch_size):
            with tf.variable_scope(name + '/pre_model_%d' % idx, reuse=tf.AUTO_REUSE):
                pre_mlp = MLP(name,
                              output_dim=obs_space_dims,
                              hidden_sizes=hidden_sizes,
                              hidden_nonlinearity=hidden_nonlinearity,
                              output_nonlinearity=output_nonlinearity,
                              input_var=pre_input_per_task[idx],
                              input_dim=obs_space_dims + action_space_dims,
                              params=mlp.get_params())

                pre_delta_pred = pre_mlp.output_var
                pre_loss = tf.reduce_mean(tf.square(pre_delta_per_task[idx] - pre_delta_pred))
                adapted_params = self._adapt_sym(pre_loss, pre_mlp.get_params())
                self._adapted_params.append(adapted_params)

            with tf.variable_scope(name + '/post_model_%d' % idx, reuse=tf.AUTO_REUSE):
                post_mlp = MLP(name,
                               output_dim=obs_space_dims,
                               hidden_sizes=hidden_sizes,
                               hidden_nonlinearity=hidden_nonlinearity,
                               output_nonlinearity=output_nonlinearity,
                               input_var=post_input_per_task[idx],
                               params=adapted_params,
                               input_dim=obs_space_dims + action_space_dims)
                post_delta_pred = post_mlp.output_var

                post_loss = tf.reduce_mean(tf.square(post_delta_per_task[idx] - post_delta_pred))

                pre_losses.append(pre_loss)
                post_losses.append(post_loss)

            self.pre_loss = tf.reduce_mean(pre_losses)
            self.post_loss = tf.reduce_mean(post_losses)
            self.train_op = optimizer(self.learning_rate).minimize(self.post_loss)

        """ --------------------------- Post-update Inference Graph --------------------------- """
        with tf.variable_scope(name + '_ph_graph'):
            self.post_update_delta = []
            self.network_phs_meta_batch = []

            nn_input_per_task = tf.split(self.nn_input, self.meta_batch_size, axis=0)
            for idx in range(meta_batch_size):
                with tf.variable_scope('task_%i' % idx):
                    network_phs = self._create_placeholders_for_vars(mlp.get_params())
                    self.network_phs_meta_batch.append(network_phs)

                    mlp_meta_batch = MLP(name,
                                         output_dim=obs_space_dims,
                                         hidden_sizes=hidden_sizes,
                                         hidden_nonlinearity=hidden_nonlinearity,
                                         output_nonlinearity=output_nonlinearity,
                                         params=network_phs,
                                         input_var=nn_input_per_task[idx],
                                         input_dim=obs_space_dims + action_space_dims,
                                         )

                    self.post_update_delta.append(mlp_meta_batch.output_var)

        self._networks = [mlp]
    def __init__(
        self,
        name,
        env,
        hidden_sizes=(512, 512),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,
        rolling_average_persitency=0.99,
    ):

        Serializable.quick_init(self, locals())

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.name = name
        self._dataset_train = None
        self._dataset_test = None

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = action_space_dims = env.action_space.shape[0]

        hidden_nonlinearity = self._activations[hidden_nonlinearity]
        output_nonlinearity = self._activations[output_nonlinearity]

        with tf.variable_scope(name):
            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, obs_space_dims))

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            # create MLP
            with tf.variable_scope('ff_model'):
                mlp = MLP(name,
                          output_dim=obs_space_dims,
                          hidden_sizes=hidden_sizes,
                          hidden_nonlinearity=hidden_nonlinearity,
                          output_nonlinearity=output_nonlinearity,
                          input_var=self.nn_input,
                          input_dim=obs_space_dims + action_space_dims)

            self.delta_pred = mlp.output_var  # shape: (batch_size, ndim_obs, n_models)

            self.loss = tf.reduce_mean(
                tf.square(self.delta_ph - self.delta_pred))
            self.optimizer = optimizer(self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = tensor_utils.compile_function(
                [self.obs_ph, self.act_ph], self.delta_pred)

        self._networks = [mlp]