Ejemplos de ForwardModel en Python

Lenguaje de programación: Python

Namespace/Package Name: forward_model

Clase / Tipo: ForwardModel

Ejemplos en hotexamples.com: 10

Python ForwardModel - 10 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de forward_model.ForwardModel extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

ForwardModel(8)

train(4)

forward(3)

_handle_messages(1)

adjoint(1)

connect(1)

geometry(1)

loadCalibration(1)

operator(1)

pseudoinverse(1)

reweight(1)

set_next_state_callback(1)

space(1)

Ejemplo n.º 1

Mostrar archivo

    def __init__(self):
        self._client_fwd = ForwardModel(fwd_model_uri)
        self._client = GameState(uri)

        self._client.set_game_tick_callback(self._on_game_tick)
        self._client_fwd.set_next_state_callback(self._on_next_game_state)
        self.connect()

Ejemplo n.º 2

Mostrar archivo

    def __init__(self, name):
        self.name = name

        self.calibrated = rospy.get_param("~" + self.name + "/calibrated")
        self.signEncoder = rospy.get_param("~" + self.name + "/signEncoder")
        self.signJoint = rospy.get_param("~" + self.name + "/signJoint")
        self.name = rospy.get_param("~" + self.name + "/name")
        self.nameEncoder = rospy.get_param("~" + self.name + "/nameEncoder")
        minAngle = rospy.get_param("~" + self.name + "/minAngle")
        maxAngle = rospy.get_param("~" + self.name + "/maxAngle")
        self.pGain = rospy.get_param("~" + self.name + "/gains/P")
        self.iGain = rospy.get_param("~" + self.name + "/gains/I")
        self.vGain = rospy.get_param("~" + self.name + "/gains/D")
        self.maxAbsForwardError = rospy.get_param("~" + self.name +
                                                  "/maxAbsForwardError")

        # Attribute stores resting pose angle of antagonist-pair motor joints
        # taken from joint yaml file in gummi_base and gummi_ee pkgs.
        self.restingPoseAngle = rospy.get_param("~" + self.name +
                                                "/restingPoseAngle")

        self.range = maxAngle - minAngle
        self.angle = JointAngle(self.nameEncoder, self.signEncoder, minAngle,
                                maxAngle, True)

        self.eqModel = EquilibriumModel(self.name)
        self.inverseModel = InverseModel(self.name)
        self.inverseModelCollision = InverseModel(self.name)
        self.forwardModel = ForwardModel(self.name)

        if self.calibrated is 1:
            self.inverseModel.loadCalibration()
            self.inverseModelCollision.loadCalibration()
            self.forwardModel.loadCalibration()

        self.cocontractionReflex = Reflex(2.0, 0.0045, 0.0)
        self.feedbackReflex = Reflex(1.0, 0.0075, 0.0)
        self.collisionReflex = Reflex(1.0, 0.0075, 0.0)

        self.initPublishers()
        self.initVariables()
        self.disableEncoderTorque()

        jointRange = self.angle.getMax() - self.angle.getMin()
        self.eqModel.calculateEqVelCalibration(jointRange)

Ejemplo n.º 3

Mostrar archivo

Archivo: antagonist.py Proyecto: PuglisiCristian/GummiArm

    def __init__(self, name):
        self.name = name

        self.calibrated = rospy.get_param("~" + self.name + "/calibrated")
        self.signEncoder = rospy.get_param("~" + self.name + "/signEncoder")
        self.signJoint = rospy.get_param("~" + self.name + "/signJoint")
        self.name = rospy.get_param("~" + self.name + "/name")
        self.nameEncoder = rospy.get_param("~" + self.name + "/nameEncoder")
        minAngle = rospy.get_param("~" + self.name + "/minAngle")
        maxAngle = rospy.get_param("~" + self.name + "/maxAngle")
        self.pGain = rospy.get_param("~" + self.name + "/gains/P")
        self.iGain = rospy.get_param("~" + self.name + "/gains/I")
        self.vGain = rospy.get_param("~" + self.name + "/gains/D")
        self.maxAbsForwardError = rospy.get_param("~" + self.name +
                                                  "/maxAbsForwardError")

        self.range = maxAngle - minAngle
        self.angle = JointAngle(self.nameEncoder, self.signEncoder, minAngle,
                                maxAngle, True)

        self.eqModel = EquilibriumModel(self.name)
        self.inverseModel = InverseModel(self.name)
        self.inverseModelCollision = InverseModel(self.name)
        self.forwardModel = ForwardModel(self.name)

        if self.calibrated is 1:
            self.inverseModel.loadCalibration()
            self.inverseModelCollision.loadCalibration()
            self.forwardModel.loadCalibration()

        self.cocontractionReflex = Reflex(2.0, 0.0015, 0.0)
        self.feedbackReflex = Reflex(1.0, 0.0075, 0.0)
        self.collisionReflex = Reflex(1.0, 0.0075, 0.0)

        self.initPublishers()
        self.initVariables()
        self.disableEncoderTorque()

        jointRange = self.angle.getMax() - self.angle.getMin()
        self.eqModel.calculateEqVelCalibration(jointRange)

Ejemplo n.º 4

Mostrar archivo

Archivo: antagonist.py Proyecto: PuglisiCristian/GummiArm

class Antagonist:
    def __init__(self, name):
        self.name = name

        self.calibrated = rospy.get_param("~" + self.name + "/calibrated")
        self.signEncoder = rospy.get_param("~" + self.name + "/signEncoder")
        self.signJoint = rospy.get_param("~" + self.name + "/signJoint")
        self.name = rospy.get_param("~" + self.name + "/name")
        self.nameEncoder = rospy.get_param("~" + self.name + "/nameEncoder")
        minAngle = rospy.get_param("~" + self.name + "/minAngle")
        maxAngle = rospy.get_param("~" + self.name + "/maxAngle")
        self.pGain = rospy.get_param("~" + self.name + "/gains/P")
        self.iGain = rospy.get_param("~" + self.name + "/gains/I")
        self.vGain = rospy.get_param("~" + self.name + "/gains/D")
        self.maxAbsForwardError = rospy.get_param("~" + self.name +
                                                  "/maxAbsForwardError")

        self.range = maxAngle - minAngle
        self.angle = JointAngle(self.nameEncoder, self.signEncoder, minAngle,
                                maxAngle, True)

        self.eqModel = EquilibriumModel(self.name)
        self.inverseModel = InverseModel(self.name)
        self.inverseModelCollision = InverseModel(self.name)
        self.forwardModel = ForwardModel(self.name)

        if self.calibrated is 1:
            self.inverseModel.loadCalibration()
            self.inverseModelCollision.loadCalibration()
            self.forwardModel.loadCalibration()

        self.cocontractionReflex = Reflex(2.0, 0.0015, 0.0)
        self.feedbackReflex = Reflex(1.0, 0.0075, 0.0)
        self.collisionReflex = Reflex(1.0, 0.0075, 0.0)

        self.initPublishers()
        self.initVariables()
        self.disableEncoderTorque()

        jointRange = self.angle.getMax() - self.angle.getMin()
        self.eqModel.calculateEqVelCalibration(jointRange)

    def initVariables(self):
        self.errors = deque()
        self.velocity = False
        self.closedLoop = False
        self.feedForward = False
        self.collisionResponse = False
        self.errorLast = 0.0
        self.ballistic = 0.0
        self.deltaAngleBallistic = 0.0
        self.deltaEqFeedback = 0.0
        self.lastForwardError = 0.0
        self.forwardError = 0.0

        self.ballisticRatio = 0.85
        self.feedbackRatio = 0.5

    def disableEncoderTorque(self):
        service_name = self.nameEncoder + "_controller/torque_enable"
        rospy.wait_for_service(service_name)
        try:
            te = rospy.ServiceProxy(service_name, TorqueEnable)
            te(torque_enable=False)
        except rospy.ServiceException, e:
            print "Service call failed: %s" % e

Ejemplo n.º 5

Mostrar archivo

    def __init__(self, environment):

        self.env = environment

        # Create placeholders for all the inputs
        self.states_ = tf.placeholder(
            "float", shape=(None, ) + self.env.state_size,
            name='states_')  # Batch x State, previous state
        self.states = tf.placeholder(
            "float", shape=(None, ) + self.env.state_size,
            name='states')  # Batch x State, current_state
        self.actions = tf.placeholder("float",
                                      shape=(None, self.env.action_size),
                                      name='action')  # Batch x Action
        self.label = tf.placeholder("float", shape=(None, 1), name='label')
        self.gamma = tf.placeholder("float", shape=(), name='gamma')
        self.temp = tf.placeholder("float", shape=(), name='temperature')
        self.noise = tf.placeholder("float", shape=(), name='noise_flag')
        self.do_keep_prob = tf.placeholder("float",
                                           shape=(),
                                           name='do_keep_prob')
        if self.env.use_airl:
            self.done_ph = tf.placeholder(name="dones",
                                          shape=(None, ),
                                          dtype=tf.float32)

        # Create MGAIL blocks
        self.forward_model = ForwardModel(
            state_size=self.env.state_size[0]
            if self.env.obs_mode == 'state' else self.env.encoder_feat_size,
            action_size=self.env.action_size,
            encoding_size=self.env.fm_size,
            lr=self.env.fm_lr,
            forward_model_type=self.env.forward_model_type,
            obs_mode=self.env.obs_mode,
            use_scale_dot_product=self.env.use_scale_dot_product,
            use_skip_connection=self.env.use_skip_connection,
            use_dropout=self.env.use_dropout)

        if self.env.obs_mode == 'pixel':
            if self.env.state_only:
                feat_in_dim = 1024  # self.env.encoder_feat_size[0]
                policy_input_feat = 1024
            else:
                feat_in_dim = 1024 + self.env.action_size  # self.env.encoder_feat_size[0]
                policy_input_feat = 1024
        else:
            if self.env.state_only:
                feat_in_dim = self.env.state_size[0]
                policy_input_feat = self.env.state_size[0]
            else:
                feat_in_dim = self.env.state_size[0] + self.env.action_size
                policy_input_feat = self.env.state_size[0]

        self.discriminator = Discriminator(
            in_dim=feat_in_dim,
            out_dim=self.env.disc_out_dim,
            size=self.env.d_size,
            lr=self.env.d_lr,
            do_keep_prob=self.do_keep_prob,
            weight_decay=self.env.weight_decay,
            use_airl=self.env.use_airl,
            phi_hidden_size=self.env.phi_size,
            state_only=self.env.state_only,
        )

        self.policy = Policy(in_dim=policy_input_feat,
                             out_dim=self.env.action_size,
                             size=self.env.p_size,
                             lr=self.env.p_lr,
                             do_keep_prob=self.do_keep_prob,
                             n_accum_steps=self.env.policy_accum_steps,
                             weight_decay=self.env.weight_decay)

        # Create experience buffers
        self.er_agent = ER(
            memory_size=self.env.er_agent_size,
            state_dim=self.env.state_size,
            action_dim=self.env.action_size,
            reward_dim=1,  # stub connection
            qpos_dim=self.env.qpos_size,
            qvel_dim=self.env.qvel_size,
            batch_size=self.env.batch_size,
            history_length=1)

        self.er_expert = common.load_er(fname=os.path.join(
            self.env.run_dir, self.env.expert_data),
                                        batch_size=self.env.batch_size,
                                        history_length=1,
                                        traj_length=2)

        self.env.sigma = self.er_expert.actions_std / self.env.noise_intensity

        if self.env.obs_mode == 'pixel':
            current_states = ops.preprocess(self.states, bits=8)
            current_states_feat = ops.encoder(current_states,
                                              reuse=tf.AUTO_REUSE)
            prev_states = ops.preprocess(self.states_, bits=8)
            prev_states_feat = ops.encoder(prev_states, reuse=tf.AUTO_REUSE)
        else:
            # Normalize the inputs
            prev_states = common.normalize(self.states_,
                                           self.er_expert.states_mean,
                                           self.er_expert.states_std)
            current_states = common.normalize(self.states,
                                              self.er_expert.states_mean,
                                              self.er_expert.states_std)
            prev_states_feat = prev_states
            current_states_feat = current_states

        if self.env.continuous_actions:
            actions = common.normalize(self.actions,
                                       self.er_expert.actions_mean,
                                       self.er_expert.actions_std)
        else:
            actions = self.actions

        # 1. Forward Model
        initial_gru_state = np.ones((1, self.forward_model.encoding_size))
        forward_model_prediction, _, divergence_loss = self.forward_model.forward(
            [prev_states_feat, actions, initial_gru_state])
        if self.env.obs_mode == 'pixel':
            forward_model_prediction = ops.decoder(
                forward_model_prediction,
                data_shape=self.env.state_size,
                reuse=tf.AUTO_REUSE)
            self.forward_model_prediction = ops.postprocess(
                forward_model_prediction, bits=8, dtype=tf.uint8)
        else:
            self.forward_model_prediction = forward_model_prediction
        forward_model_loss = tf.reduce_mean(
            tf.square(current_states - forward_model_prediction)
        ) + self.env.forward_model_lambda * tf.reduce_mean(divergence_loss)
        self.forward_model.train(objective=forward_model_loss)

        if self.env.use_airl:
            # 1.1 action log prob
            logits = self.policy.forward(current_states_feat)
            if self.env.continuous_actions:
                mean, logstd = logits, tf.log(tf.ones_like(logits))
                std = tf.exp(logstd)

                n_elts = tf.cast(tf.reduce_prod(mean.shape[1:]),
                                 tf.float32)  # first dimension is batch size
                log_normalizer = n_elts / 2. * (np.log(2 * np.pi).astype(
                    np.float32)) + 1 / 2 * tf.reduce_sum(logstd, axis=1)
                # Diagonal Gaussian action probability, for every action
                action_logprob = -tf.reduce_sum(tf.square(actions - mean) /
                                                (2 * std),
                                                axis=1) - log_normalizer
            else:
                # Override since the implementation of tfp.RelaxedOneHotCategorical
                # yields positive values.
                if actions.shape[1:] != logits.shape[1:]:
                    actions = tf.cast(actions, tf.int8)
                    values = tf.one_hot(actions,
                                        logits.shape.as_list()[-1],
                                        dtype=tf.float32)
                    assert values.shape == logits.shape, (values.shape,
                                                          logits.shape)
                else:
                    values = actions

                # [0]'s implementation (see line below) seems to be an approximation
                # to the actual Gumbel Softmax density.
                # TODO: to confirm 'action' or 'value'
                action_logprob = -tf.reduce_sum(
                    -values * tf.nn.log_softmax(logits, axis=-1), axis=-1)
                # prob = logit[np.arange(self.action_test.shape[0]), self.action_test]
                # action_logprob = tf.log(prob)
            # 2. Discriminator
            self.discriminator.airl_entropy_weight = self.env.airl_entropy_weight
            # labels = tf.concat([1 - self.label, self.label], 1)
            # labels = 1 - self.label  # 0 for expert, 1 for policy
            labels = self.label  # 1 for expert, 0 for policy
            d, self.disc_shaped_reward_output, self.disc_reward = self.discriminator.forward(
                state=current_states_feat,
                action=actions,
                prev_state=prev_states_feat,
                done_inp=self.done_ph,
                log_policy_act_prob=action_logprob,
            )

            # 2.1 0-1 accuracy
            correct_predictions = tf.equal(tf.argmax(d, 1),
                                           tf.argmax(labels, 1))
            self.discriminator.acc = tf.reduce_mean(
                tf.cast(correct_predictions, "float"))
            # 2.2 prediction
            d_cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=labels,
                logits=d,
                name="disc_loss",
            )
            # Construct generator reward:
            # \[\hat{r}(s,a) = \log(D_{\theta}(s,a)) - \log(1 - D_{\theta}(s,a)).\]
            # This simplifies to:
            # \[\hat{r}(s,a) = f_{\theta}(s,a) - \log \pi(a \mid s).\]
            # This is just an entropy-regularized objective
            # ent_bonus = -self.env.airl_entropy_weight * self.discriminator.log_policy_act_prob_ph
            # policy_train_reward = self.discriminator.reward_net.reward_output_train + ent_bonus
        else:
            # 2. Discriminator
            labels = tf.concat([1 - self.label, self.label], 1)
            d, _, _ = self.discriminator.forward(state=current_states_feat,
                                                 action=actions)

            # 2.1 0-1 accuracy
            correct_predictions = tf.equal(tf.argmax(d, 1),
                                           tf.argmax(labels, 1))
            self.discriminator.acc = tf.reduce_mean(
                tf.cast(correct_predictions, "float"))
            # 2.2 prediction
            d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                logits=d, labels=labels)
        # cost sensitive weighting (weight true=expert, predict=agent mistakes)
        d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\
                                                           tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy)
        discriminator_loss = tf.reduce_mean(d_loss_weighted)
        self.discriminator.train(objective=discriminator_loss)

        # 3. Collect experience
        mu = self.policy.forward(current_states_feat)
        if self.env.continuous_actions:
            a = common.denormalize(mu, self.er_expert.actions_mean,
                                   self.er_expert.actions_std)
            eta = tf.random_normal(shape=tf.shape(a), stddev=self.env.sigma)
            self.action_test = tf.squeeze(a + self.noise * eta)
        else:
            a = common.gumbel_softmax(logits=mu, temperature=self.temp)
            self.action_test = tf.argmax(a, dimension=1)

        # 4.3 AL
        def policy_loop(current_state_policy_update, t, total_cost,
                        total_trans_err, env_term_sig, prev_state):
            if self.env.obs_mode == 'pixel':
                current_state_feat_policy_update = ops.encoder(
                    current_state_policy_update, reuse=True)
                prev_state_feat_policy_update = ops.encoder(prev_state,
                                                            reuse=True)
            else:
                current_state_feat_policy_update = current_state_policy_update
                prev_state_feat_policy_update = prev_state
            mu = self.policy.forward(current_state_feat_policy_update,
                                     reuse=True)

            if self.env.continuous_actions:
                eta = self.env.sigma * tf.random_normal(shape=tf.shape(mu))
                action = mu + eta

                if self.env.use_airl:
                    mean, logstd = mu, tf.log(
                        tf.ones_like(mu) * self.env.sigma)
                    std = tf.exp(logstd)

                    n_elts = tf.cast(
                        tf.reduce_prod(mean.shape[1:]),
                        tf.float32)  # first dimension is batch size
                    log_normalizer = n_elts / 2. * (np.log(2 * np.pi).astype(
                        np.float32)) + 1 / 2 * tf.reduce_sum(logstd, axis=1)
                    # Diagonal Gaussian action probability, for every action
                    action_logprob = -tf.reduce_sum(tf.square(action - mean) /
                                                    (2 * std),
                                                    axis=1) - log_normalizer
            else:
                action = common.gumbel_softmax_sample(logits=mu,
                                                      temperature=self.temp)

                if self.env.use_airl:
                    # Override since the implementation of tfp.RelaxedOneHotCategorical
                    # yields positive values.
                    if action.shape[1:] != logits.shape[1:]:
                        actions = tf.cast(action, tf.int8)
                        values = tf.one_hot(actions,
                                            logits.shape.as_list()[-1],
                                            dtype=tf.float32)
                        assert values.shape == logits.shape, (values.shape,
                                                              logits.shape)
                    else:
                        values = action

                    # [0]'s implementation (see line below) seems to be an approximation
                    # to the actual Gumbel Softmax density.
                    # TODO: to confirm 'action' or 'value'
                    action_logprob = -tf.reduce_sum(
                        -values * tf.nn.log_softmax(logits, axis=-1), axis=-1)

            # minimize the gap between agent logit (d[:,0]) and expert logit (d[:,1])
            if self.env.use_airl:
                d, shaped_reward_output, reward = self.discriminator.forward(
                    state=current_state_feat_policy_update,
                    action=action,
                    prev_state=prev_state_feat_policy_update,
                    done_inp=tf.cast(env_term_sig, tf.float32),
                    log_policy_act_prob=action_logprob,
                    reuse=True)
                if self.env.alg in ['mairlTransfer', 'mairlImit4Transfer']:
                    reward_for_updating_policy = reward
                else:  # 'mairlImit'
                    reward_for_updating_policy = shaped_reward_output
                if self.env.train_mode and not self.env.alg in [
                        'mairlTransfer', 'mairlImit4Transfer'
                ]:
                    ent_bonus = -self.env.airl_entropy_weight * tf.stop_gradient(
                        action_logprob)
                    policy_reward = reward_for_updating_policy + ent_bonus
                else:
                    policy_reward = reward_for_updating_policy
                cost = tf.reduce_mean(-policy_reward) * self.env.policy_al_w
            else:
                d, _, _ = self.discriminator.forward(
                    state=current_state_feat_policy_update,
                    action=action,
                    reuse=True)
                cost = self.al_loss(d)

            # add step cost
            total_cost += tf.multiply(tf.pow(self.gamma, t), cost)

            # get action
            if self.env.continuous_actions:
                a_sim = common.denormalize(action, self.er_expert.actions_mean,
                                           self.er_expert.actions_std)
            else:
                a_sim = tf.argmax(action, dimension=1)

            # get next state
            state_env, _, env_term_sig, = self.env.step(a_sim,
                                                        mode='tensorflow')[:3]
            state_e = common.normalize(state_env, self.er_expert.states_mean,
                                       self.er_expert.states_std)
            state_e = tf.stop_gradient(state_e)

            state_a, _, divergence_loss_a = self.forward_model.forward(
                [current_state_feat_policy_update, action, initial_gru_state],
                reuse=True)
            if self.env.obs_mode == 'pixel':
                state_a = ops.decoder(state_a,
                                      data_shape=self.env.state_size,
                                      reuse=True)
            if True:  # self.env.alg in ['mgail']:
                state, nu = common.re_parametrization(state_e=state_e,
                                                      state_a=state_a)
            else:
                _, nu = common.re_parametrization(state_e=state_e,
                                                  state_a=state_a)
                state = state_a

            total_trans_err += tf.reduce_mean(abs(nu))
            t += 1

            if self.env.obs_mode == 'pixel':
                state = tf.slice(state, [0, 0, 0, 0], [1, -1, -1, -1])
            return state, t, total_cost, total_trans_err, env_term_sig, current_state_policy_update

        def policy_stop_condition(current_state_policy_update, t, cost,
                                  trans_err, env_term_sig, prev_state):
            cond = tf.logical_not(
                env_term_sig)  # not done: env_term_sig = False
            cond = tf.logical_and(cond, t < self.env.n_steps_train)
            cond = tf.logical_and(cond,
                                  trans_err < self.env.total_trans_err_allowed)
            return cond

        if self.env.obs_mode == 'pixel':
            state_0 = tf.slice(current_states, [0, 0, 0, 0], [1, -1, -1, -1])
        else:
            state_0 = tf.slice(current_states, [0, 0], [1, -1])
        # prev_state_0 = tf.slice(states_, [0, 0], [1, -1])
        loop_outputs = tf.while_loop(policy_stop_condition, policy_loop,
                                     [state_0, 0., 0., 0., False, state_0])
        self.policy.train(objective=loop_outputs[2])

Ejemplo n.º 6

Mostrar archivo

class Agent():
    def __init__(self):
        self._client_fwd = ForwardModel(fwd_model_uri)
        self._client = GameState(uri)

        self._client.set_game_tick_callback(self._on_game_tick)
        self._client_fwd.set_next_state_callback(self._on_next_game_state)
        self.connect()

    def connect(self):
        loop = asyncio.get_event_loop()

        client_connection = loop.run_until_complete(self._client.connect())
        client_fwd_connection = None

        client_fwd_connection = loop.run_until_complete(
            self._client_fwd.connect())

        loop = asyncio.get_event_loop()
        loop.create_task(self._client._handle_messages(client_connection))
        loop.create_task(
            self._client_fwd._handle_messages(client_fwd_connection))
        loop.run_forever()

    def _get_bomb_to_detonate(self, game_state) -> [int, int] or None:
        agent_number = game_state.get("connection").get("agent_number")
        entities = self._client._state.get("entities")
        bombs = list(
            filter(
                lambda entity: entity.get("owner") == agent_number and entity.
                get("type") == "b", entities))
        bomb = next(iter(bombs or []), None)
        if bomb != None:
            return [bomb.get("x"), bomb.get("y")]
        else:
            return None

    async def _on_game_tick(self, tick_number, game_state):
        random_action = self.generate_random_action()
        if random_action in ["up", "left", "right", "down"]:
            await self._client.send_move(random_action)
        elif random_action == "bomb":
            await self._client.send_bomb()
        elif random_action == "detonate":
            bomb_coordinates = self._get_bomb_to_detonate(game_state)
            if bomb_coordinates != None:
                x, y = bomb_coordinates
                await self._client.send_detonate(x, y)
        else:
            print(f"Unhandled action: {random_action}")

    def generate_random_action(self):
        actions_length = len(actions)
        return actions[random.randint(0, actions_length - 1)]

    async def _on_next_game_state(self, state):
        # print(state)
        pass

    def generate_random_action(self):
        actions_length = len(actions)
        return actions[random.randint(0, actions_length - 1)]

Ejemplo n.º 7

Mostrar archivo

    def __init__(self, environment, use_irl=False):
        self.use_irl = use_irl
        self.env = environment

        # Create placeholders for all the inputs
        self.states_ = tf.compat.v1.placeholder("float", shape=(None, self.env.state_size), name='states_')  # Batch x State
        self.states = tf.compat.v1.placeholder("float", shape=(None, self.env.state_size), name='states')  # Batch x State
        self.actions = tf.compat.v1.placeholder("float", shape=(None, self.env.action_size), name='action')  # Batch x Action
        self.label = tf.compat.v1.placeholder("float", shape=(None, 1), name='label')
        self.gamma = tf.compat.v1.placeholder("float", shape=(), name='gamma')
        self.temp = tf.compat.v1.placeholder("float", shape=(), name='temperature')
        self.noise = tf.compat.v1.placeholder("float", shape=(), name='noise_flag')
        self.do_keep_prob = tf.compat.v1.placeholder("float", shape=(), name='do_keep_prob')
        self.lprobs = tf.compat.v1.placeholder('float', shape=(None, 1), name='log_probs')

        # Create MGAIL blocks
        self.forward_model = ForwardModel(state_size=self.env.state_size,
                                          action_size=self.env.action_size,
                                          encoding_size=self.env.fm_size,
                                          lr=self.env.fm_lr)
        
        # MODIFYING THE NEW DISCRIMINATOR:
        if self.use_irl:
            self.discriminator = DiscriminatorIRL(in_dim=self.env.state_size + self.env.action_size,
                                            out_dim=1,
                                            size=self.env.d_size,
                                            lr=self.env.d_lr,
                                            do_keep_prob=self.do_keep_prob,
                                            weight_decay=self.env.weight_decay,
                                            state_only=True,
                                            gamma=self.gamma,
                                            state_size = self.env.state_size,
                                            action_size = self.env.action_size)
        # END MODIFYING THE NEW DISCRIMINATOR
        else:
            self.discriminator = Discriminator(in_dim=self.env.state_size + self.env.action_size,
                                            out_dim=2,
                                            size=self.env.d_size,
                                            lr=self.env.d_lr,
                                            do_keep_prob=self.do_keep_prob,
                                            weight_decay=self.env.weight_decay)

        self.policy = Policy(in_dim=self.env.state_size,
                              out_dim=self.env.action_size,
                              size=self.env.p_size,
                              lr=self.env.p_lr,
                              do_keep_prob=self.do_keep_prob,
                              n_accum_steps=self.env.policy_accum_steps,
                              weight_decay=self.env.weight_decay)

        # Create experience buffers
        self.er_agent = ER(memory_size=self.env.er_agent_size,
                           state_dim=self.env.state_size,
                           action_dim=self.env.action_size,
                           batch_size=self.env.batch_size,
                           history_length=1)

        self.er_expert = common.load_d4rl_er(h5path=os.path.join(self.env.run_dir, self.env.expert_data),
                                        batch_size=self.env.batch_size,
                                        history_length=1,
                                        traj_length=2)

        self.env.sigma = self.er_expert.actions_std / self.env.noise_intensity

        # Normalize the inputs
        states_ = common.normalize(self.states_, self.er_expert.states_mean, self.er_expert.states_std)
        states = common.normalize(self.states, self.er_expert.states_mean, self.er_expert.states_std)
        if self.env.continuous_actions:
            actions = common.normalize(self.actions, self.er_expert.actions_mean, self.er_expert.actions_std)
        else:
            actions = self.actions

        # 1. Forward Model
        initial_gru_state = np.ones((1, self.forward_model.encoding_size))
        forward_model_prediction, _ = self.forward_model.forward([states_, actions, initial_gru_state])
        forward_model_loss = tf.reduce_mean(tf.square(states-forward_model_prediction))
        self.forward_model.train(objective=forward_model_loss)

        # 2. Discriminator
        labels = tf.concat([1 - self.label, self.label], 1)
        lprobs = self.lprobs
        
        # MODIFIED DISCRIMINATOR SECTION
        if self.use_irl:
            self.discrim_output, log_p_tau, log_q_tau, log_pq = self.discriminator.forward(states_, actions, states, lprobs)


            correct_predictions = tf.equal(tf.cast(tf.round(self.discrim_output), tf.int64), tf.argmax(labels, 1))
            self.discriminator.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"))

            d_cross_entropy = self.label*(log_p_tau-log_pq) + (1-self.label)*(log_q_tau-log_pq)

            d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\
                                                            tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy)
            
            discriminator_loss = -tf.reduce_mean(d_loss_weighted)
            self.discriminator.train(objective=discriminator_loss)
        # END MODIFIED DISCRIMINATOR SECTION


        else:
            d = self.discriminator.forward(states, actions)
            # 2.1 0-1 accuracy
            correct_predictions = tf.equal(tf.argmax(d, 1), tf.argmax(labels, 1))
            self.discriminator.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"))
            # 2.2 prediction
            d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=d, labels=labels)
            # cost sensitive weighting (weight true=expert, predict=agent mistakes)
            d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\
                                                            tf.multiply(tf.compat.v1.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy)
        
            discriminator_loss = tf.reduce_mean(d_loss_weighted)
            self.discriminator.train(objective=discriminator_loss)

        # 3. Collect experience
        mu = self.policy.forward(states)
        if self.env.continuous_actions:
            a = common.denormalize(mu, self.er_expert.actions_mean, self.er_expert.actions_std)
            eta = tf.random.normal(shape=tf.shape(a), stddev=self.env.sigma)
            self.action_test = a + self.noise * eta
            # self.action_means = mu
            N = tf.shape(self.action_test)[0]
            expanded_sigma= tf.repeat(tf.expand_dims(tf.cast(self.env.sigma, dtype=tf.float32), 0), N, axis=0)
            self.action_probs_test = common.compute_action_probs_tf(self.action_test, mu, expanded_sigma)
        else:
            a = common.gumbel_softmax(logits=mu, temperature=self.temp)
            self.action_test = tf.compat.v1.argmax(a, dimension=1)
            self.action_means = tf.squeeze(mu)

        # 4.3 AL
        def policy_loop(state_, t, total_cost, total_trans_err, _):
            mu = self.policy.forward(state_, reuse=True)

            if self.env.continuous_actions:
                eta = self.env.sigma * tf.random.normal(shape=tf.shape(mu))
                action = mu + eta
                N = tf.shape(action)[0]
                expanded_sigma= tf.repeat(tf.expand_dims(tf.cast(self.env.sigma, dtype=tf.float32), 0), N, axis=0)
                a_prob = common.compute_action_probs_tf(action, mu, expanded_sigma)
            else:
                action = common.gumbel_softmax_sample(logits=mu, temperature=self.temp)
                a_prob = 0.5

            # get action
            if self.env.continuous_actions:
                a_sim = common.denormalize(action, self.er_expert.actions_mean, self.er_expert.actions_std)
            else:
                a_sim = tf.compat.v1.argmax(action, dimension=1)

            # get next state
            state_env, _, env_term_sig, = self.env.step(a_sim, mode='tensorflow')[:3]
            state_e = common.normalize(state_env, self.er_expert.states_mean, self.er_expert.states_std)
            state_e = tf.stop_gradient(state_e)

            state_a, _ = self.forward_model.forward([state_, action, initial_gru_state], reuse=True)

            state, nu = common.re_parametrization(state_e=state_e, state_a=state_a)
            total_trans_err += tf.reduce_mean(abs(nu))
            t += 1

            # minimize the gap between agent logit (d[:,0]) and expert logit (d[:,1])

            # MODIFIED DISCRIMINATOR SECTION:
            if self.use_irl:
                self.discrim_output, log_p_tau, log_q_tau, log_pq = self.discriminator.forward(state_, action, state, a_prob, reuse=True)
                cost = self.al_loss(log_p=log_p_tau, log_q=log_q_tau, log_pq=log_pq)
            else:
                d = self.discriminator.forward(state_, action, reuse=True)
                cost = self.al_loss(d=d)

            # END MODIFIED DISCRIMINATOR SECTION

            # add step cost
            total_cost += tf.multiply(tf.pow(self.gamma, t), cost)

            return state, t, total_cost, total_trans_err, env_term_sig

        def policy_stop_condition(state_, t, cost, trans_err, env_term_sig):
            cond = tf.logical_not(env_term_sig)
            cond = tf.logical_and(cond, t < self.env.n_steps_train)
            cond = tf.logical_and(cond, trans_err < self.env.total_trans_err_allowed)
            return cond

        state_0 = tf.slice(states, [0, 0], [1, -1])
        loop_outputs = tf.while_loop(policy_stop_condition, policy_loop, [state_0, 0., 0., 0., False])
        self.policy.train(objective=loop_outputs[2])

Ejemplo n.º 8

Mostrar archivo

Archivo: two_pushes.py Proyecto: gmargo11/csl-hw2

matplotlib.use('Agg')
import matplotlib.pyplot as plt

from push_env import PushingEnv

from forward_model import ForwardModel
from inverse_model import InverseModel

if __name__ == "__main__":
    # train inverse model
    inverse_model = InverseModel()
    num_epochs = 30
    train_losses, valid_losses = inverse_model.train(num_epochs=num_epochs)

    #train forward model
    forward_model = ForwardModel()
    num_epochs = 30
    train_losses, valid_losses = forward_model.train(num_epochs=num_epochs)

    env = PushingEnv(ifRender=False)
    num_trials = 10

    # two pushes, inverse model
    errors = np.zeros(num_trials)
    # save one push
    errors[0] = env.plan_inverse_model_extrapolate(
        inverse_model, img_save_name="inverse_twopush", seed=0)
    print("test loss:", errors[0])
    # try 10 random seeds
    for seed in range(1, 10):
        errors[seed] = env.plan_inverse_model_extrapolate(inverse_model,

Ejemplo n.º 9

Mostrar archivo

Archivo: bayesian_deep_gradient_experiments_mfvi.py Proyecto: rb876/Quantifying-Model-Uncertainty-in-Inverse-Problems-via-Bayesian-Deep-Gradient-Descent

def main():

    parser = argparse.ArgumentParser()
    # general & dataset & training settings
    parser.add_argument('--k_max', type=int, default=5,
                        help='Max reconstruction iterations')
    parser.add_argument('--save_figs', type = lambda x:bool(strtobool(x)), default=True,
                        help='save pics in reconstruction')
    parser.add_argument('--img_mode', type=str, default='SimpleCT',
                        help=' image-modality reconstruction: SimpleCT')
    parser.add_argument('--train_size', type=int, default=4000,
                        help='dataset size')
    parser.add_argument('--pseudo_inverse_init', type = lambda x:bool(strtobool(x)), default=True,
                        help='initialise with pseudoinverse')
    parser.add_argument('--brain', type = lambda x:bool(strtobool(x)), default=False,
                        help='test set of brain images')
    parser.add_argument('--epochs', type=int, default=150,
                        help='number of epochs to train')
    parser.add_argument('--batch_size', type=int, default=128,
                        help='input batch size for training')
    parser.add_argument('--initial_lr', type=float, default=1e-3,
                        help='initial_lr')
    parser.add_argument('--val_batch_size', type=int, default=128,
                        help='input batch size for valing')

    # forward models setting
    parser.add_argument('--size', type=int, default=128,
                        help='image size')
    parser.add_argument('--beam_num_angle', type=int, default=30,
                        help='number of angles / projections')
    # options
    parser.add_argument('--no_cuda', type = lambda x:bool(strtobool(x)), default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=222,
                        help='random seed')

    args = parser.parse_args()
    layer_utils.set_gpu_mode(True)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if use_cuda else 'cpu')

    if args.img_mode is not None:
        forward_model = ForwardModel()
        half_size = args.size / 2
        space =  odl.uniform_discr([-half_size, -half_size],
                                   [half_size, half_size],
                                   [args.size, args.size], dtype='float32')
        forward_model.space = space
        geometry = odl.tomo.parallel_beam_geometry(space, num_angles=args.beam_num_angle)
        forward_model.geometry = geometry
        operator = odl.tomo.RayTransform(space, geometry)
        opnorm = odl.power_method_opnorm(operator)
        forward_model.operator = odl_torch.OperatorModule( (1 / opnorm) * operator )
        forward_model.adjoint = odl_torch.OperatorModule(operator.adjoint)
        pseudoinverse = odl.tomo.fbp_op(operator)
        pseudoinverse = odl_torch.OperatorModule( pseudoinverse * opnorm )
        forward_model.pseudoinverse = pseudoinverse

        geometry_specs = 'full_view_sparse_' + str(args.beam_num_angle)
        dataset_name = 'dataset' + '_' + args.img_mode + '_' + str(args.size) \
        + '_' + str(args.train_size) + '_' + geometry_specs + '_' \
        + 'brain' + '_' + str(args.brain)


    if args.img_mode == SimpleCT.__name__:
        img_mode = SimpleCT(forward_model)
        data_constructor = DatasetConstructor(img_mode, train_size=args.train_size, brain=args.brain, dataset_name=dataset_name)
        data = data_constructor.data()
    else:
        raise NotImplementedError
    dataset = DataSet(data, img_mode, args.pseudo_inverse_init)

    optim_parms = {'epochs':args.epochs, 'initial_lr':  args.initial_lr, 'batch_size': args.batch_size}
    from hybrid_model import HybridModel as NeuralLearner

    # results directory
    path = os.path.dirname(__file__)
    dir_path = os.path.join(path, 'results', args.img_mode, 'MFVI', str(args.train_size), geometry_specs, str(args.seed))
    if not os.path.isdir(dir_path):
        os.makedirs(dir_path)

    # all config
    print('===========================\n', flush=True)
    for key, val in vars(args).items():
        print('{}: {}'.format(key, val), flush=True)
    print('===========================\n', flush=True)

    blocks_history = {'model': [], 'optimizer': []}
    arch_args = {'arch': {'up':  [ [1, 16, 3, 1, 1],  [16, 32, 3, 1, 1]],
                          'low': [ [1, 16, 3, 1, 1],  [16, 32, 3, 1, 1]],
                          'cm':  [ [64, 32, 3, 1, 1], [32, 16, 3, 1, 1]] }}

    # savings training procedures
    filename = 'train_phase'
    filepath = os.path.join(dir_path, filename)
    vis = TrainVisualiser(filepath)

    start_time = time.time()
    # looping through architecture-blocs
    for idx in range(1, args.k_max + 1):

        print('============== training block number: {} ============= \n'.format(idx), flush=True)

        train_tensor =  dataset.construct(flag='train')
        val_tensor = dataset.construct(flag='validation')

        train_loader = DataLoader(train_tensor, batch_size=args.batch_size, shuffle=True)
        val_loader = DataLoader(val_tensor, batch_size=args.val_batch_size, shuffle=True)

        model = NeuralLearner(arch_args)
        model = model.to(device)
        model_path = os.path.join(dir_path, str(idx) + '.pt')
        if os.path.exists(model_path):
            model_loaded = True
            model.load_state_dict(torch.load(model_path))
            print('idx: {} model loaded!\npath to model:\n{}'.format(idx, model_path), flush=True)
        else:
            model_loaded = False
            model.optimise(train_loader, **optim_parms)
            save_net(model, os.path.join(dir_path, str(idx) + '.pt'))
            print('idx: {} optimisation finished!'.format(idx), flush=True)

        start = time.time()
        info = next_step_update(dataset, train_tensor, model, device, flag='train')
        end = time.time()
        print('============= {} {:.4f} ============= \n'.format('training reconstruction', end-start), flush=True)
        for key in info.keys():
            print('{}: {} \n'.format(key, info[key]), flush=True)

        start = time.time()
        info = next_step_update(dataset, val_tensor, model, device, flag='validation')
        end = time.time()
        print('============= {} {:.4f} ============= \n'.format('validation reconstruction', end-start), flush=True)
        for key in info.keys():
            print('{}: {} \n'.format(key, info[key]), flush=True)

        vis.update(dataset, flag='validation')
        blocks_history['model'].append(model)

        # reconstruction
        resonstruction_dir_path = os.path.join(dir_path, str(idx))
        if model_loaded:
            resonstruction_dir_path = os.path.join(dir_path, str(idx), 're-loaded')

        if not os.path.isdir(resonstruction_dir_path):
            os.makedirs(resonstruction_dir_path)
        get_stats(dataset, blocks_history, device, resonstruction_dir_path)

    print('--- training time: %s seconds ---' % (time.time() - start_time), flush=True)
    vis.generate()

Ejemplo n.º 10

Mostrar archivo

    def __init__(self, environment, reweight, ensemble):

        self.env = environment
        self.reweight = reweight
        self.ensemble = ensemble

        # Create placeholders for all the inputs
        self.states_ = tf.placeholder("float", shape=(None, self.env.state_size), name='states_')  # Batch x State
        self.states = tf.placeholder("float", shape=(None, self.env.state_size), name='states')  # Batch x State
        self.actions = tf.placeholder("float", shape=(None, self.env.action_size), name='action')  # Batch x Action
        self.label = tf.placeholder("float", shape=(None, 1), name='label')
        self.gamma = tf.placeholder("float", shape=(), name='gamma')
        self.temp = tf.placeholder("float", shape=(), name='temperature')
        self.noise = tf.placeholder("float", shape=(), name='noise_flag')
        self.do_keep_prob = tf.placeholder("float", shape=(), name='do_keep_prob')

        self.states_e_ = tf.placeholder("float", shape=(None, self.env.state_size), name='states_e_')
        self.states_e = tf.placeholder("float", shape=(None, self.env.state_size), name='states_e')
        self.actions_e = tf.placeholder("float", shape=(None, self.env.action_size), name='action_e')
        self.ex_wts_ = tf.placeholder("float", shape=(self.ensemble, None), name='ex_wts')

        # Create MGAIL blocks
        self.forward_model = ForwardModel(state_size=self.env.state_size,
                                          action_size=self.env.action_size,
                                          encoding_size=self.env.fm_size,
                                          lr=self.env.fm_lr,
                                          ensemble=self.ensemble)

        self.discriminator = Discriminator(in_dim=self.env.state_size + self.env.action_size,
                                           out_dim=2,
                                           size=self.env.d_size,
                                           lr=self.env.d_lr,
                                           do_keep_prob=self.do_keep_prob,
                                           weight_decay=self.env.weight_decay)

        self.policy = Policy(in_dim=self.env.state_size,
                              out_dim=self.env.action_size,
                              size=self.env.p_size,
                              lr=self.env.p_lr,
                              do_keep_prob=self.do_keep_prob,
                              n_accum_steps=self.env.policy_accum_steps,
                              weight_decay=self.env.weight_decay)

        # Create experience buffers
        self.er_agent = ER(memory_size=self.env.er_agent_size,
                           state_dim=self.env.state_size,
                           action_dim=self.env.action_size,
                           reward_dim=1,  # stub connection
                           qpos_dim=self.env.qpos_size,
                           qvel_dim=self.env.qvel_size,
                           batch_size=self.env.batch_size,
                           history_length=1)

        self.er_expert = common.load_er(fname=os.path.join(self.env.run_dir, self.env.expert_data),
                                        batch_size=self.env.batch_size,
                                        history_length=1,
                                        traj_length=2)

        self.env.sigma = self.er_expert.actions_std / self.env.noise_intensity

        # Normalize the inputs
        states_ = common.normalize(self.states_, self.er_expert.states_mean, self.er_expert.states_std)
        states = common.normalize(self.states, self.er_expert.states_mean, self.er_expert.states_std)
        if self.env.continuous_actions:
            actions = common.normalize(self.actions, self.er_expert.actions_mean, self.er_expert.actions_std)
        else:
            actions = self.actions

        states_e_ = common.normalize(self.states_e_, self.er_expert.states_mean, self.er_expert.states_std)
        states_e = common.normalize(self.states_e, self.er_expert.states_mean, self.er_expert.states_std)
        if self.env.continuous_actions:
            actions_e = common.normalize(self.actions_e, self.er_expert.actions_mean, self.er_expert.actions_std)
        else:
            actions_e = self.actions_e

        # 1. Forward Model
        if self.reweight:
            initial_gru_state = np.ones((1, self.forward_model.encoding_size))
            self.forward_model.train(x_=[states_, actions, initial_gru_state], y_=states, ex_wts=self.ex_wts_)

            initial_gru_state_rw = np.ones((1, self.forward_model.encoding_size))
            initial_gru_state_val = np.ones((1, self.forward_model.encoding_size))
            self.forward_model.reweight(x_=[states_, actions, initial_gru_state_rw], y_=states,
                                        x_val_=[states_e_, actions_e, initial_gru_state_val], y_val_=states_e,
                                        bsize_a=self.env.batch_size, bsize_b=self.env.batch_size)
        else:
            initial_gru_state = np.ones((1, self.forward_model.encoding_size))
            self.forward_model.train(x_=[states_, actions, initial_gru_state], y_=states, ex_wts=None)

        # 1.1 prediction (for development)
        # self.forward_model.predict(x_=[states_, actions, initial_gru_state], y_=states)

        # 2. Discriminator
        labels = tf.concat([1 - self.label, self.label], 1)
        d = self.discriminator.forward(states, actions)

        # 2.1 0-1 accuracy
        correct_predictions = tf.equal(tf.argmax(d, 1), tf.argmax(labels, 1))
        self.discriminator.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"))
        # 2.2 prediction
        d_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=d, labels=labels)
        # cost sensitive weighting (weight true=expert, predict=agent mistakes)
        d_loss_weighted = self.env.cost_sensitive_weight * tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 1.)), d_cross_entropy) +\
                                                           tf.multiply(tf.to_float(tf.equal(tf.squeeze(self.label), 0.)), d_cross_entropy)
        discriminator_loss = tf.reduce_mean(d_loss_weighted)
        self.discriminator.train(objective=discriminator_loss)

        # 3. Collect experience
        mu = self.policy.forward(states)
        if self.env.continuous_actions:
            a = common.denormalize(mu, self.er_expert.actions_mean, self.er_expert.actions_std)
            eta = tf.random_normal(shape=tf.shape(a), stddev=self.env.sigma)
            self.action_test = tf.squeeze(a + self.noise * eta)
        else:
            a = common.gumbel_softmax(logits=mu, temperature=self.temp)
            self.action_test = tf.argmax(a, dimension=1)

        # 4.3 AL
        def policy_loop(state_, t, total_cost, total_trans_err, _):
            mu = self.policy.forward(state_, reuse=True)

            if self.env.continuous_actions:
                eta = self.env.sigma * tf.random_normal(shape=tf.shape(mu))
                action = mu + eta
            else:
                action = common.gumbel_softmax_sample(logits=mu, temperature=self.temp)

            # minimize the gap between agent logit (d[:,0]) and expert logit (d[:,1])
            d = self.discriminator.forward(state_, action, reuse=True)
            cost = self.al_loss(d)

            # add step cost
            total_cost += tf.multiply(tf.pow(self.gamma, t), cost)

            # get action
            if self.env.continuous_actions:
                a_sim = common.denormalize(action, self.er_expert.actions_mean, self.er_expert.actions_std)
            else:
                a_sim = tf.argmax(action, dimension=1)

            # get next state
            state_env, _, env_term_sig, = self.env.step(a_sim, mode='tensorflow')[:3]
            state_e = common.normalize(state_env, self.er_expert.states_mean, self.er_expert.states_std)
            state_e = tf.stop_gradient(state_e)

            # state_a, _ = self.forward_model.forward([state_, action, initial_gru_state], reuse=True)
            state_a, _ = self.forward_model.forward(inputs=[state_, action, initial_gru_state],
                                                    is_training=False, dtype=tf.float32,
                                                    w_dict=None, ex_wts=None, reuse=True)

            state, nu = common.re_parametrization(state_e=state_e, state_a=state_a)
            total_trans_err += tf.reduce_mean(abs(nu))
            t += 1

            return state, t, total_cost, total_trans_err, env_term_sig

        def policy_stop_condition(state_, t, cost, trans_err, env_term_sig):
            cond = tf.logical_not(env_term_sig)
            cond = tf.logical_and(cond, t < self.env.n_steps_train)
            cond = tf.logical_and(cond, trans_err < self.env.total_trans_err_allowed)
            return cond

        state_0 = tf.slice(states, [0, 0], [1, -1])
        loop_outputs = tf.while_loop(policy_stop_condition, policy_loop, [state_0, 0., 0., 0., False])
        self.policy.train(objective=loop_outputs[2])