Ejemplo n.º 1
0
 def _create_MPPI_controller(self):
     ctrl = mppi.MPPI(
         dynamics=self._predict,
         running_cost=self.cost_fn,
         nx=self.state_dim,
         noise_sigma=self.noise_sigma,
         num_samples=self.n_samples,
         horizon=self.horizon,
         lambda_=self.lambda_,
         device=TORCH_DEVICE,
         u_min=torch.tensor(self.action_lb,
                            dtype=torch.float,
                            device=TORCH_DEVICE),
         u_max=torch.tensor(self.action_ub,
                            dtype=torch.float,
                            device=TORCH_DEVICE),
     )
     return ctrl
            action = np.random.uniform(low=ACTION_LOW, high=ACTION_HIGH)
            env.step([action])
            # env.render()
            new_data[i, :nx] = pre_action_state
            new_data[i, nx:] = action

        train(new_data)
        logger.info("bootstrapping finished")

    env = wrappers.Monitor(env, '/tmp/mppi/', force=True)
    env.reset()
    if downward_start:
        env.env.state = [np.pi, 1]

    mppi_gym = mppi.MPPI(dynamics,
                         running_cost,
                         nx,
                         noise_sigma,
                         num_samples=N_SAMPLES,
                         horizon=TIMESTEPS,
                         lambda_=lambda_,
                         device=d,
                         u_min=torch.tensor(ACTION_LOW,
                                            dtype=torch.double,
                                            device=d),
                         u_max=torch.tensor(ACTION_HIGH,
                                            dtype=torch.double,
                                            device=d))
    total_reward, data = mppi.run_mppi(mppi_gym, env, train)
    logger.info("Total reward %f", total_reward)
Ejemplo n.º 3
0
        theta_dt = state[:, 1]
        action = action[:, 0]
        cost = angle_normalize(
            theta)**2 + 0.1 * theta_dt**2 + 0.001 * action**2
        return cost

    def train(new_data):
        pass

    downward_start = True
    env = gym.make(ENV_NAME).env  # bypass the default TimeLimit wrapper
    env.reset()
    if downward_start:
        env.state = [np.pi, 1]

    env = wrappers.Monitor(env, '/tmp/mppi/', force=True)
    env.reset()
    if downward_start:
        env.env.state = [np.pi, 1]

    nx = 2
    mppi_gym = mppi.MPPI(dynamics,
                         running_cost,
                         nx,
                         noise_sigma,
                         num_samples=N_SAMPLES,
                         horizon=TIMESTEPS,
                         lambda_=lambda_)
    total_reward = mppi.run_mppi(mppi_gym, env, train)
    logger.info("Total reward %f", total_reward)