def _create_MPPI_controller(self): ctrl = mppi.MPPI( dynamics=self._predict, running_cost=self.cost_fn, nx=self.state_dim, noise_sigma=self.noise_sigma, num_samples=self.n_samples, horizon=self.horizon, lambda_=self.lambda_, device=TORCH_DEVICE, u_min=torch.tensor(self.action_lb, dtype=torch.float, device=TORCH_DEVICE), u_max=torch.tensor(self.action_ub, dtype=torch.float, device=TORCH_DEVICE), ) return ctrl
action = np.random.uniform(low=ACTION_LOW, high=ACTION_HIGH) env.step([action]) # env.render() new_data[i, :nx] = pre_action_state new_data[i, nx:] = action train(new_data) logger.info("bootstrapping finished") env = wrappers.Monitor(env, '/tmp/mppi/', force=True) env.reset() if downward_start: env.env.state = [np.pi, 1] mppi_gym = mppi.MPPI(dynamics, running_cost, nx, noise_sigma, num_samples=N_SAMPLES, horizon=TIMESTEPS, lambda_=lambda_, device=d, u_min=torch.tensor(ACTION_LOW, dtype=torch.double, device=d), u_max=torch.tensor(ACTION_HIGH, dtype=torch.double, device=d)) total_reward, data = mppi.run_mppi(mppi_gym, env, train) logger.info("Total reward %f", total_reward)
theta_dt = state[:, 1] action = action[:, 0] cost = angle_normalize( theta)**2 + 0.1 * theta_dt**2 + 0.001 * action**2 return cost def train(new_data): pass downward_start = True env = gym.make(ENV_NAME).env # bypass the default TimeLimit wrapper env.reset() if downward_start: env.state = [np.pi, 1] env = wrappers.Monitor(env, '/tmp/mppi/', force=True) env.reset() if downward_start: env.env.state = [np.pi, 1] nx = 2 mppi_gym = mppi.MPPI(dynamics, running_cost, nx, noise_sigma, num_samples=N_SAMPLES, horizon=TIMESTEPS, lambda_=lambda_) total_reward = mppi.run_mppi(mppi_gym, env, train) logger.info("Total reward %f", total_reward)