def __init__(self, env, plan_horizon, model, popsize, num_elites, max_iters, initial_mu, initial_sigma, num_particles=6, use_gt_dynamics=True, use_mpc=True, use_random_optimizer=False): """ :param env: :param plan_horizon: :param model: The learned dynamics model to use, which can be None if use_gt_dynamics is True :param popsize: Population size :param num_elites: CEM parameter :param max_iters: CEM parameter :param num_particles: Number of trajectories for TS1 :param use_gt_dynamics: Whether to use the ground truth dynamics from the environment :param use_mpc: Whether to use only the first action of a planned trajectory :param use_random_optimizer: Whether to use CEM or take random actions """ self.env = env action_dim = len(env.action_space.low) state_dim = len(env.observation_space.low) self.use_gt_dynamics, self.use_mpc, self.use_random_optimizer = use_gt_dynamics, use_mpc, use_random_optimizer self.num_particles = num_particles self.plan_horizon = plan_horizon self.num_nets = None if model is None else model.num_nets self.state_dim, self.action_dim = 8, env.action_space.shape[0] self.ac_ub, self.ac_lb = env.action_space.high, env.action_space.low self.initial_sigma = initial_sigma self.initial_mu = initial_mu self.popsize = popsize self.max_iters = max_iters # Set up optimizer self.model = model if use_gt_dynamics: self.predict_next_state = self.predict_next_state_gt assert num_particles == 1 else: self.predict_next_state = self.predict_next_state_model # TODO: write your code here # Initialize your planner with the relevant arguments. # Write different optimizers for cem and random actions respectively if(not self.use_random_optimizer): print("Using CEM Policy") self.num_elites = num_elites self.policy = CEMPolicy(self.env,self.action_dim,self.initial_mu,self.initial_sigma,self.plan_horizon,self.popsize,self.num_elites,self.max_iters,self.ac_ub,self.ac_lb,self.use_gt_dynamics) else: print("Using Random Policy") self.policy = RandomPolicy(self.env,self.action_dim,self.initial_mu,self.initial_sigma,self.plan_horizon,self.popsize,self.max_iters,self.ac_ub, self.ac_lb,self.use_gt_dynamics)
def __init__(self, env_name='Pushing2D-v1', num_nets=1, mpc_params=None): self.env = gym.make(env_name) self.task_horizon = TASK_HORIZON self.agent = Agent(self.env) mpc_params['use_gt_dynamics'] = False self.model = PENN(num_nets, STATE_DIM, len(self.env.action_space.sample()), LR) self.cem_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE, NUM_ELITES, MAX_ITERS, **mpc_params, use_random_optimizer=False) self.random_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE, NUM_ELITES, MAX_ITERS, **mpc_params, use_random_optimizer=True) self.random_policy_no_mpc = RandomPolicy( len(self.env.action_space.sample()))
def train(self): traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], [] test_results = [] samples = [] rand_pol = RandomPolicy(2) for i in range(NINIT_ROLLOUTS): samples.append(self.agent.sample(self.task_hor, rand_pol)) traj_obs.append(samples[-1]["obs"]) traj_acs.append(samples[-1]["ac"]) traj_rews.append(samples[-1]["rewards"]) if NINIT_ROLLOUTS>0: self.policy.train( [sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples], epochs=10 ) for i in range(NTRAIN_ITERS): print("####################################################################") print("Starting training iteration %d." % (i + 1)) samples = [] for j in range(NROLLOUTS_PER_ITER): samples.append( self.agent.sample( self.task_hor, self.policy ) ) print("Rewards obtained:", [sample["reward_sum"] for sample in samples]) traj_obs.extend([sample["obs"] for sample in samples]) traj_acs.extend([sample["ac"] for sample in samples]) traj_rets.extend([sample["reward_sum"] for sample in samples]) traj_rews.extend([sample["rewards"] for sample in samples]) if(i % 50 == 0): self.model.save_models() test_results.append((i,self.test(20))) test_file = open("test_graph.txt","w") test_file.writelines([str(epoch) + "," + str(result) + "\n" for (epoch,result) in test_results]) test_file.close() self.policy.train( [sample["obs"] for sample in samples], [sample["ac"] for sample in samples], [sample["rewards"] for sample in samples] )
def __init__(self, env_name='Pushing2D-v1', mpc_params=None): self.env = gym.make(env_name) self.task_horizon = TASK_HORIZON self.agent = Agent(self.env) # Does not need model self.warmup = False mpc_params['use_gt_dynamics'] = True if (mpc_params['use_mpc']): self.cem_policy = MPC(self.env, PLAN_HORIZON, None, POPSIZE, NUM_ELITES, MAX_ITERS, INITIAL_MU, INITIAL_SIGMA, **mpc_params, use_random_optimizer=False) self.random_policy = MPC(self.env, PLAN_HORIZON, None, POPSIZE, NUM_ELITES, MAX_ITERS, INITIAL_MU, INITIAL_SIGMA, **mpc_params, use_random_optimizer=True) else: self.cem_policy = CEMPolicy( self.env, len(self.env.action_space.low), INITIAL_MU, INITIAL_SIGMA, PLAN_HORIZON, POPSIZE, NUM_ELITES, MAX_ITERS, self.env.action_space.high, self.env.action_space.low, mpc_params['use_gt_dynamics']) self.random_policy = RandomPolicy(self.env, len(self.env.action_space.low), INITIAL_MU, INITIAL_SIGMA, PLAN_HORIZON, POPSIZE, MAX_ITERS, self.env.action_space.high, self.env.action_space.low, mpc_params['use_gt_dynamics'])
def __init__(self, env_name='Pushing2D-v1', num_nets=1, mpc_params=None): self.env = gym.make(env_name) # self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.device = torch.device('cpu') self.task_horizon = TASK_HORIZON # Tensorboard logging. self.timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') self.environment_name = "pusher" self.logdir = 'logs/%s/%s' % (self.environment_name, self.timestamp) self.summary_writer = SummaryWriter(self.logdir) self.agent = Agent(self.env) mpc_params['use_gt_dynamics'] = False self.model = PENN(num_nets, STATE_DIM, len(self.env.action_space.sample()), LR, self.device, self.summary_writer, self.timestamp, self.environment_name) self.cem_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE, NUM_ELITES, MAX_ITERS, use_random_optimizer=False, **mpc_params) self.random_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE, NUM_ELITES, MAX_ITERS, use_random_optimizer=True, **mpc_params) self.random_policy_no_mpc = RandomPolicy( len(self.env.action_space.sample()))
class MPC: def __init__(self, env, plan_horizon, model, popsize, num_elites, max_iters, initial_mu, initial_sigma, num_particles=6, use_gt_dynamics=True, use_mpc=True, use_random_optimizer=False): """ :param env: :param plan_horizon: :param model: The learned dynamics model to use, which can be None if use_gt_dynamics is True :param popsize: Population size :param num_elites: CEM parameter :param max_iters: CEM parameter :param num_particles: Number of trajectories for TS1 :param use_gt_dynamics: Whether to use the ground truth dynamics from the environment :param use_mpc: Whether to use only the first action of a planned trajectory :param use_random_optimizer: Whether to use CEM or take random actions """ self.env = env action_dim = len(env.action_space.low) state_dim = len(env.observation_space.low) self.use_gt_dynamics, self.use_mpc, self.use_random_optimizer = use_gt_dynamics, use_mpc, use_random_optimizer self.num_particles = num_particles self.plan_horizon = plan_horizon self.num_nets = None if model is None else model.num_nets self.state_dim, self.action_dim = 8, env.action_space.shape[0] self.ac_ub, self.ac_lb = env.action_space.high, env.action_space.low self.initial_sigma = initial_sigma self.initial_mu = initial_mu self.popsize = popsize self.max_iters = max_iters # Set up optimizer self.model = model if use_gt_dynamics: self.predict_next_state = self.predict_next_state_gt assert num_particles == 1 else: self.predict_next_state = self.predict_next_state_model # TODO: write your code here # Initialize your planner with the relevant arguments. # Write different optimizers for cem and random actions respectively if(not self.use_random_optimizer): print("Using CEM Policy") self.num_elites = num_elites self.policy = CEMPolicy(self.env,self.action_dim,self.initial_mu,self.initial_sigma,self.plan_horizon,self.popsize,self.num_elites,self.max_iters,self.ac_ub,self.ac_lb,self.use_gt_dynamics) else: print("Using Random Policy") self.policy = RandomPolicy(self.env,self.action_dim,self.initial_mu,self.initial_sigma,self.plan_horizon,self.popsize,self.max_iters,self.ac_ub, self.ac_lb,self.use_gt_dynamics) def predict_next_state_model(self, states, actions): """ Given a list of state action pairs, use the learned model to predict the next state""" # TODO: write your code here raise NotImplementedError def predict_next_state_gt(self, states, actions): """ Given a list of state action pairs, use the ground truth dynamics to predict the next state""" # TODO: write your code here raise NotImplementedError def train(self, obs_trajs, acs_trajs, rews_trajs, epochs=5): """ Take the input obs, acs, rews and append to existing transitions the train model. Arguments: obs_trajs: states acs_trajs: actions rews_trajs: rewards (NOTE: this may not be used) epochs: number of epochs to train for """ # TODO: write your code here raise NotImplementedError def reset(self): # TODO: write your code here print("Resetting MPC policy") self.policy.reset() def act(self, state, present_timestep): """ Use model predictive control to find the action give current state. Arguments: state: current state present_timestep: current timestep """ if(present_timestep==0): self.policy.goal = state[[-2, -1]] if(self.use_random_optimizer): best_trajectory = self.policy.train(state) return best_trajectory[0,:].tolist() else: mu = self.policy.train(state) self.policy.mu = np.vstack((mu[1:,:],np.zeros((1,self.action_dim)))) return mu[0,:].tolist()