Esempio n. 1
0
    def __init__(self):
        self.env = gym.make('Pushing2D-v1')
        self.task_hor = TASK_HORIZON

        self.agent = Agent(self.env)
        self.model = PENN(NUM_NETS, STATE_DIM, ACTION_DIM, LR)
        self.policy = MPC(self.env, NUM_PARTICLES, PLAN_HOR, self.model, POPSIZE, NUM_ELITES, MAX_ITERS)
Esempio n. 2
0
    def __init__(self, env_name='Pushing2D-v1', num_nets=1, mpc_params=None):
        self.env = gym.make(env_name)
        self.task_horizon = TASK_HORIZON

        self.agent = Agent(self.env)
        mpc_params['use_gt_dynamics'] = False
        self.model = PENN(num_nets, STATE_DIM,
                          len(self.env.action_space.sample()), LR)
        self.cem_policy = MPC(self.env,
                              PLAN_HORIZON,
                              self.model,
                              POPSIZE,
                              NUM_ELITES,
                              MAX_ITERS,
                              **mpc_params,
                              use_random_optimizer=False)
        self.random_policy = MPC(self.env,
                                 PLAN_HORIZON,
                                 self.model,
                                 POPSIZE,
                                 NUM_ELITES,
                                 MAX_ITERS,
                                 **mpc_params,
                                 use_random_optimizer=True)
        self.random_policy_no_mpc = RandomPolicy(
            len(self.env.action_space.sample()))
Esempio n. 3
0
    def __init__(self, env_name='Pushing2D-v1', num_nets=1, mpc_params=None):
        self.env = gym.make(env_name)
        # self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.device = torch.device('cpu')

        self.task_horizon = TASK_HORIZON

        # Tensorboard logging.
        self.timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        self.environment_name = "pusher"
        self.logdir = 'logs/%s/%s' % (self.environment_name, self.timestamp)
        self.summary_writer = SummaryWriter(self.logdir)

        self.agent = Agent(self.env)
        mpc_params['use_gt_dynamics'] = False
        self.model = PENN(num_nets, STATE_DIM,
                          len(self.env.action_space.sample()), LR, self.device,
                          self.summary_writer, self.timestamp,
                          self.environment_name)
        self.cem_policy = MPC(self.env,
                              PLAN_HORIZON,
                              self.model,
                              POPSIZE,
                              NUM_ELITES,
                              MAX_ITERS,
                              use_random_optimizer=False,
                              **mpc_params)
        self.random_policy = MPC(self.env,
                                 PLAN_HORIZON,
                                 self.model,
                                 POPSIZE,
                                 NUM_ELITES,
                                 MAX_ITERS,
                                 use_random_optimizer=True,
                                 **mpc_params)
        self.random_policy_no_mpc = RandomPolicy(
            len(self.env.action_space.sample()))
Esempio n. 4
0
class Experiment:
    def __init__(self):
        self.env = gym.make('Pushing2D-v1')
        self.task_hor = TASK_HORIZON

        self.agent = Agent(self.env)
        self.model = PENN(NUM_NETS, STATE_DIM, ACTION_DIM, LR)
        self.policy = MPC(self.env, NUM_PARTICLES, PLAN_HOR, self.model, POPSIZE, NUM_ELITES, MAX_ITERS)


    def test(self, num_episodes):
        samples = []
        for j in range(num_episodes):
            samples.append(
                self.agent.sample(
                    self.task_hor, self.policy
                )
            )
        print("Rewards obtained:", np.mean([sample["reward_sum"] for sample in samples]))
        print("Percent success:", np.mean([sample["rewards"][-1]==0 for sample in samples]))
        return np.mean([sample["rewards"][-1]==0 for sample in samples])

    def train(self):
        traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], []
        test_results = []
        samples = []
        rand_pol = RandomPolicy(2)
        for i in range(NINIT_ROLLOUTS):
            samples.append(self.agent.sample(self.task_hor, rand_pol))
            traj_obs.append(samples[-1]["obs"])
            traj_acs.append(samples[-1]["ac"])
            traj_rews.append(samples[-1]["rewards"])

        if NINIT_ROLLOUTS>0:
            self.policy.train(
                    [sample["obs"] for sample in samples],
                    [sample["ac"] for sample in samples],
                    [sample["rewards"] for sample in samples],
                    epochs=10
            )

        for i in range(NTRAIN_ITERS):
            print("####################################################################")
            print("Starting training iteration %d." % (i + 1))

            samples = []
            for j in range(NROLLOUTS_PER_ITER):
                samples.append(
                    self.agent.sample(
                        self.task_hor, self.policy
                    )
                )
            print("Rewards obtained:", [sample["reward_sum"] for sample in samples])
            traj_obs.extend([sample["obs"] for sample in samples])
            traj_acs.extend([sample["ac"] for sample in samples])
            traj_rets.extend([sample["reward_sum"] for sample in samples])
            traj_rews.extend([sample["rewards"] for sample in samples])

            if(i % 50 == 0):
                self.model.save_models()
                test_results.append((i,self.test(20)))
                test_file = open("test_graph.txt","w")
                test_file.writelines([str(epoch) + "," + str(result) + "\n" for (epoch,result) in test_results])
                test_file.close()

            self.policy.train(
                    [sample["obs"] for sample in samples],
                    [sample["ac"] for sample in samples],
                    [sample["rewards"] for sample in samples]
            )