Ejemplo n.º 1
0
def main(args):
    # Create a Gym environment
    env = gym.make(args.env)

    # Exercise 1
    # TODO: For CartPole-v0 - maximum episode length
    env._max_episode_steps = 1000

    # Get dimensionalities of actions and observations
    action_space_dim = get_space_dim(env.action_space)
    observation_space_dim = get_space_dim(env.observation_space)

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Print some stuff
    print("Environment:", args.env)
    print("Training device:", agent.train_device)
    print("Observation space dimensions:", observation_space_dim)
    print("Action space dimensions:", action_space_dim)

    # If no model was passed, train a policy from scratch.
    # Otherwise load the policy from the file and go directly to testing.
    if args.test is None:
        training_history = train(args.position, agent, env,
                                 args.train_episodes, False,
                                 args.render_training)

        # Save the model
        tt = str(datetime.datetime.now().date()) + "-" + str(
            datetime.datetime.now().hour) + "-" + str(
                datetime.datetime.now().minute)
        model_file = "%s_params.mdl" % (args.env + tt + "vel")
        torch.save(policy.state_dict(), model_file)
        print("Model saved to", model_file)

        # Plot rewards
        sns.lineplot(x="episode", y="reward", data=training_history)
        sns.lineplot(x="episode", y="mean_reward", data=training_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history (%s)" % args.env)
        # time and day of plot
        plt.savefig("train_history" + tt + "vel" + ".jpg")
        plt.show()

        print("Training finished.")
    else:
        print("Loading model from", args.test, "...")
        state_dict = torch.load(args.test)
        policy.load_state_dict(state_dict)
        print("Testing...")
        test(args.position, agent, env, args.train_episodes, args.render_test)
def main(args):
    # Create a Gym environment
    env = gym.make(args.env)

    # Exercise 1
    env._max_episode_steps = args.episode_length

    # Get dimensionalities of actions and observations
    action_space_dim = get_space_dim(env.action_space)
    observation_space_dim = get_space_dim(env.observation_space)

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Print some stuff
    print("Environment:", args.env)
    print("Training device:", agent.train_device)
    print("Observation space dimensions:", observation_space_dim)
    print("Action space dimensions:", action_space_dim)

    # If no model was passed, train a policy from scratch.
    # Otherwise load the policy from the file and go directly to testing.
    if args.test is None:
        training_history = train(agent,
                                 env,
                                 args.train_episodes,
                                 False,
                                 args.render_training,
                                 x0=args.x0,
                                 args=args,
                                 policy=policy)

        # Save the model
        model_file = "%s_params.mdl" % args.env
        torch.save(policy.state_dict(), model_file)
        print("Model saved to", model_file)

        # Plot rewards
        sns.lineplot(x="episode", y="reward", data=training_history)
        sns.lineplot(x="episode", y="mean_reward", data=training_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history (%s)" % args.env)
        plt.show()
        print("Training finished.")
    else:
        print("Loading model from", args.test, "...")
        state_dict = torch.load(args.test)
        policy.load_state_dict(state_dict)
        print("Testing...")
        test(agent, env, args.train_episodes, args.render_test, x0=args.x0)
Ejemplo n.º 3
0
class Trainer:
    def __init__(self):
        #Preparing envs
        self.envs = Envs()

        self.memory = ReplayBuffer()
        self.device = torch.device(settings.device)
        self.policy = Policy().to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=p.lr)

        self.critic = QNetwork().to(self.device)
        self.critic_target = QNetwork().to(self.device)

        self.critic_optim = Adam(self.critic.parameters(), lr=p.lr)
        self.parameter_update(tau=1.0)

        if settings.mode == "test":
            self.policy.load_state_dict(
                torch.load("policy_seed_{}".format(settings.seed)))

        self.logger = Logger()

    def start(self):
        self.total_numsteps = 0

        if settings.mode == "train":
            self.add_random_steps()

            names = torch.FloatTensor(
                [i for i, _ in enumerate(settings.env_names)]).to(self.device)
            while self.total_numsteps < p.max_numsteps:
                self.run_test()
                leg_starts, states = self.envs.reset()
                for step in range(p._max_episode_steps):
                    self.total_numsteps += 1
                    actions = self.select_action(leg_starts, states, names)
                    next_states, rewards, dones = self.envs.step(actions)
                    self.memory.push(names, leg_starts, states, next_states,
                                     actions, rewards, dones)
                    states = self.envs.reset_dones(next_states, dones)

                    c1_loss, c2_loss, policy_loss = self.update_nets()

                    if (self.total_numsteps % 10) == 0:
                        self.logger.show_update(self.total_numsteps)

            torch.save(self.policy.state_dict(),
                       "policy_seed_{}".format(settings.seed))

        else:
            print("Seed: {}".format(settings.seed))
            self.run_test()

    def run_test(self):
        if settings.mode == "test":
            print("\nTesting current policy")
        leg_starts, states = self.envs.reset()
        done_filter = epsd_rewards = torch.FloatTensor(
            [1.0] * len(settings.env_names)).to(self.device)
        epsd_rewards = torch.FloatTensor([0.0] * len(settings.env_names)).to(
            self.device)
        names = torch.FloatTensor(
            [i for i, _ in enumerate(settings.env_names)]).to(self.device)
        for step in range(p._max_episode_steps):
            actions = self.select_action(leg_starts,
                                         states,
                                         names,
                                         evaluate=True)
            next_states, rewards, dones = self.envs.step(actions)
            epsd_rewards += done_filter * rewards
            done_filter *= (dones != 1).float()
            states = next_states

        self.logger.add_rewards(len(names), epsd_rewards, self.total_numsteps)
        self.logger.save()

    def add_random_steps(self):
        print("Adding random steps")
        leg_starts, states = self.envs.reset()
        names = torch.FloatTensor(
            [i for i, _ in enumerate(settings.env_names)]).to(self.device)
        while len(self.memory) <= p.batch_size * 10:
            actions = self.envs.sample_actions()
            next_states, rewards, dones = self.envs.step(actions)
            self.memory.push(names, leg_starts, states, next_states, actions,
                             rewards, dones)
            states = self.envs.reset_dones(next_states, dones)

    def select_action(self, leg_starts, states, names, evaluate=False):
        with torch.no_grad():

            if not evaluate:
                actions, _, _ = self.policy.sample(leg_starts, states, names)
            else:
                _, _, actions = self.policy.sample(leg_starts, states, names)

            return actions.cpu()

    def parameter_update(self, tau=p.tau):
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def update_nets(self):
        names_batch, leg_starts_batch, state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.memory.sample(
        )

        reward_batch = reward_batch.unsqueeze(1)
        mask_batch = mask_batch.unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                leg_starts_batch, next_state_batch, names_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                leg_starts_batch, next_state_batch, next_state_action,
                names_batch)
            min_qf_next_target = torch.min(
                qf1_next_target, qf2_next_target) - p.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * p.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(leg_starts_batch, state_batch, action_batch,
                               names_batch)

        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)
        qf_loss = qf1_loss + qf2_loss

        self.critic_optim.zero_grad()
        qf_loss.backward()
        self.critic_optim.step()

        pi, log_pi, _ = self.policy.sample(leg_starts_batch, state_batch,
                                           names_batch)
        qf1_pi, qf2_pi = self.critic(leg_starts_batch, state_batch, pi,
                                     names_batch)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((p.alpha * log_pi) - min_qf_pi).mean()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        self.parameter_update()

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item()
Ejemplo n.º 4
0
            next_obs, rewards, done, info = env.step((action1, action2))
            next_state, stacked_frames = stack_frame(stacked_frames,
                                                     next_obs[0])

            reward_sum += rewards[0]

            obs = next_obs
            state = next_state

    env.render()


# If no model was passed, train a policy from scratch.
# Otherwise load the policy from the file and go directly to testing.
if args.test is None:
    try:
        train(episodes, player, opponent)
    # Handle Ctrl+C - save model and go to tests
    except KeyboardInterrupt:
        print("Interrupted!")
    model_file = "%dqn.mdl" % args.env
    torch.save(policy.state_dict(), model_file)
    print("Model saved to", model_file)
else:
    state_dict = torch.load(args.test)
    policy.load_state_dict(state_dict)
    print("Testing...")
    test(100, player, opponent)

env.end()
Ejemplo n.º 5
0
    epsilon = 1e-05

    env_id = 'Breakout-v0'
    envs = [make_env(env_id) for _ in range(n_env)]
    #    envs = DummyVecEnv(envs)
    #    envs = SubprocVecEnv(envs)
    envs = ShmemVecEnv(envs)
    envs = VecToTensor(envs)

    date = datetime.now().strftime('%m_%d_%H_%M')
    mon_file_name = "./tmp/" + date
    envs = VecMonitor(envs, mon_file_name)

    train_policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    step_policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    step_policy.load_state_dict(train_policy.state_dict())
    step_policy.eval()

    runner = Runner(envs, step_policy, n_step, gamma)

    optimizer = optim.RMSprop(train_policy.parameters(),
                              lr=lr,
                              alpha=alpha,
                              eps=epsilon)

    for i in tqdm(range(num_updates)):
        mb_obs, mb_rewards, mb_values, mb_actions = runner.run()

        action_logits, values = train_policy(mb_obs)

        mb_adv = mb_rewards - mb_values