Python Policy.load_state_dictの例、agent.Policy.load_state_dict Pythonの例

コード例 #1

0

ファイルを表示

def main(load_path, num_episode):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    n_env = 1

    env_id = 'Breakout-v0'
    envs = [make_env(env_id) for _ in range(n_env)]
    envs = DummyVecEnv(envs)
    envs = VecToTensor(envs)

    policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    policy.load_state_dict(torch.load(load_path, map_location=device))
    policy.eval()

    for i in tqdm(range(num_episode)):
        obs = envs.reset()
        total_rewards = 0
        while True:
            action_logits, values = policy(obs)
            actions = choose_action(action_logits)

            next_obs, rewards, dones, info = envs.step(actions)
            total_rewards += rewards

            envs.render()

            if dones:
                break

        print('--------------------' + str(total_rewards.item()) +
              '-------------------')

    envs.close()

コード例 #2

0

ファイルを表示

ファイル: cartpole.py プロジェクト: CristianAbrante/reinforcement-learning-exercises

def test(env_name, episodes, params, render):
    # Create a Gym environment
    env = gym.make(env_name)

    # Get dimensionalities of actions and observations
    action_space_dim = env.action_space.shape[-1]
    observation_space_dim = env.observation_space.shape[-1]

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    policy.load_state_dict(params)
    agent = Agent(policy)

    test_reward, test_len = 0, 0
    for ep in range(episodes):
        done = False
        observation = env.reset()
        while not done:
            # Similar to the training loop above -
            # get the action, act on the environment, save total reward
            # (evaluation=True makes the agent always return what it thinks to be
            # the best action - there is no exploration at this point)
            action, _ = agent.get_action(observation, evaluation=True)
            observation, reward, done, info = env.step(
                action.detach().cpu().numpy())

            if render:
                env.render()
            test_reward += reward
            test_len += 1
    print("Average test reward:", test_reward / episodes, "episode length:",
          test_len / episodes)

コード例 #3

0

ファイルを表示

ファイル: cartpole.py プロジェクト: ricfrr/ReinforcementLearning

def main(args):
    # Create a Gym environment
    env = gym.make(args.env)

    # Exercise 1
    # TODO: For CartPole-v0 - maximum episode length
    env._max_episode_steps = 1000

    # Get dimensionalities of actions and observations
    action_space_dim = get_space_dim(env.action_space)
    observation_space_dim = get_space_dim(env.observation_space)

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Print some stuff
    print("Environment:", args.env)
    print("Training device:", agent.train_device)
    print("Observation space dimensions:", observation_space_dim)
    print("Action space dimensions:", action_space_dim)

    # If no model was passed, train a policy from scratch.
    # Otherwise load the policy from the file and go directly to testing.
    if args.test is None:
        training_history = train(args.position, agent, env,
                                 args.train_episodes, False,
                                 args.render_training)

        # Save the model
        tt = str(datetime.datetime.now().date()) + "-" + str(
            datetime.datetime.now().hour) + "-" + str(
                datetime.datetime.now().minute)
        model_file = "%s_params.mdl" % (args.env + tt + "vel")
        torch.save(policy.state_dict(), model_file)
        print("Model saved to", model_file)

        # Plot rewards
        sns.lineplot(x="episode", y="reward", data=training_history)
        sns.lineplot(x="episode", y="mean_reward", data=training_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history (%s)" % args.env)
        # time and day of plot
        plt.savefig("train_history" + tt + "vel" + ".jpg")
        plt.show()

        print("Training finished.")
    else:
        print("Loading model from", args.test, "...")
        state_dict = torch.load(args.test)
        policy.load_state_dict(state_dict)
        print("Testing...")
        test(args.position, agent, env, args.train_episodes, args.render_test)

コード例 #4

0

ファイルを表示

ファイル: cartpole.py プロジェクト: ananth1996/ELEC_8125_Reinforcement_Learning

def main(args):
    # Create a Gym environment
    env = gym.make(args.env)

    # Exercise 1
    env._max_episode_steps = args.episode_length

    # Get dimensionalities of actions and observations
    action_space_dim = get_space_dim(env.action_space)
    observation_space_dim = get_space_dim(env.observation_space)

    # Instantiate agent and its policy
    policy = Policy(observation_space_dim, action_space_dim)
    agent = Agent(policy)

    # Print some stuff
    print("Environment:", args.env)
    print("Training device:", agent.train_device)
    print("Observation space dimensions:", observation_space_dim)
    print("Action space dimensions:", action_space_dim)

    # If no model was passed, train a policy from scratch.
    # Otherwise load the policy from the file and go directly to testing.
    if args.test is None:
        training_history = train(agent,
                                 env,
                                 args.train_episodes,
                                 False,
                                 args.render_training,
                                 x0=args.x0,
                                 args=args,
                                 policy=policy)

        # Save the model
        model_file = "%s_params.mdl" % args.env
        torch.save(policy.state_dict(), model_file)
        print("Model saved to", model_file)

        # Plot rewards
        sns.lineplot(x="episode", y="reward", data=training_history)
        sns.lineplot(x="episode", y="mean_reward", data=training_history)
        plt.legend(["Reward", "100-episode average"])
        plt.title("Reward history (%s)" % args.env)
        plt.show()
        print("Training finished.")
    else:
        print("Loading model from", args.test, "...")
        state_dict = torch.load(args.test)
        policy.load_state_dict(state_dict)
        print("Testing...")
        test(agent, env, args.train_episodes, args.render_test, x0=args.x0)

コード例 #5

0

ファイルを表示

class Trainer:
    def __init__(self):
        #Preparing envs
        self.envs = Envs()

        self.memory = ReplayBuffer()
        self.device = torch.device(settings.device)
        self.policy = Policy().to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=p.lr)

        self.critic = QNetwork().to(self.device)
        self.critic_target = QNetwork().to(self.device)

        self.critic_optim = Adam(self.critic.parameters(), lr=p.lr)
        self.parameter_update(tau=1.0)

        if settings.mode == "test":
            self.policy.load_state_dict(
                torch.load("policy_seed_{}".format(settings.seed)))

        self.logger = Logger()

    def start(self):
        self.total_numsteps = 0

        if settings.mode == "train":
            self.add_random_steps()

            names = torch.FloatTensor(
                [i for i, _ in enumerate(settings.env_names)]).to(self.device)
            while self.total_numsteps < p.max_numsteps:
                self.run_test()
                leg_starts, states = self.envs.reset()
                for step in range(p._max_episode_steps):
                    self.total_numsteps += 1
                    actions = self.select_action(leg_starts, states, names)
                    next_states, rewards, dones = self.envs.step(actions)
                    self.memory.push(names, leg_starts, states, next_states,
                                     actions, rewards, dones)
                    states = self.envs.reset_dones(next_states, dones)

                    c1_loss, c2_loss, policy_loss = self.update_nets()

                    if (self.total_numsteps % 10) == 0:
                        self.logger.show_update(self.total_numsteps)

            torch.save(self.policy.state_dict(),
                       "policy_seed_{}".format(settings.seed))

        else:
            print("Seed: {}".format(settings.seed))
            self.run_test()

    def run_test(self):
        if settings.mode == "test":
            print("\nTesting current policy")
        leg_starts, states = self.envs.reset()
        done_filter = epsd_rewards = torch.FloatTensor(
            [1.0] * len(settings.env_names)).to(self.device)
        epsd_rewards = torch.FloatTensor([0.0] * len(settings.env_names)).to(
            self.device)
        names = torch.FloatTensor(
            [i for i, _ in enumerate(settings.env_names)]).to(self.device)
        for step in range(p._max_episode_steps):
            actions = self.select_action(leg_starts,
                                         states,
                                         names,
                                         evaluate=True)
            next_states, rewards, dones = self.envs.step(actions)
            epsd_rewards += done_filter * rewards
            done_filter *= (dones != 1).float()
            states = next_states

        self.logger.add_rewards(len(names), epsd_rewards, self.total_numsteps)
        self.logger.save()

    def add_random_steps(self):
        print("Adding random steps")
        leg_starts, states = self.envs.reset()
        names = torch.FloatTensor(
            [i for i, _ in enumerate(settings.env_names)]).to(self.device)
        while len(self.memory) <= p.batch_size * 10:
            actions = self.envs.sample_actions()
            next_states, rewards, dones = self.envs.step(actions)
            self.memory.push(names, leg_starts, states, next_states, actions,
                             rewards, dones)
            states = self.envs.reset_dones(next_states, dones)

    def select_action(self, leg_starts, states, names, evaluate=False):
        with torch.no_grad():

            if not evaluate:
                actions, _, _ = self.policy.sample(leg_starts, states, names)
            else:
                _, _, actions = self.policy.sample(leg_starts, states, names)

            return actions.cpu()

    def parameter_update(self, tau=p.tau):
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def update_nets(self):
        names_batch, leg_starts_batch, state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.memory.sample(
        )

        reward_batch = reward_batch.unsqueeze(1)
        mask_batch = mask_batch.unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                leg_starts_batch, next_state_batch, names_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                leg_starts_batch, next_state_batch, next_state_action,
                names_batch)
            min_qf_next_target = torch.min(
                qf1_next_target, qf2_next_target) - p.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * p.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(leg_starts_batch, state_batch, action_batch,
                               names_batch)

        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)
        qf_loss = qf1_loss + qf2_loss

        self.critic_optim.zero_grad()
        qf_loss.backward()
        self.critic_optim.step()

        pi, log_pi, _ = self.policy.sample(leg_starts_batch, state_batch,
                                           names_batch)
        qf1_pi, qf2_pi = self.critic(leg_starts_batch, state_batch, pi,
                                     names_batch)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((p.alpha * log_pi) - min_qf_pi).mean()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        self.parameter_update()

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item()

コード例 #6

0

ファイルを表示

ファイル: pong_ai.py プロジェクト: jz-joannazielinska/RL_Project

def train(episodes, player, opponent):

    target_dqn = Policy(observation_space_dim, action_space_dim)
    target_dqn.load_state_dict(policy.state_dict())
    #Stacked preprocessed frames
    stacked_frames = deque(np.zeros((200, 210)), maxlen=4)

    #Updates
    update_counter = 0

    #Memory Initialisation
    # take random actions to fill the memory
    memory = Memory(memory_size, batch_size)
    for i in range(memory_size):
        if (i == 0):
            obs = env.reset()
            state, stacked_frames = stack_frame(stacked_frames, obs[0], True)
        action1 = random.randint(0, 3)
        action2 = random.randint(0, 3)
        next_obs, rewards, done, info = env.step((action1, action2))
        next_state, stacked_frames = stack_frame(stacked_frames, next_obs[0])
        memory.store((state, action1, rewards[0], next_state, done))
        state = next_state

    player.reset_score()
    opponent.reset_score()
    '''
    Training
    '''

    for i in range(0, episodes):
        done = False
        obs = env.reset()
        state, stacked_frames = stack_frame(stacked_frames, obs[0], True)
        timesteps = 0
        reward_sum = 0

        while not done:
            action1 = player.get_action(state, epsilon)
            action2 = opponent.get_action()

            next_obs, rewards, done, info = env.step((action1, action2))
            next_state, stacked_frames = stack_frame(stacked_frames,
                                                     next_obs[0])

            memory.store((state, action1, rewards[0], next_state, done))
            reward_sum += rewards[0]

            obs = next_obs
            state = next_state

            env.render()

            #Updating policy
            #Loading from memory
            samples = memory.sample()
            batch_states = np.asarray([x[0] for x in samples])
            batch_actions = np.asarray([x[1] for x in samples])
            batch_rewards = np.asarray([x[2] for x in samples])
            batch_next_states = np.asarray([x[3] for x in samples])
            batch_done = np.asarray([x[4] for x in samples])

            #Target network
            batch = torch.from_numpy(batch_next_states.squeeze()).float().to(
                player.train_device)
            batch_t_q_values = target_dqn.forward(batch)

            #Q Learning
            batch_t_q_max, _ = batch_t_q_values.max(dim=1)
            y = torch.empty(batch_size, 1)
            batch_rewards = torch.from_numpy(batch_rewards).float().to(
                player.train_device)

            for j in range(batch_size):
                #.any() ?
                if batch_done[j].any():
                    y[j] = batch_rewards[j]
                else:
                    y[j] = batch_rewards[j] + batch_t_q_max[j].mul(gamma)
            y.detach()

            #Gradient_descent
            batch_q_values = policy.forward(
                torch.from_numpy(batch_states.squeeze()).float().to(
                    player.train_device))
            loss = torch.mean(y.sub(batch_q_values)**2)
            loss.backward()

            player.update_policy()

            update_counter += 1
            if (update_counter % update_step == 0):
                target_dqn.load_state_dict(policy.state_dict())
            timesteps += 1

        epsilon = epsilon * decay
        print(
            "Episode {} finished. Total reward: {:.3g} ({} timesteps)".format(
                i, reward_sum, timesteps))

コード例 #7

0

ファイルを表示

ファイル: pong_ai.py プロジェクト: jz-joannazielinska/RL_Project

            next_obs, rewards, done, info = env.step((action1, action2))
            next_state, stacked_frames = stack_frame(stacked_frames,
                                                     next_obs[0])

            reward_sum += rewards[0]

            obs = next_obs
            state = next_state

    env.render()


# If no model was passed, train a policy from scratch.
# Otherwise load the policy from the file and go directly to testing.
if args.test is None:
    try:
        train(episodes, player, opponent)
    # Handle Ctrl+C - save model and go to tests
    except KeyboardInterrupt:
        print("Interrupted!")
    model_file = "%dqn.mdl" % args.env
    torch.save(policy.state_dict(), model_file)
    print("Model saved to", model_file)
else:
    state_dict = torch.load(args.test)
    policy.load_state_dict(state_dict)
    print("Testing...")
    test(100, player, opponent)

env.end()

コード例 #8

0

ファイルを表示

    epsilon = 1e-05

    env_id = 'Breakout-v0'
    envs = [make_env(env_id) for _ in range(n_env)]
    #    envs = DummyVecEnv(envs)
    #    envs = SubprocVecEnv(envs)
    envs = ShmemVecEnv(envs)
    envs = VecToTensor(envs)

    date = datetime.now().strftime('%m_%d_%H_%M')
    mon_file_name = "./tmp/" + date
    envs = VecMonitor(envs, mon_file_name)

    train_policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    step_policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    step_policy.load_state_dict(train_policy.state_dict())
    step_policy.eval()

    runner = Runner(envs, step_policy, n_step, gamma)

    optimizer = optim.RMSprop(train_policy.parameters(),
                              lr=lr,
                              alpha=alpha,
                              eps=epsilon)

    for i in tqdm(range(num_updates)):
        mb_obs, mb_rewards, mb_values, mb_actions = runner.run()

        action_logits, values = train_policy(mb_obs)

        mb_adv = mb_rewards - mb_values