Ejemplo n.º 1
0
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    action_size = env.action_space.n
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)
    model.train()

    if not args.on_policy:
        memory = EpisodicReplayMemory(args.memory_capacity,
                                      args.max_episode_length)

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # On-policy episode loop
        while True:
            # Sync with shared model at least every t_max steps
            model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                hx, avg_hx = Variable(torch.zeros(1,
                                                  args.hidden_size)), Variable(
                                                      torch.zeros(
                                                          1, args.hidden_size))
                cx, avg_cx = Variable(torch.zeros(1,
                                                  args.hidden_size)), Variable(
                                                      torch.zeros(
                                                          1, args.hidden_size))
                # Reset environment and done flag
                state = state_to_tensor(env.reset())
                action, reward, done, episode_length = 0, 0, False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                input = extend_input(state,
                                     action_to_one_hot(action, action_size),
                                     reward)
                policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx))
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    Variable(input), (avg_hx, avg_cx))

                # Sample action
                action = policy.multinomial().data[
                    0,
                    0]  # Graph broken as loss for stochastic action calculated manually

                # Step
                next_state, reward, done, _ = env.step(action)
                next_state = state_to_tensor(next_state)
                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(input, action, reward,
                                  policy.data)  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards, average_policies
                    ), (policy, Q, V, Variable(torch.LongTensor([[action]])),
                        Variable(torch.Tensor([[reward]])), average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = Variable(torch.zeros(1, 1))

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(
                        extend_input(state,
                                     action_to_one_hot(action, action_size),
                                     reward), None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(Variable(input), (hx, cx))
                Qret = Qret.detach()

            # Train the network on-policy
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                hx, avg_hx = Variable(
                    torch.zeros(args.batch_size, args.hidden_size)), Variable(
                        torch.zeros(args.batch_size, args.hidden_size))
                cx, avg_cx = Variable(
                    torch.zeros(args.batch_size, args.hidden_size)), Variable(
                        torch.zeros(args.batch_size, args.hidden_size))

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    input = torch.cat((trajectory.state
                                       for trajectory in trajectories[i]), 0)
                    action = Variable(
                        torch.LongTensor([
                            trajectory.action for trajectory in trajectories[i]
                        ])).unsqueeze(1)
                    reward = Variable(
                        torch.Tensor([
                            trajectory.reward for trajectory in trajectories[i]
                        ])).unsqueeze(1)
                    old_policy = Variable(
                        torch.cat((trajectory.policy
                                   for trajectory in trajectories[i]), 0))

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(Variable(input), (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               Variable(input),
                                               (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_input = torch.cat(
                        (trajectory.state
                         for trajectory in trajectories[i + 1]), 0)
                    done = Variable(
                        torch.Tensor([
                            trajectory.action is None
                            for trajectory in trajectories[i + 1]
                        ]).unsqueeze(1))

                # Do forward pass for all transitions
                _, _, Qret, _ = model(Variable(next_input), (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach()

                # Train the network off-policy
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True

    env.close()
Ejemplo n.º 2
0
def test(rank, args, T, shared_model):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    action_size = env.action_space.n
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size, args.no_noise)
    model.eval()

    can_test = True  # Test flag
    t_start = 1  # Test step counter to check against global counter
    rewards, steps = [], []  # Rewards and steps for plotting
    l = str(len(str(args.T_max)))  # Max num. of digits for logging steps
    done = True  # Start new episode

    while T.value() <= args.T_max:
        if can_test:
            t_start = T.value()  # Reset counter

            # Evaluate over several episodes and average results
            avg_rewards, avg_episode_lengths = [], []
            for _ in range(args.evaluation_episodes):
                while True:
                    # Reset or pass on hidden state
                    if done:
                        # Sync with shared model every episode
                        model.load_state_dict(shared_model.state_dict())
                        hx = Variable(torch.zeros(1, args.hidden_size),
                                      volatile=True)
                        cx = Variable(torch.zeros(1, args.hidden_size),
                                      volatile=True)
                        # Reset environment and done flag
                        state = state_to_tensor(env.reset())
                        action, reward, done, episode_length = 0, 0, False, 0
                        reward_sum = 0
                        model.remove_noise()  # Run without noise

                    # Optionally render validation states
                    if args.render:
                        env.render()

                    # Calculate policy
                    input = extend_input(
                        state, action_to_one_hot(action, action_size), reward,
                        episode_length)
                    policy, _, (hx, cx) = model(
                        Variable(input, volatile=True),
                        (hx.detach(),
                         cx.detach()))  # Break graph for memory efficiency

                    # Choose action greedily
                    action = policy.max(1)[1].data[0, 0]

                    # Step
                    state, reward, done, _ = env.step(action)
                    state = state_to_tensor(state)
                    reward_sum += reward
                    done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                    episode_length += 1  # Increase episode counter

                    # Log and reset statistics at the end of every episode
                    if done:
                        avg_rewards.append(reward_sum)
                        avg_episode_lengths.append(episode_length)
                        break

            print(('[{}] Step: {:<' + l +
                   '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
                       datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3],
                       t_start,
                       sum(avg_rewards) / args.evaluation_episodes,
                       sum(avg_episode_lengths) / args.evaluation_episodes))
            rewards.append(avg_rewards)  # Keep all evaluations
            steps.append(t_start)
            plot_line(steps, rewards)  # Plot rewards
            torch.save(model.state_dict(), 'model.pth')  # Save model params
            can_test = False  # Finish testing
            if args.evaluate:
                return
        else:
            if T.value() - t_start >= args.evaluation_interval:
                can_test = True

        time.sleep(0.001)  # Check if available to test every millisecond

    env.close()
Ejemplo n.º 3
0
def train(rank, args, T, shared_model, optimiser):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    action_size = env.action_space.n
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size, args.no_noise, args.noise_entropy)
    model.train()

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # Sync with shared model at least every t_max steps
        model.load_state_dict(shared_model.state_dict())
        # Get starting timestep
        t_start = t

        # Reset or pass on hidden state
        if done:
            hx = Variable(torch.zeros(1, args.hidden_size))
            cx = Variable(torch.zeros(1, args.hidden_size))
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            action, reward, done, episode_length = 0, 0, False, 0
        else:
            # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
            hx = hx.detach()
            cx = cx.detach()
        model.sample_noise(
        )  # Pick a new noise vector (until next optimisation step)

        # Lists of outputs for training
        values, log_probs, rewards, entropies = [], [], [], []

        while not done and t - t_start < args.t_max:
            input = extend_input(state, action_to_one_hot(action, action_size),
                                 reward, episode_length)
            # Calculate policy and value
            policy, value, (hx, cx) = model(Variable(input), (hx, cx))
            log_policy = policy.log()
            entropy = -(log_policy * policy).sum(1)

            # Sample action
            action = policy.multinomial()
            log_prob = log_policy.gather(
                1, action.detach()
            )  # Graph broken as loss for stochastic action calculated manually
            action = action.data[0, 0]

            # Step
            state, reward, done, _ = env.step(action)
            state = state_to_tensor(state)
            reward = args.reward_clip and min(max(
                reward, -1), 1) or reward  # Optionally clamp rewards
            done = done or episode_length >= args.max_episode_length
            episode_length += 1  # Increase episode counter

            # Save outputs for training
            [
                arr.append(el)
                for arr, el in zip((values, log_probs, rewards,
                                    entropies), (value, log_prob, reward,
                                                 entropy))
            ]

            # Increment counters
            t += 1
            T.increment()

        # Return R = 0 for terminal s or V(s_i; θ) for non-terminal s
        if done:
            R = Variable(torch.zeros(1, 1))
        else:
            _, R, _ = model(Variable(input), (hx, cx))
            R = R.detach()
        values.append(R)

        # Train the network
        policy_loss = 0
        value_loss = 0
        A_GAE = torch.zeros(1, 1)  # Generalised advantage estimator Ψ
        # Calculate n-step returns in forward view, stepping backwards from the last state
        trajectory_length = len(rewards)
        for i in reversed(range(trajectory_length)):
            # R ← r_i + γR
            R = rewards[i] + args.discount * R
            # Advantage A = R - V(s_i; θ)
            A = R - values[i]
            # dθ ← dθ - ∂A^2/∂θ
            value_loss += 0.5 * A**2  # Least squares error

            # TD residual δ = r + γV(s_i+1; θ) - V(s_i; θ)
            td_error = rewards[i] + args.discount * values[
                i + 1].data - values[i].data
            # Generalised advantage estimator Ψ (roughly of form ∑(γλ)^t∙δ)
            A_GAE = A_GAE * args.discount * args.trace_decay + td_error
            # dθ ← dθ + ∇θ∙log(π(a_i|s_i; θ))∙Ψ
            policy_loss -= log_probs[i] * Variable(
                A_GAE)  # Policy gradient loss
            if args.no_noise or args.noise_entropy:
                # dθ ← dθ + β∙∇θH(π(s_i; θ))
                policy_loss -= args.entropy_weight * entropies[
                    i]  # Entropy maximisation loss

        # Optionally normalise loss by number of time steps
        if not args.no_time_normalisation:
            policy_loss /= trajectory_length
            value_loss /= trajectory_length

        # Zero shared and local grads
        optimiser.zero_grad()
        # Note that losses were defined as negatives of normal update rules for gradient descent
        (policy_loss + value_loss).backward()
        # Gradient L2 normalisation
        nn.utils.clip_grad_norm(model.parameters(), args.max_gradient_norm, 2)

        # Transfer gradients to shared model and update
        _transfer_grads_to_shared_model(model, shared_model)
        optimiser.step()
        if not args.no_lr_decay:
            # Linearly decay learning rate
            _adjust_learning_rate(
                optimiser,
                max(args.lr * (args.T_max - T.value()) / args.T_max, 1e-32))

    env.close()