예제 #1
0
    def test_generalized_advantage(self):
        vector = th.randn(VECTOR_SIZE)
        for i in range(500):
            self.replay.append(vector, vector, random.random(), vector, False)
        self.replay.done()[-1] += 1
        values = th.randn_like(self.replay.reward())
        rewards = self.replay.reward().view(-1).tolist()
        dones = self.replay.done().view(-1).tolist()
        next_value = random.random()
        ref = generalized_advantage_estimate(GAMMA, TAU, rewards, dones,
                                             values, next_value)
        advantages = generalized_advantage(GAMMA, TAU, self.replay.reward(),
                                           self.replay.done(), values,
                                           next_value + th.zeros(1))
        ref = th.Tensor(ref).view(advantages.size())
        self.assertTrue(close(ref, advantages))

        # Overlapping episodes
        overlap = self.replay[2:] + self.replay[:3]
        overlap_values = th.cat((values[2:], values[:3]), dim=0)
        overlap_next_value = th.randn(1)
        overlap_adv = generalized_advantage(GAMMA, TAU,
                                            overlap.reward().double(),
                                            overlap.done().double(),
                                            overlap_values.double(),
                                            overlap_next_value.double())
        values = overlap_values.view(-1).tolist()
        rewards = overlap.reward().view(-1).tolist()
        dones = overlap.done().view(-1).tolist()
        ref = generalized_advantage_estimate(GAMMA, TAU, rewards, dones,
                                             values, overlap_next_value.item())
        ref = th.Tensor(ref).view(overlap_adv.size()).double()
        self.assertTrue(close(overlap_adv, ref))
예제 #2
0
def update(replay, optimizer, policy, env, lr_schedule):
    _, next_state_value = policy(replay[-1].next_state())
    advantages = pg.generalized_advantage(GAMMA, TAU, replay.reward(),
                                          replay.done(), replay.value(),
                                          next_state_value)

    advantages = ch.utils.normalize(advantages, epsilon=1e-5).view(-1, 1)
    rewards = [a + v for a, v in zip(advantages, replay.value())]

    for i, sars in enumerate(replay):
        sars.reward = rewards[i].detach()
        sars.advantage = advantages[i].detach()

    # Logging
    policy_losses = []
    entropies = []
    value_losses = []
    mean = lambda a: sum(a) / len(a)

    # Perform some optimization steps
    for step in range(PPO_EPOCHS * PPO_NUM_BATCHES):
        batch = replay.sample(PPO_BSZ)
        masses, values = policy(batch.state())

        # Compute losses
        new_log_probs = masses.log_prob(batch.action()).sum(-1, keepdim=True)
        entropy = masses.entropy().sum(-1).mean()
        policy_loss = ppo.policy_loss(new_log_probs,
                                      batch.log_prob(),
                                      batch.advantage(),
                                      clip=PPO_CLIP)
        value_loss = ppo.state_value_loss(values,
                                          batch.value().detach(),
                                          batch.reward(),
                                          clip=PPO_CLIP)
        loss = policy_loss - ENT_WEIGHT * entropy + V_WEIGHT * value_loss

        # Take optimization step
        optimizer.zero_grad()
        loss.backward()
        th.nn.utils.clip_grad_norm_(policy.parameters(), GRAD_NORM)
        optimizer.step()

        policy_losses.append(policy_loss)
        entropies.append(entropy)
        value_losses.append(value_loss)

    # Log metrics
    if dist.get_rank() == 0:
        env.log('policy loss', mean(policy_losses).item())
        env.log('policy entropy', mean(entropies).item())
        env.log('value loss', mean(value_losses).item())
        ppt.plot(mean(env.all_rewards[-10000:]), 'PPO results')

    # Update the parameters on schedule
    if LINEAR_SCHEDULE:
        lr_schedule.step()
예제 #3
0
def main(env='Pendulum-v0'):
    agent = ActorCritic(HIDDEN_SIZE)
    actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)
    replay = ch.ExperienceReplay()

    env = gym.make(env)
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Logger(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    for step in range(1, MAX_STEPS + 1):
        replay += env.run(agent, episodes=1)

        if len(replay) >= BATCH_SIZE:
            with torch.no_grad():
                advantages = pg.generalized_advantage(DISCOUNT,
                                                      TRACE_DECAY,
                                                      replay.reward(),
                                                      replay.done(),
                                                      replay.value(),
                                                      torch.zeros(1))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT,
                                         replay.reward(),
                                         replay.done())
                old_log_probs = replay.log_prob()

            new_values = replay.value()
            new_log_probs = replay.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    _, infos = agent(replay.state())
                    masses = infos['mass']
                    new_values = infos['value'].view(-1, 1)
                    new_log_probs = masses.log_prob(replay.action())

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(new_log_probs,
                                                            old_log_probs,
                                                            advantages,
                                                            clip=PPO_CLIP_RATIO)
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(new_values,
                                                                returns)
                critic_optimiser.zero_grad()
                value_loss.backward()
                critic_optimiser.step()

            replay.empty()
예제 #4
0
def compute_advantages(baseline,
                       tau,
                       gamma,
                       rewards,
                       dones,
                       states,
                       next_states,
                       update_vf=True):
    returns = ch.td.discount(gamma, rewards, dones)

    if update_vf:
        baseline.fit(states, returns)

    values = baseline(states)
    next_values = baseline(next_states)
    bootstraps = values * (1.0 - dones) + next_values * dones
    next_value = torch.zeros(1, device=values.device)
    return generalized_advantage(tau=tau,
                                 gamma=gamma,
                                 rewards=rewards,
                                 dones=dones,
                                 values=bootstraps,
                                 next_value=next_value)
예제 #5
0
def main(env='Pendulum-v0'):
    agent = ActorCritic(HIDDEN_SIZE).to(device)
    agent.apply(weights_init)

    actor_optimizer = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimizer = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)
    actor_scheduler = torch.optim.lr_scheduler.StepLR(actor_optimizer,
                                                      step_size=2000,
                                                      gamma=0.5)
    critic_scheduler = torch.optim.lr_scheduler.StepLR(critic_optimizer,
                                                       step_size=2000,
                                                       gamma=0.5)
    replay = ch.ExperienceReplay()

    env = gym.make(env)
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Logger(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    def get_action(state):
        return agent(state.to(device))

    for step in range(1, MAX_STEPS + 1):
        replay += env.run(get_action, episodes=1)

        if len(replay) >= BATCH_SIZE:
            #batch = replay.sample(BATCH_SIZE).to(device)
            batch = replay.to(device)
            with torch.no_grad():
                advantages = pg.generalized_advantage(
                    DISCOUNT, TRACE_DECAY, batch.reward(), batch.done(),
                    batch.value(),
                    torch.zeros(1).to(device))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT, batch.reward(), batch.done())
                old_log_probs = batch.log_prob()

            new_values = batch.value()
            new_log_probs = batch.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    _, infos = agent(batch.state())
                    masses = infos['mass']
                    new_values = infos['value'].view(-1, 1)
                    new_log_probs = masses.log_prob(batch.action())

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(
                    new_log_probs,
                    old_log_probs,
                    advantages,
                    clip=PPO_CLIP_RATIO)
                actor_optimizer.zero_grad()
                policy_loss.backward()
                #nn.utils.clip_grad_norm_(agent.actor.parameters(), 1.0)
                actor_optimizer.step()

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(
                    new_values, returns)
                critic_optimizer.zero_grad()
                value_loss.backward()
                #nn.utils.clip_grad_norm_(agent.critic.parameters(), 1.0)
                critic_optimizer.step()

            actor_scheduler.step()
            critic_scheduler.step()

            replay.empty()
예제 #6
0
def main():
    order_book_id_number = 10
    toy_data = create_toy_data(order_book_ids_number=order_book_id_number,
                               feature_number=20,
                               start="2019-05-01",
                               end="2019-12-12",
                               frequency="D")
    env = PortfolioTradingGym(data_df=toy_data,
                              sequence_window=5,
                              add_cash=True)
    env = Numpy(env)
    env = ch.envs.Logger(env, interval=1000)
    env = ch.envs.Torch(env)
    env = ch.envs.Runner(env)

    # create net
    action_size = env.action_space.shape[0]
    number_asset, seq_window, features_number = env.observation_space.shape
    input_size = features_number

    agent = ActorCritic(input_size=input_size,
                        hidden_size=HIDDEN_SIZE,
                        action_size=action_size)
    actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)

    replay = ch.ExperienceReplay()

    for step in range(1, MAX_STEPS + 1):
        replay += env.run(agent, episodes=1)

        if len(replay) >= BATCH_SIZE:
            with torch.no_grad():
                advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY,
                                                      replay.reward(),
                                                      replay.done(),
                                                      replay.value(),
                                                      torch.zeros(1))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT, replay.reward(), replay.done())
                old_log_probs = replay.log_prob()

            # here is to add readability
            new_values = replay.value()
            new_log_probs = replay.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    _, infos = agent(replay.state())
                    masses = infos['mass']
                    new_values = infos['value']
                    new_log_probs = masses.log_prob(
                        replay.action()).unsqueeze(-1)

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(
                    new_log_probs,
                    old_log_probs,
                    advantages,
                    clip=PPO_CLIP_RATIO)
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(
                    new_values, returns)
                critic_optimiser.zero_grad()
                value_loss.backward()
                critic_optimiser.step()

            replay.empty()
예제 #7
0
def train_cherry():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    env = gym.make('Pendulum-v0')
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    agent = ActorCritic(HIDDEN_SIZE)
    actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)

    def get_action(state):
        mass, value = agent(state)
        action = mass.sample()
        log_prob = mass.log_prob(action)
        return action, {
            'log_prob': log_prob,
            'value': value,
        }

    result = {
        'rewards': [],
        'policy_losses': [],
        'value_losses': [],
        'weights': [],
    }

    for step in range(1, CHERRY_MAX_STEPS + 1):
        replay += env.run(get_action, episodes=1)

        if len(replay) >= BATCH_SIZE:
            for r in replay.reward():
                result['rewards'].append(r.item())
            with torch.no_grad():
                advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY,
                                                      replay.reward(),
                                                      replay.done(),
                                                      replay.value(),
                                                      torch.zeros(1))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT, replay.reward(), replay.done())
                old_log_probs = replay.log_prob()

            new_values = replay.value()
            new_log_probs = replay.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    masses, new_values = agent(replay.state())
                    new_log_probs = masses.log_prob(replay.action())
                    new_values = new_values.view(-1, 1)

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(
                    new_log_probs,
                    old_log_probs,
                    advantages,
                    clip=PPO_CLIP_RATIO)
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()
                result['policy_losses'].append(policy_loss.item())

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(
                    new_values, returns)
                critic_optimiser.zero_grad()
                value_loss.backward()
                critic_optimiser.step()
                result['value_losses'].append(value_loss.item())
            replay.empty()

    result['weights'] = list(agent.parameters())
    return result