Exemple #1
0
def main():
    torch.set_num_threads(1)
    torch.manual_seed(0)

    env = gym.make(env_name)
    env.seed(seed)

    print('New model')
    policy = Policy('actor_critic', env.observation_space.shape[0], env.action_space.n)
    policy.to(device)

    optimizer = PPO(policy, clip_param, ppo_epoch, mini_batch_size,
                value_loss_coef, entropy_coef, learning_rate,
                max_grad_norm)

    episode_rewards = deque(maxlen=50)

    for eps in range(0, n_eps + 1):
        state = env.reset()
        storage = Storage(device=device)

        policy.eval()

        episode_rewards.append(test_env(policy, gym.make(env_name)))
        if eps % 5 == 0:
            print('Avg reward', np.mean(episode_rewards))

        for step in range(n_steps):

            state = torch.FloatTensor(state).to(device)

            with torch.no_grad():
                value, action, log_prob = policy.act(state)

            next_state, reward, done, _ = env.step(action.item())

            storage.push(state, action, log_prob, value, reward, done)

            state = next_state

            if done:
                state = env.reset()

        next_state = torch.FloatTensor(next_state).to(device)
        with torch.no_grad():
            next_value = policy.get_value(next_state).detach()

        storage.compute(next_value)

        policy.train()

        value_loss, action_loss, dist_entropy = optimizer.update(storage)

        with open('metrics.csv', 'a') as metrics:
            metrics.write('{},{},{}\n'.format(value_loss, action_loss, dist_entropy))
Exemple #2
0
def main():
    torch.set_num_threads(1)
    torch.manual_seed(0)

    env = gym.make(env_name)
    env.seed(42)

    print('New model')
    policy = Policy('dqn', env.observation_space.shape[0], env.action_space.n)
    target_policy = Policy('dqn', env.observation_space.shape[0],
                           env.action_space.n)
    policy.to(device)
    target_policy.to(device)
    target_policy.load_state_dict(policy.state_dict())
    optimizer = DQNOptimizer(policy, target_policy, mini_batch_size, discount,
                             learning_rate, update_epochs)

    episode_rewards = deque(maxlen=50)

    get_epsilon = lambda episode: np.exp(-episode * e_decay)

    for eps in range(0, n_eps + 1):
        state = env.reset()
        storage = Storage(device=device)

        episode_rewards.append(test_env(target_policy, gym.make(env_name)))
        if eps % 5 == 0:
            print('Avg reward', np.mean(episode_rewards))

        for step in range(n_steps):

            state = torch.FloatTensor(state).to(device)

            with torch.no_grad():
                action = policy.act(state, get_epsilon(eps))

            next_state, reward, done, _ = env.step(action.item())

            storage.push(state, action, reward, next_state, done)

            state = next_state

            if done:
                state = env.reset()

        storage.compute()

        loss = optimizer.update(storage)

        if eps % target_policy_update:
            target_policy.load_state_dict(policy.state_dict())

        with open('metrics.csv', 'a') as metrics:
            metrics.write('{}\n'.format(loss))
Exemple #3
0
def get_policies(env, goal_object):
    meta_policy = Policy(env.observation_space.shape[0],
                         goal_object.get_size())
    target_meta_policy = Policy(env.observation_space.shape[0],
                                goal_object.get_size())
    policy = Policy(env.observation_space.shape[0] + 1, env.action_space.n)
    target_policy = Policy(env.observation_space.shape[0] + 1,
                           env.action_space.n)

    meta_policy.to(device)
    target_meta_policy.to(device)
    policy.to(device)
    target_policy.to(device)

    target_meta_policy.load_state_dict(meta_policy.state_dict())
    target_policy.load_state_dict(policy.state_dict())

    return meta_policy, target_meta_policy, policy, target_policy
Exemple #4
0
def main():
    torch.set_num_threads(1)
    torch.manual_seed(0)

    env = MountainCarEnvInherit()
    env.seed(42)

    meta_policy = Policy('dqn', env.observation_space.shape[0], goal_object.get_size()) 
    target_meta_policy = Policy('dqn', env.observation_space.shape[0], goal_object.get_size()) 

    policy = Policy('dqn', env.observation_space.shape[0] + goal_object.get_size(), env.action_space.n)
    target_policy = Policy('dqn', env.observation_space.shape[0] + goal_object.get_size(), env.action_space.n)

    meta_policy.to(device)
    target_meta_policy.to(device)
    target_meta_policy.load_state_dict(meta_policy.state_dict())

    policy.to(device)
    target_policy.to(device)
    target_policy.load_state_dict(policy.state_dict())

    optimizer_meta_policy = DQNOptimizer(meta_policy, target_meta_policy, mini_batch_size, discount, learning_rate, update_epochs)

    optimizer_policy = DQNOptimizer(policy, target_policy, mini_batch_size, discount, learning_rate, update_epochs)

    episode_rewards = deque(maxlen=50)

    get_meta_epsilon = lambda episode: np.exp(-episode * e_meta_decay)
    get_epsilon = lambda episode: np.exp(-episode * e_decay)

    frame = 0
    meta_frame = 0

    for eps in range(0, n_eps + 1):

        if eps % 1 == 0:
            episode_rewards.append(test_env(meta_policy, policy, MountainCarEnvInherit()))
            print('Avg reward', np.mean(episode_rewards))

        storage = Storage(device=device)
        storage_meta = Storage(device=device)
        print('Game', eps)

        state0 = env.reset()
        state = state0.copy()
        state = torch.FloatTensor(state).to(device)

        done = False

        for step in range(100):

            extrinsic_reward = 0
            goal = meta_policy.act(state, get_meta_epsilon(step))
            onehot_goal = to_onehot(goal, goal_object.get_size())

            print('Goal', goal)

            goal_reached = False

            for i in range(100):

                joint_state = torch.FloatTensor(np.concatenate([state.cpu().numpy(), onehot_goal], axis=0)).to(device)

                with torch.no_grad():
                    action = policy.act(joint_state, get_epsilon(frame))

                next_state, reward, done, _ = env.step(action.item())

                intrinsic_reward = get_intrinsic_reward(goal, next_state)
                goal_reached = True if intrinsic_reward else False

                joint_next_state = np.concatenate([next_state, onehot_goal], axis=0)
                storage.push(joint_state, action, intrinsic_reward, joint_next_state, done)

                extrinsic_reward += reward

                state = next_state
                state = torch.FloatTensor(state).to(device)

                frame += 1

                if done or goal_reached:
                    break

            goal = torch.LongTensor([goal]).to(device)
            storage_meta.push(torch.FloatTensor(state0).to(device), goal, extrinsic_reward, next_state, done)

            meta_frame += 1

            if done:
                break

        storage.compute()
        storage_meta.compute()

        loss_meta = optimizer_meta_policy.update(storage_meta)
        loss = optimizer_policy.update(storage)

        if eps % target_policy_update:
            target_meta_policy.load_state_dict(meta_policy.state_dict())
            target_policy.load_state_dict(policy.state_dict())

        with open('metrics.csv', 'a') as metrics:
            metrics.write('{},{}\n'.format(loss_meta, loss))