def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description='REINFORCE using PyTorch')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        help='discount factor (default: 0.99)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='learning rate (default: 0.01)')
    parser.add_argument('--eb',
                        type=int,
                        default=1,
                        help='episode batch (default: 1)')
    parser.add_argument('--episodes',
                        type=int,
                        default=10000,
                        help='simulated episodes (default: 10000)')
    parser.add_argument('--policy',
                        type=str,
                        default=None,
                        help="""Policy checkpoint to restore.""")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help='random seed (default: 42)')
    parser.add_argument('--horizon',
                        type=int,
                        default=1000,
                        help='horizon (default: 1000)')
    parser.add_argument('--render',
                        action='store_true',
                        help='render the environment')
    parser.add_argument('--baseline',
                        action='store_true',
                        help='use the baseline for the REINFORCE algorithm')
    parser.add_argument('--render_interval',
                        type=int,
                        default=100,
                        help='interval between rendered epochs (default: 100)')
    parser.add_argument('--env',
                        type=str,
                        default='CarRacing-v0',
                        help='environment to train on (default: CartPole-v0)')
    parser.add_argument('--vae',
                        type=str,
                        default=None,
                        help='VAE checkpoint to load')
    parser.add_argument('--arch',
                        type=str,
                        default='base_car_racing',
                        help="""Model architecture.""")
    args = parser.parse_args()
    # Initialize environment
    env = gym.make(args.env)
    env = CropCarRacing(env)
    env = ResizeObservation(env, (32, 32, 3))
    env = Scolorized(env, weights=[0.0, 1.0, 0.0])
    env = NormalizeRGB(env)
    env = VAEObservation(env, args.vae, arch=args.arch)
    print(env.observation_space)
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    print("Env final goal:", env.spec.reward_threshold)
    # Create the alias for the run
    alias = 'reinforce_lr=%s_eb=%s_seed=%s' % (args.lr, args.eb, args.seed)
    if args.baseline:
        alias += '_baseline'
    alias += '_%s' % (time.time())
    # Use alias for checkpoints
    checkpoint_best_filename = 'policy_weights/' + alias + '_best.torch'
    checkpoint_final_filename = 'policy_weights/' + alias + '_final.torch'
    if not os.path.exists('policy_weights/'):
        os.makedirs('policy_weights/')
    # Tensorboard writer
    writer = SummaryWriter('policy_logs/' + alias)
    # Declare policy
    policy = Policy(env)
    if args.policy:
        policy.load_state_dict(torch.load(args.policy))
        policy.eval()
    # Declare sampler
    sampler = Sampler(env, args.horizon)
    # Run episodes
    running_reward = deque(maxlen=100)
    best_reward = None
    for i_episode in trange(0,
                            args.episodes,
                            args.eb,
                            desc="Episodes",
                            unit_scale=args.eb):
        # Sample trajectories
        trajectories = sampler.sample(args.eb,
                                      policy,
                                      render=(i_episode %
                                              args.render_interval == 0))
        # Update policy
        finish_episode(trajectories, policy, args)
        # Get quantities for summaries
        episode_rewards = np.sum(trajectories['rewards'], axis=1)
        mean_reward = np.mean(episode_rewards)
        episode_lens = np.sum(trajectories['mask'], axis=1)
        for sub_i in range(args.eb):
            # Summaries: mean episode reward for 100 episodes
            running_reward.append(episode_rewards[sub_i])
            writer.add_scalar('data/mean_100episode_reward',
                              np.mean(running_reward), i_episode + sub_i)
            # Summaries: mean episode len
            writer.add_scalar('data/episode_len', episode_lens[sub_i],
                              i_episode + sub_i)
            writer.add_scalar('data/episode_reward', episode_rewards[sub_i],
                              i_episode + sub_i)
        # Save best model if needed
        if (best_reward is None) or (mean_reward > best_reward):
            best_reward = mean_reward
            print("Saving best model:", best_reward)
            torch.save(policy.state_dict(), checkpoint_best_filename)
        # Check if completed
        if np.mean(running_reward) > env.spec.reward_threshold:
            print("Solved, stopping. Mean reward:", np.mean(running_reward))
            break

    # Save final model
    torch.save(policy.state_dict(), checkpoint_final_filename)
    # Close env and writer
    env.close()
    writer.close()
Beispiel #2
0
    Car Racing action space:
    Box(3) floats
    action[0]: steer, -1 to 1
    action[1]: gas. 0 to 1
    action[2]: brake, 0 to 1
'''

env = gym.make('CarRacing-v0')
env = CropCarRacing(env)
env = ResizeObservation(env, (64, 64, 3))
#env = Scolorized(env)
env = NormalizeRGB(env)

dataset = []
env.seed(42)
obs = env.reset()
done = False

print(env.observation_space)
print(env.action_space)

for i in trange(50):
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)
    env.render()
    dataset.append(obs)
env.close()

plt.imshow(dataset[-1])
plt.show()
Beispiel #3
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description='REINFORCE using PyTorch')
    # Logging
    parser.add_argument('--alias',
                        type=str,
                        default='base',
                        help="""Alias of the model.""")
    parser.add_argument('--render_interval',
                        type=int,
                        default=100,
                        help='interval between rendered epochs (default: 100)')
    # Learning parameters
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        help='discount factor (default: 0.99)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='learning rate (default: 0.01)')

    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='Enables CUDA training')
    parser.add_argument('--eb',
                        type=int,
                        default=1,
                        help='episode batch (default: 1)')
    parser.add_argument('--episodes',
                        type=int,
                        default=10000,
                        help='simulated episodes (default: 10000)')
    parser.add_argument('--policy',
                        type=str,
                        default=None,
                        help="""Policy checkpoint to restore.""")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help='random seed (default: 42)')
    parser.add_argument('--horizon',
                        type=int,
                        default=1000,
                        help='horizon (default: 1000)')
    parser.add_argument('--baseline',
                        action='store_true',
                        help='use the baseline for the REINFORCE algorithm')
    args = parser.parse_args()
    # Check cuda
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if args.cuda else "cpu")
    # Initialize environment
    env = gym.make('CarRacing-v0')
    env = CropCarRacing(env)
    env = ResizeObservation(env, (32, 32, 3))
    env = Scolorized(env, weights=[0.0, 1.0, 0.0])
    env = NormalizeRGB(env)
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    print("Env final goal:", env.spec.reward_threshold)
    # Create the alias for the run
    alias = '%s_%s' % (args.alias, time.time())
    # Use alias for checkpoints
    checkpoint_best_filename = 'policy_weights/' + alias + '_best.torch'
    checkpoint_final_filename = 'policy_weights/' + alias + '_final.torch'
    if not os.path.exists('weights/'):
        os.makedirs('weights/')
    # Tensorboard writer
    writer = SummaryWriter('logs/' + alias)
    # Create VAE policy
    vape = VAEPolicy()
    optimizer = optim.Adam(vape.parameters(), lr=1e-04)

    # Animation of environment
    obs = env.reset()
    obs_torch = torch.from_numpy(NCHW([obs])).float().to(device)
    rebuild = vape.encode_decode(obs_torch)
    rebuild = NHWC(rebuild.detach().numpy()[0])

    fig1 = plt.figure()
    if len(obs.shape) == 3 and (obs.shape[-1] == 1):
        im = plt.imshow(side_by_side(obs, rebuild), cmap="Greys")
    else:
        im = plt.imshow(side_by_side(obs, rebuild))
    done = False
    HORIZON = 200
    timestep = 0

    # Setting animation update function
    def updatefig(*args):
        nonlocal done
        nonlocal obs
        nonlocal HORIZON
        nonlocal timestep
        obs_torch = torch.from_numpy(NCHW([obs])).float().to(device)
        if not done and timestep < HORIZON:
            action, action_proba = vape.act(obs_torch)
            action = action[0].detach().numpy()
            obs, reward, done, info = env.step(action)
            env.render(mode='human')
            timestep += 1
        else:
            done = False
            obs = env.reset()
            timestep = 0
        rebuild = vape.encode_decode(obs_torch)
        rebuild = NHWC(rebuild.detach().numpy()[0])
        im.set_array(side_by_side(obs, rebuild))
        vape.optimize_vae(obs_torch, optimizer)
        time.sleep(0.01)
        return im,

    # Start animation
    ani = animation.FuncAnimation(fig1, updatefig, interval=50, blit=True)
    plt.show()
    # Close env and writer
    env.close()
    writer.close()
Beispiel #4
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description='REINFORCE using PyTorch')
    parser.add_argument('--gamma', type=float, default=0.99, help='discount factor (default: 0.99)')
    parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)')
    parser.add_argument('--vae_lr', type=float, default=1e-04, help='learning rate (default: 0.01)')
    parser.add_argument('--eb', type=int, default=1, help='episode batch (default: 1)')
    parser.add_argument('--episodes', type=int, default=10000, help='simulated episodes (default: 10000)')
    parser.add_argument('--controller', type=str, default=None, help="""Controller checkpoint to restore.""")
    parser.add_argument('--seed', type=int, default=42, help='random seed (default: 42)')
    parser.add_argument('--horizon', type=int, default=1000, help='horizon (default: 1000)')
    parser.add_argument('--render', action='store_true', help='render the environment')
    parser.add_argument('--baseline', action='store_true', help='use the baseline for the REINFORCE algorithm')
    parser.add_argument('--render_interval', type=int, default=100, help='interval between rendered epochs (default: 100)')
    parser.add_argument('--avoidance', type=str, default='self', help='Avoidance scheme')
    parser.add_argument('--dist', type=str, default='beta', help='Action probability distribution.')
    parser.add_argument('--avoidance_max', type=float, default=1.0, help='Avoidance max value')
    args = parser.parse_args()
    # Initialize environment
    env = gym.make('CarRacing-v0')
    env = CropCarRacing(env)
    env = ResizeObservation(env, (64, 64, 3))
    env = Scolorized(env, weights=[0.0, 1.0, 0.0])
    env = NormalizeRGB(env)
    #env = ActionScaler(env)
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    print("Env final goal:", env.spec.reward_threshold)
    # Create the alias for the run
    alias = 'reinforce_lr=%s_eb=%s_seed=%s' % (args.lr, args.eb, args.seed)
    if args.baseline:
        alias += '_baseline'
    alias += '_%s' % (time.time())
    # Use alias for checkpoints
    checkpoint_best_filename = 'weights/' + alias + '_best.torch'
    checkpoint_final_filename = 'weights/' + alias + '_final.torch'
    if not os.path.exists('weights/'):
        os.makedirs('weights/')
    # Tensorboard writer
    writer = SummaryWriter('logs/' + alias)
    # Declare vae policy
    vape = VAEPolicy(avoidance=args.avoidance, avoidance_threshold=args.avoidance_max, vae_lr=args.vae_lr)
    if args.controller:
        vape.load_state_dict(torch.load(args.controller))
    # Declare sampler
    sampler = Sampler(env, args.horizon)
    # Run episodes
    running_reward = deque(maxlen=100)
    best_reward = None
    for i_episode in trange(0, args.episodes, args.eb, desc="Episodes", unit_scale=args.eb):
        # Sample trajectories
        trajectories, losses_and_info = sampler.sample(args.eb, vape, render=False)#(i_episode%args.render_interval==0))
        reco_loss, norm_loss, total_loss, added_to_batch, avoidance_score = zip(*losses_and_info)
        # Update policy
        finish_episode(trajectories, vape, args)
        # Get quantities for summaries
        episode_rewards = np.sum(trajectories['rewards'], axis=1)
        mean_reward = np.mean(episode_rewards)
        episode_lens = np.sum(trajectories['mask'], axis=1)
        for sub_i in range(args.eb):
            # Summaries: mean episode reward for 100 episodes
            running_reward.append(episode_rewards[sub_i])
            writer.add_scalar('data/mean_100episode_reward', np.mean(running_reward), i_episode + sub_i)
            # Summaries: mean episode len
            writer.add_scalar('data/episode_len', episode_lens[sub_i], i_episode + sub_i)
            writer.add_scalar('data/episode_reward', episode_rewards[sub_i], i_episode + sub_i)
        writer.add_scalar('data/added_to_batch', np.sum(added_to_batch), i_episode/args.eb)
        writer.add_scalar('data/mean_avoidance', np.mean(avoidance_score), i_episode/args.eb)
        writer.add_scalar('data/reco_loss', np.mean(reco_loss), i_episode/args.eb)
        writer.add_scalar('data/norm_loss', np.mean(norm_loss), i_episode/args.eb)

        # Save best model if needed
        if (best_reward is None) or (mean_reward > best_reward):
            best_reward = mean_reward
            print("Saving best model:", best_reward)
            torch.save(vape.state_dict(), checkpoint_best_filename)
        # Check if completed
        if np.mean(running_reward) > env.spec.reward_threshold:
            print("Solved, stopping. Mean reward:", np.mean(running_reward))
            break

    # Save final model
    torch.save(vape.state_dict(), checkpoint_final_filename)
    # Close env and writer
    env.close()
    writer.close()