def make_env(seed):
     env = gym.make('CarRacing-v0')
     env = NormalizeRGB(env)
     env = CropCarRacing(env)
     env = ResizeObservation(env, (64, 64, 3))
     env.seed(seed)
     np.random.seed(seed)
     return env
Example #2
0
def run_agent(layout: str):
    env = PacmanEnv(layout)
    env = SkipFrame(env, skip=4)
    env = GrayScaleObservation(env)
    env = ResizeObservation(env, shape=84)
    env = FrameStack(env, num_stack=4)
    screen = env.reset(mode='rgb_array')
    n_actions = env.action_space.n

    model = load_model(screen.shape, n_actions, 'pacman.pth')

    for i in range(10):

        env.render(mode='human')
        screen = env.reset(mode='rgb_array')

        for _ in count():
            env.render(mode='human')
            action = select_action(screen, 0, model, n_actions)
            screen, reward, done, info = env.step(action)

            if done:
                break
def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description='REINFORCE using PyTorch')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        help='discount factor (default: 0.99)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='learning rate (default: 0.01)')
    parser.add_argument('--eb',
                        type=int,
                        default=1,
                        help='episode batch (default: 1)')
    parser.add_argument('--episodes',
                        type=int,
                        default=10000,
                        help='simulated episodes (default: 10000)')
    parser.add_argument('--policy',
                        type=str,
                        default=None,
                        help="""Policy checkpoint to restore.""")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help='random seed (default: 42)')
    parser.add_argument('--horizon',
                        type=int,
                        default=1000,
                        help='horizon (default: 1000)')
    parser.add_argument('--render',
                        action='store_true',
                        help='render the environment')
    parser.add_argument('--baseline',
                        action='store_true',
                        help='use the baseline for the REINFORCE algorithm')
    parser.add_argument('--render_interval',
                        type=int,
                        default=100,
                        help='interval between rendered epochs (default: 100)')
    parser.add_argument('--env',
                        type=str,
                        default='CarRacing-v0',
                        help='environment to train on (default: CartPole-v0)')
    parser.add_argument('--vae',
                        type=str,
                        default=None,
                        help='VAE checkpoint to load')
    parser.add_argument('--arch',
                        type=str,
                        default='base_car_racing',
                        help="""Model architecture.""")
    args = parser.parse_args()
    # Initialize environment
    env = gym.make(args.env)
    env = CropCarRacing(env)
    env = ResizeObservation(env, (32, 32, 3))
    env = Scolorized(env, weights=[0.0, 1.0, 0.0])
    env = NormalizeRGB(env)
    env = VAEObservation(env, args.vae, arch=args.arch)
    print(env.observation_space)
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    print("Env final goal:", env.spec.reward_threshold)
    # Create the alias for the run
    alias = 'reinforce_lr=%s_eb=%s_seed=%s' % (args.lr, args.eb, args.seed)
    if args.baseline:
        alias += '_baseline'
    alias += '_%s' % (time.time())
    # Use alias for checkpoints
    checkpoint_best_filename = 'policy_weights/' + alias + '_best.torch'
    checkpoint_final_filename = 'policy_weights/' + alias + '_final.torch'
    if not os.path.exists('policy_weights/'):
        os.makedirs('policy_weights/')
    # Tensorboard writer
    writer = SummaryWriter('policy_logs/' + alias)
    # Declare policy
    policy = Policy(env)
    if args.policy:
        policy.load_state_dict(torch.load(args.policy))
        policy.eval()
    # Declare sampler
    sampler = Sampler(env, args.horizon)
    # Run episodes
    running_reward = deque(maxlen=100)
    best_reward = None
    for i_episode in trange(0,
                            args.episodes,
                            args.eb,
                            desc="Episodes",
                            unit_scale=args.eb):
        # Sample trajectories
        trajectories = sampler.sample(args.eb,
                                      policy,
                                      render=(i_episode %
                                              args.render_interval == 0))
        # Update policy
        finish_episode(trajectories, policy, args)
        # Get quantities for summaries
        episode_rewards = np.sum(trajectories['rewards'], axis=1)
        mean_reward = np.mean(episode_rewards)
        episode_lens = np.sum(trajectories['mask'], axis=1)
        for sub_i in range(args.eb):
            # Summaries: mean episode reward for 100 episodes
            running_reward.append(episode_rewards[sub_i])
            writer.add_scalar('data/mean_100episode_reward',
                              np.mean(running_reward), i_episode + sub_i)
            # Summaries: mean episode len
            writer.add_scalar('data/episode_len', episode_lens[sub_i],
                              i_episode + sub_i)
            writer.add_scalar('data/episode_reward', episode_rewards[sub_i],
                              i_episode + sub_i)
        # Save best model if needed
        if (best_reward is None) or (mean_reward > best_reward):
            best_reward = mean_reward
            print("Saving best model:", best_reward)
            torch.save(policy.state_dict(), checkpoint_best_filename)
        # Check if completed
        if np.mean(running_reward) > env.spec.reward_threshold:
            print("Solved, stopping. Mean reward:", np.mean(running_reward))
            break

    # Save final model
    torch.save(policy.state_dict(), checkpoint_final_filename)
    # Close env and writer
    env.close()
    writer.close()
Example #4
0
from metrics import MetricLogger
from agent import Mario
from wrappers import ResizeObservation, SkipFrame

# Initialize Super Mario environment
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')

# Limit the action-space to
#   0. walk right
#   1. jump right
env = JoypadSpace(env, [['right'], ['right', 'A']])

# Apply Wrappers to environment
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env, keep_dim=False)
env = ResizeObservation(env, shape=84)
env = TransformObservation(env, f=lambda x: x / 255.)
env = FrameStack(env, num_stack=4)

env.reset()

save_dir = Path('checkpoints') / datetime.datetime.now().strftime(
    '%Y-%m-%dT%H-%M-%S')
save_dir.mkdir(parents=True)

checkpoint = None  # Path('checkpoints/2020-10-21T18-25-27/mario.chkpt')

# Add in check to see if GPU is avaliable (BM)
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Using GPU!")
Example #5
0
def train_agent(layout: str, episodes: int = 10000, frames_to_skip: int = 4):
    GAMMA = 0.99
    EPSILON = 1.0
    EPS_END = 0.1
    EPS_DECAY = 1e7
    TARGET_UPDATE = 10
    BATCH_SIZE = 64

    epsilon_by_frame = lambda frame_idx: EPS_END + (
        EPSILON - EPS_END) * math.exp(-1. * frame_idx / EPS_DECAY)

    # Get screen size so that we can initialize layers correctly based on shape
    # returned from AI gym. Typical dimensions at this point are close to 3x40x90
    # which is the result of a clamped and down-scaled render buffer in get_screen()
    env = PacmanEnv(layout=layout)
    env = SkipFrame(env, skip=frames_to_skip)
    env = GrayScaleObservation(env)
    env = ResizeObservation(env, shape=84)
    env = FrameStack(env, num_stack=4)
    screen = env.reset(mode='rgb_array')

    # Get number of actions from gym action space
    n_actions = env.action_space.n

    policy_net = DQN(screen.shape, n_actions).to(device)
    target_net = DQN(screen.shape, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayBuffer(BATCH_SIZE)

    for i_episode in range(episodes):
        # Initialize the environment and state
        state = env.reset(mode='rgb_array')
        ep_reward = 0.
        EPSILON = epsilon_by_frame(i_episode)

        for t in count():
            # Select and perform an action
            env.render(mode='human')
            action = select_action(state, EPSILON, policy_net, n_actions)
            next_state, reward, done, info = env.step(action)
            reward = max(-1.0, min(reward, 1.0))
            ep_reward += reward

            memory.cache(state, next_state, action, reward, done)

            # Observe new state
            if done:
                next_state = None

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(memory, policy_net, optimizer, target_net, GAMMA)
            if done:
                print("Episode #{}, lasts for {} timestep, total reward: {}".
                      format(i_episode, t + 1, ep_reward))
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

        if i_episode % 1000 == 0:
            save_model(target_net, 'pacman.pth')

    print('Complete')
    env.render()
    env.close()

    save_model(target_net, 'pacman.pth')
Example #6
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description='REINFORCE using PyTorch')
    # Logging
    parser.add_argument('--alias',
                        type=str,
                        default='base',
                        help="""Alias of the model.""")
    parser.add_argument('--render_interval',
                        type=int,
                        default=100,
                        help='interval between rendered epochs (default: 100)')
    # Learning parameters
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        help='discount factor (default: 0.99)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='learning rate (default: 0.01)')

    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='Enables CUDA training')
    parser.add_argument('--eb',
                        type=int,
                        default=1,
                        help='episode batch (default: 1)')
    parser.add_argument('--episodes',
                        type=int,
                        default=10000,
                        help='simulated episodes (default: 10000)')
    parser.add_argument('--policy',
                        type=str,
                        default=None,
                        help="""Policy checkpoint to restore.""")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help='random seed (default: 42)')
    parser.add_argument('--horizon',
                        type=int,
                        default=1000,
                        help='horizon (default: 1000)')
    parser.add_argument('--baseline',
                        action='store_true',
                        help='use the baseline for the REINFORCE algorithm')
    args = parser.parse_args()
    # Check cuda
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if args.cuda else "cpu")
    # Initialize environment
    env = gym.make('CarRacing-v0')
    env = CropCarRacing(env)
    env = ResizeObservation(env, (32, 32, 3))
    env = Scolorized(env, weights=[0.0, 1.0, 0.0])
    env = NormalizeRGB(env)
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    print("Env final goal:", env.spec.reward_threshold)
    # Create the alias for the run
    alias = '%s_%s' % (args.alias, time.time())
    # Use alias for checkpoints
    checkpoint_best_filename = 'policy_weights/' + alias + '_best.torch'
    checkpoint_final_filename = 'policy_weights/' + alias + '_final.torch'
    if not os.path.exists('weights/'):
        os.makedirs('weights/')
    # Tensorboard writer
    writer = SummaryWriter('logs/' + alias)
    # Create VAE policy
    vape = VAEPolicy()
    optimizer = optim.Adam(vape.parameters(), lr=1e-04)

    # Animation of environment
    obs = env.reset()
    obs_torch = torch.from_numpy(NCHW([obs])).float().to(device)
    rebuild = vape.encode_decode(obs_torch)
    rebuild = NHWC(rebuild.detach().numpy()[0])

    fig1 = plt.figure()
    if len(obs.shape) == 3 and (obs.shape[-1] == 1):
        im = plt.imshow(side_by_side(obs, rebuild), cmap="Greys")
    else:
        im = plt.imshow(side_by_side(obs, rebuild))
    done = False
    HORIZON = 200
    timestep = 0

    # Setting animation update function
    def updatefig(*args):
        nonlocal done
        nonlocal obs
        nonlocal HORIZON
        nonlocal timestep
        obs_torch = torch.from_numpy(NCHW([obs])).float().to(device)
        if not done and timestep < HORIZON:
            action, action_proba = vape.act(obs_torch)
            action = action[0].detach().numpy()
            obs, reward, done, info = env.step(action)
            env.render(mode='human')
            timestep += 1
        else:
            done = False
            obs = env.reset()
            timestep = 0
        rebuild = vape.encode_decode(obs_torch)
        rebuild = NHWC(rebuild.detach().numpy()[0])
        im.set_array(side_by_side(obs, rebuild))
        vape.optimize_vae(obs_torch, optimizer)
        time.sleep(0.01)
        return im,

    # Start animation
    ani = animation.FuncAnimation(fig1, updatefig, interval=50, blit=True)
    plt.show()
    # Close env and writer
    env.close()
    writer.close()
Example #7
0
    # Check cuda
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if args.cuda else "cpu")
    # Loading the dataset
    if args.dataset:
        dataset = np.array(pickle.load(open(args.dataset, 'rb')))
        N_SAMPLES, W, H, CHANNELS = dataset.shape
        print("Dataset size:", N_SAMPLES)
        print("Channels:", CHANNELS)
        print("Image dim: (%d,%d)" % (W,H))
        dataset_torch = torch.from_numpy(NCHW(dataset)).float().to(device)
    else:
        print("Using gym environment directly.")
        env = gym.make('CarRacing-v0')
        env = CropCarRacing(env)
        env = ResizeObservation(env, (32, 32, 3))
        env = NormalizeRGB(env)
        env = Scolorized(env, weights=[0.0, 1.0, 0.0])
        env.seed(args.seed)

    # Network creation
    VAE_class = VAEbyArch(args.arch)
    vae = VAE_class(latent_size=args.latent_size).to(device)
    # Restore checkpoint
    assert args.vae, "No checkpoint provided."
    vae.load_state_dict(torch.load(args.vae))
    vae.eval()

    if args.dataset:
        # Single observation display
        mu, log_sigma, z, rebuild = vae(dataset_torch[args.sample:args.sample+1])
Example #8
0
import matplotlib.pyplot as plt
from tqdm import trange

from wrappers import ResizeObservation, CropCarRacing, Scolorized, NormalizeRGB

'''
    Car Racing action space:
    Box(3) floats
    action[0]: steer, -1 to 1
    action[1]: gas. 0 to 1
    action[2]: brake, 0 to 1
'''

env = gym.make('CarRacing-v0')
env = CropCarRacing(env)
env = ResizeObservation(env, (64, 64, 3))
#env = Scolorized(env)
env = NormalizeRGB(env)

dataset = []
env.seed(42)
obs = env.reset()
done = False

print(env.observation_space)
print(env.action_space)

for i in trange(50):
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)
    env.render()
Example #9
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description='REINFORCE using PyTorch')
    parser.add_argument('--gamma', type=float, default=0.99, help='discount factor (default: 0.99)')
    parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)')
    parser.add_argument('--vae_lr', type=float, default=1e-04, help='learning rate (default: 0.01)')
    parser.add_argument('--eb', type=int, default=1, help='episode batch (default: 1)')
    parser.add_argument('--episodes', type=int, default=10000, help='simulated episodes (default: 10000)')
    parser.add_argument('--controller', type=str, default=None, help="""Controller checkpoint to restore.""")
    parser.add_argument('--seed', type=int, default=42, help='random seed (default: 42)')
    parser.add_argument('--horizon', type=int, default=1000, help='horizon (default: 1000)')
    parser.add_argument('--render', action='store_true', help='render the environment')
    parser.add_argument('--baseline', action='store_true', help='use the baseline for the REINFORCE algorithm')
    parser.add_argument('--render_interval', type=int, default=100, help='interval between rendered epochs (default: 100)')
    parser.add_argument('--avoidance', type=str, default='self', help='Avoidance scheme')
    parser.add_argument('--dist', type=str, default='beta', help='Action probability distribution.')
    parser.add_argument('--avoidance_max', type=float, default=1.0, help='Avoidance max value')
    args = parser.parse_args()
    # Initialize environment
    env = gym.make('CarRacing-v0')
    env = CropCarRacing(env)
    env = ResizeObservation(env, (64, 64, 3))
    env = Scolorized(env, weights=[0.0, 1.0, 0.0])
    env = NormalizeRGB(env)
    #env = ActionScaler(env)
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    print("Env final goal:", env.spec.reward_threshold)
    # Create the alias for the run
    alias = 'reinforce_lr=%s_eb=%s_seed=%s' % (args.lr, args.eb, args.seed)
    if args.baseline:
        alias += '_baseline'
    alias += '_%s' % (time.time())
    # Use alias for checkpoints
    checkpoint_best_filename = 'weights/' + alias + '_best.torch'
    checkpoint_final_filename = 'weights/' + alias + '_final.torch'
    if not os.path.exists('weights/'):
        os.makedirs('weights/')
    # Tensorboard writer
    writer = SummaryWriter('logs/' + alias)
    # Declare vae policy
    vape = VAEPolicy(avoidance=args.avoidance, avoidance_threshold=args.avoidance_max, vae_lr=args.vae_lr)
    if args.controller:
        vape.load_state_dict(torch.load(args.controller))
    # Declare sampler
    sampler = Sampler(env, args.horizon)
    # Run episodes
    running_reward = deque(maxlen=100)
    best_reward = None
    for i_episode in trange(0, args.episodes, args.eb, desc="Episodes", unit_scale=args.eb):
        # Sample trajectories
        trajectories, losses_and_info = sampler.sample(args.eb, vape, render=False)#(i_episode%args.render_interval==0))
        reco_loss, norm_loss, total_loss, added_to_batch, avoidance_score = zip(*losses_and_info)
        # Update policy
        finish_episode(trajectories, vape, args)
        # Get quantities for summaries
        episode_rewards = np.sum(trajectories['rewards'], axis=1)
        mean_reward = np.mean(episode_rewards)
        episode_lens = np.sum(trajectories['mask'], axis=1)
        for sub_i in range(args.eb):
            # Summaries: mean episode reward for 100 episodes
            running_reward.append(episode_rewards[sub_i])
            writer.add_scalar('data/mean_100episode_reward', np.mean(running_reward), i_episode + sub_i)
            # Summaries: mean episode len
            writer.add_scalar('data/episode_len', episode_lens[sub_i], i_episode + sub_i)
            writer.add_scalar('data/episode_reward', episode_rewards[sub_i], i_episode + sub_i)
        writer.add_scalar('data/added_to_batch', np.sum(added_to_batch), i_episode/args.eb)
        writer.add_scalar('data/mean_avoidance', np.mean(avoidance_score), i_episode/args.eb)
        writer.add_scalar('data/reco_loss', np.mean(reco_loss), i_episode/args.eb)
        writer.add_scalar('data/norm_loss', np.mean(norm_loss), i_episode/args.eb)

        # Save best model if needed
        if (best_reward is None) or (mean_reward > best_reward):
            best_reward = mean_reward
            print("Saving best model:", best_reward)
            torch.save(vape.state_dict(), checkpoint_best_filename)
        # Check if completed
        if np.mean(running_reward) > env.spec.reward_threshold:
            print("Solved, stopping. Mean reward:", np.mean(running_reward))
            break

    # Save final model
    torch.save(vape.state_dict(), checkpoint_final_filename)
    # Close env and writer
    env.close()
    writer.close()