def rl_learner(*args, **kwargs):
    for t in args:
        for key, value in t.items():
            kwargs[key] = value

    print("TUNING HYPERPARAMETERS:")
    print(args)

    parser = argparse.Namespace(**kwargs)
    ratio = 0.5
    weights = np.array([ratio**3, ratio**2, ratio, 1.0], dtype=np.float32)

    # env = MainGymWrapper.wrap(gym.make(env_name))
    env = make_atari(parser.environment)
    env = wrap_deepmind(env, frame_stack=True)
    env = wrap_pytorch(env)

    if parser.mode == "Train":
        print("STARTING...")
        now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        model_path = "model-" + now
        dir_path = os.path.join(parser.job_dir, model_path)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        print("MODEL WILL BE STORED AT: ", dir_path)

        writer = tf.summary.create_file_writer(dir_path)

        replay_buffer = ReplayBuffer(parser.buffer_size)
        agent = Agent(parser)
        input_shape = env.observation_space.shape
        input_shape = (input_shape[1], input_shape[2], 1)
        agent.initilize(input_shape, dir_path, env.action_space.n)

        all_returns = []
        episode_return = 0
        episode_num = 1
        loss = 0
        state = env.reset()
        state = np.expand_dims(np.average(np.float32(state),
                                          axis=0,
                                          weights=weights),
                               axis=0)
        state = np.transpose(state, (1, 2, 0))

        for step in range(1, parser.steps + 1):
            action = agent.step(state, step)
            next_state, reward, done, _ = env.step(action)
            next_state = np.expand_dims(np.average(np.float32(next_state),
                                                   axis=0,
                                                   weights=weights),
                                        axis=0)
            next_state = np.transpose(next_state, (1, 2, 0))
            episode_return += reward

            replay_buffer.push((state, action, reward, next_state, done))

            state = next_state

            if step >= parser.start_train:
                loss = agent.train(replay_buffer)

            if step >= parser.start_train and step % parser.update_target == 0:
                agent.update_networks()
                agent.save_model()

            if step >= parser.start_train and step % parser.log_frequency == 0:
                with writer.as_default():
                    tf.summary.scalar("last_10_average_returns",
                                      sum(all_returns[-10:]) /
                                      float(max(len(all_returns[-10:]), 1)),
                                      step=step)
                    tf.summary.scalar("loss", loss, step=step)
                writer.flush()

            if done:
                print(
                    'CURRENT STEP: {}, EPISODE_NUMBER: {}, EPISODE REWARD: {}. EPISODE DONE!'
                    .format(step, episode_num, episode_return))
                all_returns.append(episode_return)
                episode_return = 0
                episode_num += 1
                state = env.reset()
                state = np.expand_dims(np.average(np.float32(state),
                                                  axis=0,
                                                  weights=weights),
                                       axis=0)
                state = np.transpose(state, (1, 2, 0))

        return {
            "loss":
            -sum(all_returns[-10:]) / float(max(len(all_returns[-10:]), 1)),
            "model_dir": dir_path,
            "status": hyperopt.STATUS_OK,
            "attachment": {
                "return": pickle.dumps(all_returns)
                # os.path.join(dir_path, "returns.txt"):
                #     pickle.dump(all_returns, open(os.path.join(dir_path, "returns.txt"), "wb"))
            }
        }
Example #2
0
from Wrapper.wrappers import make_atari, wrap_deepmind, wrap_pytorch
import math, random
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
from numpy import savetxt
USE_CUDA = torch.cuda.is_available()
from dqn import QLearner, compute_td_loss, ReplayBuffer

env_id = "PongNoFrameskip-v4"  # established environment that will be played
env = make_atari(env_id)
env = wrap_deepmind(env)
env = wrap_pytorch(env)

num_frames = 1000000  # total frames that will be learning from
batch_size = 32  # the number of samples that are provided to the model for update services at a given time
gamma = 0.99  # the discount of future rewards
record_idx = 10000  #

replay_initial = 10000  # number frames that are held
replay_buffer = ReplayBuffer(100000)
model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
model.load_state_dict(
    torch.load("model_pretrained.pth",
               map_location='cpu'))  #loading in the pretrained model
Example #3
0
def main(args):
    # CUDA
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    print("Using cuda: ", use_cuda)

    # Environment
    env_id = "PongNoFrameskip-v4"
    env = make_atari(env_id)
    env = wrap_deepmind(env, args.frame_stack)
    env = wrap_pytorch(env)

    # Random seed
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    # Initializing
    replay_initial = 10000  #50000
    replay_buffer = ReplayBuffer(args.capacity)
    # model = QLearner(env, args, replay_buffer)
    # Initialize target q function and q function
    model_Q = QLearner(env, args, replay_buffer)
    model_target_Q = QLearner(env, args, replay_buffer)

    if args.optimizer == 'Adam':
        if args.use_optim_scheduler:
            optimizer = optim.Adam(model_Q.parameters(), lr=args.initial_lr)
            scheduler = StepLR(optimizer,
                               step_size=args.step_size,
                               gamma=args.gamma)
            # scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=1000, verbose=True)
        else:
            optimizer = optim.Adam(model_Q.parameters(), args.lr)

    elif args.optimizer == 'RMSprop':
        optimizer = optim.RMSprop(model_Q.parameters(), args.lr)

    if USE_CUDA:
        model_Q = model_Q.cuda()
        model_target_Q = model_target_Q.cuda()

    # Training loop
    epsilon_by_frame = lambda frame_idx: args.epsilon_final + (
        args.epsilon_start - args.epsilon_final) * math.exp(-1. * frame_idx /
                                                            args.epsilon_decay)

    losses = []
    learning_rates = []
    all_rewards = []
    episode_reward = 0
    num_param_updates = 0
    mean_reward = -float('nan')
    mean_reward2 = -float('nan')
    best_mean_reward = -float('inf')
    best_mean_reward2 = -float('inf')

    best_18_reward = -float('inf')
    best_19_reward = -float('inf')
    best_20_reward = -float('inf')
    best_21_reward = -float('inf')

    time_history = []  # records time (in sec) of each episode
    old_lr = args.initial_lr
    state = env.reset()
    start_time_frame = time.time()
    for frame_idx in range(1, args.num_frames + 1):
        start_time = time.time()

        epsilon = epsilon_by_frame(frame_idx)
        action = model_Q.act(state, epsilon)

        next_state, reward, done, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)

        state = next_state
        episode_reward += reward
        if done:
            state = env.reset()
            all_rewards.append(episode_reward)
            time_history.append(time.time() - start_time)
            episode_reward = 0

        if args.render == 1:
            env.render()

        if len(replay_buffer) > replay_initial:
            for nou in range(args.number_of_updates):
                loss = compute_td_loss(model_Q, model_target_Q,
                                       args.batch_size, args.gamma,
                                       replay_buffer, args.N)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                losses.append(loss.data.cpu().numpy())

                num_param_updates += 1
            # Periodically update the target network by Q network to target Q network
            if num_param_updates % args.target_update_freq == 0:
                model_target_Q.load_state_dict(model_Q.state_dict())

            if args.use_optim_scheduler:
                # scheduler.step(mean_reward2)
                scheduler.step()
                new_lr = scheduler.get_last_lr()
                # new_lr = optimizer.param_groups[0]['lr']
                if new_lr != old_lr:
                    learning_rates.append(new_lr)
                    print('NewLearningRate: ', new_lr)
                old_lr = new_lr

        if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
            print("Preparing replay buffer with len -- ", len(replay_buffer),
                  "Frame:", frame_idx, "Total time so far:",
                  (time.time() - start_time_frame))

        if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
            mean_reward = np.mean(all_rewards[-10:])
            mean_reward2 = np.mean(all_rewards[-100:])
            best_mean_reward = max(best_mean_reward, mean_reward)
            best_mean_reward2 = max(best_mean_reward2, mean_reward2)
            print("Frame:", frame_idx, "Loss:", np.mean(losses),
                  "Total Rewards:",
                  all_rewards[-1], "Average Rewards over all frames:",
                  np.mean(all_rewards), "Last-10 average reward:", mean_reward,
                  "Best mean reward of last-10:", best_mean_reward,
                  "Last-100 average reward:", mean_reward2,
                  "Best mean reward of last-100:", best_mean_reward2, "Time:",
                  time_history[-1], "Total time so far:",
                  (time.time() - start_time_frame))
            if mean_reward >= 18.0:
                if mean_reward > best_18_reward:
                    best_18_reward = mean_reward
                    torch.save(model_Q.state_dict(), args.save_interim_path + \
                              'fmodel_best_18_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\
                               %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))
            if mean_reward >= 19.0:
                if mean_reward > best_19_reward:
                    best_19_reward = mean_reward
                    torch.save(model_Q.state_dict(), args.save_interim_path + \
                              'fmodel_best_19_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\
                               %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))
            if mean_reward >= 20.0:
                if mean_reward > best_20_reward:
                    best_20_reward = mean_reward
                    torch.save(model_Q.state_dict(), args.save_interim_path + \
                              'fmodel_best_20_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\
                               %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))
            if mean_reward >= 21.0:
                if mean_reward > best_21_reward:
                    best_21_reward = mean_reward
                    torch.save(model_Q.state_dict(), args.save_interim_path + \
                              'fmodel_best_21_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\
                               %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))

        if frame_idx % args.save_freq_frame == 0:
            results = [losses, all_rewards, time_history]
            torch.save(model_Q.state_dict(), args.save_model_path)
            np.save(args.save_result_path, results)
        if frame_idx == 10000:
            results = [losses, all_rewards, time_history]
            torch.save(model_Q.state_dict(), args.save_interim_path + \
                      'fmodel_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\
                       %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))
            np.save(args.save_interim_path + \
                   'fresults_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.npy' \
                    %(args.lr, frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn), \
                    results)

        if frame_idx % 500000 == 0:
            results = [losses, all_rewards, time_history]
            torch.save(model_Q.state_dict(), args.save_interim_path + \
                      'fmodel_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth' \
                      %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))
            np.save(args.save_interim_path + \
                   'fresults_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.npy' \
                   %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn), \
                    results)