def rl_learner(*args, **kwargs):
    for t in args:
        for key, value in t.items():
            kwargs[key] = value

    print("TUNING HYPERPARAMETERS:")
    print(args)

    parser = argparse.Namespace(**kwargs)
    ratio = 0.5
    weights = np.array([ratio**3, ratio**2, ratio, 1.0], dtype=np.float32)

    # env = MainGymWrapper.wrap(gym.make(env_name))
    env = make_atari(parser.environment)
    env = wrap_deepmind(env, frame_stack=True)
    env = wrap_pytorch(env)

    if parser.mode == "Train":
        print("STARTING...")
        now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        model_path = "model-" + now
        dir_path = os.path.join(parser.job_dir, model_path)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        print("MODEL WILL BE STORED AT: ", dir_path)

        writer = tf.summary.create_file_writer(dir_path)

        replay_buffer = ReplayBuffer(parser.buffer_size)
        agent = Agent(parser)
        input_shape = env.observation_space.shape
        input_shape = (input_shape[1], input_shape[2], 1)
        agent.initilize(input_shape, dir_path, env.action_space.n)

        all_returns = []
        episode_return = 0
        episode_num = 1
        loss = 0
        state = env.reset()
        state = np.expand_dims(np.average(np.float32(state),
                                          axis=0,
                                          weights=weights),
                               axis=0)
        state = np.transpose(state, (1, 2, 0))

        for step in range(1, parser.steps + 1):
            action = agent.step(state, step)
            next_state, reward, done, _ = env.step(action)
            next_state = np.expand_dims(np.average(np.float32(next_state),
                                                   axis=0,
                                                   weights=weights),
                                        axis=0)
            next_state = np.transpose(next_state, (1, 2, 0))
            episode_return += reward

            replay_buffer.push((state, action, reward, next_state, done))

            state = next_state

            if step >= parser.start_train:
                loss = agent.train(replay_buffer)

            if step >= parser.start_train and step % parser.update_target == 0:
                agent.update_networks()
                agent.save_model()

            if step >= parser.start_train and step % parser.log_frequency == 0:
                with writer.as_default():
                    tf.summary.scalar("last_10_average_returns",
                                      sum(all_returns[-10:]) /
                                      float(max(len(all_returns[-10:]), 1)),
                                      step=step)
                    tf.summary.scalar("loss", loss, step=step)
                writer.flush()

            if done:
                print(
                    'CURRENT STEP: {}, EPISODE_NUMBER: {}, EPISODE REWARD: {}. EPISODE DONE!'
                    .format(step, episode_num, episode_return))
                all_returns.append(episode_return)
                episode_return = 0
                episode_num += 1
                state = env.reset()
                state = np.expand_dims(np.average(np.float32(state),
                                                  axis=0,
                                                  weights=weights),
                                       axis=0)
                state = np.transpose(state, (1, 2, 0))

        return {
            "loss":
            -sum(all_returns[-10:]) / float(max(len(all_returns[-10:]), 1)),
            "model_dir": dir_path,
            "status": hyperopt.STATUS_OK,
            "attachment": {
                "return": pickle.dumps(all_returns)
                # os.path.join(dir_path, "returns.txt"):
                #     pickle.dump(all_returns, open(os.path.join(dir_path, "returns.txt"), "wb"))
            }
        }
Exemple #2
0
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
from numpy import savetxt
USE_CUDA = torch.cuda.is_available()
from dqn import QLearner, compute_td_loss, ReplayBuffer

env_id = "PongNoFrameskip-v4"  # established environment that will be played
env = make_atari(env_id)
env = wrap_deepmind(env)
env = wrap_pytorch(env)

num_frames = 1000000  # total frames that will be learning from
batch_size = 32  # the number of samples that are provided to the model for update services at a given time
gamma = 0.99  # the discount of future rewards
record_idx = 10000  #

replay_initial = 10000  # number frames that are held
replay_buffer = ReplayBuffer(100000)
model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
model.load_state_dict(
    torch.load("model_pretrained.pth",
               map_location='cpu'))  #loading in the pretrained model

target_model = QLearner(env, num_frames, batch_size, gamma,
                        replay_buffer)  #load in model
Exemple #3
0
def main(args):
    # CUDA
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    print("Using cuda: ", use_cuda)

    # Environment
    env_id = "PongNoFrameskip-v4"
    env = make_atari(env_id)
    env = wrap_deepmind(env, args.frame_stack)
    env = wrap_pytorch(env)

    # Random seed
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    # Initializing
    replay_initial = 10000  #50000
    replay_buffer = ReplayBuffer(args.capacity)
    # model = QLearner(env, args, replay_buffer)
    # Initialize target q function and q function
    model_Q = QLearner(env, args, replay_buffer)
    model_target_Q = QLearner(env, args, replay_buffer)

    if args.optimizer == 'Adam':
        if args.use_optim_scheduler:
            optimizer = optim.Adam(model_Q.parameters(), lr=args.initial_lr)
            scheduler = StepLR(optimizer,
                               step_size=args.step_size,
                               gamma=args.gamma)
            # scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=1000, verbose=True)
        else:
            optimizer = optim.Adam(model_Q.parameters(), args.lr)

    elif args.optimizer == 'RMSprop':
        optimizer = optim.RMSprop(model_Q.parameters(), args.lr)

    if USE_CUDA:
        model_Q = model_Q.cuda()
        model_target_Q = model_target_Q.cuda()

    # Training loop
    epsilon_by_frame = lambda frame_idx: args.epsilon_final + (
        args.epsilon_start - args.epsilon_final) * math.exp(-1. * frame_idx /
                                                            args.epsilon_decay)

    losses = []
    learning_rates = []
    all_rewards = []
    episode_reward = 0
    num_param_updates = 0
    mean_reward = -float('nan')
    mean_reward2 = -float('nan')
    best_mean_reward = -float('inf')
    best_mean_reward2 = -float('inf')

    best_18_reward = -float('inf')
    best_19_reward = -float('inf')
    best_20_reward = -float('inf')
    best_21_reward = -float('inf')

    time_history = []  # records time (in sec) of each episode
    old_lr = args.initial_lr
    state = env.reset()
    start_time_frame = time.time()
    for frame_idx in range(1, args.num_frames + 1):
        start_time = time.time()

        epsilon = epsilon_by_frame(frame_idx)
        action = model_Q.act(state, epsilon)

        next_state, reward, done, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)

        state = next_state
        episode_reward += reward
        if done:
            state = env.reset()
            all_rewards.append(episode_reward)
            time_history.append(time.time() - start_time)
            episode_reward = 0

        if args.render == 1:
            env.render()

        if len(replay_buffer) > replay_initial:
            for nou in range(args.number_of_updates):
                loss = compute_td_loss(model_Q, model_target_Q,
                                       args.batch_size, args.gamma,
                                       replay_buffer, args.N)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                losses.append(loss.data.cpu().numpy())

                num_param_updates += 1
            # Periodically update the target network by Q network to target Q network
            if num_param_updates % args.target_update_freq == 0:
                model_target_Q.load_state_dict(model_Q.state_dict())

            if args.use_optim_scheduler:
                # scheduler.step(mean_reward2)
                scheduler.step()
                new_lr = scheduler.get_last_lr()
                # new_lr = optimizer.param_groups[0]['lr']
                if new_lr != old_lr:
                    learning_rates.append(new_lr)
                    print('NewLearningRate: ', new_lr)
                old_lr = new_lr

        if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
            print("Preparing replay buffer with len -- ", len(replay_buffer),
                  "Frame:", frame_idx, "Total time so far:",
                  (time.time() - start_time_frame))

        if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
            mean_reward = np.mean(all_rewards[-10:])
            mean_reward2 = np.mean(all_rewards[-100:])
            best_mean_reward = max(best_mean_reward, mean_reward)
            best_mean_reward2 = max(best_mean_reward2, mean_reward2)
            print("Frame:", frame_idx, "Loss:", np.mean(losses),
                  "Total Rewards:",
                  all_rewards[-1], "Average Rewards over all frames:",
                  np.mean(all_rewards), "Last-10 average reward:", mean_reward,
                  "Best mean reward of last-10:", best_mean_reward,
                  "Last-100 average reward:", mean_reward2,
                  "Best mean reward of last-100:", best_mean_reward2, "Time:",
                  time_history[-1], "Total time so far:",
                  (time.time() - start_time_frame))
            if mean_reward >= 18.0:
                if mean_reward > best_18_reward:
                    best_18_reward = mean_reward
                    torch.save(model_Q.state_dict(), args.save_interim_path + \
                              'fmodel_best_18_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\
                               %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))
            if mean_reward >= 19.0:
                if mean_reward > best_19_reward:
                    best_19_reward = mean_reward
                    torch.save(model_Q.state_dict(), args.save_interim_path + \
                              'fmodel_best_19_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\
                               %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))
            if mean_reward >= 20.0:
                if mean_reward > best_20_reward:
                    best_20_reward = mean_reward
                    torch.save(model_Q.state_dict(), args.save_interim_path + \
                              'fmodel_best_20_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\
                               %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))
            if mean_reward >= 21.0:
                if mean_reward > best_21_reward:
                    best_21_reward = mean_reward
                    torch.save(model_Q.state_dict(), args.save_interim_path + \
                              'fmodel_best_21_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\
                               %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))

        if frame_idx % args.save_freq_frame == 0:
            results = [losses, all_rewards, time_history]
            torch.save(model_Q.state_dict(), args.save_model_path)
            np.save(args.save_result_path, results)
        if frame_idx == 10000:
            results = [losses, all_rewards, time_history]
            torch.save(model_Q.state_dict(), args.save_interim_path + \
                      'fmodel_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\
                       %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))
            np.save(args.save_interim_path + \
                   'fresults_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.npy' \
                    %(args.lr, frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn), \
                    results)

        if frame_idx % 500000 == 0:
            results = [losses, all_rewards, time_history]
            torch.save(model_Q.state_dict(), args.save_interim_path + \
                      'fmodel_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth' \
                      %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn))
            np.save(args.save_interim_path + \
                   'fresults_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.npy' \
                   %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn), \
                    results)