def rl_learner(*args, **kwargs): for t in args: for key, value in t.items(): kwargs[key] = value print("TUNING HYPERPARAMETERS:") print(args) parser = argparse.Namespace(**kwargs) ratio = 0.5 weights = np.array([ratio**3, ratio**2, ratio, 1.0], dtype=np.float32) # env = MainGymWrapper.wrap(gym.make(env_name)) env = make_atari(parser.environment) env = wrap_deepmind(env, frame_stack=True) env = wrap_pytorch(env) if parser.mode == "Train": print("STARTING...") now = datetime.datetime.now().strftime("%Y%m%d%H%M%S") model_path = "model-" + now dir_path = os.path.join(parser.job_dir, model_path) if not os.path.exists(dir_path): os.makedirs(dir_path) print("MODEL WILL BE STORED AT: ", dir_path) writer = tf.summary.create_file_writer(dir_path) replay_buffer = ReplayBuffer(parser.buffer_size) agent = Agent(parser) input_shape = env.observation_space.shape input_shape = (input_shape[1], input_shape[2], 1) agent.initilize(input_shape, dir_path, env.action_space.n) all_returns = [] episode_return = 0 episode_num = 1 loss = 0 state = env.reset() state = np.expand_dims(np.average(np.float32(state), axis=0, weights=weights), axis=0) state = np.transpose(state, (1, 2, 0)) for step in range(1, parser.steps + 1): action = agent.step(state, step) next_state, reward, done, _ = env.step(action) next_state = np.expand_dims(np.average(np.float32(next_state), axis=0, weights=weights), axis=0) next_state = np.transpose(next_state, (1, 2, 0)) episode_return += reward replay_buffer.push((state, action, reward, next_state, done)) state = next_state if step >= parser.start_train: loss = agent.train(replay_buffer) if step >= parser.start_train and step % parser.update_target == 0: agent.update_networks() agent.save_model() if step >= parser.start_train and step % parser.log_frequency == 0: with writer.as_default(): tf.summary.scalar("last_10_average_returns", sum(all_returns[-10:]) / float(max(len(all_returns[-10:]), 1)), step=step) tf.summary.scalar("loss", loss, step=step) writer.flush() if done: print( 'CURRENT STEP: {}, EPISODE_NUMBER: {}, EPISODE REWARD: {}. EPISODE DONE!' .format(step, episode_num, episode_return)) all_returns.append(episode_return) episode_return = 0 episode_num += 1 state = env.reset() state = np.expand_dims(np.average(np.float32(state), axis=0, weights=weights), axis=0) state = np.transpose(state, (1, 2, 0)) return { "loss": -sum(all_returns[-10:]) / float(max(len(all_returns[-10:]), 1)), "model_dir": dir_path, "status": hyperopt.STATUS_OK, "attachment": { "return": pickle.dumps(all_returns) # os.path.join(dir_path, "returns.txt"): # pickle.dump(all_returns, open(os.path.join(dir_path, "returns.txt"), "wb")) } }
from Wrapper.wrappers import make_atari, wrap_deepmind, wrap_pytorch import math, random import gym import numpy as np import matplotlib.pyplot as plt import torch import torch.nn as nn import torch.optim as optim import torch.autograd as autograd import torch.nn.functional as F from numpy import savetxt USE_CUDA = torch.cuda.is_available() from dqn import QLearner, compute_td_loss, ReplayBuffer env_id = "PongNoFrameskip-v4" # established environment that will be played env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) num_frames = 1000000 # total frames that will be learning from batch_size = 32 # the number of samples that are provided to the model for update services at a given time gamma = 0.99 # the discount of future rewards record_idx = 10000 # replay_initial = 10000 # number frames that are held replay_buffer = ReplayBuffer(100000) model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) model.load_state_dict( torch.load("model_pretrained.pth", map_location='cpu')) #loading in the pretrained model
def main(args): # CUDA use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") print("Using cuda: ", use_cuda) # Environment env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env, args.frame_stack) env = wrap_pytorch(env) # Random seed env.seed(args.seed) torch.manual_seed(args.seed) # Initializing replay_initial = 10000 #50000 replay_buffer = ReplayBuffer(args.capacity) # model = QLearner(env, args, replay_buffer) # Initialize target q function and q function model_Q = QLearner(env, args, replay_buffer) model_target_Q = QLearner(env, args, replay_buffer) if args.optimizer == 'Adam': if args.use_optim_scheduler: optimizer = optim.Adam(model_Q.parameters(), lr=args.initial_lr) scheduler = StepLR(optimizer, step_size=args.step_size, gamma=args.gamma) # scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=1000, verbose=True) else: optimizer = optim.Adam(model_Q.parameters(), args.lr) elif args.optimizer == 'RMSprop': optimizer = optim.RMSprop(model_Q.parameters(), args.lr) if USE_CUDA: model_Q = model_Q.cuda() model_target_Q = model_target_Q.cuda() # Training loop epsilon_by_frame = lambda frame_idx: args.epsilon_final + ( args.epsilon_start - args.epsilon_final) * math.exp(-1. * frame_idx / args.epsilon_decay) losses = [] learning_rates = [] all_rewards = [] episode_reward = 0 num_param_updates = 0 mean_reward = -float('nan') mean_reward2 = -float('nan') best_mean_reward = -float('inf') best_mean_reward2 = -float('inf') best_18_reward = -float('inf') best_19_reward = -float('inf') best_20_reward = -float('inf') best_21_reward = -float('inf') time_history = [] # records time (in sec) of each episode old_lr = args.initial_lr state = env.reset() start_time_frame = time.time() for frame_idx in range(1, args.num_frames + 1): start_time = time.time() epsilon = epsilon_by_frame(frame_idx) action = model_Q.act(state, epsilon) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) time_history.append(time.time() - start_time) episode_reward = 0 if args.render == 1: env.render() if len(replay_buffer) > replay_initial: for nou in range(args.number_of_updates): loss = compute_td_loss(model_Q, model_target_Q, args.batch_size, args.gamma, replay_buffer, args.N) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.data.cpu().numpy()) num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % args.target_update_freq == 0: model_target_Q.load_state_dict(model_Q.state_dict()) if args.use_optim_scheduler: # scheduler.step(mean_reward2) scheduler.step() new_lr = scheduler.get_last_lr() # new_lr = optimizer.param_groups[0]['lr'] if new_lr != old_lr: learning_rates.append(new_lr) print('NewLearningRate: ', new_lr) old_lr = new_lr if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print("Preparing replay buffer with len -- ", len(replay_buffer), "Frame:", frame_idx, "Total time so far:", (time.time() - start_time_frame)) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: mean_reward = np.mean(all_rewards[-10:]) mean_reward2 = np.mean(all_rewards[-100:]) best_mean_reward = max(best_mean_reward, mean_reward) best_mean_reward2 = max(best_mean_reward2, mean_reward2) print("Frame:", frame_idx, "Loss:", np.mean(losses), "Total Rewards:", all_rewards[-1], "Average Rewards over all frames:", np.mean(all_rewards), "Last-10 average reward:", mean_reward, "Best mean reward of last-10:", best_mean_reward, "Last-100 average reward:", mean_reward2, "Best mean reward of last-100:", best_mean_reward2, "Time:", time_history[-1], "Total time so far:", (time.time() - start_time_frame)) if mean_reward >= 18.0: if mean_reward > best_18_reward: best_18_reward = mean_reward torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_best_18_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) if mean_reward >= 19.0: if mean_reward > best_19_reward: best_19_reward = mean_reward torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_best_19_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) if mean_reward >= 20.0: if mean_reward > best_20_reward: best_20_reward = mean_reward torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_best_20_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) if mean_reward >= 21.0: if mean_reward > best_21_reward: best_21_reward = mean_reward torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_best_21_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) if frame_idx % args.save_freq_frame == 0: results = [losses, all_rewards, time_history] torch.save(model_Q.state_dict(), args.save_model_path) np.save(args.save_result_path, results) if frame_idx == 10000: results = [losses, all_rewards, time_history] torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) np.save(args.save_interim_path + \ 'fresults_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.npy' \ %(args.lr, frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn), \ results) if frame_idx % 500000 == 0: results = [losses, all_rewards, time_history] torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth' \ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) np.save(args.save_interim_path + \ 'fresults_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.npy' \ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn), \ results)