def rl_learner(*args, **kwargs): for t in args: for key, value in t.items(): kwargs[key] = value print("TUNING HYPERPARAMETERS:") print(args) parser = argparse.Namespace(**kwargs) ratio = 0.5 weights = np.array([ratio**3, ratio**2, ratio, 1.0], dtype=np.float32) # env = MainGymWrapper.wrap(gym.make(env_name)) env = make_atari(parser.environment) env = wrap_deepmind(env, frame_stack=True) env = wrap_pytorch(env) if parser.mode == "Train": print("STARTING...") now = datetime.datetime.now().strftime("%Y%m%d%H%M%S") model_path = "model-" + now dir_path = os.path.join(parser.job_dir, model_path) if not os.path.exists(dir_path): os.makedirs(dir_path) print("MODEL WILL BE STORED AT: ", dir_path) writer = tf.summary.create_file_writer(dir_path) replay_buffer = ReplayBuffer(parser.buffer_size) agent = Agent(parser) input_shape = env.observation_space.shape input_shape = (input_shape[1], input_shape[2], 1) agent.initilize(input_shape, dir_path, env.action_space.n) all_returns = [] episode_return = 0 episode_num = 1 loss = 0 state = env.reset() state = np.expand_dims(np.average(np.float32(state), axis=0, weights=weights), axis=0) state = np.transpose(state, (1, 2, 0)) for step in range(1, parser.steps + 1): action = agent.step(state, step) next_state, reward, done, _ = env.step(action) next_state = np.expand_dims(np.average(np.float32(next_state), axis=0, weights=weights), axis=0) next_state = np.transpose(next_state, (1, 2, 0)) episode_return += reward replay_buffer.push((state, action, reward, next_state, done)) state = next_state if step >= parser.start_train: loss = agent.train(replay_buffer) if step >= parser.start_train and step % parser.update_target == 0: agent.update_networks() agent.save_model() if step >= parser.start_train and step % parser.log_frequency == 0: with writer.as_default(): tf.summary.scalar("last_10_average_returns", sum(all_returns[-10:]) / float(max(len(all_returns[-10:]), 1)), step=step) tf.summary.scalar("loss", loss, step=step) writer.flush() if done: print( 'CURRENT STEP: {}, EPISODE_NUMBER: {}, EPISODE REWARD: {}. EPISODE DONE!' .format(step, episode_num, episode_return)) all_returns.append(episode_return) episode_return = 0 episode_num += 1 state = env.reset() state = np.expand_dims(np.average(np.float32(state), axis=0, weights=weights), axis=0) state = np.transpose(state, (1, 2, 0)) return { "loss": -sum(all_returns[-10:]) / float(max(len(all_returns[-10:]), 1)), "model_dir": dir_path, "status": hyperopt.STATUS_OK, "attachment": { "return": pickle.dumps(all_returns) # os.path.join(dir_path, "returns.txt"): # pickle.dump(all_returns, open(os.path.join(dir_path, "returns.txt"), "wb")) } }
import gym import numpy as np import matplotlib.pyplot as plt import torch import torch.nn as nn import torch.optim as optim import torch.autograd as autograd import torch.nn.functional as F from numpy import savetxt USE_CUDA = torch.cuda.is_available() from dqn import QLearner, compute_td_loss, ReplayBuffer env_id = "PongNoFrameskip-v4" # established environment that will be played env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) num_frames = 1000000 # total frames that will be learning from batch_size = 32 # the number of samples that are provided to the model for update services at a given time gamma = 0.99 # the discount of future rewards record_idx = 10000 # replay_initial = 10000 # number frames that are held replay_buffer = ReplayBuffer(100000) model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) model.load_state_dict( torch.load("model_pretrained.pth", map_location='cpu')) #loading in the pretrained model target_model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) #load in model
def main(args): # CUDA use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") print("Using cuda: ", use_cuda) # Environment env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env, args.frame_stack) env = wrap_pytorch(env) # Random seed env.seed(args.seed) torch.manual_seed(args.seed) # Initializing replay_initial = 10000 #50000 replay_buffer = ReplayBuffer(args.capacity) # model = QLearner(env, args, replay_buffer) # Initialize target q function and q function model_Q = QLearner(env, args, replay_buffer) model_target_Q = QLearner(env, args, replay_buffer) if args.optimizer == 'Adam': if args.use_optim_scheduler: optimizer = optim.Adam(model_Q.parameters(), lr=args.initial_lr) scheduler = StepLR(optimizer, step_size=args.step_size, gamma=args.gamma) # scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=1000, verbose=True) else: optimizer = optim.Adam(model_Q.parameters(), args.lr) elif args.optimizer == 'RMSprop': optimizer = optim.RMSprop(model_Q.parameters(), args.lr) if USE_CUDA: model_Q = model_Q.cuda() model_target_Q = model_target_Q.cuda() # Training loop epsilon_by_frame = lambda frame_idx: args.epsilon_final + ( args.epsilon_start - args.epsilon_final) * math.exp(-1. * frame_idx / args.epsilon_decay) losses = [] learning_rates = [] all_rewards = [] episode_reward = 0 num_param_updates = 0 mean_reward = -float('nan') mean_reward2 = -float('nan') best_mean_reward = -float('inf') best_mean_reward2 = -float('inf') best_18_reward = -float('inf') best_19_reward = -float('inf') best_20_reward = -float('inf') best_21_reward = -float('inf') time_history = [] # records time (in sec) of each episode old_lr = args.initial_lr state = env.reset() start_time_frame = time.time() for frame_idx in range(1, args.num_frames + 1): start_time = time.time() epsilon = epsilon_by_frame(frame_idx) action = model_Q.act(state, epsilon) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) time_history.append(time.time() - start_time) episode_reward = 0 if args.render == 1: env.render() if len(replay_buffer) > replay_initial: for nou in range(args.number_of_updates): loss = compute_td_loss(model_Q, model_target_Q, args.batch_size, args.gamma, replay_buffer, args.N) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.data.cpu().numpy()) num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % args.target_update_freq == 0: model_target_Q.load_state_dict(model_Q.state_dict()) if args.use_optim_scheduler: # scheduler.step(mean_reward2) scheduler.step() new_lr = scheduler.get_last_lr() # new_lr = optimizer.param_groups[0]['lr'] if new_lr != old_lr: learning_rates.append(new_lr) print('NewLearningRate: ', new_lr) old_lr = new_lr if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print("Preparing replay buffer with len -- ", len(replay_buffer), "Frame:", frame_idx, "Total time so far:", (time.time() - start_time_frame)) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: mean_reward = np.mean(all_rewards[-10:]) mean_reward2 = np.mean(all_rewards[-100:]) best_mean_reward = max(best_mean_reward, mean_reward) best_mean_reward2 = max(best_mean_reward2, mean_reward2) print("Frame:", frame_idx, "Loss:", np.mean(losses), "Total Rewards:", all_rewards[-1], "Average Rewards over all frames:", np.mean(all_rewards), "Last-10 average reward:", mean_reward, "Best mean reward of last-10:", best_mean_reward, "Last-100 average reward:", mean_reward2, "Best mean reward of last-100:", best_mean_reward2, "Time:", time_history[-1], "Total time so far:", (time.time() - start_time_frame)) if mean_reward >= 18.0: if mean_reward > best_18_reward: best_18_reward = mean_reward torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_best_18_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) if mean_reward >= 19.0: if mean_reward > best_19_reward: best_19_reward = mean_reward torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_best_19_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) if mean_reward >= 20.0: if mean_reward > best_20_reward: best_20_reward = mean_reward torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_best_20_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) if mean_reward >= 21.0: if mean_reward > best_21_reward: best_21_reward = mean_reward torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_best_21_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) if frame_idx % args.save_freq_frame == 0: results = [losses, all_rewards, time_history] torch.save(model_Q.state_dict(), args.save_model_path) np.save(args.save_result_path, results) if frame_idx == 10000: results = [losses, all_rewards, time_history] torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth'\ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) np.save(args.save_interim_path + \ 'fresults_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.npy' \ %(args.lr, frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn), \ results) if frame_idx % 500000 == 0: results = [losses, all_rewards, time_history] torch.save(model_Q.state_dict(), args.save_interim_path + \ 'fmodel_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.pth' \ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn)) np.save(args.save_interim_path + \ 'fresults_lr%s_frame_%s_framestack_%s_scheduler_%s_%s.npy' \ %(args.lr,frame_idx, args.frame_stack, args.use_optim_scheduler, args.interim_fn), \ results)