def __init__(self, env, params, net=None, reward=[], loss=[]): self.params = params self.r_sums = reward self.l_means = loss # PARAMS self.gamma = params["gamma"] self.freq_copy = params["freq_copy"] self.tau = params["max_tau"] self.tau_decay = params["tau_decay"] self.min_tau = params["min_tau"] self.exploration = params["exploration"] self.sigma = params["sigma"] self.alpha = params["alpha"] self.m = params["m"] self.frame_skip = params["frame_skip"] self.target_update_strategy = params["target_update_strategy"] self.batch_size = params["batch_size"] self.cuda = False # NEURAL NETWORK self.n_action = env.action_space.n self.net = QModel(self.n_action) if net is not None: self.net.load_state_dict(net) self.target = copy.deepcopy(self.net) self.optimizer = params["optimizer"](self.net.parameters(), lr=self.sigma) self.criterion = params["criterion"]() self.buff = Buffer(params["buffer_size"]) self.env = wrappers.AtariPreprocessing(env, frame_skip=self.frame_skip, screen_size=84, grayscale_obs=True, scale_obs=True) self.env = wrappers.FrameStack(self.env, self.m)
def get_env_fn(): return wrappers.AtariPreprocessing(gym.make(env_id), )
with open(os.path.join(eval_args.path, 'args.json'), 'r') as f: train_args = ArgsStruct(**json.load(f)) env_name = eval_args.env if eval_args.env is not None else train_args.env episodes = eval_args.episodes if eval_args.episodes is not None else train_args.optimize_freq timestamp = time.strftime("%Y-%m-%d-%H%M") log_path = os.path.join(exp_dir, f'eval-{timestamp}.log') logging.basicConfig(filename=log_path, level=logging.INFO) logging.getLogger('').addHandler(logging.StreamHandler()) if env_name == 'pong': env_id = 'PongNoFrameskip-v0' env = gym.make(env_id) else: raise NotImplementedError(env_name) device = torch.device('cuda:0' if not eval_args.no_cuda and torch.cuda.is_available() else 'cpu') env = wrappers.FrameStack(wrappers.AtariPreprocessing(env), num_stack=4) model = load_agent(train_args, env).to(device) model.load_state_dict( torch.load(os.path.join(exp_dir, 'checkpoint_best.pt'))) eval_res = evaluate(model, env, train_args, device, episodes) logging.info(pformat(eval_res))
nargs='?', default='Breakout-v0', help='Select the environment to run') args = parser.parse_args() env = gym.make(args.env_id) env.spec.id += " NoFrameskip" # You provide the directory to write to (can be an existing # directory, including one with existing data -- all monitor files # will be namespaced). You can also dump to a tempdir if you'd # like: tempfile.mkdtemp(). outdir = '/tmp/random-agent-results' env = wrappers.Monitor(env, directory=outdir, force=True) env = wrappers.AtariPreprocessing(env, screen_size=84, frame_skip=4, grayscale_obs=True) env = wrappers.FrameStack(env, 4) env.seed(0) neural_network = ConvNet() target_neural_network = copy.deepcopy(neural_network) print(list(neural_network.parameters())) criterion = nn.MSELoss() optim = torch.optim.SGD(neural_network.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM) optim.zero_grad() reward = 0 buffer = deque(maxlen=10000) done = False
parser.add_argument('env_id', nargs='?', default='BreakoutNoFrameskip-v4', help='Select the environment to run') args = parser.parse_args() # You can set the level to logger.DEBUG or logger.WARN if you # want to change the amount of output. logger.set_level(logger.INFO) env = gym.make(args.env_id) rewards = [] #env = wrappers.Monitor(env, force=True) #env.seed(0) env = wrappers.AtariPreprocessing(env) env = wrappers.FrameStack(env, 4) #env.seed(0) agent = Agent(env) episode_count = 200 reward = 0 done = False for i in range(episode_count): ob = env.reset() prev_ob = ob episode_reward = 0 print(ob)
def train(args: argparse.Namespace, env: gym.Env, exp_dir: str): seed = args.seed torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(seed) device = torch.device( 'cuda:0' if not args.no_cuda and torch.cuda.is_available() else 'cpu') env = wrappers.FrameStack(wrappers.AtariPreprocessing(env), num_stack=args.stacked_frames) writer = SummaryWriter(log_dir=exp_dir) with open(os.path.join(exp_dir, 'args.json'), 'w') as f: json.dump(args.__dict__, f, indent=2) logging.info(args) n_actions = env.action_space.n current_model = load_agent(args, env).to(device) current_model.eval() target_model = load_agent(args, env).to(device) target_model.eval() if args.curiosity: curiosity = load_icm(args, env).to(device) curiosity.eval() target_model.load_state_dict( current_model.state_dict()) # Sync/update target model # rms-prop? https://www.reddit.com/r/reinforcementlearning/comments/ei9p3y/using_rmsprop_over_adam/ if args.optimizer == 'adam': optimizer = optim.Adam(current_model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.curiosity: curiosity_optimizer = optim.Adam(curiosity.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: logging.error('Optimizer not implemented') raise NotImplementedError() logging.info(current_model) if args.curiosity: logging.info(curiosity) n_params = sum(p.numel() for p in current_model.parameters() if p.requires_grad) logging.info(f'Training {n_params} parameters') if args.curiosity: n_params = sum(p.numel() for p in curiosity.parameters() if p.requires_grad) logging.info(f'Training {n_params} parameters') criterion = nn.SmoothL1Loss if args.criterion == 'huber' else None if criterion is None: raise NotImplementedError(args.criterion) buffer = ReplayBuffer(capacity=args.replay_size, seed=args.seed) best_mean_reward = env.reward_range[0] updates_without_improvement = 0 # Adapted from Mario Martin's Notebook epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 10000 epsilon_by_episode = lambda e: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * e / epsilon_decay) if args.curiosity: epsilon_by_episode = lambda e: 0.0 # No epsilon needed if curiosity is used t0 = time.time() all_rewards = [] all_steps = [] all_mean_rewards = [] all_mean_steps = [] episode_set_rewards = 0.0 episode_set_curiosity_rewards = 0.0 episode_set_steps = 0 updates = 0 optimizations = 0 initial_counter = 0 for episode in range(args.episodes): state = env.reset() episode_reward = 0.0 episode_curiosity_reward = 0.0 steps = 0 epsilon = epsilon_by_episode(episode) while True: current_model.eval() if args.curiosity: curiosity.eval() action = current_model.act( torch.tensor(transform( state.__array__())).unsqueeze(0).to(device), epsilon, torch.rand(1)[0].to(device), torch.randint(0, n_actions, (1, ))[0].to(device)) current_model.train() next_state, reward, done, _ = env.step(action) episode_reward += reward curiosity_reward = None if args.curiosity: with torch.no_grad(): curiosity_reward, _ = \ curiosity(torch.tensor(transform(state.__array__())).unsqueeze(0).to(device), torch.tensor(transform(next_state.__array__())).unsqueeze(0).to(device), torch.tensor([action]).long().to(device)) episode_curiosity_reward += curiosity_reward buffer.push( LazyTransition( state, action, next_state, reward, done, curiosity_reward.cpu().numpy() if curiosity_reward is not None else None)) if done: initial_counter += 1 writer.add_scalar('Reward/train', episode_reward, episode + 1) writer.add_scalar('Steps/train', steps, episode + 1) writer.add_scalar('Epsilon/train', epsilon, episode + 1) all_rewards.append(episode_reward) all_steps.append(steps) episode_set_rewards += episode_reward episode_set_steps += steps if args.curiosity: writer.add_scalar('Curiosity/train', episode_curiosity_reward, episode + 1) episode_set_curiosity_rewards += episode_curiosity_reward state = next_state steps += 1 if args.render: env.render() if done: logging.info( f'Finished episode {episode+1} with reward = {episode_reward:.2f} | steps = {steps+1} | ' f'epsilon = {epsilon:.2f}') if args.curiosity: logging.info(f'curiosity = {curiosity_reward:.2f}') break if buffer.full and ( episode + 1 ) % args.optimize_freq == 0: # len(buffer) >= args.batch_size: transitions = buffer.sample(args.batch_size) if not args.curiosity: q_loss, _ = optimize(transitions, current_model, target_model, optimizer, device, epsilon, args.criterion) else: q_loss, curiosity_loss = optimize(transitions, current_model, target_model, optimizer, device, epsilon, args.criterion, curiosity, curiosity_optimizer) denominator = args.optimize_freq - 1 if optimizations > 0 else initial_counter mean_episode_set_rewards = episode_set_rewards / denominator mean_episode_set_steps = episode_set_steps / denominator writer.add_scalar('Mean-Reward/train', mean_episode_set_rewards, optimizations + 1) writer.add_scalar('Mean-Steps/train', mean_episode_set_steps, optimizations + 1) writer.add_scalar('Q-Loss/train', q_loss, optimizations + 1) if args.curiosity: writer.add_scalar('Curiosity-Loss/train', curiosity_loss, optimizations + 1) all_mean_rewards.append(mean_episode_set_rewards) all_mean_steps.append(mean_episode_set_steps) episode_set_rewards = 0.0 episode_set_steps = 0 torch.save(current_model.state_dict(), os.path.join(exp_dir, 'checkpoint_last.pt')) if args.curiosity: torch.save( curiosity.state_dict(), os.path.join(exp_dir, 'curiosity_checkpoint_last.pt')) logging.info(f'Optimized model ({optimizations+1} optimizations)') optimizations += 1 if mean_episode_set_rewards > best_mean_reward: updates_without_improvement = 0 best_mean_reward = mean_episode_set_rewards torch.save(current_model.state_dict(), os.path.join(exp_dir, 'checkpoint_best.pt')) logging.info(f'NEW: Best mean reward: {best_mean_reward:.2f}') if best_mean_reward == env.reward_range[1]: logging.info('Reached max reward') break else: updates_without_improvement += 1 logging.info(f'Best mean reward: {best_mean_reward:.2f}') if args.early_stop != -1 and updates_without_improvement == args.early_stop: break logging.info( f'{updates_without_improvement} updates without improvement') if buffer.full and (episode + 1) % args.update_target_freq == 0: target_model.load_state_dict(current_model.state_dict()) logging.info(f'Updated target model (updates {updates+1})') updates += 1 t1 = time.time() logging.info(f'Finished training in {t1-t0:.1f}s') if args.render: env.close() model = load_agent(args, env).to(device) model.load_state_dict(torch.load('checkpoint_best.pt')) eval_res = evaluate(model, env, args, device, episodes=args.optimize_freq) logging.info(pformat(eval_res))