import torch.nn.functional as F import torch.optim as optim from lib import common GAMMA = 0.99 LEARNING_RATE = 0.001 ENTROPY_BETA = 0.01 BATCH_SIZE = 128 NUM_ENVS = 50 BELLMAN_STEPS = 4 CLIP_GRAD = 0.1 if __name__ == "__main__": common.mkdir('.', 'checkpoints') parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("-n", "--name", required=True, help="Name of the run") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") make_env = lambda: ptan.common.wrappers.wrap_dqn( gym.make("PongNoFrameskip-v4")) envs = [make_env() for _ in range(NUM_ENVS)] writer = SummaryWriter(comment="-pong-a2c_" + args.name) net = common.AtariA2C(envs[0].observation_space.shape,
sum_loss_total += loss sum_entropy += entropy count_steps += 1 # Write to tensorboard output file writer.add_scalar("returns", sum_returns / count_steps, frame_idx) writer.add_scalar("advantage", sum_advantage / count_steps, frame_idx) writer.add_scalar("loss_actor", sum_loss_actor / count_steps, frame_idx) writer.add_scalar("loss_critic", sum_loss_critic / count_steps, frame_idx) writer.add_scalar("entropy", sum_entropy / count_steps, frame_idx) writer.add_scalar("loss_total", sum_loss_total / count_steps, frame_idx) if __name__ == "__main__": mkdir('.', 'checkpoints') parser = argparse.ArgumentParser() parser.add_argument("-n", "--name", default=ENV_ID, help="Name of the run") args = parser.parse_args() writer = SummaryWriter(comment="ppo_" + args.name) # Autodetect CUDA use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print('Device:', device) # Prepare environments envs = [make_env() for i in range(NUM_ENVS)] # make multiple envs (ENV_ID) for training envs = SubprocVecEnv(envs) # ?? env = gym.make(ENV_ID) # make env for testing num_inputs = envs.observation_space.shape[0]
action='store', default='%s/reports' % os.path.dirname(os.path.realpath(__file__)), required=False) ob_group.add_argument('-d', '--debug', help='enable full traceback on exceptions', action='store_true', default=False, required=False) args = parser.parse_args() config = '%s/etc/omnibus.conf' % os.path.dirname(os.path.realpath(__file__)) output_dir = args.output DEBUG = args.debug info('Using configuration file (%s) ...' % config) info('Debug: %s' % DEBUG) if os.path.exists(output_dir): if not os.path.isdir(output_dir): error('Specified report output location is not a directory; exiting ...') sys.exit(1) else: info('Creating report output directory (%s) ...' % output_dir) mkdir(output_dir) console = Console() console.cmdloop()
obs, reward, done, _ = env.step(action) rewards += reward steps += 1 if done: break return rewards / count, steps / count def calc_logprob(mu_v, var_v, actions_v): p1 = -((mu_v - actions_v)**2) / (2 * var_v.clamp(min=1e-3)) p2 = -torch.log(torch.sqrt(2 * math.pi * var_v)) return p1 + p2 if __name__ == "__main__": common.mkdir(".", "checkpoints") parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable CUDA") parser.add_argument("-n", "--name", required=True, help="Name of the run") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") save_path = os.path.join("./checkpoints/", "a2c-" + args.name) os.makedirs(save_path, exist_ok=True) env = gym.make(ENV_ID) test_env = gym.make(ENV_ID)
def Start(self, debugOutputQueue, pauseQueue, fromSavedModel=''): mkdir('.', 'checkpoints') parser = argparse.ArgumentParser() parser.add_argument("-n", "--name", default=self.settings['ENV_NAME'], help="Name of the run") args = parser.parse_args() # Autodetect CUDA use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") self.logQueue.put(pprint.pformat('Device:' + device.type)) # Prepare environments envs = RemoteVecEnv(NUM_ENVS) num_inputs = envs.observation_space.shape[0] num_outputs = envs.action_space.shape[0] frame_idx = 0 train_epoch = 0 best_reward = None self.model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE) self.writer = SummaryWriter(comment="ppo_" + args.name) if fromSavedModel == '': self.logQueue.put('Successfully make 8 remote environment') self.logQueue.put(pprint.pformat(self.model)) else: check_point = torch.load(fromSavedModel) self.model.load_state_dict(check_point['state_dict']) self.optimizer.load_state_dict(check_point['optimizer']) train_epoch = check_point['epoch'] frame_idx = check_point['frame_idx'] self.logQueue.put('Successfully load model from ' + fromSavedModel) state = envs.reset() early_stop = False save_count = 0 while not early_stop: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] for _ in range(PPO_STEPS): state = torch.FloatTensor(state).to(device) dist, value = self.model(state) action = dist.sample() # each state, reward, done is a list of results from each parallel environment action_exp = action.cpu().numpy() action_exp = np.clip(action_exp, -10, 10) next_state, reward, done, _ = envs.step(action_exp) log_prob = dist.log_prob(action) log_probs.append(log_prob) values.append(value) rewards.append( torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append( torch.FloatTensor(1 - done).unsqueeze(1).to(device)) states.append(state) actions.append(action) state = next_state frame_idx += 1 debugData = (next_state, reward, done, action_exp) debugOutputQueue.put(debugData) while pauseQueue.qsize() > 0: if pauseQueue.qsize() == 1: time.sleep(1) else: while not pauseQueue.empty(): pauseQueue.get() next_state = torch.FloatTensor(next_state).to(device) _, next_value = self.model(next_state) returns = self.compute_gae(next_value, rewards, masks, values) returns = torch.cat(returns).detach() log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states) actions = torch.cat(actions) advantage = returns - values advantage = self.normalize(advantage) self.ppo_update(frame_idx, states, actions, log_probs, returns, advantage) train_epoch += 1 if train_epoch % TEST_EPOCHS == 0: test_reward = np.mean([ self.test_env(envs, self.model, device) for _ in range(NUM_TESTS) ]) self.writer.add_scalar("test_rewards", test_reward, frame_idx) self.logQueue.put( pprint.pformat('Frame %s. reward: %s' % (frame_idx, test_reward))) # Save a checkpoint every time we achieve a best reward if best_reward is None or best_reward < test_reward: if best_reward is not None: self.logQueue.put( pprint.pformat( "Best reward updated: %.3f -> %.3f" % (best_reward, test_reward))) name = "%s_best_%+.3f_%d.dat" % ( args.name, test_reward, frame_idx) fname = os.path.join('.', 'checkpoints', name) check_point = { 'epoch': train_epoch, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'frame_idx': frame_idx, } # self.save_ckp(check_point, fname) # torch.save(self.model.state_dict(), fname) torch.save(check_point, fname) best_reward = test_reward if test_reward > TARGET_REWARD: early_stop = True save_count += 1 if save_count >= 15: self.logQueue.put( pprint.pformat('Saving checkpoint for frame: ' + str(frame_idx))) name = "%s_frame_%d.dat" % (args.name, frame_idx) fname = os.path.join('.', 'checkpoints', name) check_point = { 'epoch': train_epoch, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'frame_idx': frame_idx, } torch.save(check_point, fname) save_count = 0
default=True, action='store_false', dest='visualize', help="Disable visualization of the game play") args = parser.parse_args() params = PARAMS[args.config] if args.config == 'doom': import vizdoomgym env = make_doom_env(params['env_name']) else: env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env) if args.record: mkdir('.', args.record) env = wrappers.Monitor(env, args.record, force=True) net = model.NoisyDuelingDQN(env.observation_space.shape, env.action_space.n) net.load_state_dict( torch.load(args.model, map_location=lambda storage, loc: storage)) state = env.reset() total_reward = 0.0 c = collections.Counter() while True: start_ts = time.time() if args.visualize: env.render() state_v = torch.tensor(