def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = ActorCritic(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=0.001) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) net.train() running_score = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: if args.render: env.render() policy, value = net(state) action = get_action(policy, num_actions) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 transition = [state, next_state, action, reward, mask] train_model(net, optimizer, transition, policy, value) score += reward state = next_state score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % args.log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) writer.add_scalar('log/score', float(score), running_score) if running_score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
def main(): args = get_args() device = torch.device("cuda:0" if args.cuda else "cpu") env = gym.make(args.env_name) num_inputs = env.observation_space.spaces['observation'].shape[ 0] + env.observation_space.spaces['desired_goal'].shape[ 0] # extended state num_actions = env.action_space.shape[0] network = ActorCritic(num_inputs, num_actions, layer_norm=args.layer_norm) network.to(device) '''joint train''' reward_record = [] for i in range(args.num_parallel_run): args.seed += 1 reward_record.append(espd(args, network, device))
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = ActorCritic(num_inputs, num_actions) net.load_state_dict(torch.load(args.save_path + 'model.pth')) net.to(device) net.eval() running_score = 0 steps = 0 for e in range(5): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: env.render() steps += 1 policy, value = net(state) action = get_action(policy, num_actions) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) score += reward state = next_state print('{} episode | score: {:.2f}'.format(e, score))
def worker(gpu, ngpus_per_node, callback, args): args.gpu = gpu if args.distributed: args.seed += args.gpu torch.cuda.set_device(args.gpu) args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0 if args.multiprocessing_distributed: args.rank = args.rank * ngpus_per_node + args.gpu torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:8632', world_size=args.world_size, rank=args.rank) else: args.rank = 0 if args.lr_scale: scaled_lr = args.lr * math.sqrt((args.num_ales * args.world_size) / 16) if args.rank == 0: print('Scaled learning rate from {:4.4f} to {:4.4f}'.format(args.lr, scaled_lr)) args.lr = scaled_lr args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available() args.no_cuda_train = (not args.no_cuda_train) and torch.cuda.is_available() args.verbose = args.verbose and (args.rank == 0) np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) if args.use_cuda_env or (args.no_cuda_train == False): torch.cuda.manual_seed(np.random.randint(1, 10000)) env_device = torch.device('cuda', args.gpu) if args.use_cuda_env else torch.device('cpu') train_device = torch.device('cuda', args.gpu) if (args.no_cuda_train == False) else torch.device('cpu') if args.rank == 0: if args.output_filename: train_csv_file = open(args.output_filename, 'w', newline='') train_csv_writer = csv.writer(train_csv_file, delimiter=',') train_csv_writer.writerow(['frames','fps','total_time', 'rmean','rmedian','rmin','rmax','rstd', 'lmean','lmedian','lmin','lmax','lstd', 'entropy','value_loss','policy_loss']) eval_output_filename = '.'.join([''.join(args.output_filename.split('.')[:-1] + ['_test']), 'csv']) eval_csv_file = open(eval_output_filename, 'w', newline='') eval_csv_file.write(json.dumps(vars(args))) eval_csv_file.write('\n') eval_csv_writer = csv.writer(eval_csv_file, delimiter=',') eval_csv_writer.writerow(['frames','total_time', 'rmean','rmedian','rmin','rmax','rstd', 'lmean','lmedian','lmin','lmax','lstd']) else: train_csv_file, train_csv_writer = None, None eval_csv_file, eval_csv_writer = None, None if args.plot: from tensorboardX import SummaryWriter current_time = datetime.now().strftime('%b%d_%H-%M-%S') log_dir = os.path.join(args.log_dir, current_time + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) for k, v in vars(args).items(): writer.add_text(k, str(v)) print() print('PyTorch : {}'.format(torch.__version__)) print('CUDA : {}'.format(torch.backends.cudnn.m.cuda)) print('CUDNN : {}'.format(torch.backends.cudnn.version())) print('APEX : {}'.format('.'.join([str(i) for i in apex.amp.__version__.VERSION]))) print() if train_device.type == 'cuda': print(cuda_device_str(train_device.index), flush=True) if args.use_openai: train_env = create_vectorize_atari_env(args.env_name, args.seed, args.num_ales, episode_life=args.episodic_life, clip_rewards=False, max_frames=args.max_episode_length) observation = torch.from_numpy(train_env.reset()).squeeze(1) else: train_env = AtariEnv(args.env_name, args.num_ales, color_mode='gray', repeat_prob=0.0, device=env_device, rescale=True, episodic_life=args.episodic_life, clip_rewards=False, frameskip=4) train_env.train() observation = train_env.reset(initial_steps=args.ale_start_steps, verbose=args.verbose).squeeze(-1) if args.use_openai_test_env: test_env = create_vectorize_atari_env(args.env_name, args.seed, args.evaluation_episodes, episode_life=False, clip_rewards=False) test_env.reset() else: test_env = AtariEnv(args.env_name, args.evaluation_episodes, color_mode='gray', repeat_prob=0.0, device='cpu', rescale=True, episodic_life=False, clip_rewards=False, frameskip=4) model = ActorCritic(args.num_stack, train_env.action_space, normalize=args.normalize, name=args.env_name) model = model.to(train_device).train() if args.rank == 0: print(model) args.model_name = model.name() if args.use_adam: optimizer = optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) else: optimizer = optim.RMSprop(model.parameters(), lr=args.lr, eps=args.eps, alpha=args.alpha) model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale ) if args.distributed: model = DDP(model, delay_allreduce=True) num_frames_per_iter = args.num_ales * args.num_steps total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) shape = (args.num_steps + 1, args.num_ales, args.num_stack, *train_env.observation_space.shape[-2:]) states = torch.zeros(shape, device=train_device, dtype=torch.float32) states[0, :, -1] = observation.to(device=train_device, dtype=torch.float32) shape = (args.num_steps + 1, args.num_ales) values = torch.zeros(shape, device=train_device, dtype=torch.float32) returns = torch.zeros(shape, device=train_device, dtype=torch.float32) shape = (args.num_steps, args.num_ales) rewards = torch.zeros(shape, device=train_device, dtype=torch.float32) masks = torch.zeros(shape, device=train_device, dtype=torch.float32) actions = torch.zeros(shape, device=train_device, dtype=torch.long) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) if args.use_gae: gae = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) maybe_npy = lambda a: a.numpy() if args.use_openai else a torch.cuda.synchronize() iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) total_time = 0 evaluation_offset = 0 for update in iterator: T = args.world_size * update * num_frames_per_iter if (args.rank == 0) and (T >= evaluation_offset): evaluation_offset += args.evaluation_interval eval_lengths, eval_rewards = evaluate(args, T, total_time, model, test_env, eval_csv_writer, eval_csv_file) if args.plot: writer.add_scalar('eval/rewards_mean', eval_rewards.mean().item(), T, walltime=total_time) writer.add_scalar('eval/lengths_mean', eval_lengths.mean().item(), T, walltime=total_time) start_time = time.time() with torch.no_grad(): for step in range(args.num_steps): value, logit = model(states[step]) # store values values[step] = value.squeeze(-1) # convert actions to numpy and perform next step probs_action = F.softmax(logit, dim=1).multinomial(1).to(env_device) observation, reward, done, info = train_env.step(maybe_npy(probs_action)) if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation) reward = torch.from_numpy(reward) done = torch.from_numpy(done.astype(np.uint8)) else: observation = observation.squeeze(-1).unsqueeze(1) # move back to training memory observation = observation.to(device=train_device) reward = reward.to(device=train_device, dtype=torch.float32) done = done.to(device=train_device) probs_action = probs_action.to(device=train_device, dtype=torch.long) not_done = 1.0 - done.float() # update rewards and actions actions[step].copy_(probs_action.view(-1)) masks[step].copy_(not_done) rewards[step].copy_(reward.sign()) # update next observations states[step + 1, :, :-1].copy_(states[step, :, 1:].clone()) states[step + 1] *= not_done.view(-1, *[1] * (observation.dim() - 1)) states[step + 1, :, -1].copy_(observation.view(-1, *states.size()[-2:])) # update episodic reward counters episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done returns[-1] = values[-1] = model(states[-1])[0].data.squeeze(-1) if args.use_gae: gae.zero_() for step in reversed(range(args.num_steps)): delta = rewards[step] + (args.gamma * values[step + 1] * masks[step]) - values[step] gae = delta + (args.gamma * args.tau * masks[step] * gae) returns[step] = gae + values[step] else: for step in reversed(range(args.num_steps)): returns[step] = rewards[step] + (args.gamma * returns[step + 1] * masks[step]) value, logit = model(states[:-1].view(-1, *states.size()[-3:])) log_probs = F.log_softmax(logit, dim=1) probs = F.softmax(logit, dim=1) action_log_probs = log_probs.gather(1, actions.view(-1).unsqueeze(-1)) dist_entropy = -(log_probs * probs).sum(-1).mean() advantages = returns[:-1].view(-1).unsqueeze(-1) - value value_loss = advantages.pow(2).mean() policy_loss = -(advantages.clone().detach() * action_log_probs).mean() loss = value_loss * args.value_loss_coef + policy_loss - dist_entropy * args.entropy_coef optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) optimizer.step() states[0].copy_(states[-1]) torch.cuda.synchronize() if args.rank == 0: iter_time = time.time() - start_time total_time += iter_time if args.plot: writer.add_scalar('train/rewards_mean', final_rewards.mean().item(), T, walltime=total_time) writer.add_scalar('train/lengths_mean', final_lengths.mean().item(), T, walltime=total_time) writer.add_scalar('train/learning_rate', scheduler.get_lr()[0], T, walltime=total_time) writer.add_scalar('train/value_loss', value_loss, T, walltime=total_time) writer.add_scalar('train/policy_loss', policy_loss, T, walltime=total_time) writer.add_scalar('train/entropy', dist_entropy, T, walltime=total_time) progress_data = callback(args, model, T, iter_time, final_rewards, final_lengths, value_loss.item(), policy_loss.item(), dist_entropy.item(), train_csv_writer, train_csv_file) iterator.set_postfix_str(progress_data) if args.plot: writer.close() if args.use_openai: train_env.close() if args.use_openai_test_env: test_env.close()
def main(): # 确定神经网络计算设备 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 构建神经网络 net = ActorCritic() net = net.to(device) # 准备优化器 optimizer = torch.optim.Adam(net.parameters(), lr=3e-4) # 准备环境 envs = Envs(NUM_WORKERS, gamma=GAMMA) # 开始训练 for episode in range(EPISODES): # 从多个环境采集一回合数据 net.eval() with torch.no_grad(): states = envs.reset() done = False while not done: states = states.to(device) _, policys = net(states) policys = policys.cpu() # 移到CPU上处理比较好 # 不能下的位置概率填 0 for i in range(NUM_WORKERS): if envs.reversis[i].next != 0: for y, x in itertools.product(range(SIZE), repeat=2): if not envs.reversis[i].good[y][x]: policys[i][y * SIZE + x] = 0. else: policys[i][y * SIZE + x] += 1e-8 # 防止概率全为 0 actions = Categorical(probs=policys).sample() done, states = envs.step(actions) envs.setReturn() data = EpisodeData(envs.readHistory()) loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) # 训练网络 net.train() # 相关指标 value_loss_total = 0. entropy_total = 0. for states, actions, Returns in loader: states, actions, Returns = states.to(device), actions.to( device), Returns.to(device) values, policys = net(states) dist = Categorical(probs=policys) action_log_probs = dist.log_prob(actions).view(-1, 1) dist_entropy = dist.entropy().mean() # 我们希望分布的熵更大些,保持模型的探索性 advantages = Returns.view(-1, 1) - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() optimizer.zero_grad() (VALUE_LOSS_COEF * value_loss + action_loss - ENTROPY_LOSS_COEF * dist_entropy).backward() optimizer.step() value_loss_total += value_loss.item() entropy_total += dist_entropy.item() print('Episode: {:>10d}, Value Loss: {:g}, Entropy: {:g}'.format( episode, value_loss_total / len(loader), entropy_total / len(loader)), flush=True) if episode != 0 and episode % SAVE_INTERVAL == 0: if not os.path.isdir('models'): os.mkdir('models') torch.save(net.state_dict(), 'models/{}.pt'.format(episode // SAVE_INTERVAL))
class A3C(): '''Implementation of N-step Asychronous Advantage Actor Critic''' def __init__(self, args, env, train=True): self.args = args self.set_random_seeds() self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # Create the environment. self.env = gym.make(env) self.environment_name = env # Setup model. self.policy = ActorCritic(4, self.env.action_space.n) self.policy.apply(self.initialize_weights) # Setup critic model. self.critic = ActorCritic(4, self.env.action_space.n) self.critic.apply(self.initialize_weights) # Setup optimizer. self.eps = 1e-10 # To avoid divide-by-zero error. self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=args.policy_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=args.critic_lr) # Model weights path. self.timestamp = datetime.now().strftime( 'a2c-breakout-%Y-%m-%d_%H-%M-%S') self.weights_path = 'models/%s/%s' % (self.environment_name, self.timestamp) # Load pretrained weights. if args.weights_path: self.load_model() self.policy.to(self.device) self.critic.to(self.device) # Video render mode. if args.render: self.policy.eval() self.generate_episode(render=True) self.plot() return # Data for plotting. self.rewards_data = [] # n * [epoch, mean(returns), std(returns)] # Network training mode. if train: # Tensorboard logging. self.logdir = 'logs/%s/%s' % (self.environment_name, self.timestamp) self.summary_writer = SummaryWriter(self.logdir) # Save hyperparameters. with open(self.logdir + '/training_parameters.json', 'w') as f: json.dump(vars(self.args), f, indent=4) def initialize_weights(self, layer): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): nn.init.xavier_uniform_(layer.weight) nn.init.zeros_(layer.bias) def set_random_seeds(self): torch.manual_seed(self.args.random_seed) np.random.seed(self.args.random_seed) torch.backends.cudnn.benchmark = True def save_model(self, epoch): '''Helper function to save model state and weights.''' if not os.path.exists(self.weights_path): os.makedirs(self.weights_path) torch.save( { 'policy_state_dict': self.policy.state_dict(), 'policy_optimizer': self.policy_optimizer.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'critic_optimizer': self.critic_optimizer.state_dict(), 'rewards_data': self.rewards_data, 'epoch': epoch }, os.path.join(self.weights_path, 'model_%d.h5' % epoch)) def load_model(self): '''Helper function to load model state and weights. ''' if os.path.isfile(self.args.weights_path): print('=> Loading checkpoint', self.args.weights_path) self.checkpoint = torch.load(self.args.weights_path) self.policy.load_state_dict(self.checkpoint['policy_state_dict']) self.policy_optimizer.load_state_dict( self.checkpoint['policy_optimizer']) self.critic.load_state_dict(self.checkpoint['critic_state_dict']) self.critic_optimizer.load_state_dict( self.checkpoint['critic_optimizer']) self.rewards_data = self.checkpoint['rewards_data'] else: raise Exception('No checkpoint found at %s' % self.args.weights_path) def train(self): '''Trains the model on a single episode using REINFORCE.''' for epoch in range(self.args.num_episodes): # Generate epsiode data. returns, log_probs, value_function, train_rewards = self.generate_episode( ) self.summary_writer.add_scalar('train/cumulative_rewards', train_rewards, epoch) self.summary_writer.add_scalar('train/trajectory_length', returns.size()[0], epoch) # Compute loss and policy gradient. self.policy_optimizer.zero_grad() policy_loss = ((returns - value_function.detach()) * -log_probs).mean() policy_loss.backward() self.policy_optimizer.step() self.critic_optimizer.zero_grad() critic_loss = F.mse_loss(returns, value_function) critic_loss.backward() self.critic_optimizer.step() # Test the model. if epoch % self.args.test_interval == 0: self.policy.eval() print('\nTesting') rewards = [ self.generate_episode(test=True) for epoch in range(self.args.test_episodes) ] rewards_mean, rewards_std = np.mean(rewards), np.std(rewards) print( 'Test Rewards (Mean): %.3f | Test Rewards (Std): %.3f\n' % (rewards_mean, rewards_std)) self.rewards_data.append([epoch, rewards_mean, rewards_std]) self.summary_writer.add_scalar('test/rewards_mean', rewards_mean, epoch) self.summary_writer.add_scalar('test/rewards_std', rewards_std, epoch) self.policy.train() # Logging. if epoch % self.args.log_interval == 0: print( 'Epoch: {0:05d}/{1:05d} | Policy Loss: {2:.3f} | Value Loss: {3:.3f}' .format(epoch, self.args.num_episodes, policy_loss, critic_loss)) self.summary_writer.add_scalar('train/policy_loss', policy_loss, epoch) self.summary_writer.add_scalar('train/critic_loss', critic_loss, epoch) # Save the model. if epoch % self.args.save_interval == 0: self.save_model(epoch) self.save_model(epoch) self.summary_writer.close() def generate_episode(self, gamma=0.99, test=False, render=False, max_iters=10000): ''' Generates an episode by executing the current policy in the given env. Returns: - a list of states, indexed by time epoch - a list of actions, indexed by time epoch - a list of cumulative discounted returns, indexed by time epoch ''' iters = 0 done = False state = self.env.reset() # Set video save path if render enabled. if render: save_path = 'videos/%s/epoch-%s' % (self.environment_name, self.checkpoint['epoch']) if not os.path.exists(save_path): os.makedirs(save_path) monitor = gym.wrappers.Monitor(self.env, save_path, force=True) batches = [] states = [torch.zeros(84, 84, device=self.device).float()] * 3 rewards, returns = [], [] actions, log_probs = [], [] while not done: # Run policy on current state to log probabilities of actions. states.append( torch.tensor(preprocess(state), device=self.device).float().squeeze(0)) batches.append(torch.stack(states[-4:])) action_probs = self.policy.forward( batches[-1].unsqueeze(0)).squeeze(0) # Sample action from the log probabilities. if test and self.args.det_eval: action = torch.argmax(action_probs) else: action = torch.argmax( torch.distributions.Multinomial( logits=action_probs).sample()) actions.append(action) log_probs.append(action_probs[action]) # Run simulation with current action to get new state and reward. if render: monitor.render() state, reward, done, _ = self.env.step(action.cpu().numpy()) rewards.append(reward) # Break if the episode takes too long. iters += 1 if iters > max_iters: break # Save video and close rendering. cum_rewards = np.sum(rewards) if render: monitor.close() print('\nCumulative Rewards:', cum_rewards) return # Return cumulative rewards for test mode. if test: return cum_rewards # Flip rewards from T-1 to 0. rewards = np.array(rewards) / self.args.reward_normalizer # Compute value. values = [] minibatches = torch.split(torch.stack(batches), 256) for minibatch in minibatches: values.append( self.critic.forward(minibatch, action=False).squeeze(1)) values = torch.cat(values) discounted_values = values * gamma**self.args.n # Compute the cumulative discounted returns. n_step_rewards = np.zeros((1, self.args.n)) for i in reversed(range(rewards.shape[0])): if i + self.args.n >= rewards.shape[0]: V_end = 0 else: V_end = discounted_values[i + self.args.n] n_step_rewards[0, :-1] = n_step_rewards[0, 1:] * gamma n_step_rewards[0, -1] = rewards[i] n_step_return = torch.tensor( n_step_rewards.sum(), device=self.device).unsqueeze(0) + V_end returns.append(n_step_return) # Normalize returns. # returns = torch.stack(returns) # mean_return, std_return = returns.mean(), returns.std() # returns = (returns - mean_return) / (std_return + self.eps) return torch.stack(returns[::-1]).detach().squeeze(1), torch.stack( log_probs), values.squeeze(), cum_rewards def plot(self): # Save the plot. filename = os.path.join( 'plots', *self.args.weights_path.split('/')[-2:]).replace('.h5', '.png') if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) # Make error plot with mean, std of rewards. data = np.asarray(self.rewards_data) plt.errorbar(data[:, 0], data[:, 1], data[:, 2], lw=2.5, elinewidth=1.5, ecolor='grey', barsabove=True, capthick=2, capsize=3) plt.title('Cumulative Rewards (Mean/Std) Plot for A3C Algorithm') plt.xlabel('Number of Episodes') plt.ylabel('Cumulative Rewards') plt.grid() plt.savefig(filename, dpi=300) plt.show()
# in PyTorch 1.4, looks like we have print(torch.cuda.memory_summary(device)) and torch.cuda.memory_stats() amongst other functions # In[8]: print( torch.cuda.get_device_properties(device).total_memory / (1024.0 * 1024.0)) # In[9]: torch.cuda.synchronize() model = ActorCritic(num_stack, train_env.action_space, normalize=normalize, name=env_name) model = model.to(device).train() optimizer = optim.Adam( model.parameters(), lr=lr, amsgrad=False) # savage, but AMSGrad was enabled by default ! opt_level = 'O0' loss_scale = None from apex.amp import __version__ from apex.parallel import DistributedDataParallel as DDP from apex.fp16_utils import * from apex import amp, optimizers from apex.multi_tensor_apply import multi_tensor_applier if device.type == 'cuda': model, optimizer = amp.initialize(model,