def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None # envs = [make_env(args.env_name, args.seed, i, args.log_dir) # for i in range(args.num_processes)] envs = [make_env_test()] # TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO args.num_processes = len(envs) # REMEMBER YOU CHENGED IT if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0], *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() # if args.num_stack > 1: # current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # Load model actor_critic = torch.load("./checkpoint.pt") print(actor_critic) actor_critic = actor_critic[0] start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) #envs.envs[0].render() #time.sleep(0.1) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic.get_value( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: print("printed visdom plot") # Sometimes monitor doesn't properly flush the outputs #win = visdom_plot(viz, win, args.log_dir, args.env_name, # args.algo, args.num_frames) except IOError: pass
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None train_envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.no_norm, args.num_stack, args.log_dir, args.add_timestep, device, allow_early_resets=False) if args.eval_interval: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, args.no_norm, args.num_stack, eval_log_dir, args.add_timestep, device, allow_early_resets=True, eval=True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = train_envs.venv.ob_rms else: eval_envs = None # FIXME this is very specific to Pommerman env right now actor_critic = create_policy(train_envs.observation_space, train_envs.action_space, name='pomm', nn_kwargs={ 'batch_norm': False if args.algo == 'acktr' else True, 'recurrent': args.recurrent_policy, 'hidden_size': 512, }, train=True) actor_critic.to(device) if args.algo.startswith('a2c'): agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=lr_update_schedule, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo.startswith('ppo'): agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=lr_update_schedule, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.algo.endswith('sil'): agent = algo.SIL(agent, update_ratio=args.sil_update_ratio, epochs=args.sil_epochs, batch_size=args.sil_batch_size, value_loss_coef=args.sil_value_loss_coef or args.value_loss_coef, entropy_coef=args.sil_entropy_coef or args.entropy_coef) replay = ReplayStorage(5e5, args.num_processes, args.gamma, 0.1, train_envs.observation_space.shape, train_envs.action_space, actor_critic.recurrent_hidden_state_size, device=device) else: replay = None rollouts = RolloutStorage(args.num_steps, args.num_processes, train_envs.observation_space.shape, train_envs.action_space, actor_critic.recurrent_hidden_state_size) obs = train_envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = train_envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], device=device) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if replay is not None: replay.insert(rollouts.obs[step], rollouts.recurrent_hidden_states[step], action, reward, done) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy, other_metrics = agent.update( rollouts, j, replay) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model.state_dict(), hasattr(train_envs.venv, 'ob_rms') and train_envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * update_factor if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {}, last {} mean/median reward {:.1f}/{:.1f}, " "min / max reward {:.1f}/{:.1f}, value/action loss {:.5f}/{:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss), end=', ' if other_metrics else '\n') if 'sil_value_loss' in other_metrics: print("SIL value/action loss {:.1f}/{:.1f}.".format( other_metrics['sil_value_loss'], other_metrics['sil_action_loss'])) if args.eval_interval and len( episode_rewards) > 1 and j > 0 and j % args.eval_interval == 0: eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 50: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): import copy import glob import os import time import matplotlib.pyplot as plt import gym import numpy as np import torch torch.multiprocessing.set_start_method('spawn') import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from gym.spaces import Discrete from arguments import get_args from baselines.common.vec_env.dummy_vec_env import DummyVecEnv from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.vec_env.vec_normalize import VecNormalize from envs import make_env from img_env_corner import ImgEnv, IMG_ENVS from model import Policy from storage import RolloutStorage from utils import update_current_obs, eval_episode from torchvision import transforms from visdom import Visdom import algo viz = Visdom(port=8097) print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") plot_rewards = [] plot_policy_loss = [] plot_value_loss = [] # x = np.array([0]) # y = np.array([0]) # counter = 0 # win = viz.line( # X=x, # Y=y, # win="test1", # name='Line1', # opts=dict( # title='Reward', # ) # ) # win2 = viz.line( # X=x, # Y=y, # win="test2", # name='Line2', # opts=dict( # title='Policy Loss', # ) # ) # win3 = viz.line( # X=x, # Y=y, # win="test3", # name='Line3', # opts=dict( # title='Value Loss', # ) # ) args = get_args() if args.no_cuda: args.cuda = False print(args) assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' num_updates = int(args.num_frames) // args.num_steps // args.num_processes torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) toprint = ['seed', 'lr', 'nat', 'resnet'] if args.env_name in IMG_ENVS: toprint += ['window', 'max_steps'] toprint.sort() name = args.tag args_param = vars(args) os.makedirs(os.path.join(args.out_dir, args.env_name), exist_ok=True) for arg in toprint: if arg in args_param and (args_param[arg] or arg in ['gamma', 'seed']): if args_param[arg] is True: name += '{}_'.format(arg) else: name += '{}{}_'.format(arg, args_param[arg]) model_dir = os.path.join(args.out_dir, args.env_name, args.algo) os.makedirs(model_dir, exist_ok=True) results_dict = { 'episodes': [], 'rewards': [], 'args': args } torch.set_num_threads(1) eval_env = make_env(args, 'cifar10', args.seed, 1, None, args.add_timestep, natural=args.nat, train=False) envs = make_env(args, 'cifar10', args.seed, 1, None, args.add_timestep, natural=args.nat, train=True) #print(envs) # envs = envs[0] # if args.num_processes > 1: # envs = SubprocVecEnv(envs) # else: # envs = DummyVecEnv(envs) # eval_env = DummyVecEnv(eval_env) # if len(envs.observation_space.shape) == 1: # envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy, dataset=args.env_name, resnet=args.resnet, pretrained=args.pretrained) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) action_space = envs.action_space if args.env_name in IMG_ENVS: action_space = np.zeros(2) # obs_shape = envs.observation_space.shape rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): # envs.display_original(j) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) # envs.display_step(step, j) # print("OBS", obs) # print("REWARD", reward) # print("DONE", done) # print("INFO", info) reward = torch.from_numpy(np.expand_dims(np.stack([reward]), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in [done]]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) # print("envs.curr_img SHAPE: ", envs.curr_img.shape) #display_state = envs.curr_img # display_state[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window] = 5 # display_state = custom_replace(display_state, 1, 0) # display_state[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window] = \ # envs.curr_img[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window] # img = transforms.ToPILImage()(display_state) # img.save("state_cifar/"+"state"+str(j)+"_"+str(step)+".png") with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0: torch.save((actor_critic.state_dict(), results_dict), os.path.join( model_dir, name + 'cifar_model_ppo_ex1_corner.pt')) if j % args.log_interval == 0: end = time.time() total_reward = eval_episode(eval_env, actor_critic, args) results_dict['rewards'].append(total_reward) total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, reward {:.1f} entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), np.mean(results_dict['rewards'][-10:]), dist_entropy, value_loss, action_loss)) plot_rewards.append(np.mean(results_dict['rewards'][-10:])) plot_policy_loss.append(action_loss) plot_value_loss.append(value_loss) plt.plot(range(len(plot_rewards)), plot_rewards) plt.savefig("rewards_corner.png") plt.close() plt.plot(range(len(plot_policy_loss)), plot_policy_loss) plt.savefig("policyloss_corner.png") plt.close() plt.plot(range(len(plot_value_loss)), plot_value_loss) plt.savefig("valueloss_corner.png") plt.close()
def main(): torch.set_num_threads(1) device = torch.device("cpu") # if args.vis: # from visdom import Visdom # viz = Visdom(port=args.port) # win = None # envs = make_vec_envs(args.env_name, args.seed, args.num_processes, # args.gamma, args.log_dir, args.add_timestep, device, False) observation_space = Box(low=0, high=10000, shape=(26,), dtype=np.float32) # Box(84,84,4) action_space = Discrete(7) # Discrete(4) actor_critic = Policy(observation_space.shape, action_space, base_kwargs={'recurrent': None}) actor_critic.to(device) # if args.algo == 'a2c': # agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, # args.entropy_coef, lr=args.lr, # eps=args.eps, alpha=args.alpha, # max_grad_norm=args.max_grad_norm) # elif args.algo == 'ppo': # agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, # args.value_loss_coef, args.entropy_coef, lr=args.lr, # eps=args.eps, # max_grad_norm=args.max_grad_norm) # elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, value_loss_coef=0.1, entropy_coef=0.01, acktr=True) rollouts = RolloutStorage(8000, 1, observation_space.shape, action_space, actor_critic.recurrent_hidden_state_size) obs = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] rollouts.obs[0].copy_(torch.Tensor(obs)) rollouts.to(device) episode_rewards = deque(maxlen=10) f = open('poktr_rtmdp_20_2.txt', 'w') f.write("\noriginal loss(schedule 6 packets):") start = time.time() for j in range(num_updates): # num_updates net = Net() node_list, path_list = net.read_graph(net.node_list, net.path_list) startnode = node_list[0] # 起始节点 net.get_data(startnode) count = 0 remove_count = 0 # 记录丢弃的数据包的值 end_time = startnode.messages[0].end_time pre_action_item = random.randint(0, 6) pre_action_item_oh = convert_one_hot(pre_action_item, 7) s = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, end_time, pre_action_item_oh] states = [[0], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []] # 用来存储所有节点状态 ep_r = 0 ep_acc_r = 0 obs[:] = s reward_ten = torch.Tensor(1, 1) pre_value = torch.FloatTensor([[0.1]]) pre_action = torch.Tensor([[random.randint(0, 6)]]) pre_action_log_prob = torch.FloatTensor([[-1.]]) pre_recurrent_hidden_states = torch.FloatTensor([[0.]]) pre_masks = torch.FloatTensor([[0.]]) for step in range(8000): # Sample actions count += 1 old_action_log_prob = torch.Tensor([[0]]) # print(rollouts, rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) action_item = action.item() # 将Tensor类型的数据转化为Int型 action_item_oh = convert_one_hot(action_item, 7) # Obser reward and next obs obs, reward, done, states, remove_count, acc_r, su_packets = net.schedule(pre_action_item, count, states, node_list, path_list, remove_count) ep_r += reward ep_acc_r += acc_r reward_ten[[0]] = reward # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) obs.extend(pre_action_item_oh) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done else [1.0]]) # print((obs), recurrent_hidden_states, torch.Tensor(action), type(action_log_prob), type(value), type(reward), type(masks)) rollouts.insert(torch.Tensor(obs), recurrent_hidden_states, action, action_log_prob, value, reward_ten, masks) # rollouts.insert(torch.Tensor(obs), pre_recurrent_hidden_states, pre_action, pre_action_log_prob, pre_value, reward_ten, pre_masks) pre_action = action pre_action_item = action_item pre_action_log_prob = action_log_prob pre_recurrent_hidden_states = recurrent_hidden_states pre_value = value pre_action_item_oh = convert_one_hot(pre_action_item, 7) f.write("\ntime:"+str(time.strftime('%H:%M:%S', time.localtime(time.time())))+"|"+str(j)+"|ep_r:"+str(ep_r)+"|pakcets:"+str(su_packets)+"|remove:"+str(remove_count)+"|ep_acc_r:"+str(ep_acc_r / 8000)) f.flush() with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, False, 0.99, 0.95) value_loss, action_loss, dist_entropy = agent.update(rollouts) print("time:", time.strftime('%H:%M:%S', time.localtime(time.time())), "|", j, "|ep_r:", ep_r, "|pakcets:", su_packets, "|remove:", remove_count, "|ep_acc_r:", ep_acc_r / 8000, "|value_loss:", value_loss, "|action_loss:", action_loss, "|entropy:", dist_entropy) rollouts.after_update()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs( args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None win_options = None win_term = None envs = [make_env(args.env_name, args.seed, i, log_dir, args.add_timestep) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.algo == 'a2c': agent = algo.A2C_ACKTR(obs_shape, envs.action_space, args.recurrent_policy, args.value_loss_coef, args.entropy_coef, envs, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, cuda=args.cuda) elif args.algo == 'a2oc': agent = algo.A2OC(envs, args, log_dir) elif args.algo == 'ppo': agent = algo.PPO(obs_shape, envs.action_space, args.recurrent_policy, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, envs, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, cuda=args.cuda) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(obs_shape, envs.action_space, args.recurrent_policy, args.value_loss_coef, args.entropy_coef, acktr=True) else: raise ValueError('args.algo does not match any expected algorithm') rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, agent.envs.action_space, agent.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = agent.envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = agent.envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() title_name = args.env_name + " | " + args.algo if args.algo == "a2oc": title_name += " | "+ str(args.num_options) + " | " + str(args.delib) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states, obs, reward, done, _ = agent.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) episode_rewards += reward # If done then clean the history of observations. masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = agent.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, termination_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = agent.actor_critic if args.cuda: save_model = copy.deepcopy(agent.actor_critic).cpu() save_model = [save_model, hasattr(agent.envs, 'ob_rms') and agent.envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, exp_config_str, exp_config_str + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps agent.log(total_num_steps) print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, termination_loss: {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss, termination_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, log_dir, title_name, args.algo, args.num_frames) if args.algo == 'a2oc': win_options = options_plot(viz, win_options, args.num_frames, title_name, agent.log_options_file) win_term = term_prob_plot(viz, win_term, args.num_frames, title_name, agent.log_term_prob) except IOError: pass