def test(args, shared_model, env_conf): # print('IN TEST') ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) setup_logger('{}_map_log'.format(args.env), r'{0}{1}_map_log'.format(args.log_dir, args.env)) log['{}_map_log'.format(args.env)] = logging.getLogger('{}_map_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) if 'micropolis' in args.env.lower(): import gym_micropolis env = micropolis_env(args.env, env_conf, args) else: # print('using atari env for test') env = atari_env(args.env, env_conf, args) reward_sum = 0 entropy_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id if 'micropolis' in args.env.lower(): modelInit = getattr(model, args.design_head) player.model = modelInit(player.env.observation_space.shape[0], player.env.action_space, player.env.env.env.MAP_X) player.lstm_sizes = player.model.getMemorySizes() if not 'arcade' in args.env.lower(): player.lstm_size = (1, 16, player.env.env.env.MAP_X, env.env.env.MAP_Y) else: player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 i = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward entropy_sum += player.entropy.data.item() if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1:1.5e}, entropy {4:1.5e} episode length {2}, reward mean {3:1.5e}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, entropy_sum)) import numpy as np np.set_printoptions(threshold=400) log['{}_map_log'.format(args.env)].info('\n{}'.format( np.array2string( np.add( player.env.env.env.micro.map.zoneMap[-1], np.full((player.env.env.env.MAP_X, player.env.env.env.MAP_Y), 2))).replace('\n ', '').replace('][', ']\n[').replace( '[[', '[').replace(']]', ']'))) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}best_{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}best_{1}.dat'.format(args.save_model_dir, args.env)) if i % 10 == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}latest_{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}latest_{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 entropy_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 i += 1 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, env_conf): start_time = time.time() ptitle('Training Agent: {}'.format(rank)) #log = {} #setup_logger('{}_train_log'.format(args.env), r'{0}{1}_train_log'.format( # args.log_dir, args.env)) #log['{}_train_log'.format(args.env)] = logging.getLogger( # '{}_train_log'.format(args.env)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) if 'micropolis' in args.env.lower(): env = micropolis_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id if 'micropolis' in args.env.lower(): modelInit = getattr(model, args.design_head) player.model = modelInit(player.env.observation_space.shape[0], player.env.action_space, player.env.env.env.MAP_X) player.lstm_sizes = player.model.getMemorySizes() else: player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) lstm_size = 512 if 'micropolis' in args.env.lower(): if 'arcade' not in args.env.lower(): lstm_size = (1, 16, env.env.env.MAP_X, env.env.env.MAP_Y) player.lstm_size = lstm_size player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 log_counter = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) num_lstm_layers = len(player.lstm_sizes) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = [ Variable(torch.zeros(player.lstm_sizes[i]).cuda()) for i in range(num_lstm_layers) ] player.hx = [ Variable(torch.zeros(player.lstm_sizes[i]).cuda()) for i in range(num_lstm_layers) ] else: player.cx = [ Variable(torch.zeros(lstm_sizes[i])) for i in range(num_lstm_layers) ] player.hx = [ Variable(torch.zeros(lstm_sizes[i])) for i in range(num_lstm_layers) ] else: player.cx = [ Variable(player.cx[i].data) for i in range(num_lstm_layers) ] player.hx = [ Variable(player.hx[i].data) for i in range(num_lstm_layers) ] for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if args.randomize_exploration: player.certainty = np.random.uniform(0.5, 1.5) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: values, logit, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) if values.size()[1] == 1: value = values else: prob = torch.nn.functional.softmax(logit, dim=1) action = prob.multinomial(1).data value = values[0][action] R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = torch.zeros(1, 1).cuda() R = Variable(R).cuda() else: R = Variable(R) player.values.append(R) policy_loss = 0 value_loss = 0 for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): player.rewards[i] = torch.Tensor([player.rewards[i] ]).cuda() R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = Variable(gae.cuda()) else: gae = Variable(gae) policy_loss = policy_loss - \ player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i] #if log_counter % 10 == 0: # log['{}_train_log'.format(args.env)].info( # "Time {0}, reward {1}, policy loss {2}, value loss {3}, entropy {4}". # format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), # '{:9.2e}'.format(float(sum(player.rewards) / len(player.rewards))), # '{:9.2e}'.format(float(policy_loss.data.item())), # '{:9.2e}'.format(float(value_loss.data.item())), # '{:10.8e}'.format(float(sum(player.entropies))))) #log_counter += 1 optimizer.zero_grad() a3c = args.lmbda * (policy_loss + 0.5 * value_loss) a3c.backward() torch.nn.utils.clip_grad_norm_(player.model.parameters(), 40) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
if __name__ == '__main__': __spec__ = None args = parser.parse_args() torch.manual_seed(args.seed) if args.gpu_ids == -1: args.gpu_ids = [-1] else: torch.cuda.manual_seed(args.seed) mp.set_start_method('spawn') setup_json = read_config(args.env_config) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] if 'micropolis' in args.env.lower(): env = micropolis_env(args.env, env_conf, args) modelInit = getattr(model, args.design_head) shared_model = modelInit(env.observation_space.shape[0], env.action_space, env.env.env.MAP_X) else: env = atari_env(args.env, env_conf, args) shared_model = A3Clstm(env.observation_space.shape[0], env.action_space) if args.load: saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() if args.shared_optimizer:
torch.cuda.manual_seed(args.seed) saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) log = {} setup_logger('{}_mon_log'.format(args.env), r'{0}{1}_mon_log'.format(args.log_dir, args.env)) log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) if 'micropolis' in args.env.lower(): env = micropolis_env("{}".format(args.env), env_conf, args) #else: # env = atari_env("{}".format(args.env), env_conf, args) num_tests = 0 start_time = time.time() reward_total_sum = 0 player = Agent(None, env, args, None) if 'micropolis' in args.env.lower(): modelInit = getattr(model, args.design_head) player.model = modelInit(player.env.observation_space.shape[0], player.env.action_space, player.env.env.env.MAP_X) player.lstm_size = player.model.getMemorySizes() else: player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.gpu_id = gpu_id