def test(args, shared_model, env_conf): log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) env = atari_env(args.env, env_conf) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.eval() for t in itertools.count(): if player.done: player.model.load_state_dict(shared_model.state_dict()) player.action_test(t) reward_sum += player.reward if player.done: num_tests += 1 player.current_life = 0 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if reward_sum > args.save_score_level: player.model.load_state_dict(shared_model.state_dict()) state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() time.sleep(60) player.state = torch.from_numpy(state).float()
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) tp_weight = args.tp player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space, args.terminal_prediction, args.reward_prediction) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() # Below is where the cores are running episodes continously ... average_ep_length = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 128).cuda()) player.hx = Variable(torch.zeros(1, 128).cuda()) else: player.cx = Variable(torch.zeros(1, 128)) player.hx = Variable(torch.zeros(1, 128)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.eps_len += 1 player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 reward_pred_loss = 0 terminal_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) # TODO why this is here? for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * player.values[ i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - player.log_probs[i] * Variable( gae) - 0.01 * player.entropies[i] if args.reward_prediction: reward_pred_loss = reward_pred_loss + ( player.reward_predictions[i] - player.rewards[i]).pow(2) if args.terminal_prediction: # new way of using emprical episode length as a proxy for current length. if player.average_episode_length is None: end_predict_labels = np.arange( player.eps_len - len(player.terminal_predictions), player.eps_len) / player.eps_len # heuristic else: end_predict_labels = np.arange( player.eps_len - len(player.terminal_predictions), player.eps_len) / player.average_episode_length for i in range(len(player.terminal_predictions)): terminal_loss = terminal_loss + ( player.terminal_predictions[i] - end_predict_labels[i]).pow(2) terminal_loss = terminal_loss / len(player.terminal_predictions) player.model.zero_grad() #print(f"policy loss {policy_loss} and value loss {value_loss} and terminal loss {terminal_loss} and reward pred loss {reward_pred_loss}") total_loss = policy_loss + 0.5 * value_loss + tp_weight * terminal_loss + 0.5 * reward_pred_loss total_loss.backward() # will free memory ... # Visualize Computation Graph #graph = make_dot(total_loss) #from graphviz import Source #Source.view(graph) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() if player.done: if player.average_episode_length is None: # initial one player.average_episode_length = player.eps_len else: player.average_episode_length = int( 0.99 * player.average_episode_length + 0.01 * player.eps_len) #print(player.average_episode_length, 'current one is ', player.eps_len) player.eps_len = 0 # reset here
env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] torch.set_default_tensor_type('torch.FloatTensor') saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) log = {} setup_logger('{}_mon_log'.format(args.env), r'{0}{1}_mon_log'.format(args.log_dir, args.env)) log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format( args.env)) env = atari_env("{}".format(args.env), env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) num_tests = 0 reward_total_sum = 0 player = Agent(model, env, args, state=None) player.env = gym.wrappers.Monitor(player.env, "{}_monitor".format(args.env), force=True) player.model.eval() for i_episode in range(args.num_episodes): state = player.env.reset() player.state = torch.from_numpy(state).float() player.eps_len = 0 reward_sum = 0 while True:
def test(args, shared_model, env_conf, lock, counter): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger( '{}_log'.format(args.env), r'{0}{1}-{2}_log'.format(args.log_dir, args.env, args.log_target)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests with lock: counter.value += 1 log['{}_log'.format(args.env)].info( "UpdateStep {0} Time {1}, episode reward {2}, episode length {3}, reward mean {4:.4f}" .format( counter.value, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}_{2}.dat'.format(args.save_model_dir, args.env, args.log_target)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}_{2}.dat'.format(args.save_model_dir, args.env, args.log_target)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: if player.info['ale.lives'] == 0 or player.max_length: player.eps_len = 0 state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
os.mkdir(args.save_model_dir) if args.seed: torch.manual_seed(args.seed) if args.gpu_ids == -1: args.gpu_ids = [-1] else: if args.seed: torch.cuda.manual_seed(args.seed) mp.set_start_method('spawn') setup_json = read_config(args.env_config) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] env = atari_env(args.env, env_conf, args) shared_model = A3Cff(env.observation_space.shape[0], env.action_space) if args.load_path: saved_state = torch.load(args.load_path, map_location=lambda storage, loc: storage) shared_model.load_state_dict(saved_state) shared_model.share_memory() if args.optimizer == 'RMSprop': optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = SharedAdam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad)
# left 3 5. def grayshow(img): img = img.squeeze() # img = img / 2 + 0.5 # unnormalize # npimg = img.numpy() plt.imshow(img, cmap='gray') plt.show() if __name__ == '__main__': pygame.init() screen = pygame.display.set_mode((300, 300)) pygame.display.set_caption('键盘监听中') screen.fill((255, 255, 255)) pygame.key.set_repeat(70) pygame.display.flip() env = atari_env(args.env, env_conf, args) # gym.make("SpaceInvaders-v0") # action1 = env.action_space.sample() # print(env.action_space.n) # while action1 in [0,2,3,4,5]: # action1 = env.action_space.sample() # print(action1) init_log = 3 while True: trace_s = [] trace_a = [] s = env.reset() action = key_action[NO] while True: # grayshow(s) trace_s.append(s)
def train(rank, reward_type, args, shared_model, optimizer, env_conf): log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) env.seed(args.seed + rank) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None, reward_type) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.train() for i in itertools.count(): if i % 10 == 0: print("reward type {0}, iter {1}".format(reward_type, i)) player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): player.action_train() reward_sum += player.reward if args.count_lives: player.check_state() if player.done: break if player.done: num_tests += 1 player.current_life = 0 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 40) ensure_shared_grads(player.model, shared_model) optimizer.step() player.clear_actions()
def train_robust(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] if args.seed: torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) if args.seed: env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Cff(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) p = optimizer.param_groups[0]['params'][0] step = optimizer.state[p]['step'] if step >= (args.total_frames / args.num_steps): return #increase linearly until 2/3 through halfway lin_coeff = min(1, (1.5 * int(step) + 1) / (args.total_frames / args.num_steps)) epsilon = lin_coeff * args.epsilon_end kappa = args.kappa_end #(1-lin_coeff)*1 + lin_coeff*args.kappa_end for step in range(args.num_steps): player.action_train(bound_epsilon=epsilon) if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _ = player.model(Variable(player.state.unsqueeze(0))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t if gae >= 0: worst_case_loss = -player.min_log_probs[i] * Variable(gae) else: worst_case_loss = -player.max_log_probs[i] * Variable(gae) standard_loss = -player.log_probs[i] * Variable(gae) policy_loss = policy_loss + kappa * standard_loss + ( 1 - kappa) * worst_case_loss - 0.01 * player.entropies[i] #print(policy_loss + 0.5 * value_loss) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def test(args, shared_models, env_conf): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(env, args, gpu_id) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() flag = True max_score = 0 prev_reward = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.models[0].load_state_dict( shared_models[0].state_dict()) player.models[1].load_state_dict( shared_models[1].state_dict()) else: player.models[0].load_state_dict(shared_models[0].state_dict()) player.models[1].load_state_dict(shared_models[1].state_dict()) player.models[0].eval() player.models[1].eval() flag = False player.action_test() reward_sum += player.reward if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) with open('./results', 'a') as f: line = f"{reward_total_sum - prev_reward}\n" f.write(line) prev_reward = reward_total_sum player.episodic_reward = 0 if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.models[0].state_dict() torch.save( state_to_save, '{0}{1}_early.dat'.format(args.save_model_dir, args.env)) state_to_save = player.models[1].state_dict() torch.save( state_to_save, '{0}{1}_late.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.models[0].state_dict() torch.save( state_to_save, '{0}{1}_early.dat'.format(args.save_model_dir, args.env)) state_to_save = player.models[1].state_dict() torch.save( state_to_save, '{0}{1}_late.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] writer = SummaryWriter(log_dir=args.log_dir + 'tb_train') log = {} setup_logger('{}_train_log'.format(rank), r'{0}{1}_train_log'.format(args.log_dir, rank)) log['{}_train_log'.format(rank)] = logging.getLogger( '{}_train_log'.format(rank)) torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(env_id=rank, args=args, type='train') if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[2], player.env.action_space.n) player.state = player.env.reset() player.state = normalize_rgb_obs(player.state) player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() num_trains = 0 if not os.path.exists(args.log_dir + "images/"): os.makedirs(args.log_dir + "images/") while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: num_trains += 1 log['{}_train_log'.format(rank)].info('entropy:{0}'.format( player.entropy.data[0])) writer.add_scalar("data/entropy_" + str(rank), player.entropy.data[0], num_trains) writer.add_image('FCN_' + str(rank), player.fcn, num_trains) writer.add_image('Depth_GroundTruth_' + str(rank), player.depth, num_trains) writer.add_image('RGB_' + str(rank), player.env.get_rgb(), num_trains) save_image( player.fcn.data, args.log_dir + "images/" + str(rank) + "_" + str(num_trains) + "_fcn.png") # print("player.fcn.data:", player.fcn.data) save_image( player.depth.data, args.log_dir + "images/" + str(rank) + "_" + str(num_trains) + "_depth.png") cv2.imwrite( args.log_dir + "images/" + str(rank) + "_" + str(num_trains) + "_rgb.png", player.env.get_rgb()) # print("player.depth.data:", player.depth.data) player.eps_len = 0 player.current_life = 0 state = player.env.reset() state = normalize_rgb_obs(state) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: with torch.cuda.device(gpu_id): value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx), Variable( torch.from_numpy(player.env.target).type( torch.FloatTensor).cuda()))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = args.gamma * player.values[ i + 1].data + player.rewards[i] - player.values[i].data gae = gae * args.gamma * args.tau + delta_t # policy_loss = policy_loss - \ # player.log_probs[i] * \ # Variable(gae) - 0.01 * player.entropies[i] \ # + player.fcn_losses[i] # FCN policy_loss = policy_loss - 1e-5*(player.log_probs[i] * Variable(gae)) - 1e-5*(0.01 * player.entropies[i]) \ + player.fcn_losses[i] * DEPTH_LOSS_DISCOUNT # FCN # policy_loss = policy_loss + player.fcn_losses[i] # FCN writer.add_scalar("data/value_loss_" + str(rank), value_loss, num_trains) writer.add_scalar("data/policy_loss_" + str(rank), policy_loss, num_trains) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 40.0) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def test(args, shared_model, optimizer, env_conf): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] start_time = datetime.now().strftime('%Y-%m-%d_%H_%M_%S') log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_{2}_log'.format( args.log_dir, args.env, start_time)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) if not os.path.exists(args.save_model_dir): os.mkdir(args.save_model_dir) if args.seed: torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) reward_sum = 0 start = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Cff(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = -10000 while True: p = optimizer.param_groups[0]['params'][0] step = optimizer.state[p]['step'] player.model.eval() if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) flag = False with torch.no_grad(): if args.robust: #player.action_test_losses(args.epsilon_end) lin_coeff = min(1, (1.5*int(step)+1)/(args.total_frames/args.num_steps)) epsilon = lin_coeff*args.epsilon_end player.action_train(epsilon) else: player.action_train() #player.action_test_losses() reward_sum += player.noclip_reward if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: # calculate losses for tracking R = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(R) gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) standard_loss = 0 worst_case_loss = 0 value_loss = 0 entropy = 0 for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss += 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t if args.robust: if advantage >= 0: worst_case_loss += - player.min_log_probs[i] * Variable(gae) else: worst_case_loss += - player.max_log_probs[i] * Variable(gae) standard_loss += -player.log_probs[i] * Variable(gae) entropy += player.entropies[i] standard_loss = standard_loss/len(player.rewards) worst_case_loss = worst_case_loss/len(player.rewards) value_loss = value_loss/len(player.rewards) entropy = entropy/len(player.rewards) player.clear_actions() flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( ("Time {0}, steps {1}/{2}, ep reward {3}, ep length {4}, reward mean {5:.3f} \n"+ "Losses: Policy:{6:.3f}, Worst case: {7:.3f}, Value: {8:.3f}, Entropy: {9:.3f}"). format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start)), int(step), args.total_frames/args.num_steps, reward_sum, player.eps_len, reward_mean, float(standard_loss), float(worst_case_loss), float(value_loss), float(entropy))) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}_{2}_best.pt'.format( args.save_model_dir, args.env, start_time)) else: state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}_{2}_best.pt'.format( args.save_model_dir, args.env, start_time)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 #stop after total steps gradient updates have passed if step >= args.total_frames/args.num_steps: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}_{2}_last.pt'.format( args.save_model_dir, args.env, start_time)) else: state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}_{2}_last.pt'.format( args.save_model_dir, args.env, start_time)) return time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def test(args, shared_model, env_conf): log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) env = atari_env(args.env, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) state = env.reset() reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(model, env, args, state) player.state = torch.from_numpy(state).float() player.model.eval() while True: if player.starter and player.flag: player = player_start(player) else: player.flag = False if player.done and not player.flag: player.model.load_state_dict(shared_model.state_dict()) player.cx = Variable(torch.zeros(1, 512), volatile=True) player.hx = Variable(torch.zeros(1, 512), volatile=True) player.flag = False elif not player.flag: player.cx = Variable(player.cx.data, volatile=True) player.hx = Variable(player.hx.data, volatile=True) player.flag = False if not player.flag: player, reward = player_act(player, train=False) reward_sum += reward if not player.done: if player.current_life > player.info['ale.lives']: player.flag = True player.current_life = player.info['ale.lives'] else: player.current_life = player.info['ale.lives'] player.flag = False if player.done: num_tests += 1 player.current_life = 0 player.flag = True reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if reward_sum > args.save_score_level: player.model.load_state_dict(shared_model.state_dict()) state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() time.sleep(60) player.state = torch.from_numpy(state).float()
def train(rank, args, shared_models, optimizers, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) env.seed(args.seed + rank) player = Agent(env, args, gpu_id) player.rank = rank player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.models[0].train() player.models[1].train() player.eps_len += 2 # player.test_models() while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): # player.model.load_state_dict(shared_model.state_dict()) player.models[0].load_state_dict(shared_models[0].state_dict()) player.models[1].load_state_dict(shared_models[1].state_dict()) else: # player.model.load_state_dict(shared_model.state_dict()) player.models[0].load_state_dict(shared_models[0].state_dict()) player.models[1].load_state_dict(shared_models[1].state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break # if rank == 0: # print(player.episodic_reward) player.episodic_reward = 0 if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.models[player.curr_model_id]((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() # player.values.append(Variable(R)) gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) # print("Length of values vector", len(player.values)) # print("Length of rewards vector", len(player.rewards)) # print("Length of model sequence vector", len(player.model_sequence)) next_val = Variable(R) last_val = next_val R_vec = [Variable(R), Variable(R)] # last_id = player.model_sequence[-1] active_flags = [False, False] policy_loss = [0, 0] value_loss = [0, 0] for reward, value, model_id, log_prob, entropy in zip( reversed(player.rewards), reversed(player.values), reversed(player.model_sequence), reversed(player.log_probs), reversed(player.entropies) ): active_flags[model_id] = True R_vec[model_id] = args.gamma * R_vec[model_id] + reward R_vec[(model_id+1)%2] *= args.gamma advantage = R_vec[model_id] - value value_loss[model_id] += 0.5 * advantage.pow(2) delta_t = reward + args.gamma * next_val.data - value.data gae = gae * args.gamma * args.tau + delta_t policy_loss[model_id] -= (log_prob * Variable(gae) + 0.01 * entropy) next_val = value try: if active_flags[0] is True: player.models[0].zero_grad() (policy_loss[0] + 0.5 * value_loss[0]).backward() ensure_shared_grads(player.models[0], shared_models[0], gpu = gpu_id >= 0) optimizers[0].step() if active_flags[1] is True: player.models[1].zero_grad() (policy_loss[1] + 0.5 * value_loss[1]).backward() ensure_shared_grads(player.models[1], shared_models[1], gpu = gpu_id >= 0) optimizers[1].step() except Exception as e: print("Exception caught. Ignoring") if rank == 1: print(rewards) print(model_sequence) player.clear_actions()
torch.cuda.manual_seed(args.seed) saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) log = {} setup_logger('{}_mon_log'.format(args.env), r'{0}{1}_mon_log'.format(args.log_dir, args.env)) log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) env = atari_env(env_id=0, args=args, type='train') num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.model = A3Clstm(player.env.observation_space.shape[2], player.env.action_space.n) player.model.load_state_dict(saved_state) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() # player.env = gym.wrappers.Monitor( # player.env, "{}_monitor".format(args.env), force=True) player.model.eval()
# Based on # https://github.com/pytorch/examples/tree/master/mnist_hogwild # Training settings # Implemented multiprocessing using locks but was not beneficial. Hogwild # training was far superior if __name__ == '__main__': # --------设置global随机种子和多线程--------- torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # --------gym环境预处理--------------- setup_json = read_config(args.env_config) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] envs = [atari_env(args.env, env_conf, args, rank) for rank in range(args.workers)] observation_space, action_space = envs[0].observation_space.shape[0], envs[0].action_space # -------公用的lstm神经网络,load参数 envs = ParallelEnv(envs) train(args, envs, observation_space, action_space)
def train(rank, args, shared_model, optimizer, env_conf, emb, bi_grams, instructions): # Changes the process name ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) # Define special vectors eos_vector = emb.get_vector("<eos>") oov_vector = emb.get_vector("<oov>") if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam( shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) # Create agent player = Agent(None, env, args, None, emb) player.gpu_id = gpu_id # Create DNN model for the agent player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space, emb) # Set env and move to gpu player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() # Set model to "training" mode. Not doing anything but is a good practice to add player.model.train() # Start iteration player.eps_len += 2 _counter = 0 while True: # Loading param values from shared model if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) # Reset LSTM state when episode ends if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, args.lstm_size).cuda()) player.hx = Variable(torch.zeros(1, args.lstm_size).cuda()) else: player.cx = Variable(torch.zeros(1, args.lstm_size)) player.hx = Variable(torch.zeros(1, args.lstm_size)) # If not ended, save current state value else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) # Make a step and record observations. Repeat until num_steps reached or game is over. for step in range(args.num_steps): player.action_train() if player.done: break # If episode finished before args.num_steps is reached, reset environment if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() # If episode not finished after args.num_steps: # Estimates value function of current state R = torch.zeros(1, 1) if not player.done: _, value, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() # Append reward for the final time step player.values.append(Variable(R)) # Initialise loss accumulator policy_loss = 0 value_loss = 0 language_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) # Accumulate the losses for i in reversed(range(len(player.rewards))): # Calculating language loss if args.use_language: # Calculating language loss # Get action of a time step a = np.argmax(player.action_logits[i].detach().cpu().numpy()) # Get produced vectors of the time step produced_logits = player.produced_logits[i] # print(produced_vectors) # Get target vectors of the time step (an instruction corresponding to the least cost) action_instructions = instructions[a] # Sample a few from the set for _ in range(10): idx = random.randrange(0, len(action_instructions)) instruction = action_instructions[idx] target_words = instruction.split() for pos, target_word in enumerate(target_words): target_class = torch.tensor(emb.get_index(target_word)).cuda() produced_logit = produced_logits[pos] # Cross_entropy combines log-softmax and nll # Here procuded_vec is one-hot while target is an integer language_loss += torch.nn.functional.cross_entropy(produced_logit, target_class.unsqueeze(0)) if target_word == '<eos>': break # Calculate other losses R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] # Initialise grad accumulator player.model.zero_grad() # Calculate grad and update if args.use_language: (policy_loss + 0.5 * value_loss + 0.1 * 0.01* language_loss).backward() else: (policy_loss + 0.5 * value_loss).backward() """ # (policy_loss + 0.5 * value_loss).backward() print("****************") print(policy_loss) print(value_loss) # """ if args.use_language and _counter % 10 == 0: print("****************") #print(policy_loss) #print(value_loss) print("language loss", language_loss) _counter += 1 # Copying over the parameters to shared model ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() # Clean agent observations player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf, iters, checkpoint_path): iters = dill.loads(iters) if args.enable_gavel_iterator and rank == 0: iters._init_logger() ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 elapsed_time = 0 start_time = time.time() for i in iters: if i % 100 == 0: print('GPU %d finished step %d' % (rank, i), flush=True) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() elapsed_time += time.time() - start_time start_time = time.time() if (args.throughput_estimation_interval is not None and i % args.throughput_estimation_interval == 0 and rank == 0): print('[THROUGHPUT_ESTIMATION]\t%s\t%d' % (time.time(), i)) if (args.max_duration is not None and elapsed_time >= args.max_duration): break if args.enable_gavel_iterator and rank == 0: state = shared_model.state_dict() iters.save_checkpoint(state, checkpoint_path) iters.complete()
def test(args, shared_model, env_conf, shared_counter): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu') log = {} setup_logger( '{}_log'.format(args.env), os.path.join(args.log_dir, '{}-{}_log'.format(args.env, args.exp_name))) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None, gpu_id=gpu_id) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.model.apply(weights_init) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).to(torch.float32) player.model = player.model.to(device) player.state = player.state.to(device) flag = True max_score = 0 while True: if flag: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device) elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}, alpha {4:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, player.model.log_alpha.exp().detach().item())) if args.save_max and reward_sum >= max_score: max_score = reward_sum torch.save( player.model.state_dict(), os.path.join(args.save_model_dir, '{}-{}.dat'.format(args.env, args.exp_name))) with shared_counter.get_lock(): shared_counter.value += player.eps_len if shared_counter.value > args.interact_steps: break reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device)
def test(args, shared_model, env_conf): log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format( args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger( '{}_log'.format(args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) env = atari_env(args.env, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) model.eval() state = env.reset() state = torch.from_numpy(state).float() reward_sum = 0 done = True start_time = time.time() episode_length = 0 num_tests = 0 reward_total_sum = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 512), volatile=True) hx = Variable(torch.zeros(1, 512), volatile=True) else: cx = Variable(cx.data, volatile=True) hx = Variable(hx.data, volatile=True) value, logit, (hx, cx) = model((Variable( state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1)[1].data.numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward if done: num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length, reward_mean)) if reward_sum > args.save_score_level: model.load_state_dict(shared_model.state_dict()) state_to_save = model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env)) reward_sum = 0 episode_length = 0 state = env.reset() time.sleep(60) state = torch.from_numpy(state).float()
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) _ = env.reset() action = env.action_space.sample() _, _, _, info = env.step(action) start_lives = info['ale.lives'] if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() env.seed(args.seed + rank) state = env.reset() state = torch.from_numpy(state).float() done = True episode_length = 0 current_life = start_lives while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 512)) hx = Variable(torch.zeros(1, 512)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, info = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length if args.count_lives: if current_life > info['ale.lives']: done = True else: current_life = info['ale.lives'] reward = max(min(reward, 1), -1) if done: episode_length = 0 current_life = start_lives state = env.reset() state = torch.from_numpy(state).float() values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
def train_rep(args, shared_model, env_conf): batch_size = 16 train_times = args.rep_train_time trace = [] td_class = [(0, 1), (1, 2), (2, 3), (3, 5), (5, 7), (7, 9)] loss_fn = nn.CrossEntropyLoss() optimizer_r = Adam(shared_model.r_net.parameters(), lr=args.rl_r) optimizer_c = Adam(shared_model.c_net.parameters(), lr=args.rl_r) ptitle('Train rep') gpu_id = args.gpu_ids[-1] torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() # player.model.r_net = player.model.r_net.cuda() # player.model.c_net = player.model.c_net.cuda() flag = True while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.train() flag = False player.action_test() trace.append(player.state) if len(trace) > args.trace_length: # 训练几百次 for _ in range(train_times): range_c = np.random.randint(0, len(td_class)) TD = np.random.randint(td_class[range_c][0], td_class[range_c][1]) begin = np.random.randint(0, len(trace) - TD - batch_size) former = torch.stack(trace[begin:begin + batch_size], dim=0) latter = torch.stack(trace[begin + TD:begin + TD + batch_size], dim=0) target = torch.zeros(batch_size, dtype=torch.long) + range_c if gpu_id >= 0: with torch.cuda.device(gpu_id): former = former.cuda() latter = latter.cuda() target = target.cuda() rep_f, rep_l = player.model.r_net(former), player.model.r_net( latter) output = player.model.c_net(rep_f, rep_l, False) loss = loss_fn(output, target) optimizer_r.zero_grad() optimizer_c.zero_grad() loss.backward() ensure_shared_grads(player.model.r_net, shared_model.r_net, gpu=gpu_id >= 0) ensure_shared_grads(player.model.c_net, shared_model.c_net, gpu=gpu_id >= 0) optimizer_r.step() optimizer_c.step() trace = [] if player.done and not player.info: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True state = player.env.reset() time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def test(args, shared_model, env_conf): # print('IN TEST') ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) setup_logger('{}_map_log'.format(args.env), r'{0}{1}_map_log'.format(args.log_dir, args.env)) log['{}_map_log'.format(args.env)] = logging.getLogger('{}_map_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) if 'micropolis' in args.env.lower(): import gym_micropolis env = micropolis_env(args.env, env_conf, args) else: # print('using atari env for test') env = atari_env(args.env, env_conf, args) reward_sum = 0 entropy_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id if 'micropolis' in args.env.lower(): modelInit = getattr(model, args.design_head) player.model = modelInit(player.env.observation_space.shape[0], player.env.action_space, player.env.env.env.MAP_X) player.lstm_sizes = player.model.getMemorySizes() if not 'arcade' in args.env.lower(): player.lstm_size = (1, 16, player.env.env.env.MAP_X, env.env.env.MAP_Y) else: player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 i = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward entropy_sum += player.entropy.data.item() if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1:1.5e}, entropy {4:1.5e} episode length {2}, reward mean {3:1.5e}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, entropy_sum)) import numpy as np np.set_printoptions(threshold=400) log['{}_map_log'.format(args.env)].info('\n{}'.format( np.array2string( np.add( player.env.env.env.micro.map.zoneMap[-1], np.full((player.env.env.env.MAP_X, player.env.env.env.MAP_Y), 2))).replace('\n ', '').replace('][', ']\n[').replace( '[[', '[').replace(']]', ']'))) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}best_{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}best_{1}.dat'.format(args.save_model_dir, args.env)) if i % 10 == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}latest_{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}latest_{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 entropy_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 i += 1 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, env_conf, shared_counter, targ_shared): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu') torch.manual_seed(args.seed + rank) torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None, gpu_id=gpu_id) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.model.apply(weights_init) player.state = player.env.reset() player.state = torch.from_numpy(player.state).to(torch.float32) player.state = player.state.to(device) player.model = player.model.to(device) #player.targ_model = copy.deepcopy(player.model) player.model.train() #player.targ_model.eval() player.eps_len += 2 while True: player.model.load_state_dict(shared_model.state_dict()) #player.targ_model.load_state_dict(targ_shared.state_dict()) if player.done: player.cx = torch.zeros(1, 512).to(device) player.hx = torch.zeros(1, 512).to(device) #player.targ_cx = copy.deepcopy(player.cx).detach() #player.targ_hx = copy.deepcopy(player.hx).detach() else: player.cx = player.cx.detach() player.hx = player.hx.detach() for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device) #alpha = player.model.log_alpha.exp().detach() alpha = .01 #alpha = 0 x_R = torch.zeros(1, 1) if not player.done: with torch.no_grad(): action, value, logit, q_value, _ = player.model( (player.state.unsqueeze(0), (player.hx, player.cx))) x_R = q_value[1].detach() - alpha * F.log_softmax( logit, -1).gather(-1, action) x_R = x_R.to(device) policy_loss = 0 adv_gae_loss = 0 for i in reversed(range(len(player.rewards))): x_R = args.gamma * x_R + player.rewards[i] adv_gae_loss = adv_gae_loss + (player.tra_adv_gae[i][1] - x_R.detach()).pow(2) * .5 #policy_loss = policy_loss - player.log_probs[i] * player.tra_adv_gae[i][0].detach() + alpha * player.log_probs[i] * player.log_probs[i].detach() policy_loss = policy_loss - (F.softmax( player.values[i], -1) * player.tra_adv_gae[i][0].detach()).sum( -1) - alpha * player.entropies[i].unsqueeze(-1) #policy_loss = policy_loss - player.log_probs[i] * (x_R - (F.softmax(player.values[i], -1) * # player.tra_adv_gae[i][0]).sum(-1) - alpha * player.entropies[i]).detach() + alpha * player.log_probs[i] * player.log_probs[i].detach() #prob = F.softmax(player.values[i], -1) #ent_alpha = alpha * player.entropies[i].unsqueeze(-1) #advs = (player.tra_adv_gae[i][0] - # ((player.tra_adv_gae[i][0] * prob).sum(-1, True) + # ent_alpha)).detach() #policy_loss = policy_loss - (prob * advs).sum(-1) - ent_alpha x_R = x_R - alpha * player.log_probs[i].detach() player.model.zero_grad() (policy_loss + .5 * adv_gae_loss).backward(retain_graph=False) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() with shared_counter.get_lock(): shared_counter.value += len(player.rewards) if shared_counter.value > args.interact_steps: break
def train(rank, args, shared_model, optimizer, optimizer_r, env_conf, lock, counter): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = [ Variable(torch.zeros(1, 512).cuda()), Variable(torch.zeros(1, 512).cuda()) ] player.hx = [ Variable(torch.zeros(1, 512).cuda()), Variable(torch.zeros(1, 512).cuda()) ] else: player.cx = [ Variable(torch.zeros(1, 512)), Variable(torch.zeros(1, 512)) ] player.hx = [ Variable(torch.zeros(1, 512)), Variable(torch.zeros(1, 512)) ] else: player.cx = [ Variable(player.cx[0].data), Variable(player.cx[1].data) ] player.hx = [ Variable(player.hx[0].data), Variable(player.cx[1].data) ] # 测试rnet的更新有没有影响到这里 # ps = list(player.model.r_net.named_parameters()) # n, v = ps[6] # print(v.sum()) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx[0], player.cx[0]), (player.hx[1], player.cx[1]))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] with lock: counter.value += 1 # rnet player.model.r_net.zero_grad() (args.actor_weight * policy_loss + (1 - args.actor_weight) * value_loss).backward(retain_graph=True) ensure_shared_grads(player.model.r_net, shared_model.r_net, gpu=gpu_id >= 0) optimizer_r.step() player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() player.model.r_net.zero_grad() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf, num_tau_samples=32, num_tau_prime_samples=32, kappa=1.0, num_quantiles=32): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam( shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1,num_tau_prime_samples) if not player.done: logit, _, _ = player.model((Variable( player.state.unsqueeze(0)), (player.hx, player.cx))) q_vals = torch.mean(logit,0) _, action = torch.max(q_vals,0) logit, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = logit[:,action] if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() #R = R.detach() R = Variable(R) value_loss = 0 for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R.repeat(num_tau_samples,1) - player.logits_array[i].repeat(1, num_tau_prime_samples) #print("Ad: ",advantage) loss = (torch.abs(advantage) <= kappa).float() * 0.5 * advantage ** 2 #print("loss: ",loss.sum(0).sum(0), loss) loss += (torch.abs(advantage) > kappa).float() * kappa * (torch.abs(advantage) - 0.5 * kappa) #print("loss: ",loss.sum(0).sum(0), loss) step_loss = torch.abs(player.quantiles_array[i].cuda() - (advantage.detach()<0).float()) * loss/kappa value_loss += step_loss.sum(0).mean(0) player.model.zero_grad() value_loss.backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.model = A3Clstm( player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.train() while True: player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): player.action_train() if args.count_lives: player.check_state() if player.done: break if player.done: player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 40) ensure_shared_grads(player.model, shared_model) optimizer.step() player.clear_actions()
for key in convertor_config.hyperparameters: exec ('hyperparameters[\'%s\'] = convertor_config.hyperparameters[\'%s\']' % (key, key)) trainer = [] exec ("trainer=%s(convertor_config.hyperparameters)" % convertor_config.hyperparameters['trainer']) trainer.gen.load_state_dict(torch.load('/home/amittel/Desktop/CMU/DRL/rl_a3c_pytorch/conversion_models/attentionbreakout2pong_v0_gen_00003500.pkl')) trainer.gen.eval() #trainer.cuda(args.gpu) trainer.share_memory() distance_gan = trainer else: convertor_config = None distance_gan = None convertor = distance_gan env = atari_env(args.env, env_conf, args, None, None, mapFrames=False) model_env = None if args.use_convertor: setup_json = read_config(args.model_env_config) model_env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.model_env: model_env_conf = setup_json[i] model_env = atari_env(args.model_env, model_env_conf, args) shared_model = A3Clstm(env.observation_space.shape[0], env.action_space) # (TODO): We need to load the pretrained Pong weights so that the last layer (Ac-
def train(rank, args, shared_model, optimizer, env_conf): torch.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf) model = A3Clstm(env.observation_space.shape[0], env.action_space) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) state = env.reset() player = Agent(model, env, args, state) player.state = torch.from_numpy(state).float() player.model.train() epoch = 0 while True: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) if player.starter: player = player_start(player, train=True) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player = player_act(player, train=True) if player.done: break if player.current_life > player.info['ale.lives']: player.flag = True player.current_life = player.info['ale.lives'] else: player.current_life = player.info['ale.lives'] player.flag = False if args.count_lives: if player.flag: player.done = True break if player.starter and player.flag: player = player_start(player, train=True) if player.done: break if player.done: player.eps_len = 0 player.current_life = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() player.flag = False R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss += 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i] optimizer.zero_grad() (policy_loss + value_loss).backward() ensure_shared_grads(player.model, shared_model) optimizer.step() player.values = [] player.log_probs = [] player.rewards = [] player.entropies = []
def test(rank, args, shared_model): ptitle('Test Agent') gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] writer = SummaryWriter(log_dir=args.log_dir + 'tb_test') log = {} setup_logger('{}_log'.format('Test_' + str(rank)), r'{0}{1}_log'.format(args.log_dir, 'Test_' + str(rank))) log['{}_log'.format('Test_' + str(rank))] = logging.getLogger( '{}_log'.format('Test_' + str(rank))) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format('Test_' + str(rank))].info('{0}: {1}'.format( k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = atari_env(env_id=rank, args=args, type='train') reward_sum = 0 start_time = time.time() num_tests = 0 num_inside_target_room = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[2], player.env.action_space.n) player.state = player.env.reset() player.state = normalize_rgb_obs(player.state) player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() action_times = 0 while True: action_times += 1 if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.action_test() reward_sum += player.reward if not os.path.exists(args.log_dir + "video/" + str(rank) + "_" + str(num_tests)): os.makedirs(args.log_dir + "video/" + str(rank) + "_" + str(num_tests)) cv2.imwrite(args.log_dir + "video/" + str(rank) + "_" + str(num_tests) + "/" + str(action_times) + ".png", player.env.get_rgb()) # (90, 120, 3) if player.done: frame_to_video(fileloc=args.log_dir + "video/" + str(rank) + "_" + str(num_tests) + "/%d.png", t_w=120, t_h=90, destination=args.log_dir + "video/" + str(rank) + "_" + str(num_tests) + ".mp4") shutil.rmtree(args.log_dir + "video/" + str(rank) + "_" + str(num_tests)) action_times = 0 num_tests += 1 num_inside_target_room += player.env.inside_target_room reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests success_rate = num_inside_target_room / num_tests log['{}_log'.format('Test_' + str(rank))].info( "Time {0}, Tester {1}, test counter {2}, episode reward {3}, episode length {4}, reward mean {5:.4f}, success rate {6}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), rank, num_tests, reward_sum, player.eps_len, reward_mean, success_rate)) # Tensorboard writer.add_scalar("data/episode_reward", reward_sum, num_tests) writer.add_scalar("data/episode_length", player.eps_len, num_tests) writer.add_scalar("data/reward_mean", reward_mean, num_tests) writer.add_scalar("data/success_rate", success_rate, num_tests) if reward_sum > args.save_score_level: # player.model.load_state_dict(shared_model.state_dict()) if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}_{2}.dat'.format(args.save_model_dir, 'Test_' + str(rank), reward_sum)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}_{2}.dat'.format(args.save_model_dir, 'Test_' + str(rank), reward_sum)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() time.sleep(10) state = normalize_rgb_obs(state) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()