def init_process(mode: str, id: int, to_close_on_termination: OnPolicyRLEngine): ptitle(f"{mode}-{id}") def create_handler(termination_type: str): def handler(_signo, _frame): prefix = f"{termination_type} signal sent to worker {mode}-{id}." if to_close_on_termination._is_closed: get_logger().info( f"{prefix} Worker {mode}-{id} is already closed, exiting." ) sys.exit(0) elif not to_close_on_termination._is_closing: get_logger().info( f"{prefix} Forcing worker {mode}-{id} to close and exiting." ) try: to_close_on_termination.close(True) except Exception: get_logger().error( f"Error occurred when closing the RL engine used by work {mode}-{id}." f" We cannot recover from this and will simply exit. The exception:" ) get_logger().exception(traceback.format_exc()) sys.exit(1) sys.exit(0) else: get_logger().info( f"{prefix} Worker {mode}-{id} is already closing, ignoring this signal." ) return handler signal.signal(signal.SIGTERM, create_handler("Termination")) signal.signal(signal.SIGINT, create_handler("Interrupt"))
def rearrangement_datagen_worker( input_queue: mp.Queue, output_queue: mp.Queue, scene_to_obj_name_to_avoid_positions: Optional[Dict[str, Dict[ str, np.ndarray]]] = None, ): ptitle("Rearrange Datagen Worker") env = RearrangeTHOREnvironment( force_cache_reset=True, controller_kwargs={"commit_id": THOR_COMMIT_ID}) while True: try: scene, stage, seed = input_queue.get(timeout=2) except queue.Empty: break data = generate_rearrangements_for_scenes( stage_seed=seed, stage_scenes=[scene], env=env, object_types_to_not_move=OBJECT_TYPES_TO_NOT_MOVE, obj_name_to_avoid_positions=None if scene_to_obj_name_to_avoid_positions is None else scene_to_obj_name_to_avoid_positions[scene], ) output_queue.put((scene, stage, data[scene]))
def init_process(mode: str, id: int): ptitle("{}-{}".format(mode, id)) def sigterm_handler(_signo, _stack_frame): raise KeyboardInterrupt signal.signal(signal.SIGTERM, sigterm_handler)
def test(args, shared_model): ptitle('Test Agent') log = {} setup_logger( '{}_log'.format(args.env), r'{0}{1}_log_{2}'.format( args.log_dir, args.env, time.strftime("%Y:%m:%d_%Hh:%Mm:%Ss", time.gmtime(time.time())))) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) env = create_env(args.env, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) if args.model == 'CONV': player.model = A3C_CONV(env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.eval() max_score = 0 while True: if player.done: player.model.load_state_dict(shared_model.state_dict()) player.action_test() reward_sum += player.reward if player.done: num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if args.save_max and reward_sum >= max_score: max_score = reward_sum state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() time.sleep(60) player.state = torch.from_numpy(state).float()
def main(): args = get_args() init_logging(args.log_level) get_logger().info("Running with args {}".format(args)) ptitle( "Master: {}".format("Training" if args.eval is None else "Evaluation")) cfg, srcs = load_config(args) if not args.eval: OnPolicyRunner( config=cfg, output_dir=args.output_dir, save_dir_fmt=args.save_dir_fmt, loaded_config_src_files=srcs, seed=args.seed, mode="train", deterministic_cudnn=args.deterministic_cudnn, deterministic_agents=args.deterministic_agents, extra_tag=args.extra_tag, disable_tensorboard=args.disable_tensorboard, disable_config_saving=args.disable_config_saving, distributed_ip_and_port=args.distributed_ip_and_port, machine_id=args.machine_id, ).start_train( checkpoint=args.checkpoint, restart_pipeline=args.restart_pipeline, max_sampler_processes_per_worker=args. max_sampler_processes_per_worker, collect_valid_results=args.collect_valid_results, ) else: OnPolicyRunner( config=cfg, output_dir=args.output_dir, save_dir_fmt=args.save_dir_fmt, loaded_config_src_files=srcs, seed=args.seed, mode="test", deterministic_cudnn=args.deterministic_cudnn, deterministic_agents=args.deterministic_agents, extra_tag=args.extra_tag, disable_tensorboard=args.disable_tensorboard, disable_config_saving=args.disable_config_saving, distributed_ip_and_port=args.distributed_ip_and_port, machine_id=args.machine_id, ).start_test( checkpoint_path_dir_or_pattern=args.checkpoint, infer_output_dir=args.infer_output_dir, approx_ckpt_step_interval=args.approx_ckpt_step_interval, max_sampler_processes_per_worker=args. max_sampler_processes_per_worker, inference_expert=args.test_expert, )
def run_sim(rank, params, shared_model, shared_optimizer, count, lock): if not os.path.exists('./' + params.weight_dir): os.mkdir('./' + params.weight_dir) if not os.path.exists('./log'): os.mkdir('./log') logging.basicConfig(filename='./log/' + params.log_file + '.log', level=logging.INFO) ptitle('Training Agent: {}'.format(rank)) gpu_id = params.gpu_ids_train[rank % len(params.gpu_ids_train)] api = objrender.RenderAPI(w=params.width, h=params.height, device=gpu_id) cfg = load_config('config.json') torch.manual_seed(random.randint(0, 1000) + rank) if gpu_id >= 0: torch.cuda.manual_seed(random.randint(0, 1000) + rank) model = A3C_LSTM_GA() with torch.cuda.device(gpu_id): model = model.cuda() Agent = run_agent(model, gpu_id) house_id = params.house_id if house_id == -1: house_id = rank if house_id > 50: house_id = house_id % 50 env = Environment(api, get_house_id(house_id, params.difficulty), cfg) task = RoomNavTask(env, hardness=params.hardness, segment_input=params.semantic_mode, max_steps=params.max_steps, discrete_action=True) n_train = 0 best_rate = 0.0 save_model_index = 0 while True: n_train += 1 training(task, gpu_id, shared_model, Agent, shared_optimizer, params, lock, count) if n_train % 1000 == 0: with lock: n_update = count.value with torch.cuda.device(gpu_id): Agent.model.load_state_dict(shared_model.state_dict()) start_time = time.time() best_rate, save_model_index = testing(lock, n_update, gpu_id, Agent, task, best_rate, params, save_model_index, start_time, logging, house_id)
def test(args, nn): ptitle('Test Agent') log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) env = environment.make(args.env, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.model = MLP(player.env.observation_space.shape[0], player.env.action_space, args.n_frames) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.eval() max_score = 0 while True: if player.done: player.model.load_state_dict(nn.state_dict()) player.action_test() reward_sum += player.reward if player.done: num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, reward {1}, average reward {2:.4f}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_mean)) if reward_sum >= max_score: max_score = reward_sum state_to_save = player.model.state_dict() torch.save(state_to_save, '{}.dat'.format(args.model_save_dir)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() time.sleep(60) player.state = torch.from_numpy(state).float()
def main(): args = get_args() init_logging(args.log_level) get_logger().info("Running with args {}".format(args)) ptitle("Master: {}".format( "Training" if args.test_date is None else "Testing")) cfg, srcs = load_config(args) if args.test_date is None: OnPolicyRunner( config=cfg, output_dir=args.output_dir, loaded_config_src_files=srcs, seed=args.seed, mode="train", deterministic_cudnn=args.deterministic_cudnn, deterministic_agents=args.deterministic_agents, extra_tag=args.extra_tag, disable_tensorboard=args.disable_tensorboard, disable_config_saving=args.disable_config_saving, ).start_train( checkpoint=args.checkpoint, restart_pipeline=args.restart_pipeline, max_sampler_processes_per_worker=args. max_sampler_processes_per_worker, ) else: OnPolicyRunner( config=cfg, output_dir=args.output_dir, loaded_config_src_files=srcs, seed=args.seed, mode="test", deterministic_cudnn=args.deterministic_cudnn, deterministic_agents=args.deterministic_agents, extra_tag=args.extra_tag, disable_tensorboard=args.disable_tensorboard, disable_config_saving=args.disable_config_saving, ).start_test( experiment_date=args.test_date, checkpoint=args.checkpoint, skip_checkpoints=args.skip_checkpoints, max_sampler_processes_per_worker=args. max_sampler_processes_per_worker, )
def train(rank, args, shared_model, optimizer): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = create_env(args.env, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id if args.model == 'MLP': player.model = A3C_MLP(player.env.observation_space.shape[0], player.env.action_space, args.stack_frames) if args.model == 'CONV': player.model = A3C_CONV(args.stack_frames, player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 128).cuda()) player.hx = Variable(torch.zeros(1, 128).cuda()) else: player.cx = Variable(torch.zeros(1, 128)) player.hx = Variable(torch.zeros(1, 128)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: player.eps_len = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if gpu_id >= 0: with torch.cuda.device(gpu_id): R = torch.zeros(1, 1).cuda() else: R = torch.zeros(1, 1) if not player.done: state = player.state if args.model == 'CONV': state = state.unsqueeze(0) value, _, _, _ = player.model( (Variable(state), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = torch.zeros(1, 1).cuda() else: gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion # print(player.rewards[i]) delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i].sum() * Variable(gae)) - \ (0.01 * player.entropies[i].sum()) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, optimizer_r, env_conf, lock, counter): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = atari_env(args.env, env_conf, args) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = [ Variable(torch.zeros(1, 512).cuda()), Variable(torch.zeros(1, 512).cuda()) ] player.hx = [ Variable(torch.zeros(1, 512).cuda()), Variable(torch.zeros(1, 512).cuda()) ] else: player.cx = [ Variable(torch.zeros(1, 512)), Variable(torch.zeros(1, 512)) ] player.hx = [ Variable(torch.zeros(1, 512)), Variable(torch.zeros(1, 512)) ] else: player.cx = [ Variable(player.cx[0].data), Variable(player.cx[1].data) ] player.hx = [ Variable(player.hx[0].data), Variable(player.cx[1].data) ] # 测试rnet的更新有没有影响到这里 # ps = list(player.model.r_net.named_parameters()) # n, v = ps[6] # print(v.sum()) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx[0], player.cx[0]), (player.hx[1], player.cx[1]))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] with lock: counter.value += 1 # rnet player.model.r_net.zero_grad() (args.actor_weight * policy_loss + (1 - args.actor_weight) * value_loss).backward(retain_graph=True) ensure_shared_grads(player.model.r_net, shared_model.r_net, gpu=gpu_id >= 0) optimizer_r.step() player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() player.model.r_net.zero_grad() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train(rank, args, shared_model, optimizer, train_modes, n_iters, env=None): n_steps = 0 n_iter = 0 writer = SummaryWriter(os.path.join(args.log_dir, 'Agent:{}'.format(rank))) ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) training_mode = args.train_mode env_name = args.env train_modes.append(training_mode) n_iters.append(n_iter) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) device = torch.device('cuda:' + str(gpu_id)) else: device = torch.device('cpu') if env == None: env = create_env(env_name) params = shared_model.parameters() if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(params, lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(params, lr=args.lr) env.seed(args.seed + rank) player = Agent(None, env, args, None, device) player.gpu_id = gpu_id player.env.reset() # prepare model player.model = build_model(action_space=player.env.action_space, pose_space=player.reset_cam_pose(), args=args,) player.model = player.model.to(device) player.model.train() player.reset() reward_sum = torch.zeros(player.num_agents).to(device) count_eps = 0 print('Start training...') while True: # sys to the shared model player.model.load_state_dict(shared_model.state_dict()) if player.done: player.reset() reward_sum = torch.zeros(player.num_agents).to(device) count_eps += 1 player.update_rnn_hidden() fps_counter = 0 t0 = time.time() for i in range(args.num_steps): player.action_train() reward_sum += player.reward fps_counter += 1 n_steps += 1 if player.done: for i, r_i in enumerate(reward_sum): # add for Pose Only if i not in player.env.random_ids: continue # writer.add_scalar('train/reward_' + str(i), r_i, n_steps) break fps = fps_counter / (time.time() - t0) policy_loss, value_loss, entropies, pred_loss, values0 = player.optimize(params, optimizer, shared_model, gpu_id) writer.add_scalar('train/policy_loss_sum', policy_loss.sum(), n_steps) writer.add_scalar('train/value_loss_sum', value_loss.sum(), n_steps) writer.add_scalar('train/entropies_sum', entropies.sum(), n_steps) writer.add_scalar('train/values0', values0.sum(), n_steps) writer.add_scalar('train/pred_R_loss', pred_loss, n_steps) writer.add_scalar('train/fps', fps, n_steps) # writer.add_scalar('train/lr', lr[0], n_iter) n_iter += 1 n_iters[rank] = n_iter if train_modes[rank] == -100: env.close() break
def test(args, shared_model, env_conf): # print('IN TEST') ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) setup_logger('{}_map_log'.format(args.env), r'{0}{1}_map_log'.format(args.log_dir, args.env)) log['{}_map_log'.format(args.env)] = logging.getLogger('{}_map_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) if 'micropolis' in args.env.lower(): import gym_micropolis env = micropolis_env(args.env, env_conf, args) else: # print('using atari env for test') env = atari_env(args.env, env_conf, args) reward_sum = 0 entropy_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id if 'micropolis' in args.env.lower(): modelInit = getattr(model, args.design_head) player.model = modelInit(player.env.observation_space.shape[0], player.env.action_space, player.env.env.env.MAP_X) player.lstm_sizes = player.model.getMemorySizes() if not 'arcade' in args.env.lower(): player.lstm_size = (1, 16, player.env.env.env.MAP_X, env.env.env.MAP_Y) else: player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 i = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward entropy_sum += player.entropy.data.item() if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1:1.5e}, entropy {4:1.5e} episode length {2}, reward mean {3:1.5e}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, entropy_sum)) import numpy as np np.set_printoptions(threshold=400) log['{}_map_log'.format(args.env)].info('\n{}'.format( np.array2string( np.add( player.env.env.env.micro.map.zoneMap[-1], np.full((player.env.env.env.MAP_X, player.env.env.env.MAP_Y), 2))).replace('\n ', '').replace('][', ']\n[').replace( '[[', '[').replace(']]', ']'))) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}best_{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}best_{1}.dat'.format(args.save_model_dir, args.env)) if i % 10 == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}latest_{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}latest_{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 entropy_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 i += 1 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def test(rank, params, shared_model, count, lock): logging.basicConfig(filename='./2blocks_rew.log', level=logging.INFO) ptitle('Test Process: {}'.format(rank)) gpu_id = params.gpu_ids_test[rank % len(params.gpu_ids_test)] env = Env(True, 1, down_period=2) # model = A3C() model = A3C_LSTM() with torch.cuda.device(gpu_id): model = model.cuda() agent = run_agent(model, gpu_id) episode = 0 while episode <= params.episode_test: env.reset() with lock: n_update = count.value agent.synchronize(shared_model) num_steps = 0 accumulated_reward = 0 nAction = 0 line1 = 0 line2 = 0 line3 = 0 line4 = 0 nMove = 0 rew_height = 0 rew_move = 0 while True: num_steps += 1 obs = pre_processing(env.shadow_map, env._get_curr_block_pos()) # env.map action = agent.action_test(obs) if action == 5: action = 100000 rew, shadow_reward, done, putting, height = env.step( action) # what is the 'is_new_block'? if rew == 0.0 and action != 3 and action != 4: nMove += 1 if nMove < 6: rew_move += 0.2 if putting: rew_height += -(height / 20.0) nMove = 0 if rew == 1.0: line1 += 1 elif rew == 8.0: line2 += 1 elif rew == 27.0: line3 += 1 elif rew == 64: line4 += 1 ''' if nAction < 9: obs = pre_processing(env.map, env._get_curr_block_pos()) action = agent.action_test(obs) rew, shadow_reward, is_new_block = env.step(action) # what is the 'is_new_block'? nAction += 1 else: rew, is_new_block = env.step(100000) # falling nAction = 0 ''' accumulated_reward = rew + rew_move + rew_height if env.is_game_end(): episode += 1 print(" ".join([ "-------------episode stats-------------\n", "nUpdate: {}\n".format(n_update), "line1: {}\n".format(line1), "line2: {}\n".format(line2), "line3: {}\n".format(line3), "line4: {}\n".format(line4), "all_lines: {}\n".format( str(line1 + line2 + line3 + line4)), "score: {}\n".format(env.score), "rew_move: {}\n".format(rew_move), "rew_height: {}\n".format(rew_height), "steps: {}\n".format(num_steps) ])) logging.info(" ".join([ "-------------episode stats-------------\n", "nUpdate: {}\n".format(n_update), "line1: {}\n".format(line1), "line2: {}\n".format(line2), "line3: {}\n".format(line3), "line4: {}\n".format(line4), "all_lines: {}\n".format( str(line1 + line2 + line3 + line4)), "score: {}\n".format(env.score), "rew_move: {}\n".format(rew_move), "rew_height: {}\n".format(rew_height), "steps: {}\n".format(num_steps) ])) break if env.score > 1000: episode += 1 print(" ".join([ "-------------episode stats-------------\n", "nUpdate: {}\n".format(n_update), "line1: {}\n".format(line1), "line2: {}\n".format(line2), "line3: {}\n".format(line3), "line4: {}\n".format(line4), "all_lines: {}\n".format( str(line1 + line2 + line3 + line4)), "score: {}\n".format(env.score), "rew_move: {}\n".format(rew_move), "rew_height: {}\n".format(rew_height), "steps: {}\n".format(num_steps) ])) with torch.cuda.device(gpu_id): torch.save(agent.model.state_dict(), './weight/model' + str(n_update) + '.ckpt') logging.info(" ".join([ "-------------episode stats-------------\n", "nUpdate: {}\n".format(n_update), "line1: {}\n".format(line1), "line2: {}\n".format(line2), "line3: {}\n".format(line3), "line4: {}\n".format(line4), "all_lines: {}\n".format( str(line1 + line2 + line3 + line4)), "score: {}\n".format(env.score), "rew_move: {}\n".format(rew_move), "rew_height: {}\n".format(rew_height), "steps: {}\n".format(num_steps) ])) break
def trainhoc(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = OC_env(args.env) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = HOCAgent(None, env, args, None) player.gpu_id = gpu_id player.model = HOCModel(player.env.observation_space.shape[0], player.env.action_space, args.options, args.width) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 threshold = 0 EnvNumSteps = 0 while True: if EnvNumSteps > threshold: threshold += 5000 print("thread:", rank, "steps:", EnvNumSteps) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: ### add in option selection part probo1, logpo1, player.o1 = player.model.getPolicyO1( Variable(player.state)) probo2, logpo2, player.o2 = player.model.getPolicyO2( Variable(player.state), player.o1) else: player.o1 = player.o1 player.o2 = player.o2 for step in range(args.num_steps): EnvNumSteps += 1 player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: q = player.model(Variable(player.state)) v = q.max(-1)[0] R = v.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = torch.zeros(1, 1) value_loss = torch.zeros(1, 1) phi_loss = torch.zeros(1, 1) gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) thesize = len(player.rewards) for i in reversed(range(len(player.rewards))): ### update discounted reward before = R R = args.gamma * R + player.rewards[i] ### update value function difference1 = R - player.qs1[i] value_loss = value_loss + 0.5 * difference1.pow(2) difference2 = R - player.qs2[i] value_loss = value_loss + 0.5 * difference2.pow(2) if i + 1 < thesize: difference3 = before - player.values[i + 1] difference4 = before - player.qs1[i + 1] ### update policy # adv1 = R - player.qs1[i] delta2 = R - player.qs2[i] policy_loss = policy_loss - \ player.log_probsa[i] * \ Variable(delta2) - 0.1 * player.entropiesA[i] if i + 1 < thesize: beta1 = player.termprobs1[i + 1].data beta2 = player.termprobs2[i + 1].data policy_loss = policy_loss - \ args.gamma * player.log_probso1[i+1] * \ Variable(beta1 * beta2 * difference3.data) - 0.1 * player.entropieso1[i+1] policy_loss = policy_loss - \ args.gamma * player.log_probso2[i+1] * \ Variable(beta2 * difference4.data) - 0.1 * player.entropieso2[i+1] advantage1 = player.qs1[i + 1].data - player.values[ i + 1].data + args.delib phi_loss = phi_loss + \ args.gamma * player.termprobs1[i+1] * \ Variable(advantage1 * beta2, requires_grad=False) advantage2 = player.qs2[ i + 1].data - (1 - beta1) * player.qs1[i + 1].data - ( beta1 * player.values[i + 1].data) + args.delib phi_loss = phi_loss + \ args.gamma * player.termprobs2[i+1] * \ Variable(advantage2, requires_grad=False) player.model.zero_grad() (phi_loss.sum() + policy_loss.sum() + 0.5 * value_loss.sum()).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def train_func(rank, args, shared_model, optimizer, env_conf, datasets=None, shared_dict=None): if args.deploy: return ptitle('Train {0}'.format(rank)) print('Start training agent: ', rank) if rank == 0: logger = Logger(args.log_dir[:-1] + '_losses/') train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) raw_list, gt_lbl_list = datasets env = EM_env(raw_list, env_conf, type="train", gt_lbl_list=gt_lbl_list, seed=args.seed + rank) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = get_model(args, args.model, env.observation_space.shape, args.features, atrous_rates=args.atr_rate, num_actions=2, split=args.data_channel, gpu_id=gpu_id, multi=args.multi) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() if rank == 0: eps_reward = 0 pinned_eps_reward = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.eps_len = 0 if rank == 0: if train_step % args.train_log_period == 0 and train_step > 0: print("train: step", train_step, "\teps_reward", eps_reward) if train_step > 0: pinned_eps_reward = player.env.sum_reward.mean() eps_reward = 0 if args.lstm_feats: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx, player.hx = player.model.lstm.init_hidden( batch_size=1, use_cuda=True) else: player.cx, player.hx = player.model.lstm.init_hidden( batch_size=1, use_cuda=False) elif args.lstm_feats: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): if rank < args.lbl_agents: player.action_train(use_lbl=True) else: player.action_train() if rank == 0: eps_reward = player.env.sum_reward.mean() if player.done: break if player.done: state = player.env.reset(player.model, gpu_id) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if "3D" in args.data: R = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1], env_conf["size"][2]) else: R = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1]) if args.lowres: R = torch.zeros(1, 1, env_conf["size"][0] // 2, env_conf["size"][1] // 2) if not player.done: if args.lstm_feats: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) else: value, _ = player.model(Variable(player.state.unsqueeze(0))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 if "3D" in args.data: gae = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1], env_conf["size"][2]) else: gae = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1]) if args.rew_drop: keep_map = torch.tensor(player.env.keep_map) if args.lowres: gae = torch.zeros(1, 1, env_conf["size"][0] // 2, env_conf["size"][1] // 2) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() if args.rew_drop: keep_map = keep_map.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): reward_i = torch.tensor(player.rewards[i]).cuda() else: reward_i = torch.tensor(player.rewards[i]) R = args.gamma * R + reward_i if args.rew_drop: advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage * keep_map).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t else: advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t if args.noisy: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () else: if args.rew_drop: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae) * keep_map).mean () - \ (args.entropy_alpha * player.entropies[i] * keep_map).mean () else: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () - \ (args.entropy_alpha * player.entropies[i]).mean () player.model.zero_grad() sum_loss = (policy_loss + value_loss) curtime = time.time() # print ("backward curtime:", curtime) sum_loss.backward() # print ("backward done", time.time () - curtime) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) curtime = time.time() # print ("optim curtime:", curtime) optimizer.step() # print ("optim done", time.time () - curtime) player.clear_actions() if args.wctrl == "s2m": player.env.config["spl_w"] = shared_dict["spl_w"] player.env.config["mer_w"] = shared_dict["mer_w"] if rank == 0: train_step += 1 if train_step % args.log_period == 0 and train_step > 0: log_info = { 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: eps reward': pinned_eps_reward, } if "EX" in args.model: log_info["cell_prob_loss"] = cell_prob_loss for tag, value in log_info.items(): logger.scalar_summary(tag, value, train_step)
def train (rank, args, shared_model, optimizer, env_conf, datasets=None): ptitle('Training Agent: {}'.format(rank)) print ('Start training agent: ', rank) if rank == 0: logger = Logger (args.log_dir) train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf ["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) if "EM_env" in args.env: raw, lbl, prob, gt_lbl = datasets env = EM_env (raw, lbl, prob, env_conf, 'train', gt_lbl) else: env = Voronoi_env (env_conf) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop (shared_model.parameters (), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam (shared_model.parameters (), lr=args.lr, amsgrad=args.amsgrad) # env.seed (args.seed + rank) if not args.continuous: player = Agent (None, env, args, None) else: player = Agent_continuous (None, env, args, None) player.gpu_id = gpu_id if not args.continuous: player.model = A3Clstm (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) else: player.model = A3Clstm_continuous (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) player.state = player.env.reset () player.state = torch.from_numpy (player.state).float () old_score = player.env.old_score final_score = 0 if gpu_id >= 0: with torch.cuda.device (gpu_id): player.state = player.state.cuda () player.model = player.model.cuda () player.model.train () if rank == 0: eps_reward = 0 pinned_eps_reward = 0 mean_log_prob = 0 # print ("rank: ", rank) while True: if gpu_id >= 0: with torch.cuda.device (gpu_id): player.model.load_state_dict (shared_model.state_dict ()) else: player.model.load_state_dict (shared_model.state_dict ()) if player.done: player.eps_len = 0 if rank == 0: if 0 <= (train_step % args.train_log_period) < args.max_episode_length: print ("train: step", train_step, "\teps_reward", eps_reward, "\timprovement", final_score - old_score) old_score = player.env.old_score pinned_eps_reward = eps_reward eps_reward = 0 mean_log_prob = 0 if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, args.hidden_feat).cuda()) player.hx = Variable(torch.zeros(1, args.hidden_feat).cuda()) else: player.cx = Variable(torch.zeros(1, args.hidden_feat)) player.hx = Variable(torch.zeros(1, args.hidden_feat)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train () if rank == 0: # if 0 <= (train_step % args.train_log_period) < args.max_episode_length: # print ("train: step", train_step, "\taction = ", player.action) eps_reward += player.reward # print (eps_reward) mean_log_prob += player.log_probs [-1] / env_conf ["T"] if player.done: break if player.done: # if rank == 0: # print ("----------------------------------------------") final_score = player.env.old_score state = player.env.reset () player.state = torch.from_numpy (state).float () if gpu_id >= 0: with torch.cuda.device (gpu_id): player.state = player.state.cuda () R = torch.zeros (1, 1) if not player.done: if not args.continuous: value, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) else: value, _, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = player.values[i + 1].data * args.gamma + player.rewards[i] - \ player.values[i].data gae = gae * args.gamma * args.tau + delta_t # print (player.rewards [i]) if not args.continuous: policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] else: policy_loss = policy_loss - \ player.log_probs[i].sum () * Variable(gae) - \ 0.01 * player.entropies[i].sum () player.model.zero_grad () sum_loss = (policy_loss + value_loss) sum_loss.backward () ensure_shared_grads (player.model, shared_model, gpu=gpu_id >= 0) optimizer.step () player.clear_actions () if rank == 0: train_step += 1 if train_step % args.log_period == 0: log_info = { # 'train: sum_loss': sum_loss, 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: advanage': advantage, # 'train: entropy': entropy, 'train: eps reward': pinned_eps_reward, # 'train: mean log prob': mean_log_prob } for tag, value in log_info.items (): logger.scalar_summary (tag, value, train_step)
def test(args, shared_model, env_conf): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger( '{}_log'.format(args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) print("test proc:") env = AllowBacktracking(make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("test got env:", env.observation_space) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm( player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward """ if player.done and player.info['ale.lives'] > 0 and not player.max_length: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() """ if player.done or player.max_length: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def trainac(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = OC_env(args.env) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = ACAgent(None, env, args, None) player.gpu_id = gpu_id player.model = ACModel(player.env.observation_space.shape[0], player.env.action_space, args.options, args.width) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 threshold = 0 EnvNumSteps = 0 reward_mean = 0. while True: if EnvNumSteps > threshold: threshold += 5000 print("thread:", rank, "steps:", EnvNumSteps) if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) for step in range(args.num_steps): EnvNumSteps += 1 player.action_train() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: q, logit = player.model(Variable(player.state)) v = q.max(-1)[0] R = v.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = torch.zeros(1, 1) value_loss = torch.zeros(1, 1) gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) thesize = len(player.rewards) reward_sum = sum(player.rewards) reward_mean = reward_mean + (reward_sum - thesize * reward_mean) / EnvNumSteps for i in reversed(range(len(player.rewards))): before = R R = args.gamma * R + player.rewards[i] difference = R - player.qs[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * difference.pow(2) policy_loss = policy_loss - player.log_probs[i] * Variable( advantage.data) - 0.1 * player.entropies[i] player.model.zero_grad() (policy_loss.sum() + 0.5 * value_loss.sum()).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() if str(rank) == "1": fullname = args.save_model_dir + args.env + str(rank) + ".torch" tmpname = args.save_model_dir + args.env + str(rank) + ".tmp" torch.save(optimizer.state_dict(), tmpname) #optimizer.state_dict() os.rename(tmpname, fullname)
def run(self): ptitle('Training Agent: {}').format(self.rank) # env is running on CPU bufferReward, bufferLogProbs, bufferEntropies = [], [], [] done = True for self.epIdx in range(self.trainStep): print("episode index:" + str(self.epIdx) + " from" + current_process().name + "\n") if done: state = self.env.reset() state = self.processState(state) done = False rewardSum = 0 stepCount = 0 while not done and stepCount < self.numSteps: # the calculation is on GPU is available value, logit = self.localNet(state) prob = F.softmat(logit, dim = 1) logProb = F.log_softmax(logit, dim = 1) entropy = -(logProb * prob).sum(1) action = prob.multinomial(1).data logProb = logProb.gather(1, action) nextState, reward, done, info = self.env.step(action.cpu().item()) state = self.processState(nextState) bufferReward.append(reward) bufferLogProbs.append(logProb) rewardSum += reward # if done or stepCount == self.numSteps, we will update the net R = torch.zeros(1, 1, requires_grad=True) value, _ = self.localNet(state) R.data = value.data if self.gpuId >= 0: R = R.cuda() self.values.append(value) policyLoss = 0.0 valueLoss = 0.0 GAE = torch.zeros(1, 1, requires_grad=True) if self.gpuId >= 0: with torch.cuda.device(self.gpuId): GAE = GAE.cuda() for i in reversed(range(len(self.rewards))): R = self.gamma * R + self.rewards[i] advantage = R - self.values[i] valueLoss += 0.5 * advantage.pow(2) # generalized advantage estimation # we use values.data to ensure delta_t is a torch tenor without grad delta_t = self.rewards[i] + self.gamma*self.values[i+1].data - self.values[i].data GAE = GAE * self.gamma * self.tau + delta_t policyLoss = policyLoss - self.logProb[i]*GAE - self.entropyPenality * self.entropies[i] self.localNet.zero_grad() (policyLoss + 0.5*valueLoss).backward() ensure_shared_grads(self.localNet, self.globalNet, gpu=self.gpuId > 0) self.globalOptimizer.step() self.clearup()
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = Environment() # 創建環境 if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) # env.seed(args.seed + rank) player = Agent(None, env, args, None) # 創建代理人 player.gpu_id = gpu_id num_actions = env.get_num_actions() player.model = A3Clstm( Config.STACKED_FRAMES, # A3C模型 num_actions) player.state, available = player.env.reset() # 初始環境 player.state = torch.from_numpy(player.state).float() player.available = torch.from_numpy(available).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.available = player.available.cuda() player.model.train() # 訓練模式 player.eps_len += 1 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) # 更新網路 if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) # 完成一次訓練 初始化LSTM else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): # T-max = 20 player.action_train() if player.done: break if player.done: state, available = player.env.reset() player.state = torch.from_numpy(state).float() player.available = torch.from_numpy(available).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.available = player.available.cuda() R = torch.zeros(1, 1) # if done : R_t-max = 0 if not player.done: value, _, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data # R_t-max = V(s) if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pandas as pd from util.arg_parser import init_parser from setproctitle import setproctitle as ptitle import numpy as np if __name__ == '__main__': parser = init_parser() args = parser.parse_args() ptitle('Nor_a{}_o{}'.format(args.a, args.omega)) df1 = pd.read_csv( '../data/others_d9to13_a{}_o{}.csv'.format(args.a, args.omega), sep=',', header='infer') # 41720 + 44375 + 35902 + 36972 + 45863 --> 40966.4 df2 = pd.read_csv('../data/others_d16to20_a{}_o{}.csv'.format( args.a, args.omega), sep=',', header='infer') list1 = [ tuple(i) for i in np.array(df1[['o', 'd', 'tau']], dtype=np.int).tolist() ] index1 = len(df1.index) list2 = [ tuple(i) for i in np.array(df2[['o', 'd', 'tau']], dtype=np.int).tolist()
def test(args, shared_model, env_conf, lock, counter): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger( '{}_log'.format(args.env), r'{0}{1}-{2}_log'.format(args.log_dir, args.env, args.log_target)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests with lock: counter.value += 1 log['{}_log'.format(args.env)].info( "UpdateStep {0} Time {1}, episode reward {2}, episode length {3}, reward mean {4:.4f}" .format( counter.value, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}_{2}.dat'.format(args.save_model_dir, args.env, args.log_target)) else: state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}_{2}.dat'.format(args.save_model_dir, args.env, args.log_target)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def run_sim(rank, params, shared_model, shared_optimizer, count, lock): ptitle('Training Agent: {}'.format(rank)) gpu_id = params.gpu_ids_train[rank % len(params.gpu_ids_train)] api = objrender.RenderAPI(w=params.width, h=params.height, device=gpu_id) cfg = load_config('config.json') if shared_optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=params.lr, amsgrad=params.amsgrad, weight_decay=params.weight_decay) #optimizer.share_memory() else: optimizer = shared_optimizer torch.manual_seed(params.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(params.seed + rank) model = A3C_LSTM_GA() with torch.cuda.device(gpu_id): model = model.cuda() Agent = run_agent(model, gpu_id) house_id = params.house_id if house_id == -1: house_id = rank if house_id >= 20: house_id = house_id % 20 env = Environment(api, get_house_id(house_id), cfg) task = RoomNavTask(env, hardness=params.hardness, segment_input=params.semantic_mode, max_steps=params.max_steps, discrete_action=True) for episode in range(params.max_episode): next_observation = task.reset() target = task.info['target_room'] target = get_instruction_idx(target) with torch.cuda.device(gpu_id): target = Variable(torch.LongTensor(target)).cuda() Agent.model.load_state_dict(shared_model.state_dict()) Agent.cx = Variable(torch.zeros(1, 256).cuda()) Agent.hx = Variable(torch.zeros(1, 256).cuda()) Agent.target = target total_reward, num_steps, good = 0, 0, 0 Agent.done = False done = False Agent.eps_len = 0 while not done: num_steps += 1 observation = next_observation act, entropy, value, log_prob = Agent.action_train( observation, target) next_observation, reward, done, info = task.step(actions[act[0]]) rew = np.clip(reward, -1.0, 1.0) Agent.put_reward(rew, entropy, value, log_prob) if num_steps % params.num_steps == 0 or done: if done: Agent.done = done with lock: count.value += 1 Agent.training(next_observation, shared_model, optimizer, params) if done: break
def test(args, shared_model, env_conf, datasets): ptitle('Test agent') gpu_id = args.gpu_ids[-1] log = {} logger = Logger(args.log_dir) setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) raw, gt_lbl = datasets env = EM_env(raw, gt_lbl, env_conf) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id # player.model = A3Clstm (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) player.model = SimpleCNN(env.observation_space.shape, env_conf["num_action"]) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True create_dir(args.save_model_dir) recent_episode_scores = [] renderlist = [] renderlist.append(player.env.render()) max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward renderlist.append(player.env.render()) if player.done: flag = True if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores += [reward_sum] if len(recent_episode_scores) > 200: recent_episode_scores.pop(0) if args.save_max and np.mean(recent_episode_scores) >= max_score: max_score = np.mean(recent_episode_scores) if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env + '_' + str(num_tests))) if num_tests % args.log_period == 0: print("------------------------------------------------") print("Log test #:", num_tests) print("Prob: ") for i in range(player.env.agent_out_shape[1]): for j in range(player.env.agent_out_shape[2]): print("{:.3f}\t".format(player.prob_cpu[0, i, j]), end='') print() print("Actions :", player.actions) print("Actions transformed: ") print(player.actions_explained) print("rewards: ", player.rewards) print("sum rewards: ", reward_sum) print("------------------------------------------------") log_img = np.concatenate(renderlist, 0) log_info = {"test: traning_sample": log_img} for tag, img in log_info.items(): img = img[None] logger.image_summary(tag, img, num_tests) log_info = {'test: mean_reward': reward_mean} for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) renderlist = [] reward_sum = 0 player.eps_len = 0 time.sleep(30) player.clear_actions() state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def test_func(args, shared_model, env_conf, datasets=None, tests=None, shared_dict=None): ptitle('Valid agent') if args.valid_gpu < 0: gpu_id = args.gpu_ids[-1] else: gpu_id = args.valid_gpu env_conf["env_gpu"] = gpu_id if not args.deploy: log = {} logger = Logger(args.log_dir) create_dir(args.log_dir + "models/") create_dir(args.log_dir + "tifs/") create_dir(args.log_dir + "tifs_test/") os.system("cp *.py " + args.log_dir) os.system("cp *.sh " + args.log_dir) os.system("cp models/*.py " + args.log_dir + "models/") setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) env_conf_log = env_conf if tests is not None: if args.testlbl: test_env = EM_env(tests[0], env_conf, type="test", gt_lbl_list=tests[1]) else: test_env = EM_env(tests[0], env_conf, type="test") if not args.deploy: for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, d_args[k])) for k in env_conf_log.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, env_conf_log[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) raw_list, gt_lbl_list = datasets env = EM_env(raw_list, env_conf, type="train", gt_lbl_list=gt_lbl_list) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = get_model(args, args.model, env_conf["observation_shape"], args.features, atrous_rates=args.atr_rate, num_actions=2, split=args.data_channel, gpu_id=gpu_id, multi=args.multi) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() flag = True if not args.deploy: create_dir(args.save_model_dir) recent_episode_scores = ScalaTracker(100) recent_FgBgDice = ScalaTracker(100) recent_bestDice = ScalaTracker(100) recent_diffFG = ScalaTracker(100) recent_MUCov = ScalaTracker(100) recent_MWCov = ScalaTracker(100) recent_AvgFP = ScalaTracker(100) recent_AvgFN = ScalaTracker(100) recent_rand_i = ScalaTracker(100) renderlist = [] renderlist.append(player.env.render()) max_score = 0 # ----------------------------------------- Deploy / Inference ----------------------------------------- if args.deploy: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) # inference (args, None, player.model, tests [0], test_env, gpu_id, player.env.rng, len (tests [0])) if len(tests) == 4: inference(args, None, player.model, tests[0], test_env, gpu_id, player.env.rng, len(tests[0]), tests[3]) else: inference(args, None, player.model, tests[0], test_env, gpu_id, player.env.rng, len(tests[0])) return # ----------------------------------------- End Deploy / Inference ----------------------------------------- merge_ratios = [] split_ratios = [] if args.wctrl == "s2m": schedule = args.wctrl_schedule delta = (shared_dict['spl_w'] - shared_dict['mer_w']) / (2 * len(schedule)) mer_w_delta = delta mer_w_var = shared_dict['mer_w'] mer_w_scheduler = Scheduler(mer_w_var, schedule, mer_w_delta) split_delta = -delta / len(args.out_radius) split_var = shared_dict['spl_w'] / len(args.out_radius) spl_w_scheduler = Scheduler(split_var, schedule, split_delta) while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward.mean() renderlist.append(player.env.render()) if player.done: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "VALID: Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores.push(reward_sum) if args.save_max and recent_episode_scores.mean() >= max_score: max_score = recent_episode_scores.mean() if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = {} state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, str(num_tests))) if num_tests % args.log_period == 0: if tests is not None and not args.DEBUG: inference(args, logger, player.model, tests[0], test_env, gpu_id, player.env.rng, num_tests) if (np.max(env.lbl) != 0 and np.max(env.gt_lbl) != 0): bestDice, FgBgDice, diffFG, MWCov, MUCov, AvgFP, AvgFN, rand_i = evaluate( args, player.env) recent_FgBgDice.push(FgBgDice) recent_diffFG.push(abs(diffFG)) recent_bestDice.push(bestDice) recent_MWCov.push(MWCov) recent_MUCov.push(MUCov) recent_AvgFP.push(AvgFP) recent_AvgFN.push(AvgFN) recent_rand_i.push(rand_i) log_info = { "bestDice": recent_bestDice.mean(), "FgBgDice": recent_FgBgDice.mean(), "diffFG": recent_diffFG.mean(), "MWCov": recent_MWCov.mean(), "MUCov": recent_MUCov.mean(), "AvgFP": recent_AvgFP.mean(), "AvgFN": recent_AvgFN.mean(), "rand_i": recent_rand_i.mean() } for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) else: bestDice, FgBgDice, diffFG = 0, 0, 0 MWCov, MUCov, AvgFP, AvgFN = 0, 0, 0, 0 rand_i = 0 print( "----------------------VALID SET--------------------------" ) print(args.env) print("bestDice:", bestDice, "FgBgDice:", FgBgDice, "diffFG:", diffFG, "MWCov:", MWCov, "MUCov:", MUCov, "AvgFP:", AvgFP, "AvgFN:", AvgFN, "rand_i:", rand_i) # print ("mean bestDice") print("Log test #:", num_tests) print("rewards: ", player.reward.mean()) print("sum rewards: ", reward_sum) print("#gt_values:", len(np.unique(player.env.gt_lbl))) print("values:") values = player.env.unique() print(np.concatenate([values[0][None], values[1][None]], 0)) print("------------------------------------------------") log_img = np.concatenate(renderlist[::-1], 0) if not "3D" in args.data: for i in range(3): player.probs.insert(0, np.zeros_like(player.probs[0])) while (len(player.probs) - 3 < args.max_episode_length): player.probs.append(np.zeros_like(player.probs[0])) probslist = [ np.repeat(np.expand_dims(prob, -1), 3, -1) for prob in player.probs ] probslist = np.concatenate(probslist, 1) probslist = (probslist * 256).astype(np.uint8, copy=False) # log_img = renderlist [-1] print(probslist.shape, log_img.shape) log_img = np.concatenate([probslist, log_img], 0) log_info = {"valid_sample": log_img} print(log_img.shape) io.imsave( args.log_dir + "tifs/" + str(num_tests) + "_sample.tif", log_img.astype(np.uint8)) io.imsave( args.log_dir + "tifs/" + str(num_tests) + "_pred.tif", player.env.lbl.astype(np.uint8)) io.imsave(args.log_dir + "tifs/" + str(num_tests) + "_gt.tif", player.env.gt_lbl.astype(np.int32)) if args.seg_scale: log_info["scaler"] = player.env.scaler for tag, img in log_info.items(): img = img[None] logger.image_summary(tag, img, num_tests) if not args.deploy: log_info = { 'mean_valid_reward': reward_mean, '100_mean_reward': recent_episode_scores.mean(), 'split_ratio': player.env.split_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl), 'merge_ratio': player.env.merge_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl), } if args.wctrl == 's2m': log_info.update({ 'mer_w': mer_w_scheduler.value(), 'spl_w': spl_w_scheduler.value() * len(args.out_radius), }) merge_ratios.append(player.env.merge_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl)) split_ratios.append(player.env.split_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl)) print("split ratio: ", np.max(player.env.split_ratio_sum), np.min(player.env.split_ratio_sum)) print("merge ratio: ", np.max(player.env.merge_ratio_sum), np.min(player.env.merge_ratio_sum)) print("merge ratio: ", merge_ratios) print("split ratio: ", split_ratios) for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) renderlist = [] reward_sum = 0 player.eps_len = 0 if args.wctrl == "s2m": shared_dict["spl_w"] = spl_w_scheduler.next() shared_dict["mer_w"] = mer_w_scheduler.next() player.env.config["spl_w"] = shared_dict["spl_w"] player.env.config["mer_w"] = shared_dict["mer_w"] player.clear_actions() state = player.env.reset(player.model, gpu_id) renderlist.append(player.env.render()) time.sleep(15) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, train_modes, n_iters, env=None): n_iter = 0 writer = SummaryWriter(os.path.join(args.log_dir, 'Agent:{}'.format(rank))) ptitle('Training Agent: {}'.format(rank)) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) training_mode = args.train_mode env_name = args.env train_modes.append(training_mode) n_iters.append(n_iter) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) device = torch.device('cuda:' + str(gpu_id)) if len(args.gpu_ids) > 1: device_share = torch.device('cpu') else: device_share = torch.device('cuda:' + str(args.gpu_ids[-1])) else: device = device_share = torch.device('cpu') if env is None: env = create_env(env_name, args) if args.train_mode == 0: params = shared_model.player0.parameters() elif args.train_mode == 1: params = shared_model.player1.parameters() else: params = shared_model.parameters() if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(params, lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(params, lr=args.lr) env.seed(args.seed) player = Agent(None, env, args, None, device) player.w_entropy_target = args.entropy_target player.gpu_id = gpu_id # prepare model player.model = build_model(player.env.observation_space, player.env.action_space, args, device) player.model = player.model.to(device) player.model.train() player.reset() reward_sum = torch.zeros(player.num_agents).to(device) reward_sum_org = np.zeros(player.num_agents) ave_reward = np.zeros(2) ave_reward_longterm = np.zeros(2) count_eps = 0 while True: # sys to the shared model player.model.load_state_dict(shared_model.state_dict()) if player.done: player.reset() reward_sum = torch.zeros(player.num_agents).to(device) reward_sum_org = np.zeros(player.num_agents) count_eps += 1 player.update_rnn_hiden() t0 = time.time() for i in range(args.num_steps): player.action_train() reward_sum += player.reward reward_sum_org += player.reward_org if player.done: for j, r_i in enumerate(reward_sum): writer.add_scalar('train/reward_' + str(j), r_i, player.n_steps) break fps = i / (time.time() - t0) # cfg training mode # 0: tracker 1: target -1:joint all training_mode = train_modes[rank] policy_loss, value_loss, entropies, pred_loss = player.optimize( params, optimizer, shared_model, training_mode, device_share) for i in range(min(player.num_agents, 3)): writer.add_scalar('train/policy_loss_' + str(i), policy_loss[i].mean(), player.n_steps) writer.add_scalar('train/value_loss_' + str(i), value_loss[i], player.n_steps) writer.add_scalar('train/entropies' + str(i), entropies[i].mean(), player.n_steps) writer.add_scalar('train/pred_R_loss', pred_loss, player.n_steps) writer.add_scalar('train/ave_reward', ave_reward[0] - ave_reward_longterm[0], player.n_steps) writer.add_scalar('train/mode', training_mode, player.n_steps) writer.add_scalar('train/fps', fps, player.n_steps) n_iter += 1 n_iters[rank] = n_iter if train_modes[rank] == -100: env.close() break
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) print("prank:", rank, "os.pid:", os.getpid()) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = AllowBacktracking( make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("Got a local env; obs space:", env.observation_space) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() print("player.state.shape:", player.state.shape) player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: # if player.info['ale.lives'] == 0 or player.max_length: # player.eps_len = 0 state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def test(args, shared_model, env_conf, shared_counter): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu') log = {} setup_logger( '{}_log'.format(args.env), os.path.join(args.log_dir, '{}-{}_log'.format(args.env, args.exp_name))) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) env = atari_env(args.env, env_conf, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None, gpu_id=gpu_id) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.model.apply(weights_init) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).to(torch.float32) player.model = player.model.to(device) player.state = player.state.to(device) flag = True max_score = 0 while True: if flag: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward if player.done and not player.info: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device) elif player.info: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}, alpha {4:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, player.model.log_alpha.exp().detach().item())) if args.save_max and reward_sum >= max_score: max_score = reward_sum torch.save( player.model.state_dict(), os.path.join(args.save_model_dir, '{}-{}.dat'.format(args.env, args.exp_name))) with shared_counter.get_lock(): shared_counter.value += player.eps_len if shared_counter.value > args.interact_steps: break reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).to(torch.float32) player.state = player.state.to(device)
def testac(args, shared_model, env_conf): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = OC_env(args.env) reward_sum = 0 start_time = time.time() num_tests = 0 num_frames = 0 reward_mean = 0 player = ACAgent(None, env, args, None) player.gpu_id = gpu_id player.model = ACModel(player.env.observation_space.shape[0], player.env.action_space, args.options, args.width) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 EpisodeLength = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() player.env.render() EpisodeLength += 1 reward_sum += player.reward if player.done or EpisodeLength > args.num_steps: flag = True num_tests += 1 reward_mean = reward_mean + ( reward_sum - player.eps_len * reward_mean) / num_tests num_frames += player.eps_len log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) #if args.save_max and reward_sum >= max_score: max_score = reward_sum datname = '{0}{1}.dat'.format(args.save_model_dir, args.env) tmpname = '{0}{1}.tmp'.format(args.save_model_dir, args.env) if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save(state_to_save, tmpname) else: state_to_save = player.model.state_dict() torch.save(state_to_save, tmpname) os.rename(tmpname, datname) EpisodeLength = 0 reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def _task_sampling_loop_worker( worker_id: Union[int, str], connection_read_fn: Callable, connection_write_fn: Callable, make_sampler_fn: Callable[..., TaskSampler], sampler_fn_args_list: List[Dict[str, Any]], auto_resample_when_done: bool, should_log: bool, child_pipe: Optional[Connection] = None, parent_pipe: Optional[Connection] = None, ) -> None: """process worker for creating and interacting with the Tasks/TaskSampler.""" ptitle("VectorSampledTask: {}".format(worker_id)) sp_vector_sampled_tasks = SingleProcessVectorSampledTasks( make_sampler_fn=make_sampler_fn, sampler_fn_args_list=sampler_fn_args_list, auto_resample_when_done=auto_resample_when_done, should_log=should_log, ) if parent_pipe is not None: parent_pipe.close() try: while True: read_input = connection_read_fn() if len(read_input) == 3: sampler_index, command, data = read_input assert command != CLOSE_COMMAND, "Must close all processes at once." assert (command != RESUME_COMMAND ), "Must resume all task samplers at once." if command == PAUSE_COMMAND: sp_vector_sampled_tasks.pause_at( sampler_index=sampler_index) connection_write_fn("done") else: connection_write_fn( sp_vector_sampled_tasks.command_at( sampler_index=sampler_index, command=command, data=data)) else: commands, data_list = read_input assert (commands != PAUSE_COMMAND ), "Cannot pause all task samplers at once." if commands == CLOSE_COMMAND: sp_vector_sampled_tasks.close() break elif commands == RESUME_COMMAND: sp_vector_sampled_tasks.resume_all() connection_write_fn("done") else: if isinstance(commands, str): commands = [ commands ] * sp_vector_sampled_tasks.num_unpaused_tasks connection_write_fn( sp_vector_sampled_tasks.command( commands=commands, data_list=data_list)) if child_pipe is not None: child_pipe.close() except KeyboardInterrupt as e: if should_log: get_logger().info( "Worker {} KeyboardInterrupt".format(worker_id)) raise e except Exception as e: get_logger().error(traceback.format_exc()) raise e finally: if should_log: get_logger().info("""Worker {} closing.""".format(worker_id))