def run_sim(rank, params, shared_model, shared_optimizer, count, lock): if not os.path.exists('./' + params.weight_dir): os.mkdir('./' + params.weight_dir) if not os.path.exists('./log'): os.mkdir('./log') logging.basicConfig(filename='./log/' + params.log_file + '.log', level=logging.INFO) ptitle('Training Agent: {}'.format(rank)) gpu_id = params.gpu_ids_train[rank % len(params.gpu_ids_train)] api = objrender.RenderAPI(w=params.width, h=params.height, device=gpu_id) cfg = load_config('config.json') torch.manual_seed(random.randint(0, 1000) + rank) if gpu_id >= 0: torch.cuda.manual_seed(random.randint(0, 1000) + rank) model = A3C_LSTM_GA() with torch.cuda.device(gpu_id): model = model.cuda() Agent = run_agent(model, gpu_id) house_id = params.house_id if house_id == -1: house_id = rank if house_id > 50: house_id = house_id % 50 env = Environment(api, get_house_id(house_id, params.difficulty), cfg) task = RoomNavTask(env, hardness=params.hardness, segment_input=params.semantic_mode, max_steps=params.max_steps, discrete_action=True) n_train = 0 best_rate = 0.0 save_model_index = 0 while True: n_train += 1 training(task, gpu_id, shared_model, Agent, shared_optimizer, params, lock, count) if n_train % 1000 == 0: with lock: n_update = count.value with torch.cuda.device(gpu_id): Agent.model.load_state_dict(shared_model.state_dict()) start_time = time.time() best_rate, save_model_index = testing(lock, n_update, gpu_id, Agent, task, best_rate, params, save_model_index, start_time, logging, house_id)
def train_dagger_agent(expert_policy_file, config): # Get the actions that the expert would do given the observations expert = load_policy(expert_policy_file) agent, train_losses, test_losses, dataset = train_bc_agent( expert_policy_file, config) for i in range(config.num_dagger_iterations): # Get observations using the policy we just trained _, observations, _ = run_agent(agent_wrapper(agent, config), config) actions = [expert(obs[None, :]) for obs in observations] # Create a new training set that uses all the data observations = torch.Tensor(observations).to(config.device) actions = torch.Tensor(actions).squeeze(1).to(config.device) dagger_data = data.TensorDataset(observations, actions) dataset = data.ConcatDataset([dataset, dagger_data]) N = len(dataset) train_length = int(config.split * N) test_length = N - train_length trainset, valset = data.random_split(dataset, [train_length, test_length]) train_loader = data.DataLoader(trainset, batch_size=128, shuffle=True) test_loader = data.DataLoader(valset, batch_size=128) train_loss, test_loss = train_epochs(agent, train_loader, test_loader, epochs=config.epochs, lr=config.lr) train_losses += train_loss test_losses += test_loss return agent, train_losses, test_losses
def learning(params, state_Queue, action_done, actions, reward_Queue): if not os.path.exists('./' + params.weight_dir): os.mkdir('./' + params.weight_dir) if not os.path.exists('./log'): os.mkdir('./log') logging.basicConfig(filename='./log/' + params.log_file + '.log', level=logging.INFO) n_process = params.n_process n_inference = 0 n_update = 0 Agent = run_agent(params) # Agent.initialize() list_dones = [False] * n_process n_result = 0 test_done = [0] * n_process test_succ = [0] * n_process best_rate = 0 start_time = time.time() while True: list_obs, list_target, list_dones = getState(n_process, state_Queue, list_dones) # do inference and make action # actions.value = [random.randrange(len(action_list))] * n_process inference, value, log_prob, entropy = Agent.action_train( list_obs, list_target) n_inference += 1 actions = putActions(n_process, inference, actions, state_Queue) torch_rewards, list_dones, masks = getReward(n_process, reward_Queue, list_dones) Agent.insert(log_prob, value, torch_rewards, masks, entropy) if n_inference % 30 == 0 and n_inference != 0: next_obs = list_obs, list_target # Agent.update(params, next_obs, list_dones, entropy) Agent.update_sync(params, next_obs) n_update += 1 if n_update % 200 == 0: # test actions, test_done, test_succ, list_rewards = EnvReset( n_process, state_Queue, action_done, list_dones, actions) endTest = False while True: list_obs, list_target, list_dones = getState( n_process, state_Queue, list_dones) if endTest is True: best_rate = writeResult(n_process, test_done, test_succ, best_rate, Agent, params, n_update, start_time, logging) actions, test_done, test_succ = callReset( n_process, actions, state_Queue) break # do inference and make action inference = Agent.action_test(list_obs, list_target) putActions(n_process, inference, actions, state_Queue) for i in range(n_process): rank, done, reward = reward_Queue.get() list_rewards[rank] = [reward] list_dones[rank] = True if done: if test_done[rank] < params.n_eval: test_done[rank] += 1 if reward == 10: # success test_succ[rank] += 1 for i in range(n_process): reward_Queue.task_done() endTest = DoneOrNot(n_process, test_done, params.n_eval)
def run_sim(rank, params, shared_model, shared_optimizer, count, lock): ptitle('Training Agent: {}'.format(rank)) gpu_id = params.gpu_ids_train[rank % len(params.gpu_ids_train)] api = objrender.RenderAPI(w=params.width, h=params.height, device=gpu_id) cfg = load_config('config.json') if shared_optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=params.lr, amsgrad=params.amsgrad, weight_decay=params.weight_decay) #optimizer.share_memory() else: optimizer = shared_optimizer torch.manual_seed(params.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(params.seed + rank) model = A3C_LSTM_GA() with torch.cuda.device(gpu_id): model = model.cuda() Agent = run_agent(model, gpu_id) house_id = params.house_id if house_id == -1: house_id = rank if house_id >= 20: house_id = house_id % 20 env = Environment(api, get_house_id(house_id), cfg) task = RoomNavTask(env, hardness=params.hardness, segment_input=params.semantic_mode, max_steps=params.max_steps, discrete_action=True) for episode in range(params.max_episode): next_observation = task.reset() target = task.info['target_room'] target = get_instruction_idx(target) with torch.cuda.device(gpu_id): target = Variable(torch.LongTensor(target)).cuda() Agent.model.load_state_dict(shared_model.state_dict()) Agent.cx = Variable(torch.zeros(1, 256).cuda()) Agent.hx = Variable(torch.zeros(1, 256).cuda()) Agent.target = target total_reward, num_steps, good = 0, 0, 0 Agent.done = False done = False Agent.eps_len = 0 while not done: num_steps += 1 observation = next_observation act, entropy, value, log_prob = Agent.action_train( observation, target) next_observation, reward, done, info = task.step(actions[act[0]]) rew = np.clip(reward, -1.0, 1.0) Agent.put_reward(rew, entropy, value, log_prob) if num_steps % params.num_steps == 0 or done: if done: Agent.done = done with lock: count.value += 1 Agent.training(next_observation, shared_model, optimizer, params) if done: break
def test(rank, params, shared_model, count, lock): logging.basicConfig(filename='./2blocks_rew.log', level=logging.INFO) ptitle('Test Process: {}'.format(rank)) gpu_id = params.gpu_ids_test[rank % len(params.gpu_ids_test)] env = Env(True, 1, down_period=2) # model = A3C() model = A3C_LSTM() with torch.cuda.device(gpu_id): model = model.cuda() agent = run_agent(model, gpu_id) episode = 0 while episode <= params.episode_test: env.reset() with lock: n_update = count.value agent.synchronize(shared_model) num_steps = 0 accumulated_reward = 0 nAction = 0 line1 = 0 line2 = 0 line3 = 0 line4 = 0 nMove = 0 rew_height = 0 rew_move = 0 while True: num_steps += 1 obs = pre_processing(env.shadow_map, env._get_curr_block_pos()) # env.map action = agent.action_test(obs) if action == 5: action = 100000 rew, shadow_reward, done, putting, height = env.step( action) # what is the 'is_new_block'? if rew == 0.0 and action != 3 and action != 4: nMove += 1 if nMove < 6: rew_move += 0.2 if putting: rew_height += -(height / 20.0) nMove = 0 if rew == 1.0: line1 += 1 elif rew == 8.0: line2 += 1 elif rew == 27.0: line3 += 1 elif rew == 64: line4 += 1 ''' if nAction < 9: obs = pre_processing(env.map, env._get_curr_block_pos()) action = agent.action_test(obs) rew, shadow_reward, is_new_block = env.step(action) # what is the 'is_new_block'? nAction += 1 else: rew, is_new_block = env.step(100000) # falling nAction = 0 ''' accumulated_reward = rew + rew_move + rew_height if env.is_game_end(): episode += 1 print(" ".join([ "-------------episode stats-------------\n", "nUpdate: {}\n".format(n_update), "line1: {}\n".format(line1), "line2: {}\n".format(line2), "line3: {}\n".format(line3), "line4: {}\n".format(line4), "all_lines: {}\n".format( str(line1 + line2 + line3 + line4)), "score: {}\n".format(env.score), "rew_move: {}\n".format(rew_move), "rew_height: {}\n".format(rew_height), "steps: {}\n".format(num_steps) ])) logging.info(" ".join([ "-------------episode stats-------------\n", "nUpdate: {}\n".format(n_update), "line1: {}\n".format(line1), "line2: {}\n".format(line2), "line3: {}\n".format(line3), "line4: {}\n".format(line4), "all_lines: {}\n".format( str(line1 + line2 + line3 + line4)), "score: {}\n".format(env.score), "rew_move: {}\n".format(rew_move), "rew_height: {}\n".format(rew_height), "steps: {}\n".format(num_steps) ])) break if env.score > 1000: episode += 1 print(" ".join([ "-------------episode stats-------------\n", "nUpdate: {}\n".format(n_update), "line1: {}\n".format(line1), "line2: {}\n".format(line2), "line3: {}\n".format(line3), "line4: {}\n".format(line4), "all_lines: {}\n".format( str(line1 + line2 + line3 + line4)), "score: {}\n".format(env.score), "rew_move: {}\n".format(rew_move), "rew_height: {}\n".format(rew_height), "steps: {}\n".format(num_steps) ])) with torch.cuda.device(gpu_id): torch.save(agent.model.state_dict(), './weight/model' + str(n_update) + '.ckpt') logging.info(" ".join([ "-------------episode stats-------------\n", "nUpdate: {}\n".format(n_update), "line1: {}\n".format(line1), "line2: {}\n".format(line2), "line3: {}\n".format(line3), "line4: {}\n".format(line4), "all_lines: {}\n".format( str(line1 + line2 + line3 + line4)), "score: {}\n".format(env.score), "rew_move: {}\n".format(rew_move), "rew_height: {}\n".format(rew_height), "steps: {}\n".format(num_steps) ])) break
def run_expert(expert_policy_file, config): policy_net = load_policy.load_policy(expert_policy_file) return run_agent(policy_net, config)
def test(rank, params, shared_model, count, lock, best_acc, evaluation=True): if not os.path.exists('./' + params.weight_dir): os.mkdir('./' + params.weight_dir) if not os.path.exists('./log'): os.mkdir('./log') logging.basicConfig(filename='./log/' + params.log_file + '.log', level=logging.INFO) ptitle('Test Agent: {}'.format(rank)) gpu_id = params.gpu_ids_test[rank % len(params.gpu_ids_test)] api = objrender.RenderAPI(w=params.width, h=params.height, device=gpu_id) cfg = load_config('config.json') best_rate = 0.0 save_model_index = 0 n_update = 0 torch.manual_seed(params.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(params.seed + rank) model = A3C_LSTM_GA() with torch.cuda.device(gpu_id): model = model.cuda() Agent = run_agent(model, gpu_id) house_id = params.house_id if house_id == -1: house_id = rank if house_id >= 20: house_id = house_id % 20 #time.sleep(rank*30) env = Environment(api, get_house_id(house_id), cfg) task = RoomNavTask(env, hardness=params.hardness, segment_input=params.semantic_mode, max_steps=params.max_steps, discrete_action=True) #reward_type='indicator' start_time = time.time() if evaluation is True: max_episode = params.max_episode n_try = params.n_eval else: max_episode = 1 # for loaded model test n_try = params.n_test for episode in range(max_episode): eval = [] if evaluation is True: with lock: n_update = count.value with torch.cuda.device(gpu_id): Agent.model.load_state_dict(shared_model.state_dict()) else: with torch.cuda.device(gpu_id): Agent.model.load_state_dict(shared_model) Agent.model.eval() for i in range(n_try): next_observation = task.reset() target = task.info['target_room'] target = get_instruction_idx(target) with torch.cuda.device(gpu_id): target = Variable(torch.LongTensor(target)).cuda() Agent.cx = Variable(torch.zeros(1, 256).cuda()) Agent.hx = Variable(torch.zeros(1, 256).cuda()) Agent.target = target step, total_rew, good = 0, 0, 0 done = False while not done: observation = next_observation act = Agent.action_test(observation, target) next_observation, rew, done, info = task.step(actions[act[0]]) total_rew += rew if rew == 10: # success good = 1 step += 1 if done: break eval.append((step, total_rew, good)) if len(eval) > 0: succ = [e for e in eval if e[2] > 0] succ_rate = (len(succ) / len(eval)) * 100 if evaluation is True: # evaluation mode with lock: #if best_acc.value >= best_rate: # best_rate = best_acc.value if succ_rate >= best_rate: best_rate = succ_rate with torch.cuda.device(gpu_id): torch.save( Agent.model.state_dict(), params.weight_dir + 'model' + str(n_update) + '.ckpt') save_model_index += 1 #if best_rate > best_acc.value: # best_acc.value = best_rate avg_reward = sum([e[1] for e in eval]) / len(eval) avg_length = sum([e[0] for e in eval]) / len(eval) msg = " ".join([ "++++++++++ Task Stats +++++++++++\n", "Time {}\n".format( time.strftime("%dd %Hh %Mm %Ss", time.gmtime(time.time() - start_time))), "Episode Played: {:d}\n".format(len(eval)), "N_Update = {:d}\n".format(n_update), "House id: {:d}\n".format(house_id), "Avg Reward = {:5.3f}\n".format(avg_reward), "Avg Length = {:.3f}\n".format(avg_length), "Best rate {:3.2f}, Success rate {:3.2f}%".format( best_rate, succ_rate) ]) print(msg) logging.info(msg)
def run_test(rank, params, loaded_model, lock, seen_succ, seen_length, unseen_succ, unseen_length): logging.basicConfig(filename='./log/' + params.log_file + '.log', level=logging.INFO) ptitle('Test Agent: {}'.format(rank)) gpu_id = params.gpu_ids_test[rank % len(params.gpu_ids_test)] api = objrender.RenderAPI(w=params.width, h=params.height, device=gpu_id) cfg = load_config('config.json') torch.manual_seed(params.seed + rank) if gpu_id >= 0: with torch.cuda.device(gpu_id): torch.cuda.manual_seed(params.seed + rank) model = A3C_LSTM_GA() with torch.cuda.device(gpu_id): model = model.cuda() load_model = torch.load( loaded_model, map_location=lambda storage, loc: storage.cuda(gpu_id)) model.load_state_dict(load_model) model.eval() Agent = run_agent(model, gpu_id) n_test = 0 start_time = time.time() while True: house_id = rank + (n_test * params.n_process) if house_id >= 70: break else: if house_id < 20: seen = True house = get_house_id(house_id) else: seen = False house = get_eval_house_id(house_id - (n_test * params.n_process)) env = Environment(api, house, cfg) task = RoomNavTask(env, hardness=params.hardness, segment_input=params.semantic_mode, max_steps=params.max_steps, discrete_action=True) #reward_type='indicator' eval = [] for i in range(params.n_test): next_observation = task.reset() target = task.info['target_room'] target = get_instruction_idx(target) with torch.cuda.device(gpu_id): target = Variable(torch.LongTensor(target)).cuda() Agent.cx = Variable(torch.zeros(1, 256).cuda()) Agent.hx = Variable(torch.zeros(1, 256).cuda()) Agent.target = target step, total_rew, good = 0, 0, 0 done = False while not done: observation = next_observation act = Agent.action_test(observation, target) next_observation, rew, done, info = task.step(actions[act[0]]) total_rew += rew if rew == 10: # success good = 1 step += 1 if done: break eval.append((step, total_rew, good)) if len(eval) > 0: succ = [e for e in eval if e[2] > 0] succ_rate = (len(succ) / len(eval)) * 100 avg_reward = sum([e[1] for e in eval]) / len(eval) avg_length = sum([e[0] for e in eval]) / len(eval) if seen: msg_seen = "Seen" msg_house = house_id else: msg_seen = "Unseen" msg_house = house_id - 20 msg = " ".join([ "++++++++++ Task Stats +++++++++++\n", "Time {}\n".format( time.strftime("%dd %Hh %Mm %Ss", time.gmtime(time.time() - start_time))), "Episode Played: {:d}\n".format(len(eval)), "{:s} House id: {:d}\n".format(msg_seen, msg_house), "Avg Reward = {:5.3f}\n".format(avg_reward), "Avg Length = {:.3f}\n".format(avg_length), "Success rate {:3.2f}%".format(succ_rate) ]) print(msg) logging.info(msg) with lock: if seen: seen_succ.value += len(succ) seen_length.value += sum([e[0] for e in eval]) else: unseen_succ.value += len(succ) unseen_length.value += sum([e[0] for e in eval]) n_test += 1
def run_loop(rank, params, shared_model, shared_optimizer, count, lock): ptitle('Training Process: {}'.format(rank)) gpu_id = params.gpu_ids_train[rank % len(params.gpu_ids_train)] env = Env(False, 1, down_period=2) # model = A3C() model = A3C_LSTM() with torch.cuda.device(gpu_id): model = model.cuda() agent = run_agent(model, gpu_id) episode = 0 while episode <= params.episode: env.reset() agent.done = False num_steps = 0 agent.synchronize(shared_model) nAction = 0 nMove = 0 while True: num_steps += 1 # random_action = random.randrange(0, 5) ''' if nAction < 9: obs = pre_processing(env.map, env._get_curr_block_pos()) action, value, log_prob, entropy = agent.action_train(obs) rew, is_new_block = env.step(action) # what is the 'is_new_block'? nAction += 1 if nAction != 9: rew = np.clip(rew, 0.0, 64.0) agent.put_reward(rew, value, log_prob, entropy) else: rew, is_new_block = env.step(100000) # falling rew = np.clip(rew, 0.0, 64.0) agent.put_reward(rew, value, log_prob, entropy) nAction = 0 ''' obs = pre_processing(env.shadow_map, env._get_curr_block_pos()) # env.map action, value, log_prob, entropy = agent.action_train(obs) if action == 5: action = 100000 rew, shadow_rew, done, putting, height = env.step(action) # what is the 'is_new_block'? rew = np.clip(rew, -1.0, 64.0) if rew == 0.0 and action != 3 and action != 4: nMove += 1 if nMove < 6: rew = 0.2 if putting: rew = - (height / 20.0) nMove = 0 agent.put_reward(rew, value, log_prob, entropy) # pdb.set_trace() if env.is_game_end(): episode += 1 agent.done = True # if num_steps % params.num_steps == 0: # if env.is_game_end() or rew >= 1.0: if env.is_game_end(): next_obs = pre_processing(env.map, env._get_curr_block_pos()) agent.training(next_obs, shared_model, shared_optimizer, params) with lock: # synchronize vale of all process count.value += 1 if env.is_game_end(): break