def tester(model, device, n=5, task_config_file="config_files/config_example.json"): episode_reward = [] rnn_size = 128 env = AI2ThorEnv(config_file=task_config_file) for _ in range(n): # Wait for trainer to inform next job total_r = 0. d = False x = reset(env, rnn_size, device) while not d: with torch.no_grad(): a_t, _, _, _, state_t = model(x) # interact with environment o, r, d, _ = env.step(a_t.data.item()) total_r += r # accumulate reward within one rollout. # prepare inputs for next step x["observation"] = torch.Tensor(o / 255.).to(device) x["memory"]["state"] = state_t x["memory"]["mask"] = torch.tensor( (d + 1) % 2, dtype=torch.float32).to(device) x["memory"]["action"] = a_t episode_reward.append(total_r) print("Episode reward:", total_r) env.close() print(f"Average eposide reward ({np.mean(episode_reward)})")
def test_environments_runs(self): """ Checks to see if the environment still runs and nothing breaks. Useful for continuous deployment and keeping master stable. Also, we check how much time 10 steps takes within the environment. Final assert checks if max_episode_length is equal to the number of steps taken and no off-by-one errors. Prints the execution time at the end of the test for performance check. """ num_steps = 10 env = AI2ThorEnv() start = time.time() all_step_times = [] env.reset() for step_num in range(num_steps): start_of_step = time.time() action = env.action_space.sample() state, reward, done, _ = env.step(action) time_for_step = time.time() - start_of_step print( 'Step: {}. env.task.step_num: {}. Time taken for step: {:.3f}'. format(step_num, env.task.step_num, time_for_step)) all_step_times.append(time_for_step) if done: break print('Time taken altogether: {}\nAverage time taken per step: {:.3f}'. format(time.time() - start, sum(all_step_times) / len(all_step_times))) self.assertTrue(len(all_step_times) == num_steps) env.close()
def test_config_override(self): """ Check if reading both a config file and a config dict at the same time works and that the correct warning occurs for overwriting. Afterwards, check if scene_id was correctly changed from overwriting """ with warnings.catch_warnings(record=True) as warning_objs: env = AI2ThorEnv(config_dict={'scene_id': 'FloorPlan27'}) # checking if correct warning appears (there could be multiple depending on user) self.assertTrue([ w for w in warning_objs if 'Key: scene_id already in config file' in w.message.args[0] ]) self.assertTrue(env.scene_id == 'FloorPlan27') env.close()
def test_cup_task_and_interaction_actions(self): """ Check if picking up and putting down cup works and agent receives reward of 2 for doing it twice. For putting the cup down, the agent places it in the microwave and then picks it up again. Also this implicitly checks there is no random initialisation and that the same actions in the same environment will achieve the same reward each time. """ # actions_to_look_at_cup = ['RotateRight', 'RotateRight', 'MoveAhead', 'MoveAhead', # 'RotateRight', 'MoveAhead', 'MoveAhead', 'RotateLeft', 'MoveAhead', 'MoveAhead', # 'MoveAhead', 'RotateLeft', 'LookDown', 'PickupObject', 'PutObject', 'LookUp', # 'MoveRight', 'OpenObject', 'PutObject', 'PickupObject', 'CloseObject'] actions_to_look_at_cup = [ 'MoveAhead', 'MoveBack', 'RotateRight', 'RotateLeft', 'LookUp', 'LookDown', 'Stop' ] env = AI2ThorEnv( config_dict={ 'scene_id': 'FloorPlan28', 'gridSize': 0.25, 'acceptable_receptacles': [ 'Microwave' # the used receptacle below ], 'target_objects': { 'Mug': 1 } }) movement_penalty = len( actions_to_look_at_cup) * env.task.movement_reward for episode in range(2): # twice to make sure no random initialisation env.reset() rewards = [] for action_str in actions_to_look_at_cup: action = env.action_names.index(action_str) state, reward, done, _ = env.step(action) rewards.append(reward) if done: break # self.assertAlmostEqual(sum(rewards), 2 + movement_penalty) env.close()
def main(args=None): # Parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # Check if a GPU ID was set if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu set_session(get_session()) # Environment Initialization if args.is_ai2thor: config_dict = {'max_episode_length': 2000} env = AI2ThorEnv(config_dict=config_dict) env.reset() state = env.reset() state_dim = state.shape action_dim = env.action_space.n elif (args.is_atari): # Atari Environment Wrapper env = AtariEnvironment(args) state_dim = env.get_state_size() action_dim = env.get_action_size() else: # Standard Environments env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_dim = gym.make(args.env).action_space.n algo = A3C(action_dim, state_dim, args.consecutive_frames, is_atari=args.is_atari, is_ai2thor=args.is_ai2thor) algo.load_weights(args.actor_path, args.critic_path) # Display agent old_state, time = env.reset(), 0 while True: a = algo.policy_action(old_state) old_state, r, done, _ = env.step(a) time += 1 if done: print('----- done, resetting env ----') env.reset()
def test_all_task_init(self): """ Test that the creation of all tasks still works by taking a few random steps after resetting environment """ param_list = [{ 'pickup_objects': ['Mug', 'Apple'], 'task': { 'task_name': 'PickUpTask', 'target_objects': { 'Mug': 1, 'Apple': 5 } } }] for params in param_list: env = AI2ThorEnv(config_dict=params) state = env.reset() for i in range(5): action = env.action_space.sample() state, reward, done, _ = env.step(action) env.close()
def train(rank, args, shared_model, counter, lock, optimizer): torch.manual_seed(args.seed + rank) env = AI2ThorEnv(config_dict=args.config_dict) env.seed(args.seed + rank) if args.point_cloud_model: model = ActorCritic(env.action_space.n) else: args.frame_dim = env.config['resolution'][-1] model = ActorCritic(env.action_space.n, env.observation_space.shape[0], args.frame_dim) if args.cuda: model = model.cuda() model.train() state = env.reset() done = True # monitoring total_reward_for_num_steps_list = [] episode_total_rewards_list = [] avg_reward_for_num_steps_list = [] total_length = 0 episode_length = 0 n_episode = 0 total_reward_for_episode = 0 all_rewards_in_episode = [] while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 total_length += 1 if args.cuda: if args.point_cloud_model: state = (state[0].cuda(), state[1].cuda()) else: state = state.cuda() cx = cx.cuda() hx = hx.cuda() value, logit, (hx, cx) = model((state, (hx, cx))) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) log_probs.append(log_prob) action_int = action.cpu().numpy()[0][0].item() state, reward, done, _ = env.step(action_int, verbose=False) done = done or episode_length >= args.max_episode_length with lock: counter.value += 1 if done: total_length -= 1 total_reward_for_episode = sum(all_rewards_in_episode) episode_total_rewards_list.append(total_reward_for_episode) all_rewards_in_episode = [] state = env.reset() print( 'Process {} Episode {} Over with Length: {} and Reward: {: .2f}. Total Trained Length: {}' .format(rank, n_episode, episode_length, total_reward_for_episode, total_length)) sys.stdout.flush() episode_length = 0 n_episode += 1 values.append(value) rewards.append(reward) all_rewards_in_episode.append(reward) if done: break if args.synchronous: if total_reward_for_episode >= args.solved_reward: print("Process {} Solved with Reward {}".format( rank, total_reward_for_episode)) env.close() break total_reward_for_num_steps = sum(rewards) total_reward_for_num_steps_list.append(total_reward_for_num_steps) avg_reward_for_num_steps = total_reward_for_num_steps / len(rewards) avg_reward_for_num_steps_list.append(avg_reward_for_num_steps) # Backprop and optimisation R = torch.zeros(1, 1) gae = torch.zeros(1, 1) if args.cuda: if args.point_cloud_model: state = (state[0].cuda(), state[1].cuda()) else: state = state.cuda() R = R.cuda() gae = gae.cuda() if not done: # to change last reward to predicted value to .... value, _, _ = model((state, (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 # import pdb;pdb.set_trace() # good place to breakpoint to see training cycle for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + args.gamma * values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * gae.detach() - \ args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
from gym_ai2thor.envs.ai2thor_env import AI2ThorEnv import numpy as np if __name__ == '__main__': env = AI2ThorEnv() n = env.action_space.n env.reset() episodes = [] for i in range(5): env.reset() d = False total_r = 0. while not d: a = np.random.choice(n) o,r,d,_ = env.step(a) total_r +=r episodes.append(total_r) print(f'Total reward in episode {i} is {total_r}') print("AVG episode rewards:",episodes, np.mean(episodes))
mp.set_start_method("spawn") os.environ['OMP_NUM_THREADS'] = '1' # os.environ['CUDA_VISIBLE_DEVICES'] = "" args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: print('Using', torch.cuda.get_device_name(0)) torch.cuda.init() torch.manual_seed(args.seed) args.config_dict = { 'max_episode_length': args.max_episode_length, 'point_cloud_model': args.point_cloud_model } env = AI2ThorEnv(config_dict=args.config_dict) if args.point_cloud_model: shared_model = ActorCritic(env.action_space.n) else: args.frame_dim = env.config['resolution'][-1] shared_model = ActorCritic(env.action_space.n, env.observation_space.shape[0], args.frame_dim) if args.cuda: shared_model = shared_model.cuda() shared_model.share_memory() env.close( ) # above env initialisation was only to find certain params needed
if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) # Disable non deterministic ops (not sure if critical but better safe than sorry) torch.backends.cudnn.enabled = False else: args.device = torch.device('cpu') # Simple ISO 8601 timestamped logger def log(s): print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s) # Environment selection if args.game == 'ai2thor': env = FrameStackEnv(AI2ThorEnv(config_file=args.config_file), args.history_length, args.device) args.resolution = env.config['resolution'] args.img_channels = env.observation_space.shape[0] else: env = Env(args) env.train() args.resolution = (84, 84) args.img_channels = 1 action_space = env.action_space # Agent dqn = Agent(args, env) mem = ReplayMemory(args, args.memory_capacity) """ Priority weights are linearly annealed and increase every step by priority_weight_increase from args.priority_weight to 1.
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) if args.atari: env = create_atari_env(args.atari_env_name) else: args.config_dict = {'max_episode_length': args.max_episode_length} env = AI2ThorEnv(config_dict=args.config_dict) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.frame_dim) if args.cuda: model = model.cuda() model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 if args.atari and args.atari_render: env.render() # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = torch.zeros(1, 64) hx = torch.zeros(1, 64) else: cx = cx.detach() hx = hx.detach() with torch.no_grad(): if args.cuda: state = state.cuda() cx = cx.cuda() hx = hx.cuda() value, logit, (hx, cx) = model((state.unsqueeze(0).float(), (hx, cx))) prob = F.softmax(logit, dim=-1) # log_prob = F.log_softmax(logit, dim=-1) # print(prob) # entropy = -(log_prob * prob).sum(1, keepdim=True) # print(prob.max(1, keepdim=True)[0].cpu().numpy()) # print(entropy) action = prob.max(1, keepdim=True)[1].cpu().numpy() state, reward, done, _ = env.step(action[0, 0], verbose=False) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking # i.e. in test mode an agent can repeat an action ad infinitum actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: print('In test. Episode over because agent repeated action {} times'.format( actions.maxlen)) done = True if done: print("Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) if reward_sum >= args.solved_reward: print("Solved Testing with Reward {}".format(reward_sum)) torch.save(model.state_dict(), "solved_{}.pth".format("atari" if args.atari else "ai2thor")) env.close() break reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(args.test_sleep_time) state = torch.from_numpy(state)
def worker(worker_id, policy, storage, ready_to_work, queue, exit_flag, use_priors=False, task_config_file="config_files/config_example.json"): ''' Worker function to collect experience based on policy and store the experience in storage :param worker_id: id used for store the experience in storage :param policy: function/actor-critic :param storage: :param ready_to_work: condition to synchronize work and training :param queue: message queue to send episode reward to learner :param exit_flag: flag set by leaner to exit the job :param task_config_file: the task configuration file :return: ''' print(f"Worker with Id:{worker_id} pid ({os.getpid()}) starts ...") steps_per_epoch = storage.block_size state_size = storage.h_buf.shape[1] device = storage.device env = AI2ThorEnv(config_file=task_config_file) x = reset(env, state_size, device) episode_rewards, episode_steps = [], [] r_sum, step_sum = 0., 0 # Wait for start job print('waiting>>>>>>>>>>>>>>>>>>') ready_to_work.wait() print('waiting<<<<<<<<<<<<<<<<<<') while exit_flag.value != 1: for i in range(steps_per_epoch): with torch.no_grad(): a_t, logp_t, _, v_t, state_t = policy(x) # interact with environment o, r, d, _ = env.step(a_t.item()) # print('o.shape', o.shape, type(o.shape)) r_sum += r # accumulate reward within one rollout. step_sum += 1 r_t = torch.tensor(r, dtype=torch.float32).to(device) # save experience storage.store(worker_id, x["observation"], a_t, r_t, v_t, logp_t, x["memory"]["state"], x["memory"]["mask"]) # prepare inputs for next step if use_priors: x["observation"] = o else: x["observation"] = torch.Tensor(o / 255.).to( device) # 128x128 -> 1x128x128 # print('x["observation"]', x["observation"].shape) x["memory"]["state"] = state_t x["memory"]["mask"] = torch.tensor((d + 1) % 2, dtype=torch.float32).to(device) x["memory"]["action"] = a_t # check terminal state if d: # calculate the returns and GAE and reset environment storage.finish_path(worker_id, 0) # print(f"Worker:{worker_id} {device} pid:{os.getpid()} finishes goal at steps :{i}") episode_rewards.append(r_sum) episode_steps.append(step_sum) x = reset(env, state_size, device) r_sum, step_sum = 0., 0 # env does not reaches end if not d: _, _, _, last_val, _ = policy(x) storage.finish_path(worker_id, last_val) # print(f"Worker:{worker_id} {device} pid:{os.getpid()} begins to notify Learner Episode done") queue.put((episode_rewards, episode_steps, worker_id)) # print(f"Worker:{worker_id} waits for next episode") episode_rewards, episode_steps = [], [] # x = reset(env, state_size) # r_sum, step_sum = 0., 0 # Wait for next job ready_to_work.clear() ready_to_work.wait() # print(f"Worker:{worker_id} {device} pid:{os.getpid()} starts new episode") env.close() print(f"Worker with pid ({os.getpid()}) finished job")
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) if args.atari: env = create_atari_env(args.atari_env_name) else: args.config_dict = {'max_episode_length': args.max_episode_length} env = AI2ThorEnv(config_dict=args.config_dict) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.frame_dim) model.eval() state = env.reset() state = torch.from_numpy(state) reward_sum = 0 done = True # be True at the beginning start_time = time.time() # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 episodes = 0 vis = Visdom() assert vis.check_connection() vis.close() win = vis.line(X=[0.], Y=[0.], win='testing_Rewards', opts=dict(title='testing_Rewards')) while True: episode_length += 1 if args.atari and args.atari_render: env.render() # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() with torch.no_grad(): value, logit, (hx, cx) = model( (state.unsqueeze(0).float(), (hx, cx))) prob = F.softmax(logit, dim=-1) action = prob.max(1, keepdim=True)[1].numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking # i.e. in test mode an agent can repeat an action ad infinitum actions.append(action[0, 0]) if actions.count(actions[0]) == actions.maxlen: print( 'In test. Episode over because agent repeated action {} times'. format(actions.maxlen)) done = True if done: print( "In test. Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {}, episode length {}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) vis.line(X=[episodes], Y=[reward_sum], win='testing_Rewards', update='append') episodes += 1 reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(args.test_sleep_time) # wasting resource... print('testing...') state = torch.from_numpy(state)
def test(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) env = AI2ThorEnv(config_dict=args.config_dict) env.seed(args.seed + rank) if args.point_cloud_model: model = ActorCritic(env.action_space.n) else: args.frame_dim = env.config['resolution'][-1] model = ActorCritic(env.action_space.n, env.observation_space.shape[0], args.frame_dim) if args.cuda: model = model.cuda() model.eval() state = env.reset() reward_sum = 0 done = True save = '{}-steps{}-process{}-lr{}-entropy_coef{}'.format( "point" if args.point_cloud_model else "conv", args.num_steps, args.num_processes, args.lr, args.entropy_coef) save = os.path.join('logs', save) os.makedirs(save, exist_ok=True) if args.model: shared_model.load_state_dict( torch.load(os.path.join(save, "solved_ai2thor.pth"))) else: logger = CSVLogger(os.path.join(save, 'test.csv')) fileds = ['episode_reward', 'frames_rendered'] logger.log(fileds) start_time = time.time() # a quick hack to prevent the agent from stucking # actions = deque(maxlen=100) episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(deepcopy(shared_model.state_dict())) cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() with torch.no_grad(): if args.cuda: if args.point_cloud_model: state = (state[0].cuda(), state[1].cuda()) else: state = state.cuda() cx = cx.cuda() hx = hx.cuda() value, logit, (hx, cx) = model((state, (hx, cx))) prob = F.softmax(logit, dim=-1) # log_prob = F.log_softmax(logit, dim=-1) # print(prob) # entropy = -(log_prob * prob).sum(1, keepdim=True) # print(prob.max(1, keepdim=True)[0].cpu().numpy()) # print(entropy) action = prob.max(1, keepdim=True)[1].cpu().numpy() state, reward, done, _ = env.step(action[0, 0], verbose=False) done = done or episode_length >= args.max_episode_length reward_sum += reward # a quick hack to prevent the agent from stucking # i.e. in test mode an agent can repeat an action ad infinitum # actions.append(action[0, 0]) # if actions.count(actions[0]) == actions.maxlen: # print('In test. Episode over because agent repeated action {} times'.format( # actions.maxlen)) # done = True if done: print( "Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {: .2f}, episode length {}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length)) if not args.model: logger.log(["{: .2f}".format(reward_sum), counter.value]) if reward_sum >= args.solved_reward: print("Solved Testing with Reward {}".format(reward_sum)) torch.save(model.state_dict(), os.path.join(save, "solved_ai2thor.pth")) env.close() logger.close() break reward_sum = 0 episode_length = 0 # actions.clear() state = env.reset() time.sleep(args.test_sleep_time)
def train(rank, args, shared_model, counter, lock, device, optimizer=None): torch.manual_seed(args.seed + rank) if args.atari: env = create_atari_env(args.atari_env_name) else: args.config_dict = {'max_episode_length': args.max_episode_length} env = AI2ThorEnv(config_dict=args.config_dict) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.frame_dim) model = model.to(device) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() state = env.reset() state = torch.from_numpy(state) done = True # monitoring total_reward_for_num_steps_list = [] episode_total_rewards_list = [] all_rewards_in_episode = [] avg_reward_for_num_steps_list = [] total_length = 0 episode_length = 0 episodes = 0 vis = Visdom() assert vis.check_connection() vis.close() vis.line(X=[0.], Y=[0.], win='training_Rewards' + str(rank), opts=dict(title='training_Rewards' + str(rank))) while True: episodes += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = torch.zeros(1, 256) hx = torch.zeros(1, 256) else: cx = cx.detach() hx = hx.detach() values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): episode_length += 1 total_length += 1 value, logit, (hx, cx) = model( (state.unsqueeze(0).float().cuda(), (hx.cuda(), cx.cuda()))) value = value.cpu() logit = logit.cpu() hx = hx.cpu() cx = cx.cpu() prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) action_int = action.numpy()[0][0].item() state, reward, done, _ = env.step(action_int) done = done or episode_length >= args.max_episode_length with lock: counter.value += 1 if done: total_length -= 1 total_reward_for_episode = sum(all_rewards_in_episode) episode_total_rewards_list.append(total_reward_for_episode) all_rewards_in_episode = [] vis.line(X=[episodes], Y=[total_reward_for_episode], win='training_Rewards' + str(rank), update='append') print( 'In Train. Episode Over. Total Length: {}. Total reward for episode: {}' .format(total_length, total_reward_for_episode)) print('In Train. Step no: {}. total length: {}'.format( episode_length, total_length)) episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) all_rewards_in_episode.append(reward) if done: break # No interaction with environment below. # Monitoring total_reward_for_num_steps = sum(rewards) # accumulate at each step total_reward_for_num_steps_list.append(total_reward_for_num_steps) avg_reward_for_num_steps = total_reward_for_num_steps / len(rewards) avg_reward_for_num_steps_list.append(avg_reward_for_num_steps) # Backprop and optimisation R = torch.zeros(1, 1) if not done: # to change last reward to predicted value to .... value, _, _ = model((state.unsqueeze(0).float(), (hx, cx))) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 # import pdb;pdb.set_trace() # good place to breakpoint to see training cycle gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + args.gamma * values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - log_probs[i] * gae.detach() - \ args.entropy_coef * entropies[i] optimizer.zero_grad() (policy_loss + args.value_loss_coef * value_loss).cuda().backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--clip-param', type=float, default=0.2) parser.add_argument('--value_loss_coef', type=float, default=0.2) parser.add_argument('--entropy_coef', type=float, default=0.001) parser.add_argument('--max-kl', type=float, default=0.01) parser.add_argument('--use-priors', type=bool, default=False) parser.add_argument('--use-attention', type=bool, default=True) parser.add_argument('--attention', type=str, default='CBAM') args = parser.parse_args() torch.multiprocessing.set_start_method('spawn') # get observation dimension env = AI2ThorEnv(config_file="config_files/multiMugTaskTrain.json") env.reset() obs_dim = env.observation_space.shape # Share information about action space with policy architecture ac_kwargs = dict() ac_kwargs['action_space'] = env.action_space ac_kwargs['state_size'] = args.state_size ac_kwargs['use_attention'] = args.use_attention ac_kwargs['attention'] = args.attention env.close() # Main model print("Initialize Model...") # Construct Model ac_model = ActorCritic(obs_shape=obs_dim, **ac_kwargs) if args.model_path: ac_model.load_state_dict(torch.load(args.model_path))
'cameraY': -0.85, 'gridSize': 0.1, # 0.01 'continuous_movement': True, 'build_file_name': args.build_file_name, 'task': { 'task_name': 'PickUpTask', 'target_objects': { 'Mug': 1, 'Bowl': 5 } } } # Input config_dict to env which will overwrite a few values given in the default config_file. # Therefore, a few warnings will occur env = AI2ThorEnv(config_dict=config_dict) max_episode_length = env.task.max_episode_length N_EPISODES = 3 for episode in range(N_EPISODES): start = time.time() state = env.reset() for step_num in range(max_episode_length): action = env.action_space.sample() state, reward, done, _ = env.step(action) if done: break if step_num + 1 > 0 and (step_num + 1) % 100 == 0: print('Episode: {}. Step: {}/{}. Time taken: {:.3f}s'.format( episode + 1, (step_num + 1), max_episode_length, time.time() - start))
def main(args=None): # Parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # Check if a GPU ID was set if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # Environment Initialization if args.is_ai2thor: config_dict = {'max_episode_length': 500} env = AI2ThorEnv(config_dict=config_dict) env.reset() state = env.reset() state_dim = state.shape action_dim = env.action_space.n args.env = 'ai2thor' elif (args.is_atari): # Atari Environment Wrapper env = AtariEnvironment(args) state_dim = env.get_state_size() action_dim = env.get_action_size() print(state_dim) print(action_dim) else: # Standard Environments env = Environment(gym.make(args.env), args.consecutive_frames) env.reset() state_dim = env.get_state_size() action_dim = gym.make(args.env).action_space.n set_session(get_session()) summary_writer = tf.summary.FileWriter(args.type + "/tensorboard_" + args.env) algo = A3C(action_dim, state_dim, args.consecutive_frames, is_atari=args.is_atari, is_ai2thor=args.is_ai2thor) # Train stats = algo.train(env, args, summary_writer) # Export results to CSV if args.gather_stats: df = pd.DataFrame(np.array(stats)) df.to_csv(args.type + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f') # Save weights and close environments exp_dir = '{}/models/'.format(args.type) if not os.path.exists(exp_dir): os.makedirs(exp_dir) export_path = '{}{}_ENV_{}_NB_EP_{}_BS_{}'.format(exp_dir, args.type, args.env, args.nb_episodes, args.batch_size) algo.save_weights(export_path) env.close()