def __init__(self, initial_state, target): self.env = THORDiscreteEnvironment( scene_name=scene_name, initial_state_id = initial_state, terminal_state_id = target, h5_file_path=(lambda scene: parent.config["h5_file_path"].replace("{scene}", scene_name)) ) self.env.reset() self.net = net
def run(self): scene_stats = dict() resultData = [] for scene_scope, items in TASK_LIST.items(): scene_net = self.scene_nets[scene_scope] scene_stats[scene_scope] = list() for task_scope in items: env = THORDiscreteEnvironment( scene_name=scene_scope, h5_file_path=(lambda scene: self.config.get("h5_file_path", "D:\\datasets\\visual_navigation_precomputed\\{scene}.h5").replace('{scene}', scene)), terminal_state_id=int(task_scope) ) ep_rewards = [] ep_lengths = [] ep_collisions = [] for i_episode in range(NUM_EVAL_EPISODES): env.reset() terminal = False ep_reward = 0 ep_collision = 0 ep_t = 0 while not terminal: state = torch.Tensor(env.render(mode='resnet_features')) target = torch.Tensor(env.render_target(mode='resnet_features')) (policy, value,) = scene_net.forward(self.shared_net.forward((state, target,))) with torch.no_grad(): action = F.softmax(policy, dim=0).multinomial(1).data.numpy()[0] env.step(action) terminal = env.is_terminal if ep_t == 10000: break if env.collided: ep_collision += 1 ep_reward += env.reward ep_t += 1 ep_lengths.append(ep_t) ep_rewards.append(ep_reward) ep_collisions.append(ep_collision) if VERBOSE: print("episode #{} ends after {} steps".format(i_episode, ep_t)) print('evaluation: %s %s' % (scene_scope, task_scope)) print('mean episode reward: %.2f' % np.mean(ep_rewards)) print('mean episode length: %.2f' % np.mean(ep_lengths)) print('mean episode collision: %.2f' % np.mean(ep_collisions)) scene_stats[scene_scope].extend(ep_lengths) resultData.append((scene_scope, str(task_scope), np.mean(ep_rewards), np.mean(ep_lengths), np.mean(ep_collisions),)) print('\nResults (average trajectory length):') for scene_scope in scene_stats: print('%s: %.2f steps'%(scene_scope, np.mean(scene_stats[scene_scope]))) if 'csv_file' in self.config and self.config['csv_file'] is not None: export_to_csv(resultData, self.config['csv_file'])
def _initialize_thread(self): h5_file_path = self.init_args.get('h5_file_path') # self.logger = logging.getLogger('agent') # self.logger.setLevel(logging.INFO) self.init_args['h5_file_path'] = lambda scene: h5_file_path.replace('{scene}', scene) self.env = THORDiscreteEnvironment(self.scene, **self.init_args) self.gamma : float = self.init_args.get('gamma', 0.99) self.grad_norm: float = self.init_args.get('grad_norm', 40.0) entropy_beta : float = self.init_args.get('entropy_beta', 0.01) self.max_t : int = self.init_args.get('max_t', 1)# TODO: 5) self.local_t = 0 self.action_space_size = self.get_action_space_size() self.criterion = ActorCriticLoss(entropy_beta) self.policy_network = nn.Sequential(SharedNetwork(), SceneSpecificNetwork(self.get_action_space_size())) # Initialize the episode self._reset_episode() self._sync_network()
class Agent: def __init__(self, initial_state, target): self.env = THORDiscreteEnvironment( scene_name=scene_name, initial_state_id = initial_state, terminal_state_id = target, h5_file_path=(lambda scene: parent.config["h5_file_path"].replace("{scene}", scene_name)) ) self.env.reset() self.net = net @staticmethod def get_parameters(): return net.parameters() def act(self): with torch.no_grad(): state = torch.Tensor(self.env.render(mode='resnet_features')).to(parent.device) target = torch.Tensor(self.env.render_target(mode='resnet_features')).to(parent.device) (policy, value,) = net.forward((state, target,)) action = F.softmax(policy, dim=0).multinomial(1).cpu().data.numpy()[0] self.env.step(action) return (self.env.is_terminal, self.env.collided, self.env.reward)
class TrainingThread(mp.Process): def __init__(self, id: int, network: torch.nn.Module, saver, optimizer, scene: str, **kwargs): super(TrainingThread, self).__init__() # Initialize the environment self.env = None self.init_args = kwargs self.scene = scene self.saver = saver self.local_backbone_network = SharedNetwork() self.id = id self.master_network = network self.optimizer = optimizer def _sync_network(self): self.policy_network.load_state_dict(self.master_network.state_dict()) def _ensure_shared_grads(self): for param, shared_param in zip(self.policy_network.parameters(), self.master_network.parameters()): if shared_param.grad is not None: return shared_param._grad = param.grad def get_action_space_size(self): return len(self.env.actions) def _initialize_thread(self): h5_file_path = self.init_args.get('h5_file_path') # self.logger = logging.getLogger('agent') # self.logger.setLevel(logging.INFO) self.init_args['h5_file_path'] = lambda scene: h5_file_path.replace( '{scene}', scene) self.env = THORDiscreteEnvironment(self.scene, **self.init_args) self.gamma: float = self.init_args.get('gamma', 0.99) self.grad_norm: float = self.init_args.get('grad_norm', 40.0) entropy_beta: float = self.init_args.get('entropy_beta', 0.01) self.max_t: int = self.init_args.get('max_t', 1) # TODO: 5) self.local_t = 0 self.action_space_size = self.get_action_space_size() self.criterion = ActorCriticLoss(entropy_beta) self.policy_network = nn.Sequential( SharedNetwork(), SceneSpecificNetwork(self.get_action_space_size())) # Initialize the episode self._reset_episode() self._sync_network() def _reset_episode(self): self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf self.env.reset() def _forward_explore(self): # Does the evaluation end naturally? is_terminal = False terminal_end = False results = {"policy": [], "value": []} rollout_path = {"state": [], "action": [], "rewards": [], "done": []} # Plays out one game to end or max_t for t in range(self.max_t): state = { "current": self.env.render('resnet_features'), "goal": self.env.render_target('resnet_features'), } x_processed = torch.from_numpy(state["current"]) goal_processed = torch.from_numpy(state["goal"]) (policy, value) = self.policy_network(( x_processed, goal_processed, )) # Store raw network output to use in backprop results["policy"].append(policy) results["value"].append(value) with torch.no_grad(): ( _, action, ) = policy.max(0) action = F.softmax(policy, dim=0).multinomial(1).item() policy = policy.data.numpy() value = value.data.numpy() # Makes the step in the environment self.env.step(action) # Receives the game reward is_terminal = self.env.is_terminal # ad-hoc reward for navigation reward = 10.0 if is_terminal else -0.01 # Max episode length if self.episode_length > 5e3: is_terminal = True # Update episode stats self.episode_length += 1 self.episode_reward += reward self.episode_max_q = max(self.episode_max_q, np.max(value)) # clip reward reward = np.clip(reward, -1, 1) # Increase local time self.local_t += 1 rollout_path["state"].append(state) rollout_path["action"].append(action) rollout_path["rewards"].append(reward) rollout_path["done"].append(is_terminal) if is_terminal: # TODO: add logging print('playout finished') print(f'episode length: {self.episode_length}') print(f'episode reward: {self.episode_reward}') print(f'episode max_q: {self.episode_max_q}') terminal_end = True self._reset_episode() break if terminal_end: return 0.0, results, rollout_path else: x_processed = torch.from_numpy(self.env.render('resnet_features')) goal_processed = torch.from_numpy( self.env.render_target('resnet_features')) (_, value) = self.policy_network(( x_processed, goal_processed, )) return value.data.item(), results, rollout_path def _optimize_path(self, playout_reward: float, results, rollout_path): policy_batch = [] value_batch = [] action_batch = [] temporary_difference_batch = [] playout_reward_batch = [] for i in reversed(range(len(results["value"]))): reward = rollout_path["rewards"][i] value = results["value"][i] action = rollout_path["action"][i] playout_reward = reward + self.gamma * playout_reward temporary_difference = playout_reward - value.data.item() policy_batch.append(results['policy'][i]) value_batch.append(results['value'][i]) action_batch.append(action) temporary_difference_batch.append(temporary_difference) playout_reward_batch.append(playout_reward) policy_batch = torch.stack(policy_batch, 0) value_batch = torch.stack(value_batch, 0) action_batch = torch.from_numpy(np.array(action_batch, dtype=np.int64)) temporary_difference_batch = torch.from_numpy( np.array(temporary_difference_batch, dtype=np.float32)) playout_reward_batch = torch.from_numpy( np.array(playout_reward_batch, dtype=np.float32)) # Compute loss loss = self.criterion.forward(policy_batch, value_batch, action_batch, temporary_difference_batch, playout_reward_batch) loss = loss.sum() loss_value = loss.detach().numpy() self.optimizer.optimize(loss, self.policy_network.parameters(), self.master_network.parameters()) def run(self, master=None): signal.signal(signal.SIGINT, signal.SIG_IGN) print(f'Thread {self.id} ready') # We need to silence all errors on new process h5py._errors.silence_errors() self._initialize_thread() if not master is None: print(f'Master thread {self.id} started') else: print(f'Thread {self.id} started') try: self.env.reset() while True: self._sync_network() # Plays some samples playout_reward, results, rollout_path = self._forward_explore() # Train on collected samples self._optimize_path(playout_reward, results, rollout_path) print(f'Step finished {self.optimizer.get_global_step()}') # Trigger save or other self.saver.after_optimization() pass except Exception as e: print(e) # TODO: add logging #self.logger.error(e.msg) raise e
def run(self): scene_stats = dict() resultData = [] for scene_scope, items in TASK_LIST.items(): if len(self.config['test_scenes']) != 0 and not scene_scope in self.config['test_scenes']: continue scene_net = self.scene_nets[scene_scope] scene_stats[scene_scope] = list() for task_scope in items: env = THORDiscreteEnvironment( scene_name=scene_scope, h5_file_path=(lambda scene: self.config.get("h5_file_path", "D:\\datasets\\visual_navigation_precomputed\\{scene}.h5").replace('{scene}', scene)), terminal_state_id=int(task_scope), ) graph = env._get_graph_handle() hitting_times = graph['hitting_times'][()] shortest_paths = graph['shortest_path_distance'][()] ep_rewards = [] ep_lengths = [] ep_collisions = [] ep_normalized_lengths = [] for (i_episode, start) in enumerate(env.get_initial_states(int(task_scope))): env.reset(initial_state_id = start) terminal = False ep_reward = 0 ep_collision = 0 ep_t = 0 hitting_time = hitting_times[start, int(task_scope)] shortest_path = shortest_paths[start, int(task_scope)] while not terminal: state = torch.Tensor(env.render(mode='resnet_features')) target = torch.Tensor(env.render_target(mode='resnet_features')) (policy, value,) = scene_net.forward(self.shared_net.forward((state, target,))) with torch.no_grad(): action = F.softmax(policy, dim=0).multinomial(1).data.numpy()[0] env.step(action) terminal = env.is_terminal if ep_t == hitting_time: break if env.collided: ep_collision += 1 ep_reward += env.reward ep_t += 1 ep_lengths.append(ep_t) ep_rewards.append(ep_reward) ep_collisions.append(ep_collision) ep_normalized_lengths.append(min(ep_t, hitting_time) / shortest_path) if VERBOSE: print("episode #{} ends after {} steps".format(i_episode, ep_t)) print('evaluation: %s %s' % (scene_scope, task_scope)) print('mean episode reward: %.2f' % np.mean(ep_rewards)) print('mean episode length: %.2f' % np.mean(ep_lengths)) print('mean episode collision: %.2f' % np.mean(ep_collisions)) print('mean normalized episode length: %.2f' % np.mean(ep_normalized_lengths)) scene_stats[scene_scope].extend(ep_lengths) resultData.append((scene_scope, str(task_scope), np.mean(ep_rewards), np.mean(ep_lengths), np.mean(ep_collisions), np.mean(ep_normalized_lengths),)) print('\nResults (average trajectory length):') for scene_scope in scene_stats: print('%s: %.2f steps'%(scene_scope, np.mean(scene_stats[scene_scope]))) if 'csv_file' in self.config and self.config['csv_file'] is not None: export_to_csv(resultData, self.config['csv_file'])
return {key: torch.Tensor(v) for (key, v) in data.items()} shared_net.load_state_dict(convertToStateDict(data['navigation'])) for key in TASK_LIST.keys(): scene_nets[key].load_state_dict( convertToStateDict(data[f'navigation/{key}'])) scene_stats = dict() for scene_scope, items in TASK_LIST.items(): scene_net = scene_nets[scene_scope] scene_stats[scene_scope] = list() for task_scope in items: env = THORDiscreteEnvironment( scene_name=scene_scope, h5_file_path=( lambda scene: f"D:\\datasets\\visual_navigation_precomputed\\{scene}.h5"), terminal_state_id=int(task_scope)) ep_rewards = [] ep_lengths = [] ep_collisions = [] for i_episode in range(NUM_EVAL_EPISODES): env.reset() terminal = False ep_reward = 0 ep_collision = 0 ep_t = 0 while not terminal: state = torch.Tensor(env.render(mode='resnet_features')) target = torch.Tensor(