def __init__(self, cfg, core_out_size, action_space): super().__init__(cfg, action_space) self.num_action_outputs = calc_num_logits(action_space) self.num_options = cfg.num_options self.distribution_linear = nn.Linear( core_out_size, self.num_action_outputs * cfg.num_options)
def test_gumbel_trick(self): """ We use a Gumbel noise which seems to be faster compared to using pytorch multinomial. Here we test that those are actually equivalent. """ timing = Timing() torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True with torch.no_grad(): action_space = gym.spaces.Discrete(8) num_logits = calc_num_logits(action_space) device_type = 'cpu' device = torch.device(device_type) logits = torch.rand(self.batch_size, num_logits, device=device) * 10.0 - 5.0 if device_type == 'cuda': torch.cuda.synchronize(device) count_gumbel, count_multinomial = np.zeros( [action_space.n]), np.zeros([action_space.n]) # estimate probability mass by actually sampling both ways num_samples = 20000 action_distribution = get_action_distribution(action_space, logits) sample_actions_log_probs(action_distribution) action_distribution.sample_gumbel() with timing.add_time('gumbel'): for i in range(num_samples): action_distribution = get_action_distribution( action_space, logits) samples_gumbel = action_distribution.sample_gumbel() count_gumbel[samples_gumbel[0]] += 1 action_distribution = get_action_distribution(action_space, logits) action_distribution.sample() with timing.add_time('multinomial'): for i in range(num_samples): action_distribution = get_action_distribution( action_space, logits) samples_multinomial = action_distribution.sample() count_multinomial[samples_multinomial[0]] += 1 estimated_probs_gumbel = count_gumbel / float(num_samples) estimated_probs_multinomial = count_multinomial / float( num_samples) log.debug('Gumbel estimated probs: %r', estimated_probs_gumbel) log.debug('Multinomial estimated probs: %r', estimated_probs_multinomial) log.debug('Sampling timing: %s', timing) time.sleep(0.1) # to finish logging
def test_tuple_sanity_check(self): num_spaces, num_actions = 3, 2 simple_space = gym.spaces.Discrete(num_actions) spaces = [simple_space for _ in range(num_spaces)] tuple_space = gym.spaces.Tuple(spaces) self.assertTrue(calc_num_logits(tuple_space), num_spaces * num_actions) simple_logits = torch.zeros(1, num_actions) tuple_logits = torch.zeros(1, calc_num_logits(tuple_space)) simple_distr = get_action_distribution(simple_space, simple_logits) tuple_distr = get_action_distribution(tuple_space, tuple_logits) tuple_entropy = tuple_distr.entropy() self.assertEqual(tuple_entropy, simple_distr.entropy() * num_spaces) simple_logprob = simple_distr.log_prob(torch.ones(1)) tuple_logprob = tuple_distr.log_prob(torch.ones(1, num_spaces)) self.assertEqual(tuple_logprob, simple_logprob * num_spaces)
def test_simple_distribution(self): simple_action_space = gym.spaces.Discrete(3) simple_num_logits = calc_num_logits(simple_action_space) self.assertEqual(simple_num_logits, simple_action_space.n) simple_logits = torch.rand(self.batch_size, simple_num_logits) simple_action_distribution = get_action_distribution( simple_action_space, simple_logits) simple_actions = simple_action_distribution.sample() self.assertEqual(list(simple_actions.shape), [self.batch_size]) self.assertTrue( all(0 <= a < simple_action_space.n for a in simple_actions))
def __init__(self, cfg, core_out_size, action_space): super().__init__(cfg, action_space) assert not cfg.adaptive_stddev assert is_continuous_action_space(self.action_space), \ 'Non-adaptive stddev makes sense only for continuous action spaces' num_action_outputs = calc_num_logits(action_space) # calculate only action means using the policy neural network self.distribution_linear = nn.Linear(core_out_size, num_action_outputs // 2) # stddev is a single learned parameter initial_stddev = torch.empty([num_action_outputs // 2]) initial_stddev.fill_(math.log(self.cfg.initial_stddev)) self.learned_stddev = nn.Parameter(initial_stddev, requires_grad=True)
def __init__(self, cfg, action_space): super().__init__() self.k: int = cfg.cpc_forward_steps self.time_subsample: int = cfg.cpc_time_subsample self.forward_subsample: int = cfg.cpc_forward_subsample self.hidden_size: int = cfg.hidden_size self.num_actions: int = calc_num_actions(action_space) if isinstance(action_space, gym.spaces.Discrete): self.action_sizes = [action_space.n] else: self.action_sizes = [space.n for space in action_space.spaces] self.rnn = nn.GRU(32 * self.num_actions, cfg.hidden_size) self.action_embed = nn.Embedding(calc_num_logits(action_space), 32) self.predictor = nn.Sequential( nn.Linear(2 * self.hidden_size, self.hidden_size), nn.ReLU(True), nn.Linear(self.hidden_size, self.hidden_size), nn.ReLU(True), nn.Linear(self.hidden_size, 1), )
def test_tuple_distribution(self): num_spaces = random.randint(1, 4) spaces = [ gym.spaces.Discrete(random.randint(2, 5)) for _ in range(num_spaces) ] action_space = gym.spaces.Tuple(spaces) num_logits = calc_num_logits(action_space) logits = torch.rand(self.batch_size, num_logits) self.assertEqual(num_logits, sum(s.n for s in action_space.spaces)) action_distribution = get_action_distribution(action_space, logits) tuple_actions = action_distribution.sample() self.assertEqual(list(tuple_actions.shape), [self.batch_size, num_spaces]) log_probs = action_distribution.log_prob(tuple_actions) self.assertEqual(list(log_probs.shape), [self.batch_size]) entropy = action_distribution.entropy() self.assertEqual(list(entropy.shape), [self.batch_size])
def __init__(self, cfg, num_agents, obs_space, action_space): self.cfg = cfg self.num_agents = num_agents self.envs_per_split = cfg.num_envs_per_worker // cfg.worker_num_splits self.num_traj_buffers = self.calc_num_trajectory_buffers() num_actions = calc_num_actions(action_space) num_action_logits = calc_num_logits(action_space) hidden_size = get_hidden_size(self.cfg) log.debug('Allocating shared memory for trajectories') self.tensors = TensorDict() # policy inputs obs_dict = TensorDict() self.tensors['obs'] = obs_dict if isinstance(obs_space, spaces.Dict): for space_name, space in obs_space.spaces.items(): obs_dict[space_name] = self.init_tensor(space.dtype, space.shape) else: raise Exception('Only Dict observations spaces are supported') # env outputs self.tensors['rewards'] = self.init_tensor(torch.float32, [1]) self.tensors['dones'] = self.init_tensor(torch.bool, [1]) # policy outputs policy_outputs = [ ('actions', num_actions), ('action_logits', num_action_logits), ('log_prob_actions', 1), ('values', 1), ('policy_version', 1), ('rnn_states', hidden_size) ] policy_outputs = [PolicyOutput(*po) for po in policy_outputs] policy_outputs = sorted(policy_outputs, key=lambda policy_output: policy_output.name) for po in policy_outputs: self.tensors[po.name] = self.init_tensor(torch.float32, [po.size]) ensure_memory_shared(self.tensors) # this is for performance optimization # indexing in numpy arrays is faster than in PyTorch tensors self.tensors_individual_transitions = self.tensor_dict_to_numpy(len(self.tensor_dimensions())) self.tensor_trajectories = self.tensor_dict_to_numpy(len(self.tensor_dimensions()) - 1) # create a shared tensor to indicate when the learner is done with the trajectory buffer and # it can be used to store the next trajectory traj_buffer_available_shape = [ self.cfg.num_workers, self.cfg.worker_num_splits, self.envs_per_split, self.num_agents, self.num_traj_buffers, ] self.is_traj_tensor_available = torch.ones(traj_buffer_available_shape, dtype=torch.uint8) self.is_traj_tensor_available.share_memory_() self.is_traj_tensor_available = to_numpy(self.is_traj_tensor_available, 2) # copying small policy outputs (e.g. individual value predictions & action logits) to shared memory is a # bottleneck on the policy worker. For optimization purposes we create additional tensors to hold # just concatenated policy outputs. Rollout workers parse the data and add it to the trajectory buffers # in a proper format policy_outputs_combined_size = sum(po.size for po in policy_outputs) policy_outputs_shape = [ self.cfg.num_workers, self.cfg.worker_num_splits, self.envs_per_split, self.num_agents, policy_outputs_combined_size, ] self.policy_outputs = policy_outputs self.policy_output_tensors = torch.zeros(policy_outputs_shape, dtype=torch.float32) self.policy_output_tensors.share_memory_() self.policy_output_tensors = to_numpy(self.policy_output_tensors, 4) self.policy_versions = torch.zeros([self.cfg.num_policies], dtype=torch.int32) self.policy_versions.share_memory_() # a list of boolean flags to be shared among components that indicate that experience collection should be # temporarily stopped (e.g. due to too much experience accumulated on the learner) self.stop_experience_collection = torch.ones([self.cfg.num_policies], dtype=torch.bool) self.stop_experience_collection.share_memory_()