def _log_one_step(self, user_obs, doc_obs, slate, responses, reward, is_terminal, sequence_example): """Adds one step of agent-environment interaction into SequenceExample. Args: user_obs: An array of floats representing user state observations doc_obs: A list of observations of the documents slate: An array of indices to doc_obs responses: A list of observations of responses for items in the slate reward: A float for the reward returned after this step is_terminal: A boolean for whether a terminal state has been reached sequence_example: A SequenceExample proto for logging current episode """ def _add_float_feature(feature, values): feature.feature.add(float_list=tf.train.FloatList(value=values)) def _add_int64_feature(feature, values): feature.feature.add(int64_list=tf.train.Int64List(value=values)) if self._episode_writer is None: return fl = sequence_example.feature_lists.feature_list if isinstance(self._env.environment, environment.MultiUserEnvironment): for i, (single_user, single_slate, single_user_responses, single_reward) in enumerate( zip(user_obs, slate, responses, reward)): user_space = list( self._env.observation_space.spaces['user'].spaces)[i] _add_float_feature(fl['user_%d' % i], spaces.flatten(user_space, single_user)) _add_int64_feature(fl['slate_%d' % i], single_slate) _add_float_feature(fl['reward_%d' % i], [single_reward]) for j, response in enumerate(single_user_responses): resp_space = self._env.observation_space.spaces[ 'response'][i][0] for k in response: _add_float_feature( fl['response_%d_%d_%s' % (i, j, k)], spaces.flatten(resp_space, response)) else: # single-user environment _add_float_feature( fl['user'], spaces.flatten(self._env.observation_space.spaces['user'], user_obs)) _add_int64_feature(fl['slate'], slate) for i, response in enumerate(responses): resp_space = self._env.observation_space.spaces['response'][0] for k in response: _add_float_feature(fl['response_%d_%s' % (i, k)], spaces.flatten(resp_space, response)) _add_float_feature(fl['reward'], [reward]) for i, doc in enumerate(list(doc_obs.values())): doc_space = list( self._env.observation_space.spaces['doc'].spaces.values())[i] _add_float_feature(fl['doc_%d' % i], spaces.flatten(doc_space, doc)) _add_int64_feature(fl['is_terminal'], [is_terminal])
def test_fast_obs_2(): env = Warehouse(3, 8, 3, 3, 2, 1, 5, 10, None, RewardType.GLOBAL, fast_obs=False) env.reset() slow_obs_space = env.observation_space for _ in range(10): slow_obs = [env._make_obs(agent) for agent in env.agents] env._use_fast_obs() fast_obs = [env._make_obs(agent) for agent in env.agents] assert len(fast_obs) == 3 assert len(slow_obs) == 3 flattened_slow = [ spaces.flatten(osp, obs) for osp, obs in zip(slow_obs_space, slow_obs) ] for i in range(len(fast_obs)): print(slow_obs[0]) assert list(fast_obs[i]) == list(flattened_slow[i]) env._use_slow_obs() env.step(env.action_space.sample())
def test_partial_hand(): """Given a partial hand, ensure that we are able to map this to a partial deck and does not return a whole array. This is the behavior when data and numpy are difference: * data - just return a list of the the data points * numpy array - return a fixed numpy array of max size """ deck_empty = PartialDeck(cards=[]) assert deck_empty.to_data() == [] assert deck_empty.to_data_for_numpy() == ( [Card.get_null_data()] * PartialDeck.get_max_size() ) expected_first_card = Card.from_str("A,S") deck = PartialDeck(cards=[expected_first_card] * 2) assert deck.to_data() == [expected_first_card.to_data()] * 2 # Test for flatten numpy data numpy_data = deck.to_data_for_numpy() assert len(numpy_data) == PartialDeck.get_max_size() obs_space = deck.get_observation_space() flattend_numpy = spaces.flatten(obs_space, numpy_data) assert (flattend_numpy[0:2] == expected_first_card.to_data_for_numpy()).all() assert (flattend_numpy[2:4] == expected_first_card.to_data_for_numpy()).all() assert (flattend_numpy[4:6] == Card.get_null_data()).all()
def get_action(self, observation): """ Return action choice by the agents :param observation: stat of environment :type observation: gym.Space """ if not self.greedy_exploration.be_greedy( self.step) and self.with_exploration: return self.action_space.sample() observation = torch.tensor( [flatten(self.observation_space, observation)], device=self.device).float() prediction = self.network.forward(observation) def return_values(values): if isinstance(values, list): return [return_values(v) for v in values] q_values = values * self.z q_values = torch.sum(q_values, dim=2) return torch.argmax(q_values).detach().item() return return_values(prediction)
def step(self, action): self.step_count += 1 self.reward = self.compute_reward(action) self.utility += self.reward self.move_vehicles() if self.step_count >= self.task_num_per_episode: self.done = True else: self.done = False self.s["snr"] = np.array([ min(self.snr_ref * (abs(v["position"]) / 200)**-2, 1) for v in self.vehicles ] + [0] * (self.max_v - self.num_vehicles)) self.s["freq_remain"] = np.array( [v["freq_remain"] for v in self.vehicles] + [0] * (self.max_v - self.num_vehicles)) self.s["u_max"] = np.array([v["u_max"] for v in self.vehicles] + [0] * (self.max_v - self.num_vehicles)) # self.s["time_remain"] = np.array([min(-v["position"]/v["velocity"]+500/abs(v["velocity"]), 100) for v in self.vehicles] + [0]*(self.max_v-self.num_vehicles)) task = self.tasks[self.step_count] self.s["serv_prob"] = np.array([ self.compute_service_availability(task, v) for v in self.vehicles ] + [0] * (self.max_v - self.num_vehicles)) self.s["task"] = np.array(task) return spaces.flatten(self.observation_space, self.s), self.reward, self.done, {}
def step(self, action): action = np.clip(action, self.action_space.low, self.action_space.high) if self._flatten_actions: action = spaces.unflatten(self.env.action_space, action) obs, reward, done, info = self.env.step(action) if self._flatten_obs: obs = spaces.flatten(self.env.observation_space, obs) return obs, reward, done, info
def observation(self, observation): """Flattens an observation. Args: observation: The observation to flatten Returns: The flattened observation """ return spaces.flatten(self.env.observation_space, observation)
def reset(self): """Resets the environment and returns the start state""" # for _ in range(random.randint(1,10)): # self.add_vehicle() self.move_vehicles() # self.add_vehicle() # self.generate_local_tasks() # self.generate_offload_tasks() self.step_count = 0 self.next_state = None self.reward = None self.done = False for v in self.vehicles: v["freq"] = v["freq_init"] v["freq_remain"] = max( 0, v["freq_init"] - sum([i[1] / i[2] for i in v["tasks"]])) v["position"] = v["position_init"] alpha_max = v["freq_remain"] / v["freq"] v["u_max"] = sum( [np.log(1 + alpha_max * i[2]) for i in v["tasks"]]) with open(self.count_file, 'a') as f: f.write( str(self.utility) + ' ' + ' '.join([str(i) for i in self.low_count]) + ' ' + ' '.join([str(i) for i in self.low_delay]) + ' ' + ' '.join([str(i) for i in self.high_count]) + ' ' + ' '.join([str(i) for i in self.high_delay]) + ' ' + '\n') self.high_count = [0, 0, 0, 0] self.high_delay = [0, 0, 0, 0] self.low_count = [0, 0, 0, 0] self.low_delay = [0, 0, 0, 0] self.utility = 0 task = self.tasks[0] self.s = { "snr": np.array([ min(self.snr_ref * (abs(v["position"]) / 200)**-2, 1) for v in self.vehicles ] + [0] * (self.max_v - self.num_vehicles)), # "time_remain":np.array([min(-v["position"]/v["velocity"]+500/abs(v["velocity"]), 100) for v in self.vehicles] + [0]*(self.max_v-self.num_vehicles)), "freq_remain": np.array([v["freq_remain"] for v in self.vehicles] + [0] * (self.max_v - self.num_vehicles)), "u_max": np.array([v["u_max"] for v in self.vehicles] + [0] * (self.max_v - self.num_vehicles)), "serv_prob": np.array([ self.compute_service_availability(task, v) for v in self.vehicles ] + [0] * (self.max_v - self.num_vehicles)), "task": np.array(task) } return spaces.flatten(self.observation_space, self.s)
def test_flatten_unflatten(self, observation_space, ordered_values): """ test flatten and unflatten functions directly """ original = observation_space.sample() flattened = flatten(observation_space, original) unflattened = unflatten(observation_space, flattened) self._check_observations(original, flattened, unflattened, ordered_values)
def make_traj_opt_align( traj_optimizer: TrajOptimizer, env: Env, true_reward: np.ndarray, test_rewards: np.ndarray, epsilon: float, parallel: Optional[Parallel] = None, n_test_states: Optional[int] = None, ) -> np.ndarray: state_shape = env.observation_space.sample().shape action_shape = env.action_space.sample().shape if n_test_states is not None: raw_states = np.array([ flatten(env.observation_space, env.observation_space.sample()) for _ in range(n_test_states) ]) else: n_test_states = 1 raw_states = np.array([env.state]) assert raw_states.shape == (n_test_states, *state_shape) opt_plans = make_plans( true_reward.reshape(1, 4), raw_states, traj_optimizer, parallel, action_shape, memorize=True, ) assert opt_plans.shape == ( 1, n_test_states, 50, *action_shape, ), f"opt_plans shape={opt_plans.shape} is not expected {(1,n_test_states,50,*action_shape)}" opt_values: np.ndarray = rollout_plans(env, opt_plans, raw_states) plans = make_plans(test_rewards, raw_states, traj_optimizer, parallel, action_shape) assert plans.shape == ( len(test_rewards), n_test_states, 50, *action_shape, ), f"plans shape={plans.shape} is not expected {(len(test_rewards),n_test_states,50,*action_shape)}" values = rollout_plans(env, plans, raw_states) assert values.shape == ( len(test_rewards), n_test_states, ), f"Values shape={values.shape} is not expected {(len(test_rewards), n_test_states)}" alignment = cast(np.ndarray, np.all(opt_values - values < epsilon, axis=1)) return alignment
def encode(self, observation): """Encode user observation and document observations to an image.""" # It converts the observation from the simulator to a numpy array to be # consumed by DQN agent, which assume the input is a "image". # The first row is user's observation. The remaining rows are documents' # observation, one row for each document. image = np.zeros(self._observation_shape + (self._stack_size, ), dtype=self._observation_dtype) image[0, :, 0] = self._pad_with_zeros( spaces.flatten(self._input_observation_space.spaces['user'], observation['user'])) doc_space = zip( self._input_observation_space.spaces['doc'].spaces.values(), observation['doc'].values()) image[1:, :, 0] = np.array([ self._pad_with_zeros(spaces.flatten(doc_space, d)) for doc_space, d in doc_space ]) return image
def forward(self, states): # Forward flattened state states_flattened = [ spaces.flatten(self.env.observation_space, s) for s in states ] states_tensor = Tensor(states_flattened) # Move tensor to GPU if available if torch.cuda.is_available(): states_tensor = states_tensor.cuda() return self.network(states_tensor)
def step(self, action): try: action = np.split(action, self.n_agents) except (AttributeError, IndexError): action = [action] observation, reward, done, info = super().step(action) observation = np.concatenate( [spaces.flatten(s, o) for s, o in zip(self.observation_space, observation)] ) reward = np.sum(reward) done = all(done) return observation, reward, done, info
def get_qvalues(self, state): # Flatten state state = tuple(spaces.flatten(self.env.observation_space, state)) # Generate new entry in table for new states if state not in self.q_table: # By adding an entry in the Q-table, we make the agent's # behavior dependent on previous runs and hence previous seeds! # This is not expected in greedy mode. if self.is_greedy: return np.random.rand(self.env.action_space.n) self.q_table[state] = np.random.rand(self.env.action_space.n) return self.q_table[state]
def learn(self, observation, action, reward, next_observation, done) -> None: """ learn from parameters :param observation: stat of environment :type observation: gym.Space :param action: action taken by agent :type action: int, float, list :param reward: reward win :type reward: int, float, np.int, np.float :type reward: int, np.int :param next_observation: :type next_observation: gym.Space :param done: if env is finished :type done: bool """ self.memory.append([flatten(self.observation_space, observation)], action, reward, [flatten(self.observation_space, next_observation)], done) self.step += 1 if (self.step % self.step_train) == 0: self.train()
def __get_all_players_observation_with_action( self, state: FullState, decision: BaseDecision) -> List[np.ndarray]: """This return a map of the action space, with a multi-discrete action space for checking next_accepted_action """ obs = [None] * self.n_agents next_player: int = self.next_player action_obs: Dict[str, np.ndarray] = decision.action_range_to_numpy() player_obs_space = self.state.to_player_data(next_player, for_numpy=True) # TODO: only set action for the next player obs[next_player] = spaces.flatten( # Note this includes action observation self.observation_space, [action_obs, player_obs_space], ) return obs
def transform(self, attr: AttributationLike) -> AttributationLike: obs_space = self.obs_space if self.obs_image_channel_dim is not None: attr = np.sum(attr, axis=self.obs_image_channel_dim) obs_space = remove_channel_dim_from_image_space(obs_space) attr = flatten(self.obs_space, attr) if self.mode == AttributationNormalizationMode.ALL: scaling_factor = self._calculate_safe_scaling_factor(np.abs(attr)) elif self.mode == AttributationNormalizationMode.POSITIVE: attr = (attr > 0) * attr scaling_factor = self._calculate_safe_scaling_factor(attr) elif self.mode == AttributationNormalizationMode.NEGATIVE: attr = (attr < 0) * attr scaling_factor = -self._calculate_safe_scaling_factor(np.abs(attr)) elif self.mode == AttributationNormalizationMode.ABSOLUTE_VALUE: attr = np.abs(attr) scaling_factor = self._calculate_safe_scaling_factor(attr) else: raise EnumValueNotFound(self.mode, AttributationNormalizationMode) attr_norm = self._scale(attr, scaling_factor) return unflatten(obs_space, attr_norm)
def flatten_observation(space, x=None): # Note that it does not preserve dtype def _flatten_bounds(space, bounds_type): if isinstance(space, spaces.Box): if bounds_type == 'high': return np.asarray(space.high).flatten() else: return np.asarray(space.low).flatten() elif isinstance(space, spaces.Discrete): if bounds_type == 'high': return np.one(space.n) else: return np.zeros(space.n) elif isinstance(space, spaces.Tuple): return np.concatenate( [_flatten_bounds(s, bounds_type) for s in space.spaces]) elif isinstance(space, spaces.Dict): return np.concatenate([ _flatten_bounds(s, bounds_type) for s in space.spaces.values() ]) elif isinstance(space, spaces.MultiBinary): if bounds_type == 'high': return np.one(space.n) else: return np.zeros(space.n) elif isinstance(space, spaces.MultiDiscrete): if bounds_type == 'high': return np.one(reduce(__mul__, space.nvec)) else: return np.zeros(reduce(__mul__, space.nvec)) else: raise NotImplementedError if x is None: return spaces.Box(low=_flatten_bounds(space, 'low'), high=_flatten_bounds(space, 'high'), dtype=np.float64) else: return spaces.flatten(space, x)
def observation(self, observation): return flatten(self.env.observation_space['state'], observation)
def test_call_network(self): for ob, ac in self.list_work: self.network(observation_space=ob, action_space=ac)(torch.tensor( [flatten(ob, ob.sample())]).float())
def observation(self, observation): return spaces.flatten(self.env.observation_space, observation)
def observation(self, observation): return flatten(self.observation_space, observation)
def inspect_memory(self, top_n=10, max_col=80): # Functions to encode/decode states encode_state = lambda s: tuple( spaces.flatten(self.env.observation_space, s)) decode_state = lambda s: spaces.unflatten(self.env.observation_space, s ) # Function to create barchart from counter def count_barchart(counter, ax, xlabel=None, normalize=True): # Sort and extract key, counts sorted_tuples = counter.most_common() sorted_keys = [key for key, count in sorted_tuples] sorted_counts = [count for key, count in sorted_tuples] # Normalize counts if normalize: total = sum(counters['reward'].values()) sorted_counts = [c / total for c in sorted_counts] # Plotting x_indexes = range(len(sorted_counts)) ax.bar(x_indexes, sorted_counts) ax.set_xticks(x_indexes) ax.set_xticklabels(sorted_keys) ax.set_ylabel('proportion') if xlabel is not None: ax.set_xlabel(xlabel) ax.set_title('Replay Memory') # Function to print top states from counter def top_states(counter): for i, (state, count) in enumerate(counter.most_common(top_n), 1): state_label = str(decode_state(state)) state_label = state_label.replace('\n', ' ') state_label = state_label[:max_col] + '..' if len( state_label) > max_col else state_label print('{:>2}) Count: {} state: {}'.format( i, count, state_label)) # Count statistics counters = defaultdict(Counter) for state, action, reward, next_state, done in self.memory: counters['state'][encode_state(state)] += 1 counters['action'][action] += 1 counters['reward'][reward] += 1 counters['next_state'][encode_state(next_state)] += 1 counters['done'][done] += 1 # Plot reward/action fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4)) count_barchart(counters['reward'], ax1, 'rewards') count_barchart(counters['action'], ax2, 'actions') plt.plot() plt.show() # Print top states print('Top state:') top_states(counters['state']) print() print('Top next_state:') top_states(counters['next_state']) print() # Done signal print('Proportion of done: {:.2f}%'.format( 100 * counters['done'][True] / sum(counters['done'].values())))
def _extract_state(self, observation): user_space = self._observation_space.spaces["user"] return spaces.flatten(user_space, observation["user"])
def _get_observations(self): # return (self.hands[0], self.played_cards, self.scores) first = min([i for i in range(self.players) if not self.played_cards[i][0] == 0], default=0) obs = (self.hands[0], self.played_cards[first:] + self.played_cards[:first], self.scores) return spaces.flatten(self.unflattened_observation_space, obs)
def observation(self, observation): return spaces.flatten(self.env.observation_space, np.moveaxis(observation, -1, 0)) / 255.
def observation(self, observation): import ipdb ipdb.set_trace() return spaces.flatten(self.env.observation_space, observation) / 255.
def reset(self, **kwargs): if self._flatten_obs: return spaces.flatten(self.env.observation_space, self.env.reset(**kwargs)) else: return self.env.reset(**kwargs)
def to_flattened_numpy_data(self, player_id: int): return spaces.flatten(self.get_observation_space(), self.to_data_for_numpy())
def observation(self, observation): return tuple([ spaces.flatten(obs_space, obs) for obs_space, obs in zip(self.env.observation_space, observation) ])