def step(self, action): # self.forward_dynamics(action) # comvel = self.get_body_comvel("torso") # forward_reward = self.goal_direction*comvel[0] # lb, ub = self.action_bounds # scaling = (ub - lb) * 0.5 # ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) # contact_cost = 0.5 * 1e-3 * np.sum( # np.square(np.clip(self.model.data.cfrc_ext, -1, 1))), # survive_reward = 0.05 # reward = forward_reward - ctrl_cost - contact_cost + survive_reward # state = self._state # notdone = np.isfinite(state).all() \ # and state[2] >= 0.2 and state[2] <= 1.0 # done = not notdone # ob = self.get_current_obs() # return Step(ob, float(reward), done) # obs = self.get_current_obs() # selected_arm_mean = self.arm_means[action] # reward = float(np.random.random() < selected_arm_mean) # self.ts += 1 # done = self.ts >= self.max_path_length # state = np.zeros((2)) # state[0] = reward # state[1] = 1 # return Step(state, reward, done) ps = self.Ps[self.state, action] next_state = special.weighted_sample(ps, np.arange(self.n_states)) reward_mean = self.Rs[self.state, action] reward = reward_mean + np.random.normal() * 1 / np.sqrt(self.tau) self.ts += 1 self.state = next_state done = self.ts >= self.max_path_length state = np.zeros((2 + self.n_states)) state[self.state] = 1 state[self.n_states] = reward state[self.n_states + 1] = done return Step(state, reward, done)
def get_action(self, observation): if self.state_include_action: if self.prev_action is None: prev_action = np.zeros((self.action_space.flat_dim,)) else: prev_action = self.action_space.flatten(self.prev_action) all_input = np.concatenate([ self.observation_space.flatten(observation), prev_action ]) else: all_input = self.observation_space.flatten(observation) # should not be used prev_action = np.nan probs, hidden_vec = [x[0] for x in self.f_step_prob([all_input], [self.prev_hidden])] action = special.weighted_sample(probs, range(self.action_space.n)) self.prev_action = action self.prev_hidden = hidden_vec agent_info = dict(prob=probs) if self.state_include_action: agent_info["prev_action"] = prev_action return action, agent_info
def weighted_sample(self, weights): return special.weighted_sample(weights, self._items_arr)
def weighted_sample(self, weights): return special.weighted_sample(weights, xrange(self.n))
def weighted_sample(self, weights): return special.weighted_sample(weights, range(self.n))
def weighted_sample(self, weights): if config.TF_NN_SETTRACE: ipdb.set_trace() return special.weighted_sample(weights, range(self.n))
def weighted_sample_normalized(self, weights): return [ (special.weighted_sample(weights[s:s + n], range(n)) - b) / b for s, n, b in zip(self._slice, self._comp_dim, self._comp_base) ]
def weighted_sample(self, weights): return [ special.weighted_sample(weights[s:s + n], range(n)) for s, n, b in zip(self._slice, self._comp_dim, self._comp_base) ]