def _step(self, action): # update states of rnn self.frame_count += 1 self.rnn_states = rnn_next_state(self.rnn, self.z, action, self.rnn_states) # actual action in wrapped env: threshold = 0.3333 full_action = [0] * 43 if action < -threshold: full_action[11] = 1 if action > threshold: full_action[10] = 1 obs, reward, done, _ = super(DoomTakeCoverMDNRNN, self)._step(full_action) small_obs = self._process_frame(obs) self.current_obs = small_obs self.z = self._encode(small_obs) if done: self.restart = 1 else: self.restart = 0 if self.with_obs: return [self._current_state(), self.current_obs], reward, done, {} else: return self._current_state(), reward, done, {}
def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) # print(len(h), " h:", h) #TODO: 256+32 (the 32 comes first) # So we could have 288*2*18 params, or 288*2*environment.action_space.n (6 for Pong) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer raise Exception("Not ported to atari") # h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) # action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: # could probabilistically sample from softmax, but greedy action = np.argmax(np.matmul(h, self.weight) + self.bias) # action[1] = (action[1]+1.0) / 2.0 # action[2] = clip(action[2]) # print("Action:", action) action_one_hot = np.zeros(self.num_actions) action_one_hot[action] = 1 # print("Action hot:", action_one_hot) self.state = rnn_next_state(self.rnn, z, action_one_hot, self.state) return action
def get_action(self, z, epsilon=0.0): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if np.random.rand() < epsilon: action = np.random.randint(0, self.na) else: if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.maximum( np.dot(h, self.weight_hidden) + self.bias_hidden, 0) action = np.argmax( np.dot(h, self.weight_output) + self.bias_output) else: action = np.argmax(np.dot(h, self.weight) + self.bias) oh_action = np.zeros(self.na) oh_action[action] = 1 # action[1] = (action[1]+1.0) / 2.0 # action[2] = clip(action[2]) # TODO check about this fucntion self.state = rnn_next_state(self.rnn, z, oh_action, self.state) return action
def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' # print(h) if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = sigmoid(np.dot(h, self.weight) + self.bias) # action = sigmoid(h) # print(np.mean(action)) # print(action) action_mean = np.mean(action) action_mean = action_mean * 62 action = np.array([int(action_mean)]) # action[1] = (action[1] + 1.0) / 2.0 # action[2] = clip(action[2]) # action = np.array([int(action[2])]) print("action", action) self.state = rnn_next_state(self.rnn, z, action, self.state) return action
def get_action(self, feature): h = rnn_output(self.rnn_state, feature, EXP_MODE) action, v_preds = self.net.policy.get_action(h, verbose=False) action_one_hot = get_one_hot(np.array(action), ACTION_SPACE) self.rnn_state = rnn_next_state(self.net.rnn, feature, action_one_hot, self.rnn_state) return h, action, v_preds
def _step(self, action): obs, reward, done, _ = super(CarRacingMDNRNN, self)._step(action) z, _, _ = self.encode_obs(obs) h = tf.squeeze(self.rnn_states[0]) z_h = tf.concat([z, h], axis=-1) if action is not None: # don't compute state on reset self.rnn_states = rnn_next_state(self.rnn, z, action, self.rnn_states) return z_h, reward, done, {}
def get_action(self,z): h=rnn_output(self.state,z,EXP_MODE) if EXP_MODE==MODE_Z_HIDDEN: h=np.tanh(np.dot(h,self.weight_hidden)+self.bias_hidden) action=np.tanh(np.dot(h,self.weight_output)+self.bias_output) else: action=np.tanh(np.dot(h,self.weight)+self.bias) action[1]=(action[1]+1.0)/2.0 action[2]=clip(action[2]) self.state=rnn_next_state(self.rnn,z,action,self.state) return action
def encode_obs(self, obs, prev_state, action): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float)/255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = self.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar/2.0) * np.random.randn(*s) h = rnn_output(prev_state, z, 4) next_state = rnn_next_state(self.rnn, z, np.array(action), prev_state) return np.concatenate([h, z]), next_state
def encode_obs(obs, prev_state, action): # convert raw obs to z, mu, logvar result = np.copy(obs).astype(np.float)/255.0 result = result.reshape(1, 64, 64, 3) mu, logvar = vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar/2.0) * np.random.randn(*s) next_state = rnn_next_state(rnn, z, action, prev_state) h = rnn_output(state, z, 4) return h, next_state
def get_action(self, z): a = random_linear_sample(-1, 1) b = random_linear_sample(0, 1) c = random_linear_sample(0, 1) actions = dp(a, b, c) action, self.mct = mcts.mcts(z, self.env, actions, old_tree=self.mct, tree_depth=6, simulate_depth=200) self.state = rnn_next_state(self.rnn, z, action, self.state) return action
def _step(self, action): obs, reward, done, _ = super(CarRacingMDNRNN, self)._step(action) z = tf.squeeze(self.encode_obs(obs)) h = tf.squeeze(self.rnn_states[0]) c = tf.squeeze(self.rnn_states[1]) if self.rnn.args.state_space == 2: z_state = tf.concat([z, c, h], axis=-1) else: z_state = tf.concat([z, h], axis=-1) if action is not None: # don't compute state on reset self.rnn_states = rnn_next_state(self.rnn, z, action, self.rnn_states) if self.with_obs: return [z_state, obs], reward, done, {} else: return z_state, reward, done, {}
def step(self, action): # Advance RNN # NOTE: Uses z from previous frame, which makes sense as this is the frame where the action was input. # The original code advanced the RNN with the z generated from the next frame (after this action was # input), but only in the CarRacing example, not in the Doom example. if action is not None: # no action is given on reset self.rnn_state = rnn_next_state(self.rnn, self.z, action, self.rnn_state) # Advance simulation obs, reward, done, info = self.env.step(action) # Encode pixel observation self.z = self.encode_image_to_z(obs) obs = self.modify_observation(obs) return obs, reward, done, info
def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(h, self.weight) + self.bias) self.state = rnn_next_state(self.rnn, z, action, self.state) return action
def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) if self.arglist.inference: oppo_intents = [] for i in range(self.arglist.agent_num - 1): act_traj = self.act_traj[i] intent = self.oppo_model.get_inference(act_traj) oppo_intents.append(intent) oppo_intents = np.reshape( oppo_intents, ((self.arglist.agent_num - 1) * self.arglist.action_space)) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' #Oppo intent shape (batch_size, agent_num, action_space) # reshape oppo_intent agent_num * batch_size * action_space controller_input = np.concatenate((h, oppo_intents)) else: controller_input = h if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer x = np.tanh( np.dot(controller_input, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(x, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(controller_input, self.weight) + self.bias) for i in range(self.action_space): action[i] = clip(action[i]) self.state = rnn_next_state(self.rnn, z, action, self.act_traj, self.state) # self.oppo_state = oppo_next_state(self.oppo_model, action, self.act_traj, self.oppo_state) # epsilon exploration if np.random.uniform(0, 1) < 0.2: action = [np.random.uniform(-3, 3)] * len(action) return action
def get_action(self, z): h = rnn_output(self.state, z, EXP_MODE) #print('h', h.shape, h) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: '''print(h.shape) print(self.weight.shape) print(self.bias.shape)''' action = np.tanh(np.dot(h, self.weight) + self.bias) '''for i in range(ACTION_SIZE): action[i] = (action[i]+1.0) / 2.0 #all actions value are in range 0 to 1''' #action[2] = clip(action[2]) self.state = rnn_next_state(self.rnn, z, action, self.state) #update weights of MDN-RNN return action
def get_action(self, z, arglist): h = rnn_output(self.state, z, EXP_MODE) ''' action = np.dot(h, self.weight) + self.bias action[0] = np.tanh(action[0]) action[1] = sigmoid(action[1]) action[2] = clip(np.tanh(action[2])) ''' if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden) action = np.tanh(np.dot(h, self.weight_output) + self.bias_output) else: action = np.tanh(np.dot(h, self.weight) + self.bias) if arglist.competitive: obs, rewards, done, win = self.env.step([action[0], 'script']) else: obs, rewards, done, win = self.env.step(action) extra_reward = 0.0 # penalize for turning too frequently if arglist.competitive: if arglist.train_mode and penalize_turning: extra_reward -= np.abs(action[0]) / 10.0 rewards[0] += extra_reward reward = rewards[0] else: if arglist.train_mode and penalize_turning: reward = np.sum(rewards) extra_reward -= np.abs(action[0]) / 10.0 reward += extra_reward # recording_reward.append(reward) # total_reward += reward self.state = rnn_next_state(self.rnn, z, action, self.state) return action