コード例 #1
0
    def _step(self, action):

        # update states of rnn
        self.frame_count += 1

        self.rnn_states = rnn_next_state(self.rnn, self.z, action,
                                         self.rnn_states)

        # actual action in wrapped env:

        threshold = 0.3333
        full_action = [0] * 43

        if action < -threshold:
            full_action[11] = 1

        if action > threshold:
            full_action[10] = 1

        obs, reward, done, _ = super(DoomTakeCoverMDNRNN,
                                     self)._step(full_action)
        small_obs = self._process_frame(obs)
        self.current_obs = small_obs
        self.z = self._encode(small_obs)

        if done:
            self.restart = 1
        else:
            self.restart = 0

        if self.with_obs:
            return [self._current_state(), self.current_obs], reward, done, {}
        else:
            return self._current_state(), reward, done, {}
コード例 #2
0
    def get_action(self, z):
        h = rnn_output(self.state, z, EXP_MODE)
        # print(len(h), " h:", h) #TODO: 256+32 (the 32 comes first)
        # So we could have 288*2*18 params, or 288*2*environment.action_space.n (6 for Pong)
        '''
    action = np.dot(h, self.weight) + self.bias
    action[0] = np.tanh(action[0])
    action[1] = sigmoid(action[1])
    action[2] = clip(np.tanh(action[2]))
    '''
        if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
            raise Exception("Not ported to atari")
            # h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden)
            # action = np.tanh(np.dot(h, self.weight_output) + self.bias_output)
        else:
            # could probabilistically sample from softmax, but greedy
            action = np.argmax(np.matmul(h, self.weight) + self.bias)

        # action[1] = (action[1]+1.0) / 2.0
        # action[2] = clip(action[2])
        # print("Action:", action)
        action_one_hot = np.zeros(self.num_actions)
        action_one_hot[action] = 1
        # print("Action hot:", action_one_hot)

        self.state = rnn_next_state(self.rnn, z, action_one_hot, self.state)

        return action
コード例 #3
0
    def get_action(self, z, epsilon=0.0):
        h = rnn_output(self.state, z, EXP_MODE)
        '''
    action = np.dot(h, self.weight) + self.bias
    action[0] = np.tanh(action[0])
    action[1] = sigmoid(action[1])
    action[2] = clip(np.tanh(action[2]))
    '''
        if np.random.rand() < epsilon:
            action = np.random.randint(0, self.na)
        else:
            if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
                h = np.maximum(
                    np.dot(h, self.weight_hidden) + self.bias_hidden, 0)
                action = np.argmax(
                    np.dot(h, self.weight_output) + self.bias_output)
            else:
                action = np.argmax(np.dot(h, self.weight) + self.bias)

        oh_action = np.zeros(self.na)
        oh_action[action] = 1

        # action[1] = (action[1]+1.0) / 2.0
        # action[2] = clip(action[2])

        # TODO check about this fucntion
        self.state = rnn_next_state(self.rnn, z, oh_action, self.state)

        return action
コード例 #4
0
    def get_action(self, z):
        h = rnn_output(self.state, z, EXP_MODE)
        '''
        action = np.dot(h, self.weight) + self.bias
        action[0] = np.tanh(action[0])
        action[1] = sigmoid(action[1])
        action[2] = clip(np.tanh(action[2]))
        '''
        # print(h)
        if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
            h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden)
            action = np.tanh(np.dot(h, self.weight_output) + self.bias_output)
        else:
            action = sigmoid(np.dot(h, self.weight) + self.bias)
        # action = sigmoid(h)
        # print(np.mean(action))
        # print(action)

        action_mean = np.mean(action)
        action_mean = action_mean * 62
        action = np.array([int(action_mean)])

        # action[1] = (action[1] + 1.0) / 2.0
        # action[2] = clip(action[2])

        # action = np.array([int(action[2])])

        print("action", action)

        self.state = rnn_next_state(self.rnn, z, action, self.state)

        return action
コード例 #5
0
    def get_action(self, feature):
        h = rnn_output(self.rnn_state, feature, EXP_MODE)
        action, v_preds = self.net.policy.get_action(h, verbose=False)

        action_one_hot = get_one_hot(np.array(action), ACTION_SPACE)

        self.rnn_state = rnn_next_state(self.net.rnn, feature, action_one_hot,
                                        self.rnn_state)
        return h, action, v_preds
コード例 #6
0
  def _step(self, action):
    obs, reward, done, _ = super(CarRacingMDNRNN, self)._step(action)
    z, _, _ = self.encode_obs(obs)
    h = tf.squeeze(self.rnn_states[0])
    z_h = tf.concat([z, h], axis=-1)

    if action is not None: # don't compute state on reset
        self.rnn_states = rnn_next_state(self.rnn, z, action, self.rnn_states)
    return z_h, reward, done, {}
コード例 #7
0
 def get_action(self,z):
  h=rnn_output(self.state,z,EXP_MODE)
  if EXP_MODE==MODE_Z_HIDDEN:
   h=np.tanh(np.dot(h,self.weight_hidden)+self.bias_hidden)
   action=np.tanh(np.dot(h,self.weight_output)+self.bias_output)
  else:
   action=np.tanh(np.dot(h,self.weight)+self.bias)
  action[1]=(action[1]+1.0)/2.0
  action[2]=clip(action[2])
  self.state=rnn_next_state(self.rnn,z,action,self.state)
  return action
コード例 #8
0
ファイル: environment.py プロジェクト: zhl001/worldmodels_dqn
 def encode_obs(self, obs, prev_state, action):
     # convert raw obs to z, mu, logvar
     result = np.copy(obs).astype(np.float)/255.0
     result = result.reshape(1, 64, 64, 3)
     mu, logvar = self.vae.encode_mu_logvar(result)
     mu = mu[0]
     logvar = logvar[0]
     s = logvar.shape
     z = mu + np.exp(logvar/2.0) * np.random.randn(*s)
     h = rnn_output(prev_state, z, 4)
     next_state = rnn_next_state(self.rnn, z, np.array(action), prev_state)
     return np.concatenate([h, z]), next_state
コード例 #9
0
 def encode_obs(obs, prev_state, action):
   # convert raw obs to z, mu, logvar
   result = np.copy(obs).astype(np.float)/255.0
   result = result.reshape(1, 64, 64, 3)
   mu, logvar = vae.encode_mu_logvar(result)
   mu = mu[0]
   logvar = logvar[0]
   s = logvar.shape
   z = mu + np.exp(logvar/2.0) * np.random.randn(*s)
   next_state = rnn_next_state(rnn, z, action, prev_state)
   h = rnn_output(state, z, 4)
   return h, next_state
コード例 #10
0
ファイル: interface.py プロジェクト: Centauria/WorldMCTS
    def get_action(self, z):
        a = random_linear_sample(-1, 1)
        b = random_linear_sample(0, 1)
        c = random_linear_sample(0, 1)
        actions = dp(a, b, c)
        action, self.mct = mcts.mcts(z,
                                     self.env,
                                     actions,
                                     old_tree=self.mct,
                                     tree_depth=6,
                                     simulate_depth=200)

        self.state = rnn_next_state(self.rnn, z, action, self.state)

        return action
コード例 #11
0
 def _step(self, action):
   obs, reward, done, _ = super(CarRacingMDNRNN, self)._step(action)
   z = tf.squeeze(self.encode_obs(obs))
   h = tf.squeeze(self.rnn_states[0])
   c = tf.squeeze(self.rnn_states[1])
   if self.rnn.args.state_space == 2:
       z_state = tf.concat([z, c, h], axis=-1)
   else:
       z_state = tf.concat([z, h], axis=-1)
   if action is not None: # don't compute state on reset
       self.rnn_states = rnn_next_state(self.rnn, z, action, self.rnn_states)
   if self.with_obs:
       return [z_state, obs], reward, done, {}
   else:
       return z_state, reward, done, {}
コード例 #12
0
    def step(self, action):
        # Advance RNN
        # NOTE: Uses z from previous frame, which makes sense as this is the frame where the action was input.
        #       The original code advanced the RNN with the z generated from the next frame (after this action was
        #       input), but only in the CarRacing example, not in the Doom example.
        if action is not None:  # no action is given on reset
            self.rnn_state = rnn_next_state(self.rnn, self.z, action, self.rnn_state)

        # Advance simulation
        obs, reward, done, info = self.env.step(action)

        # Encode pixel observation
        self.z = self.encode_image_to_z(obs)

        obs = self.modify_observation(obs)

        return obs, reward, done, info
コード例 #13
0
ファイル: model.py プロジェクト: hdilab/pushblock
  def get_action(self, z):
    h = rnn_output(self.state, z, EXP_MODE)

    '''
    action = np.dot(h, self.weight) + self.bias
    action[0] = np.tanh(action[0])
    action[1] = sigmoid(action[1])
    action[2] = clip(np.tanh(action[2]))
    '''
    if EXP_MODE == MODE_Z_HIDDEN: # one hidden layer
      h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden)
      action = np.tanh(np.dot(h, self.weight_output) + self.bias_output)
    else:
      action = np.tanh(np.dot(h, self.weight) + self.bias)

    self.state = rnn_next_state(self.rnn, z, action, self.state)

    return action
コード例 #14
0
ファイル: model.py プロジェクト: hcch0912/ma_world_model
    def get_action(self, z):
        h = rnn_output(self.state, z, EXP_MODE)

        if self.arglist.inference:
            oppo_intents = []
            for i in range(self.arglist.agent_num - 1):
                act_traj = self.act_traj[i]
                intent = self.oppo_model.get_inference(act_traj)
                oppo_intents.append(intent)
            oppo_intents = np.reshape(
                oppo_intents,
                ((self.arglist.agent_num - 1) * self.arglist.action_space))
            '''
      action = np.dot(h, self.weight) + self.bias
      action[0] = np.tanh(action[0])
      action[1] = sigmoid(action[1])
      action[2] = clip(np.tanh(action[2]))
      '''
            #Oppo intent shape (batch_size, agent_num, action_space)
            # reshape oppo_intent  agent_num * batch_size * action_space

            controller_input = np.concatenate((h, oppo_intents))
        else:
            controller_input = h

        if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
            x = np.tanh(
                np.dot(controller_input, self.weight_hidden) +
                self.bias_hidden)
            action = np.tanh(np.dot(x, self.weight_output) + self.bias_output)
        else:
            action = np.tanh(np.dot(controller_input, self.weight) + self.bias)
        for i in range(self.action_space):
            action[i] = clip(action[i])

        self.state = rnn_next_state(self.rnn, z, action, self.act_traj,
                                    self.state)
        # self.oppo_state = oppo_next_state(self.oppo_model, action, self.act_traj, self.oppo_state)

        # epsilon exploration
        if np.random.uniform(0, 1) < 0.2:
            action = [np.random.uniform(-3, 3)] * len(action)
        return action
コード例 #15
0
 def get_action(self, z):
     h = rnn_output(self.state, z, EXP_MODE)
     #print('h', h.shape, h)
     '''
 action = np.dot(h, self.weight) + self.bias
 action[0] = np.tanh(action[0])
 action[1] = sigmoid(action[1])
 action[2] = clip(np.tanh(action[2]))
 '''
     if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
         h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden)
         action = np.tanh(np.dot(h, self.weight_output) + self.bias_output)
     else:
         '''print(h.shape)
   print(self.weight.shape)
   print(self.bias.shape)'''
         action = np.tanh(np.dot(h, self.weight) + self.bias)
     '''for i in range(ACTION_SIZE):
   action[i] = (action[i]+1.0) / 2.0 #all actions value are in range 0 to 1'''
     #action[2] = clip(action[2])
     self.state = rnn_next_state(self.rnn, z, action,
                                 self.state)  #update weights of MDN-RNN
     return action
コード例 #16
0
    def get_action(self, z, arglist):
        h = rnn_output(self.state, z, EXP_MODE)
        '''
    action = np.dot(h, self.weight) + self.bias
    action[0] = np.tanh(action[0])
    action[1] = sigmoid(action[1])
    action[2] = clip(np.tanh(action[2]))
    '''
        if EXP_MODE == MODE_Z_HIDDEN:  # one hidden layer
            h = np.tanh(np.dot(h, self.weight_hidden) + self.bias_hidden)
            action = np.tanh(np.dot(h, self.weight_output) + self.bias_output)
        else:
            action = np.tanh(np.dot(h, self.weight) + self.bias)

        if arglist.competitive:
            obs, rewards, done, win = self.env.step([action[0], 'script'])
        else:
            obs, rewards, done, win = self.env.step(action)

        extra_reward = 0.0  # penalize for turning too frequently
        if arglist.competitive:
            if arglist.train_mode and penalize_turning:
                extra_reward -= np.abs(action[0]) / 10.0
                rewards[0] += extra_reward
            reward = rewards[0]
        else:
            if arglist.train_mode and penalize_turning:
                reward = np.sum(rewards)
                extra_reward -= np.abs(action[0]) / 10.0
                reward += extra_reward

        # recording_reward.append(reward)
        # total_reward += reward

        self.state = rnn_next_state(self.rnn, z, action, self.state)

        return action