Exemple #1
0
    def play_step(self, net, epsilon=0.0, device="cpu"):
        """
        Epsilon greedy step. With probability epsilon, a random action is taken (exploration),
        else the action ist chosen to maximize the q-value as approximated by net (exploitation).
        """
        done_reward = None

        if np.random.random() < epsilon:
            action = self.env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=True)
            state_v = torch.FloatTensor(state_a)  #.to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        new_state = np.array(new_state, copy=True)
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward
Exemple #2
0
    def play_step(self, device="cpu", test=False):
        """ Play a single step """

        done_reward = None
        self.steps += 1

        ## action selection
        # play step with e-greedy exploration strategy
        # if not in test fase
        if np.random.random() < self.epsilon and not test:
            # takes a random action
            action = self.env.action_space.sample()
        else:
            # moves state into an array with 1 sample to pass through neural net
            state_a = np.array([self.state], copy=False)
            # creates tensor
            state_v = torch.tensor(state_a).to(device)
            # get q values with feed forward
            q_vals_v = self.net(state_v)
            # manually adding .cpu() to run in GPU mode
            self.latest_qvals = q_vals_v.detach().cpu().numpy()[
                0]  # store for bookkeeping
            # chooses greedy action and get its value
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # take action
        new_state, reward, is_done, _ = self.env.step(
            action)  # step of the environment is done here
        self.total_reward += reward

        # only add to experience buffer if not in test
        if not test:
            exp = Experience(self.state, action, reward, is_done, new_state)
            self.exp_buffer.append(exp)

        # change state to new state
        self.state = new_state

        # if complete, accrue total reward and reset
        if is_done:
            done_reward = self.total_reward
            self.done_reward = done_reward  # book keeping
            # add totals
            self.total_rewards.append(done_reward)
            self.total_steps.append(self.steps)
            # track episode
            self.record_episode()
            # reset environment
            self.reset()

        return is_done, done_reward
    def fill_buffer(self):

        # fill buffer prior to experience
        while len(self.exp_buffer) < self.params["REPLAY_START_SIZE"]:
                    
            action = self.env.action_space.sample()
            new_state, reward, is_done, _ = self.env.step(action)
            exp = Experience(self.state, action, reward, is_done, new_state)
            self.exp_buffer.append(exp)

            # change state to new state
            self.state = new_state

            # if done, needs to reset
            if is_done: self.reset(count_episode=False)
     
        # reset to leave agent to start learning
        self.reset(count_episode=False)
    def play_step(self, device="cpu", test=False):
        """ Play a single step """
        
        done_reward = None
        self.steps += 1

        ## action selection
        # play step with e-greedy exploration strategy
        # if not in test fase
        if np.random.random() < self.epsilon and not test:
            # takes a random action
            action = self.env.action_space.sample()
        else:
            # moves state into an array with 1 sample to pass through neural net
            state_a = np.array([self.state], copy=False)
            # creates tensor
            state_v = torch.tensor(state_a).to(device)
            # get q values with feed forward
            q_vals_v = self.net(state_v)
            # manually adding .cpu() to run in GPU mode
            self.latest_qvals = q_vals_v.detach().cpu().numpy()[0] # store for bookkeeping
            # chooses greedy action and get its value
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())
        
        # take action
        new_state, reward, is_done, _ = self.env.step(action) # step of the environment is done here
        reward *= REWARD_SCALING_FACTOR # scaling reward according to a predefined factor
        # self.env.render('human') # specific for minecraft
        self.total_reward +=  reward
        self.step_reward = reward # for bookkeeping purposes

        # only add to experience buffer if not in test
        # also do not add if state doesn't match expected size
        if not test:        
            # this is a temporary check to id if this is the problem
            # cannot id the issue at the moment
            # it does seem to be the problem
            if self.state.shape != self.obs_shape:
                print("State shape size is inconsistent")
            elif new_state.shape != self.obs_shape:
                print("New state shape size is inconsistent")
            else:
                exp = Experience(self.state, action, reward, is_done, new_state)
                self.exp_buffer.append(exp)
        
        # change state to new state
        self.state = new_state
        
        # if complete, accrue total reward and reset
        if is_done:
            done_reward = self.total_reward
            self.done_reward = done_reward # book keeping
            # add totals
            self.total_rewards.append(done_reward)
            self.total_steps.append(self.steps)            
            # track episode
            self.record_episode()
            # reset environment
            self.reset()

        return is_done, done_reward
Exemple #5
0
    def play_step(self, device="cpu", test=False):
        """ Play a single step """
        
        done_reward = None
        self.steps += 1

        # this seems to be the learning done with multiple samples, right?
        # need to organize this code into a coherence piece of something
        # else I won't get anywhere
        # maybe it is better if I recover what I already did in tf

        # moves state into an array with 1 sample to pass through neural net
        state_a = np.array([self.state], copy=False)
        # creates tensor
        states_v = torch.tensor(state_a).to(device)
        # get base value for actions
        mu_v = self.net(state_v)
        actions = mu_v.data.cpu().numpy()

        # check if OU exploration is enabled (action is deterministic)
        # what is agent states here?
        if self.ou_enabled and self.ou_epsilon > 0:
            new_a_states = []
            for a_state, action in zip(agent_states, actions):
                if a_state is None:
                    a_state = np.zeros



        # play step with e-greedy exploration strategy
        # if not in test fase
        if np.random.random() < self.epsilon and not test:
            # takes a random action
            action = self.env.action_space.sample()
        else:
            # get q values with feed forward
            q_vals_v = self.net(state_v)
            # manually adding .cpu() to run in GPU mode
            # self.latest_qvals = q_vals_v.detach().cpu().numpy()[0] # store for bookkeeping
            # chooses greedy action and get its value
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())
        
        # take action
        new_state, reward, is_done, _ = self.env.step(action) # step of the environment is done here
        self.total_reward +=  reward

        # only add to experience buffer if not in test
        if not test:        
            exp = Experience(self.state, action, reward, is_done, new_state)
            self.exp_buffer.append(exp)
        
        # change state to new state
        self.state = new_state
        
        # if complete, accrue total reward and reset
        if is_done:
            done_reward = self.total_reward
            self.done_reward = done_reward # book keeping
            # add totals
            self.total_rewards.append(done_reward)
            self.total_steps.append(self.steps)            
            # reset environment
            self.reset()
            
        return is_done, done_reward