def play_step(self, net, epsilon=0.0, device="cpu"): """ Epsilon greedy step. With probability epsilon, a random action is taken (exploration), else the action ist chosen to maximize the q-value as approximated by net (exploitation). """ done_reward = None if np.random.random() < epsilon: action = self.env.action_space.sample() else: state_a = np.array([self.state], copy=True) state_v = torch.FloatTensor(state_a) #.to(device) q_vals_v = net(state_v) _, act_v = torch.max(q_vals_v, dim=1) action = int(act_v.item()) # do step in the environment new_state, reward, is_done, _ = self.env.step(action) new_state = np.array(new_state, copy=True) self.total_reward += reward exp = Experience(self.state, action, reward, is_done, new_state) self.exp_buffer.append(exp) self.state = new_state if is_done: done_reward = self.total_reward self._reset() return done_reward
def play_step(self, device="cpu", test=False): """ Play a single step """ done_reward = None self.steps += 1 ## action selection # play step with e-greedy exploration strategy # if not in test fase if np.random.random() < self.epsilon and not test: # takes a random action action = self.env.action_space.sample() else: # moves state into an array with 1 sample to pass through neural net state_a = np.array([self.state], copy=False) # creates tensor state_v = torch.tensor(state_a).to(device) # get q values with feed forward q_vals_v = self.net(state_v) # manually adding .cpu() to run in GPU mode self.latest_qvals = q_vals_v.detach().cpu().numpy()[ 0] # store for bookkeeping # chooses greedy action and get its value _, act_v = torch.max(q_vals_v, dim=1) action = int(act_v.item()) # take action new_state, reward, is_done, _ = self.env.step( action) # step of the environment is done here self.total_reward += reward # only add to experience buffer if not in test if not test: exp = Experience(self.state, action, reward, is_done, new_state) self.exp_buffer.append(exp) # change state to new state self.state = new_state # if complete, accrue total reward and reset if is_done: done_reward = self.total_reward self.done_reward = done_reward # book keeping # add totals self.total_rewards.append(done_reward) self.total_steps.append(self.steps) # track episode self.record_episode() # reset environment self.reset() return is_done, done_reward
def fill_buffer(self): # fill buffer prior to experience while len(self.exp_buffer) < self.params["REPLAY_START_SIZE"]: action = self.env.action_space.sample() new_state, reward, is_done, _ = self.env.step(action) exp = Experience(self.state, action, reward, is_done, new_state) self.exp_buffer.append(exp) # change state to new state self.state = new_state # if done, needs to reset if is_done: self.reset(count_episode=False) # reset to leave agent to start learning self.reset(count_episode=False)
def play_step(self, device="cpu", test=False): """ Play a single step """ done_reward = None self.steps += 1 ## action selection # play step with e-greedy exploration strategy # if not in test fase if np.random.random() < self.epsilon and not test: # takes a random action action = self.env.action_space.sample() else: # moves state into an array with 1 sample to pass through neural net state_a = np.array([self.state], copy=False) # creates tensor state_v = torch.tensor(state_a).to(device) # get q values with feed forward q_vals_v = self.net(state_v) # manually adding .cpu() to run in GPU mode self.latest_qvals = q_vals_v.detach().cpu().numpy()[0] # store for bookkeeping # chooses greedy action and get its value _, act_v = torch.max(q_vals_v, dim=1) action = int(act_v.item()) # take action new_state, reward, is_done, _ = self.env.step(action) # step of the environment is done here reward *= REWARD_SCALING_FACTOR # scaling reward according to a predefined factor # self.env.render('human') # specific for minecraft self.total_reward += reward self.step_reward = reward # for bookkeeping purposes # only add to experience buffer if not in test # also do not add if state doesn't match expected size if not test: # this is a temporary check to id if this is the problem # cannot id the issue at the moment # it does seem to be the problem if self.state.shape != self.obs_shape: print("State shape size is inconsistent") elif new_state.shape != self.obs_shape: print("New state shape size is inconsistent") else: exp = Experience(self.state, action, reward, is_done, new_state) self.exp_buffer.append(exp) # change state to new state self.state = new_state # if complete, accrue total reward and reset if is_done: done_reward = self.total_reward self.done_reward = done_reward # book keeping # add totals self.total_rewards.append(done_reward) self.total_steps.append(self.steps) # track episode self.record_episode() # reset environment self.reset() return is_done, done_reward
def play_step(self, device="cpu", test=False): """ Play a single step """ done_reward = None self.steps += 1 # this seems to be the learning done with multiple samples, right? # need to organize this code into a coherence piece of something # else I won't get anywhere # maybe it is better if I recover what I already did in tf # moves state into an array with 1 sample to pass through neural net state_a = np.array([self.state], copy=False) # creates tensor states_v = torch.tensor(state_a).to(device) # get base value for actions mu_v = self.net(state_v) actions = mu_v.data.cpu().numpy() # check if OU exploration is enabled (action is deterministic) # what is agent states here? if self.ou_enabled and self.ou_epsilon > 0: new_a_states = [] for a_state, action in zip(agent_states, actions): if a_state is None: a_state = np.zeros # play step with e-greedy exploration strategy # if not in test fase if np.random.random() < self.epsilon and not test: # takes a random action action = self.env.action_space.sample() else: # get q values with feed forward q_vals_v = self.net(state_v) # manually adding .cpu() to run in GPU mode # self.latest_qvals = q_vals_v.detach().cpu().numpy()[0] # store for bookkeeping # chooses greedy action and get its value _, act_v = torch.max(q_vals_v, dim=1) action = int(act_v.item()) # take action new_state, reward, is_done, _ = self.env.step(action) # step of the environment is done here self.total_reward += reward # only add to experience buffer if not in test if not test: exp = Experience(self.state, action, reward, is_done, new_state) self.exp_buffer.append(exp) # change state to new state self.state = new_state # if complete, accrue total reward and reset if is_done: done_reward = self.total_reward self.done_reward = done_reward # book keeping # add totals self.total_rewards.append(done_reward) self.total_steps.append(self.steps) # reset environment self.reset() return is_done, done_reward