def select_action(state, qnet, epsilon, action_size): sample = random.random() if sample > epsilon: # return qnet( # state.type(FloatTensor)).detach().max(1)[1].view(1,1) return select_maxq_action(state, qnet) else: return LongTensor([[random.randrange(action_size)]])
next_state = preprocess_state(next_state,state_dim) #print(state.shape, next_state.shape, reward.shape, done) # Store the transition in memory memory.push(state, action, next_state, reward, done, None) # Move to the next state state = next_state n_steps += 1 scores.append(n_steps) state = np.append(env.reset(), np.random.normal(size = noise_dim)) state = preprocess_state(state, state_dim) done = False n_steps = 0 while not done: # Select and perform an action action = select_maxq_action(state, qnet) next_state, reward, done, _ = env.step(action.item()) next_state = np.append(next_state, np.random.normal(size = noise_dim)) next_state = preprocess_state(next_state, state_dim) state = next_state n_steps += 1 dp_scores.append(n_steps) mean_score = np.mean(dp_scores) # Set the value we want to achieve if mean_score >= 195 and i_episode >= 30: print('Ran {} episodes. Solved after {} trials ✔'.format(i_episode, i_episode - 30)) break if i_episode % 100 == 0: print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks. Epsilon is {}' .format(i_episode, mean_score, epsilon))