def rollout(env: gym.Env, policies: policies.QPolicy, episodes: int, temp: float, render: bool = False): """ Simulates trajectories for the given number of episodes. Input policy is used to sample actions at each time step :param env: the gym environment :param policies: The policy used to sample actions (Tabular/DQN) :param episodes: Number of episodes to be simulated :param epsilon: The exploration parameter for epsilon-greedy policy :param gamma: Discount factor :param render: If True, render the environment :return replay: Collection of (state, action, reward, next_state, done) at each timestep of all simulated episodes :return scores: Collection of total reward for each simulated episode """ replay = [] scores = [] for itrnum in range(episodes): state = env.reset() step = 0 score = 0 done = False while not done: if render: env.render() pi = policies(state, temp) # How do you select the action given pi. Hint: use np.random.choice #print("SIZE STATE:", np.size(state), "SIZE PI:", np.size(pi), "") action = np.random.choice([0, 1, 2], p=pi) #TODO: fill in this line #raise Exception("STATE:",state,"PI:",pi,"ACTION:",action,"TYPE:",type(action)) next_state, reward, done, _ = env.step(action) score += reward replay.append((state, action, reward, next_state, done)) state = next_state step += 1 env.close() scores.append(score) return replay, scores
def rollout(env: gym.Env, policies: policies.QPolicy, episodes: int, epsilon: float, render: bool = False): """ Simulates trajectories for the given number of episodes. Input policy is used to sample actions at each time step :param env: the gym environment :param policies: The policy used to sample actions (Tabular/DQN) :param episodes: Number of episodes to be simulated :param epsilon: The exploration parameter for epsilon-greedy policy :param gamma: Discount factor :param render: If True, render the environment :return replay: Collection of (state, action, reward, next_state, done) at each timestep of all simulated episodes :return scores: Collection of total reward for each simulated episode """ replay = [] scores = [] for _ in range(episodes): state = env.reset() step = 0 score = 0 done = False while not done: if render: env.render() ### <<< Your Code Here # print(state,episodes) pi = policies(state, epsilon) # How do you select the action given pi. Hint: use np.random.choice if pi[0] == pi[1]: action = np.random.choice(2) # print(action, 'random') else: action = np.argmax(pi) # print(action, 'not random') ### Your Code Ends >>> next_state, reward, done, _ = env.step(action) score += reward replay.append((state, action, reward, next_state, done)) state = next_state step += 1 env.close() scores.append(score) return replay, scores
def rollout(env: gym.Env, policies: policies.QPolicy, episodes: int, temp: float, render: bool = False): """ Simulates trajectories for the given number of episodes. Input policy is used to sample actions at each time step :param env: the gym environment :param policies: The policy used to sample actions (Tabular/DQN) :param episodes: Number of episodes to be simulated :param epsilon: The exploration parameter for epsilon-greedy policy :param gamma: Discount factor :param render: If True, render the environment :return replay: Collection of (state, action, reward, next_state, done) at each timestep of all simulated episodes :return scores: Collection of total reward for each simulated episode """ replay = [] scores = [] for itrnum in range(episodes): state = env.reset() step = 0 score = 0 done = False while not done: if render: env.render() pi = policies(state, temp) # How do you select the action given pi. Hint: use np.random.choice # action = np.random.choice(a=[0, 1, 2], size=1, p=pi)[0] # for tabQ learning action = np.random.choice(a=[0, 1], size=1, p=pi)[0] # for DQN learning next_state, reward, done, _ = env.step(action) score += reward replay.append((state, action, reward, next_state, done)) state = next_state step += 1 env.close() scores.append(score) return replay, scores