def Monte_Carlo(self): """ Monte_Carlo experiments :return: Q_table, policy """ Q_table = self.table_init() # Q_table initialization policy = {} # policy table for k in range(self.M): # iterations x = self.state_init() # initial state u = self.epsilon_greedy(int(np.argmax(Q_table[x])), self.epsilon) while x != self.xG: # stop condition x_next = self.move_next(x, self.u_set[u]) # next state reward = env.get_reward(x_next, self.lose) # reward observed u_next = self.epsilon_greedy(int(np.argmax(Q_table[x_next])), self.epsilon) Q_table[x][u] = (1 - self.alpha) * Q_table[x][u] + \ self.alpha * (reward + self.gamma * Q_table[x_next][u_next]) x, u = x_next, u_next for x in Q_table: policy[x] = int(np.argmax(Q_table[x])) # extract policy return Q_table, policy
def cal_Q_value(self, x, p, table): """ cal Q_value. :param x: next state vector :param p: probability of each state :param table: value table :return: Q-value """ value = 0 reward = env.get_reward(x, self.xG, self.lose) # get reward of next state for i in range(len(x)): value += p[i] * (reward[i] + self.gamma * max(table[x[i]])) return value
env.dump_image(os.path.join(img_dir, '%d.png' % t)) view_batches = get_view(env) # s actions, actions_batches = model.infer_actions( sess, view_batches, policy=argv.policy, epsilon=argv.epsilon) # a env.take_action(actions) env.decrease_health() env.update_pig_pos() # env.update_rabbit_pos() if video_flag: env.dump_image(os.path.join(img_dir, '%d.png' % (t + 1))) rewards = get_reward(env) # r, a dictionary env.increase_health(rewards) total_reward = 0 for k, v in rewards.items(): total_reward += v new_view_batches = get_view(env) # s' maxQ_batches = model.infer_max_action_values( sess, new_view_batches) model.train(sess=sess, view_batches=view_batches, actions_batches=actions_batches, rewards=rewards, maxQ_batches=maxQ_batches, learning_rate=argv.learning_rate)
predict = main_qn.predict(predict_arr)[0] action = np.argmax(predict) value = predict do_action(action) time.sleep(0.05) state = get_state() state_deque.append(state.astype('float32') / 255.0) print('Episode: {}, Step: {}, Epsilon: {}, Action: {}, '.format( episode, step, epsilon, action_name[action]), end='') if step > warmup_steps: reward = get_reward(state_deque) if is_scrolling(state_deque) and (action == 2 or action == 3): reward += 0.1 # additional reward for moving toward the right side if is_dead(state_deque): reward = -1 dead = True print('Reward: {}'.format(reward)) memory.add((current_state_arr, action, reward, get_state_arr(state_deque))) print(value) if do_learn is True: if len(memory) >= batch_size: pause_button()
0, top_action.data[0]] + prob_choose_top sampled_action = Categorical(action_probs).sample() print(action_probs) print(epsilon) # Act and update the current observation cur_observation, cur_frame, done = env.act(agent_host, sampled_action.data[0]) num_timesteps += 1 if (done): break # Calculate the reward based on the current task reward = env.get_reward(task, cur_observation, prev_observation) # Check for episode's termination if (reward > 0): done = True if (num_timesteps >= max_EPISODE_LENGTH): done = True episode_trajectory.append( Point(last_frames.numpy(), instruction.numpy(), sampled_action.data.numpy(), reward, action_probs.data.numpy())) if (done): break
def apply_action_in_env(states, actions): lats = env.convert_to_latents(states) new_lats = env.apply_actions(lats, actions, simplify=False) rewards = env.get_reward(lats, actions, new_lats, simplify=False) new_states = env.convert_to_obs(lats) return [rewards, new_states]
# function and observe the reward value. We then update the record array with this new # observation. We repeat this process a bunch of times, and it will continually update # the record array. The arm with the highest reward probability should eventually get # chosen most often, since it will give out the highest average reward. # excerpt from the Book: Deep Reinforcement Learning in Action A.Zai, B. Brown for i in range(500): if (use_softmax): # use softmax probabilities of values to choose action p = softmax(record[:, 1]) choice = np.random.choice(np.arange(n), p=p) else: # use for epsilon greedy approach if random.random() > eps: choice = get_best_arm(record) else: choice = np.random.randint(10) # computes the reward for choosing the arm r = get_reward(probs[choice]) # updates the Q-tale record = update_record(record, choice, r) #print(f"Updated record: {record}") #keeps track of the running average of rewards mean_reward = ((i + 1) * rewards[-1] + r) / (i + 2) rewards.append(mean_reward) # plot the cumalative rewards vs iteration ax.scatter(np.arange(len(rewards)), rewards) plt.savefig("./cumulative_rewards.png")