def respond(self, env): mask = get_mask(to_char(self.env.get_curr_cards()), self.action_space, to_char(self.env.get_last_cards())) s = env.get_state() s = np.reshape(s, [1, -1]) policy, val = self.sess.run([ self.agents[0].network.valid_policy, self.agents[0].network.val_pred], feed_dict={ self.agents[0].network.input: s, self.agents[0].network.mask: np.reshape(mask, [1, -1]) }) policy = policy[0] valid_actions = np.take(np.arange(self.a_dim), mask.nonzero()) valid_actions = valid_actions.reshape(-1) # a = np.random.choice(valid_actions, p=policy) a = valid_actions[np.argmax(policy)] # print("taking action: ", self.action_space[a]) return env.step(self.action_space[a])
from network import set_network, create_network from process_image import img_width if __name__ == '__main__': set_network() main_qn = load_model('./model/model.h5') main_qn.compile(loss=huber_loss, optimizer=Adam(lr=0.0005)) target_qn = create_network() memory = Memory(memory_size) total_step = 0 for episode in range(1, num_episodes + 1): while True: state = get_state() if start_of_episode(state) == 1: break step, action, value = 0, 1, 0 do_learn, dead = True, False state_deque = deque(maxlen=state_deque_size) target_qn.model.set_weights(main_qn.model.get_weights()) for _ in range(1, max_steps + 1): step += 1 total_step += 1 # epsilon decay epsilon = epsilon_stop + (epsilon_start - epsilon_stop) * np.exp(
def rl(q_table): for episode in range(MAX_EPISODES): print("episode", episode) environment = pd.DataFrame(np.zeros((3, 3))) result = "continue" # 玩家1操作 state1 = env.get_state(environment) action1 = choose_action(state1, q_table, environment) env.get_result(environment, 1, action1) while True: # 玩家2操作 state2 = env.get_state(environment) action2 = choose_action(state2, q_table, environment) result = env.get_result(environment, 2, action2) state1_ = env.get_state(environment) # 玩家2完成动作后,就是玩家1的转移后的状态 if result == 'win': R1, R2 = -1, 1 q_table.loc[state1, action1] += ALPHA * ( R1 + LAMBDA * q_table.iloc[state1_, :].max() - q_table.loc[state1, action1]) q_table.loc[state2, action2] += ALPHA * (R2 - q_table.loc[state2, action2]) break elif result == 'continue': R1 = 0 q_table.loc[state1, action1] += ALPHA * ( R1 + LAMBDA * q_table.iloc[state1_, :].max() - q_table.loc[state1, action1]) else: R1, R2 = 0.1, 0.1 q_table.loc[state1, action1] += ALPHA * ( R1 + LAMBDA * q_table.iloc[state1_, :].max() - q_table.loc[state1, action1]) q_table.loc[state2, action2] += ALPHA * (R2 - q_table.loc[state2, action2]) break # 玩家1操作 state1 = env.get_state(environment) action1 = choose_action(state1, q_table, environment) result = env.get_result(environment, 1, action1) state2_ = env.get_state(environment) if result == 'win': R1, R2 = 1, -1 q_table.loc[state2, action2] += ALPHA * ( R2 + LAMBDA * q_table.iloc[state2_, :].max() - q_table.loc[state2, action2]) q_table.loc[state1, action1] += ALPHA * (R1 - q_table.loc[state1, action1]) break elif result == 'continue': R2 = 0 q_table.loc[state2, action2] += ALPHA * ( R2 + LAMBDA * q_table.iloc[state2_, :].max() - q_table.loc[state2, action2]) else: R1, R2 = 0.1, 0.1 q_table.loc[state2, action2] += ALPHA * ( R2 + LAMBDA * q_table.iloc[state2_, :].max() - q_table.loc[state2, action2]) q_table.loc[state1, action1] += ALPHA * (R1 - q_table.loc[state1, action1]) break return q_table