while not terminal: state_t = state_t_1 # execute action in environment action_t = agent.select_action(state_t, agent.exploration) env.execute_action(action_t) # observe environment state_t_1, reward_t, terminal = env.observe() # store experience agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal) # experience replay agent.experience_replay() # for log frame += 1 loss += agent.current_loss Q_max += np.max(agent.Q_values(state_t)) if reward_t >= 1: win += 1 print( "EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}" .format(e, n_epochs - 1, win, loss / frame, Q_max / frame)) # save model agent.save_model()
state_t_1, reward_t, terminal = env.observe() while not terminal: state_t = state_t_1 # execute action in environment action_t = agent.select_action(state_t, agent.exploration) env.execute_action(action_t) # observe environment state_t_1, reward_t, terminal = env.observe() # store experience agent.store_experience(state_t, action_t, reward_t, state_t_1, terminal) # experience replay agent.experience_replay() # for log frame += 1 loss += agent.current_loss Q_max += np.max(agent.Q_values(state_t)) if reward_t == 1: win += 1 print("EPOCH: {:03d}/{:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format( e, n_epochs - 1, win, loss / frame, Q_max / frame)) # save model agent.save_model()
action_t = agent.select_action([state_t], agent.exploration) env.execute_action(action_t) # observe environment state_t_1, reward_t, terminal = env.observe() # store experience start_replay = False start_replay = agent.store_experience([state_t], action_t, reward_t, [state_t_1], terminal) # experience replay if start_replay: do_replay_count += 1 agent.update_exploration(e) if do_replay_count > 2: agent.experience_replay(e) do_replay_count = 0 # update target network if total_frame % 500 == 0 and start_replay: agent.update_target_model() # for log frame += 1 total_frame += 1 loss += agent.current_loss Q_max += np.max(agent.Q_values([state_t])) if reward_t == 1: win += 1 if start_replay:
while True is True: env.is_available() # 手を選ばせる。盤面情報と手のブレ率(random)を与える # hand_result = env.random_play() action_t = agent.select_action(state_before, agent.exploration) hand_result = env.learning_play(action_t) if hand_result == "ok": break elif hand_result == "ng": state_after = env.observe_ng(action_t) reward_t = -9999 agent.store_experience(state_before, action_t, reward_t, state_after, env.is_playable()) agent.experience_replay(n_epochs) n_epochs += 1 frame += 1 loss += agent.current_loss Q_max += np.max(agent.Q_values(state_before)) print "EPOCH: {:03d} | WIN: {:03d} | LOSS: {:.4f} | Q_MAX: {:.4f}".format( n_epochs, win, loss / frame, Q_max / frame) elif hand_result == "pass": break else: print "Hung up" # 相手の手を進める(基本的に相手が後攻) env.learning_next() # 1手毎の結果を処理する