while e < n_epochs: frame = 0 loss = 0.0 Q_max = 0.0 env.reset() state_t_1, reward_t, terminal = env.observe() # 終了までアクションして経験を積む # 経験が一定以上になったら経験から学習する loops += 1 while not terminal: state_t = state_t_1 exploration = 0.1 if args.load and not start_replay else agent.exploration action_t, is_random = agent.select_action([state_t], exploration) env.execute_action(action_t) state_t_1, reward_t, terminal = env.observe() # 楽観的初期値法 # 最初のK回は各アクションに対して同じ報酬を得られたことにする if is_optimistic_epoch(loops, optimistic_num) and not args.load: action_t = optimistic_action(env, loops, optimistic) reward_t = 1 if action_t == 0 else 1 start_replay = False start_replay = agent.store_experience([state_t], action_t, reward_t, [state_t_1], terminal) if start_replay: do_replay_count += 1
start_date = utils.format(args.start_date) end_date = utils.format(args.end_date) env = Env(start_date, end_date) agent = Agent(env.actions, len(env.columns), env.state_size, args.memory_size) agent.load_model() terminal = False total_frame = 0 max_step = 0 frame = 0 state_t, reward_t, terminal = env.observe() while not terminal: action_t, is_random = agent.select_action([state_t], 0.0) env.execute_action(action_t) state_t, reward_t, terminal = env.observe() frame += 1 total_frame += 1 if max_step < env.step: max_step = env.step print("frame: %s, total_frame: %s, terminal: %s, action: %s, reward: %s" % (frame, total_frame, terminal, action_t, reward_t)) backend.clear_session() print("max_step: %s, score: %s" % (max_step, env.score))