RL = PolicyGradient( n_actions=env.n_actions, n_features=env.n_features, learning_rate=0.01, reward_decay=0.99, ) fid_max = 0 for episode in range(500): observation = env.reset() for ii in range(N): action = RL.choose_action(observation) observation_, reward, done, fid = env.step(action, ii) RL.store_transition(observation, action, reward) observation = observation_ if done or ii >= N - 1: break if episode >= 490: if fid > fid_max: fid_max = np.copy(fid) RL.learn() print('Final_fidelity=', fid_max)
learning_rate=0.01, reward_decay=0.9, e_greedy=0.99, replace_target_iter=200, memory_size=2000, e_greedy_increment=0.001, ) step = 0 fid_max = 0 for episode in range(500): observation = env.reset() for i in range(N): action = RL.choose_action(observation) observation_, reward, done, fidelity = env.step(action, i) RL.store_transition(observation, action, reward, observation_) if (step > 500) and (step % 5 == 0): RL.learn() observation = observation_ if done: break step += 1 if episode >= 490: if fidelity > fid_max: fid_max = np.copy(fidelity)