n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995, # output_graph=True, ) for eposide_i in range(1000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) if done: ep_rs_sum = sum(RL.ep_rs) if "running_reward" not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = False #True # rendering print("episode:", eposide_i, " reward:", int(running_reward)) vt = RL.learn() # train
reward_decay=0.995, # output_graph=True, ) for i_episode in range(1000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # reward = -1 in all cases RL.store_transition(observation, action, reward) if done: # calculate running reward ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() # train if i_episode == 30:
reward_decay=0.99, # output_graph=True, ) for i_episode in range(3000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) #存储这一回合的transition if done: ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # 判断是否显示模拟 print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() #学习,输出vt if i_episode == 0:
for i in range(steps_per_episode): action_id = RL.choose_action(state) action_space = env.get_possible_action_space() action = action_space[action_id] if random_action: action = action_space[np.random.randint(len(action_space))] env.apply_action(action) next_state, reward = env.update_state() next_state = np.array(next_state) reward = postprocessreward(reward, th) RL.store_transition(state, action_id, reward) state = next_state if reward > 1: break #print(state) ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", reward)
i_episode = 0 # for i_episode in range(60000): while True: i_episode += 1 state = env.reset() done = False user = 0 reward1 = reward2 = 0 while not done: if user == 0: action1 = RL.choose_action(state) state1, reward1, done, infos = env.step(action1, -1) if done: RL.store_transition(state, action1, reward1) state = state1 reward1 = reward2 = 0 elif user == 1: while True: random_act = env.action_space.sample() x = random_act % 3 y = random_act // 3 found = False for i in range(0, 27, 3): chunk = state1[i : i + 3] # print("chunk=",chunk) if ([x,y,1] == chunk).all(): found = True break if ([x,y,-1] == chunk).all():
# output_graph=True, ) for i_episode in range(3000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action( observation ) #agent根据策略\pi进行探索,直到探索结束. 一轮探索的所有结果<observation, action, reward>存储在记忆库中,用于训练 observation_, reward, done, info = env.step( action) #所以policy gradient是非常耗时的,大多数时间都花费在与环境交互上 RL.store_transition(observation, action, reward) #每一轮探索都会将相关的东西存储到replay if done: ep_rs_sum = sum(RL.ep_rs) #所有奖励值之和 if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() # if i_episode == 0: # plt.plot(vt) # plot the episode vt