Exemple #1
0
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.995,
    # output_graph=True,
)

for eposide_i in range(1000):
    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)
        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(RL.ep_rs)
            if "running_reward" not in globals():
                running_reward = ep_rs_sum

            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01

            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = False  #True  # rendering
            print("episode:", eposide_i, "  reward:", int(running_reward))

            vt = RL.learn()  # train
    reward_decay=0.995,
    # output_graph=True,
)

for i_episode in range(1000):

    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)     # reward = -1 in all cases

        RL.store_transition(observation, action, reward)

        if done:
            # calculate running reward
            ep_rs_sum = sum(RL.ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True     # rendering

            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()  # train

            if i_episode == 30:
Exemple #3
0
    reward_decay=0.99,
    # output_graph=True,
)

for i_episode in range(3000):

    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)  #存储这一回合的transition

        if done:
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # 判断是否显示模拟
            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()  #学习,输出vt

            if i_episode == 0:
Exemple #4
0
    for i in range(steps_per_episode):

        action_id = RL.choose_action(state)
        action_space = env.get_possible_action_space()
        action = action_space[action_id]
        if random_action:
            action = action_space[np.random.randint(len(action_space))]

        env.apply_action(action)

        next_state, reward = env.update_state()
        next_state = np.array(next_state)
        reward = postprocessreward(reward, th)

        RL.store_transition(state, action_id, reward)
        state = next_state

        if reward > 1:
            break
        #print(state)

    ep_rs_sum = sum(RL.ep_rs)

    if 'running_reward' not in globals():
        running_reward = ep_rs_sum
    else:
        running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
    if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
    print("episode:", i_episode, "  reward:", reward)
i_episode = 0
# for i_episode in range(60000):
while True:
	i_episode += 1
	state = env.reset()

	done = False
	user = 0
	reward1 = reward2 = 0
	while not done:

		if user == 0:
			action1 = RL.choose_action(state)
			state1, reward1, done, infos = env.step(action1, -1)
			if done:
				RL.store_transition(state, action1, reward1)
				state = state1
				reward1 = reward2 = 0
		elif user == 1:
			while True:
				random_act = env.action_space.sample()
				x = random_act % 3
				y = random_act // 3
				found = False
				for i in range(0, 27, 3):
					chunk = state1[i : i + 3]
					# print("chunk=",chunk)
					if ([x,y,1] == chunk).all():
						found = True
						break
					if ([x,y,-1] == chunk).all():
Exemple #6
0
    # output_graph=True,
)

for i_episode in range(3000):

    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(
            observation
        )  #agent根据策略\pi进行探索,直到探索结束. 一轮探索的所有结果<observation, action, reward>存储在记忆库中,用于训练
        observation_, reward, done, info = env.step(
            action)  #所以policy gradient是非常耗时的,大多数时间都花费在与环境交互上
        RL.store_transition(observation, action,
                            reward)  #每一轮探索都会将相关的东西存储到replay

        if done:
            ep_rs_sum = sum(RL.ep_rs)  #所有奖励值之和

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()
            # if i_episode == 0:
            #     plt.plot(vt)    # plot the episode vt