Python PolicyGradient.store_transition 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: RL_brain

클래스/타입: PolicyGradient

메소드/함수: store_transition

hotexamples.com에서의 예제들: 6

Python PolicyGradient.store_transition - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 RL_brain.PolicyGradient.store_transition에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

PolicyGradient(19)

learn(12)

store_transition(5)

choose_action(4)

save(1)

test_action(1)

select_action(1)

save_net(1)

save_model(1)

random_choose_action(1)

restore_model(1)

SaveNet(1)

plot_cost(1)

max_choose_action(1)

episode_reward_decay(1)

discount_rewards(1)

update_parameters(1)

예제 #1

파일 보기

    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.995,
    # output_graph=True,
)

for eposide_i in range(1000):
    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)
        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(RL.ep_rs)
            if "running_reward" not in globals():
                running_reward = ep_rs_sum

            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01

            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = False  #True  # rendering
            print("episode:", eposide_i, "  reward:", int(running_reward))

            vt = RL.learn()  # train

예제 #2

파일 보기

파일: run_MountainCar.py 프로젝트: Emrys-Hong/Reinforcement-learning-with-tensorflow

    reward_decay=0.995,
    # output_graph=True,
)

for i_episode in range(1000):

    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)     # reward = -1 in all cases

        RL.store_transition(observation, action, reward)

        if done:
            # calculate running reward
            ep_rs_sum = sum(RL.ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True     # rendering

            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()  # train

            if i_episode == 30:

예제 #3

파일 보기

파일: run_CartPole.py 프로젝트: ydlu/RL_learning

    reward_decay=0.99,
    # output_graph=True,
)

for i_episode in range(3000):

    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)  #存储这一回合的transition

        if done:
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # 判断是否显示模拟
            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()  #学习，输出vt

            if i_episode == 0:

예제 #4

파일 보기

    for i in range(steps_per_episode):

        action_id = RL.choose_action(state)
        action_space = env.get_possible_action_space()
        action = action_space[action_id]
        if random_action:
            action = action_space[np.random.randint(len(action_space))]

        env.apply_action(action)

        next_state, reward = env.update_state()
        next_state = np.array(next_state)
        reward = postprocessreward(reward, th)

        RL.store_transition(state, action_id, reward)
        state = next_state

        if reward > 1:
            break
        #print(state)

    ep_rs_sum = sum(RL.ep_rs)

    if 'running_reward' not in globals():
        running_reward = ep_rs_sum
    else:
        running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
    if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
    print("episode:", i_episode, "  reward:", reward)

예제 #5

파일 보기

파일: run_TicTacToe.py 프로젝트: Cybernetic1/policy-gradient

i_episode = 0
# for i_episode in range(60000):
while True:
	i_episode += 1
	state = env.reset()

	done = False
	user = 0
	reward1 = reward2 = 0
	while not done:

		if user == 0:
			action1 = RL.choose_action(state)
			state1, reward1, done, infos = env.step(action1, -1)
			if done:
				RL.store_transition(state, action1, reward1)
				state = state1
				reward1 = reward2 = 0
		elif user == 1:
			while True:
				random_act = env.action_space.sample()
				x = random_act % 3
				y = random_act // 3
				found = False
				for i in range(0, 27, 3):
					chunk = state1[i : i + 3]
					# print("chunk=",chunk)
					if ([x,y,1] == chunk).all():
						found = True
						break
					if ([x,y,-1] == chunk).all():

예제 #6

파일 보기

    # output_graph=True,
)

for i_episode in range(3000):

    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(
            observation
        )  #agent根据策略\pi进行探索,直到探索结束. 一轮探索的所有结果<observation, action, reward>存储在记忆库中,用于训练
        observation_, reward, done, info = env.step(
            action)  #所以policy gradient是非常耗时的,大多数时间都花费在与环境交互上
        RL.store_transition(observation, action,
                            reward)  #每一轮探索都会将相关的东西存储到replay

        if done:
            ep_rs_sum = sum(RL.ep_rs)  #所有奖励值之和

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()
            # if i_episode == 0:
            #     plt.plot(vt)    # plot the episode vt