コード例 #1
0
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
min_val = paddle.to_tensor(1e-7).astype('float32')

actor = Actor(state_dim, action_dim, max_action)
target_actor = Actor(state_dim, action_dim, max_action)
target_actor.eval()
target_actor.load_dict(actor.state_dict())
actor_optimizer = paddle.optimizer.RMSProp(parameters=actor.parameters(),
                                           learning_rate=learning_rate)

critic_1 = Critic(state_dim, action_dim)
target_critic_1 = Critic(state_dim, action_dim)
target_critic_1.eval()
target_critic_1.load_dict(critic_1.state_dict())
critic_2 = Critic(state_dim, action_dim)
target_critic_2 = Critic(state_dim, action_dim)
target_critic_2.eval()
target_critic_2.load_dict(critic_2.state_dict())
critic_1_optimizer = paddle.optimizer.RMSProp(parameters=critic_1.parameters(),
                                              learning_rate=learning_rate)
critic_2_optimizer = paddle.optimizer.RMSProp(parameters=critic_2.parameters(),
                                              learning_rate=learning_rate)

rpm = ReplayMemory(memory_size)


def train():
    global epoch
    total_reward = 0
コード例 #2
0
ファイル: train.py プロジェクト: wobushihuair/Paddle-RLBooks
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
min_val = paddle.to_tensor(1e-7).astype('float32')

actor = Actor(state_dim, action_dim, max_action)
actor_optimizer = paddle.optimizer.RMSProp(parameters=actor.parameters(),
                                  learning_rate=learning_rate)

Q_net = Q(state_dim, action_dim)
Q_optimizer = paddle.optimizer.RMSProp(parameters=Q_net.parameters(),
                                  learning_rate=learning_rate)

critic = Critic(state_dim)
target_critic = Critic(state_dim)
target_critic.eval()
target_critic.load_dict(critic.state_dict())
critic_optimizer = paddle.optimizer.RMSProp(parameters=critic.parameters(),
                                  learning_rate=learning_rate)

rpm = ReplayMemory(memory_size)

def train():
    global epoch
    total_reward = 0
    # 重置游戏状态
    state = env.reset()
    while True:
        action = actor.select_action(state)

        next_state, reward, done, info = env.step(action)
        env.render()