def main(): # 初始化游戏 env = flappyBird.GameState() # 图像输入形状和动作维度 obs_dim = resize_shape[0] action_dim = env.action_dim # 创建模型 model = Model(obs_dim, action_dim) model.load_dict(paddle.load(save_model_path)) model.eval() # 开始游戏 obs = env.reset() episode_reward = 0 done = False # 游戏未结束执行一直执行游戏 while not done: obs = preprocess(obs) obs = np.expand_dims(obs, axis=0) obs = paddle.to_tensor(obs, dtype='float32') action = model(obs) action = paddle.argmax(action).numpy()[0] obs, reward, done, info = env.step(action, is_train=False) episode_reward += reward print("最终得分为:{:.2f}".format(episode_reward))
def main(): # 初始化游戏 env = flappyBird.GameState() # 开始游戏 obs = env.reset() # 游戏未结束执行一直执行游戏 while True: # 游戏生成的随机动作,int类型数值 action = env.action_space() # 执行游戏 obs, reward, done, info = env.step(action, is_train=False) print("=" * 50) print("action:", action) print("obs shape:", obs.shape) print("reward:", reward) print("terminal:", done) print("info:", info) if done: obs = env.reset()
from replay_memory import ReplayMemory # 定义训练的参数 batch_size = 256 # batch大小 num_episodes = 10000 # 训练次数 memory_size = 20000 # 内存记忆 n_step = 3 # 往后的步数 learning_rate = 1e-4 # 学习率大小 e_greed = 0.1 # 探索初始概率 gamma = 0.99 # 奖励系数 e_greed_decrement = 1e-6 # 在训练过程中,降低探索的概率 update_num = 0 # 用于计算目标模型更新次数 resize_shape = (1, 36, 52) # 训练缩放的大小,减少模型计算,原大小(288, 512) save_model_path = "models/model.pdparams" # 保存模型路径 env = flappyBird.GameState() obs_dim = resize_shape[0] action_dim = env.action_dim policyQ = Model(obs_dim, action_dim) targetQ = Model(obs_dim, action_dim) targetQ.eval() n_step_buffer = [] rpm = ReplayMemory(memory_size) optimizer = paddle.optimizer.Adam(parameters=policyQ.parameters(), learning_rate=learning_rate) def preprocess(observation): observation = observation[:observation.shape[0] - 100, :]