def main(): env = gym.make('Pong-v0') obs_dim = 80 * 80 act_dim = env.action_space.n logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 # if os.path.exists('./model.ckpt'): # agent.restore('./model.ckpt') for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Train Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate(env, agent, render=False) logger.info('Episode {}, Test reward: {}'.format( i + 1, total_reward)) # save the parameters to ./model.ckpt agent.save('./model.ckpt')
def main(): # 创建环境 env = gym.make('Pong-v0') obs_dim = 80 * 80 act_dim = env.action_space.n logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 if os.path.exists('model.ckpt'): agent.restore('model.ckpt') print("restore_succeed") eval_reward = evaluate(env, agent, render=True) return eval_reward
def main(): env = gym.make('CarRacing-v0') obs_dim = 28 * 32 act_dim = 3 logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 # model_name = './model_episode_100.ckpt' model_name = MODEL_NAME if os.path.exists(model_name): agent.restore(model_name) step_cnt = 0 for i in range(1): obs = env.reset() warm_up(env) episode_reward = 0 while True: obs = preprocess(obs) # from shape (210, 160, 3) to (100800,) action = agent.predict(obs) action = convert_action(action) obs, reward, isOver, _ = env.step(action) add_to_frames(obs) if ENABLE_RENDER: env.render() episode_reward += reward if isOver: print(isOver, episode_reward, MODEL_NAME) # if ENABLE_RENDER: # input() break env.close() if ENABLE_GIF_OUTPUT: print("gif writing") outfilename = MODEL_NAME + ".gif" imageio.mimsave(outfilename, frames, 'GIF', duration=0.02)
def main(): env = JumpGame() np.random.seed(0) action_dim = 2 obs_shape = 13 model = Model(act_dim=action_dim) algorithm = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(algorithm, obs_dim=obs_shape, act_dim=action_dim) # 加载模型 if os.path.exists('./model.ckpt'): save_path = './model.ckpt' agent.restore(save_path) print("模型加载成功") for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate( env, agent, render=False) # render=True 查看渲染效果,需要在本地运行,AIStudio无法显示 logger.info('Test reward: {}'.format(total_reward)) save_path = './model/dqn_model_{}_{}.ckpt'.format(i, total_reward) agent.save(save_path) # 保存模型到文件 ./model.ckpt agent.save('./model.ckpt')
def main(): env = gym.make('CarRacing-v0') obs_dim = 28 * 32 act_dim = 3 # simply straight left and right logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 # if os.path.exists('./model.ckpt'): # agent.restore('./model.ckpt') for i in range(1): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Train Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate(env, agent, render=False) logger.info('Episode {}, Test reward: {}'.format( i + 1, total_reward)) ckpt = './model_episode_{}.ckpt'.format(i + 1) agent.save(ckpt) # save the parameters to ckpt agent.save('./final.ckpt') env.close()
reward_arr -= np.mean(reward_arr) reward_arr /= np.std(reward_arr) return reward_arr # 创建环境 env = gym.make('Pong-v0') obs_dim = 80 * 80 act_dim = env.action_space.n logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent # 4. 请参考课堂Demo构建 agent,嵌套Model, PolicyGradient, Agent # model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 # if os.path.exists('./model.ckpt'): # agent.restore('./model.ckpt') for i in range(20000): obs_list, action_list, reward_list = run_episode(env, agent) # if i % 10 == 0: # logger.info("Train Episode {}, Reward Sum {}.".format(i, # sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list)
reward_arr /= np.std(reward_arr) return reward_arr # 创建环境 env = gym.make('Pong-v0') obs_dim = 80 * 80 act_dim = env.action_space.n logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent ###################################################################### ###################################################################### # 根据parl框架构建agent model = Model(act_dim=act_dim) algorithm = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_dim, act_dim=act_dim) ###################################################################### ###################################################################### # 加载模型 if os.path.exists('./model_11.ckpt'): agent.restore('./model_11.ckpt') init_level = -7 np.random.seed(1024) for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent)
def trainTest(): env = Maze() model = MazeModel(act_dim=ACT_DIM) alg = PolicyGradient(model, hyperparas={'lr': LEARNING_RATE}) agent = MazeAgent(alg, obs_dim=OBS_DIM, act_dim=ACT_DIM) beforeTrainReward = [] global MeanReward print('BeforeTrain:') for i in range(1, 129): obs_list, action_list, reward_list = run_train_episode(env, agent) MeanReward = MeanReward + (sum(reward_list) - MeanReward) / i beforeTrainReward.append(MeanReward) logger.info('Episode:{},nowReward: {:.2f},avgReward:{:.2f}'.format( i, np.sum(reward_list), MeanReward)) global ErrorCount ErrorCountBeforeTrain = ErrorCount print('TrainStart!') trainReward = [] MeanReward = 0 #MCPG #迭代十万个episode for i in range(1, 100001): #采样 obs_list, action_list, reward_list = run_train_episode(env, agent) #使用滑动平均的方式计算奖励的期望 MeanReward = MeanReward + (sum(reward_list) - MeanReward) / i if i % 10 == 0: trainReward.append(MeanReward) logger.info( "Episode:{}, nowReward:{:.2f},avgReward:{:.2f}.".format( i, sum(reward_list), MeanReward)) if MeanReward >= 0 and i >= 256: break batch_obs = np.array(obs_list) batch_action = np.array(action_list) #通过backup的方式计算G(t),并进行归一化处理 batch_reward = calc_discount_norm_reward(reward_list, GAMMA) #学习 agent.learn(batch_obs, batch_action, batch_reward) print('TrainEnd!') input() MeanReward = 0 testReward = [] ErrorCount = 0 for i in range(1, 129): all_reward = run_evaluate_episode(env, agent) logger.info('Test reward: {:.3f}'.format(all_reward)) MeanReward = MeanReward + (all_reward - MeanReward) / i testReward.append(MeanReward) logger.info('Episode:{},nowReward: {:.2f},avgReward:{:.2f}'.format( i, all_reward, MeanReward)) ErrorCountAfterTrain = ErrorCount print() print('ErrorCountBeforeTrain:{},ErrorCountAfterTrain:{}'.format( ErrorCountBeforeTrain, ErrorCountAfterTrain)) plt.title('BeforeTrain') plt.xlabel('episode') plt.ylabel('AvgReward') plt.plot(beforeTrainReward) plt.figure() X = np.arange(0, len(trainReward)) X *= 10 plt.title('Training') plt.xlabel('episode') plt.ylabel('AvgReward') plt.plot(X, trainReward) plt.figure() plt.title('AfterTrain') plt.xlabel('episode') plt.ylabel('AvgReward') plt.plot(testReward) plt.show()