def main(): # 加载数据 df = pd.read_csv('TD3gupiao/DATA/AAPL.csv') df = df.sort_values('Date') # 创建环境 env = StockTradingEnv(df) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) model = MujocoModel(act_dim, max_action) algorithm = parl.algorithms.TD3(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps # logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps)
def run_episode(envs, agent, rpm, episode): total_reward = 0 Actions = [int(action_dim / 2) for _ in range(process_num)] Obses, _, _ = envs.next(Actions) step = 0 for _ in range(1000): step += 1 Actions = [] for obs in Obses: action = agent.sample(obs) # 采样动作,所有动作都有概率被尝试到 Actions.append(action) Next_obses, Rewards, Dones = envs.next(Actions) # 可以跟训练分开成两个进程? for i in range(len(Actions)): rpm.append( (Obses[i], Actions[i], Rewards[i], Next_obses[i], Dones[i])) # train model if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0): (batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) = rpm.sample(BATCH_SIZE) train_loss = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) # s,a,r,s',done total_reward += np.sum(Rewards) Obses = Next_obses if not step % 20: logger.info('step:{} e_greed:{} reward:{}'.format( step, agent.e_greed, np.sum(Rewards))) if not step % 500: image = pygame.surfarray.array3d( pygame.display.get_surface()).copy() image = np.flip(image[:, :, [2, 1, 0]], 0) image = np.rot90(image, 3) img_pt = os.path.join('outputs', 'snapshoot_{}_{}.jpg'.format(episode, step)) cv2.imwrite(img_pt, image) return total_reward
def run_train_step(agent, rpm): for step in range(args.train_total_steps): # use the first 80% data to train batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch( args.batch_size * gpu_num) batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] batch_next_obs = batch_all_obs[:, 1:, :, :] cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_isOver) if step % 100 == 0: # use the last 20% data to evaluate batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch( args.batch_size) batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] batch_next_obs = batch_all_obs[:, 1:, :, :] eval_cost = agent.supervised_eval(batch_obs, batch_action, batch_reward, batch_next_obs, batch_isOver) logger.info( "train step {}, train costs are {}, eval cost is {}.".format( step, cost, eval_cost))
def main(): env = JumpGame() np.random.seed(0) action_dim = 2 obs_shape = 13 model = Model(act_dim=action_dim) algorithm = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(algorithm, obs_dim=obs_shape, act_dim=action_dim) # 加载模型 if os.path.exists('./model.ckpt'): save_path = './model.ckpt' agent.restore(save_path) print("模型加载成功") for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate( env, agent, render=False) # render=True 查看渲染效果,需要在本地运行,AIStudio无法显示 logger.info('Test reward: {}'.format(total_reward)) save_path = './model/dqn_model_{}_{}.ckpt'.format(i, total_reward) agent.save(save_path) # 保存模型到文件 ./model.ckpt agent.save('./model.ckpt')
def main(): env = gym.make(args.env) env.seed(ENV_SEED) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) actor = ActorModel(act_dim) critic = CriticModel() algorithm = parl.algorithms.SAC( actor, critic, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps)
def main(): # 创建飞行器环境 env = make_env("Quadrotor_hovering_control", task="hovering_control") env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print(obs_dim, act_dim) model = QuadrotorModel(act_dim + 1) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim + 1) # if os.path.exists('model_dir/steps_140848.ckpt'): # agent.restore('model_dir/steps_140848.ckpt') # print("Restore succeed") # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用 rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim + 1) # 启动训练 test_flag = 0 total_steps = 0 while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent, rpm) total_steps += steps # logger.info('Steps: {} Train reward: {}'.format(total_steps, train_reward)) # 打印训练reward if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数,评估一次模型 while total_steps // TEST_EVERY_STEPS >= test_flag: test_flag += 1 evaluate_reward = evaluate(env, agent) logger.info('Steps {}, Test reward: {}'.format( total_steps, evaluate_reward)) # 打印评估的reward # 每评估一次,就保存一次模型,以训练的step数命名 ckpt = 'model_dir/steps_{}.ckpt'.format(total_steps) agent.save(ckpt)
def _parse_memory(self, client, ident, last_obs): mem = client.memory n = len(mem) # debug info if ident == 1: for i, exp in enumerate(mem): logger.info( "[step:{}] obs:{} action:{} reward:{} shaping_reward:{}". format(i, np.sum(mem[i].obs), np.sum(mem[i].action), mem[i].reward, mem[i].info['shaping_reward'])) episode_rpm = [] for i in range(n - 1): if not mem[i].info['target_changed']: episode_rpm.append([ mem[i].obs, mem[i].action, mem[i].info['shaping_reward'], mem[i + 1].obs, False, mem[i].info['target_change_times'] ]) if not mem[-1].info['target_changed']: episode_rpm.append([ mem[-1].obs, mem[-1].action, mem[-1].info['shaping_reward'], last_obs, not mem[-1].info['timeout'], mem[i].info['target_change_times'] ]) indicators_dict = calc_indicators(mem) indicators_dict['free_client_num'] = self.ready_client_queue.qsize() indicators_dict['noiselevel'] = self.noiselevel with self.MEMORY_LOCK: self.add_episode_rpm(episode_rpm) self.scalars_manager.record(indicators_dict, self.global_step) self.global_step += 1 if self.global_step >= 50: self.noiselevel = self.noiselevel * NOISE_DECAY client.reset()
def main(): env = Paddle() np.random.seed(0) action_dim = 3 obs_shape = 5 model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN( model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, # explore e_greed_decrement=1e-6 ) # probability of exploring is decreasing during training save_path = './dqn_model.ckpt' agent.restore(save_path) print("模型加载成功") eval_reward = evaluate(agent, env) logger.info('test_reward:{}'.format(eval_reward))
def run_train_episode(env, agent, rpm): total_reward = 0 all_cost = [] obs = env.reset() #obs = rgb2gray(obs) #print(obs.shape) steps = 0 while True: steps += 1 context = rpm.recent_obs() context.append(obs) #print(obs.shape) context = np.stack(context, axis=0) action = agent.sample(context) next_obs, reward, isOver, _ = env.step(action) rpm.append(Experience(obs, action, reward, isOver)) # start training if rpm.size() > MEMORY_WARMUP_SIZE: if steps % UPDATE_FREQ == 0: batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch( args.batch_size) batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] batch_next_obs = batch_all_obs[:, 1:, :, :] cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_isOver) all_cost.append(float(cost)) total_reward += reward #obs = rgb2gray(next_obs) obs = next_obs #print(obs.shape) if isOver: break if all_cost: logger.info('[Train]total_reward: {}, mean_cost: {}'.format( total_reward, np.mean(all_cost))) return total_reward, steps, np.mean(all_cost)
def main(): # 创建环境 game = Pong(width=200, height=200, MAX_SCORE=11) p = PLE(game, fps=30, display_screen=True, force_fps=False) p.reset_game() # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) print("act_dim:", act_dim) obs_dim = 200 * 200 # 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套 model = PongModel(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = PongAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) max_episode = 20000 # 开始训练 episode = 0 best_reward = -float('inf') while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=True) # render=True 查看显示效果 if eval_reward > best_reward: best_reward = eval_reward agent.save('model_dir/ddpg_pong_{}.ckpt'.format(episode)) logger.info('episode:{} test_reward:{}'.format(episode, eval_reward))
def main(): env = gym.make('CartPole-v0') action_dim = env.action_space.n obs_shape = env.observation_space.shape rpm = ReplayMemory(MEMORY_SIZE) model = CartpoleModel(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = CartpoleAgent( algorithm, obs_dim=obs_shape[0], act_dim=action_dim, e_greed=0.1, # explore e_greed_decrement=1e-6 ) # probability of exploring is decreasing during training while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory run_episode(agent, env, rpm) max_episode = 2000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(agent, env, rpm) episode += 1 eval_reward = evaluate(agent, env) logger.info('episode:{} test_reward:{}'.format( episode, eval_reward))
def main(): env = get_player(args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) file_path = "memory.npz" rpm = ReplayMemory( MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN, load_file=True, # load replay memory data from file file_path=file_path) act_dim = env.action_space.n model = AtariModel(act_dim) algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE * gpu_num) agent = AtariAgent(algorithm, act_dim=act_dim, total_step=args.train_total_steps) if os.path.isfile('./model_dir'): logger.info("load model from file") agent.restore('./model_dir') if args.train: logger.info("train with memory data") run_train_step(agent, rpm) logger.info("finish training. Save the model.") agent.save('./model_dir') else: logger.info("collect experience") collect_exp(env, rpm, agent) rpm.save_memory() logger.info("finish collecting, save successfully")
def __init__(self, max_size, obs_shape, context_len, load_file=False, file_path=None): self.max_size = int(max_size) self.obs_shape = obs_shape self.context_len = int(context_len) self.file_path = file_path if load_file and os.path.isfile(file_path): logger.info("load memory from file" + self.file_path) self.load_memory() logger.info("memory size is {}".format(self._curr_size)) else: self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8') self.action = np.zeros((self.max_size, ), dtype='int32') self.reward = np.zeros((self.max_size, ), dtype='float32') self.isOver = np.zeros((self.max_size, ), dtype='bool') self._curr_size = 0 self._curr_pos = 0 self._context = deque(maxlen=context_len - 1)
def test_reset_actor(self): logger.info("running: test_reset_actor") # start the master master = Master(port=8237) th = threading.Thread(target=master.run) th.start() time.sleep(3) worker1 = Worker('localhost:8237', 4) parl.connect('localhost:8237') for _ in range(10): actor = Actor() ret = actor.add_one(1) self.assertEqual(ret, 2) del actor for _ in range(10): if master.cpu_num == 4: break time.sleep(10) self.assertEqual(master.cpu_num, 4) worker1.exit() master.exit()
def main(): env = gym.make('CartPole-v0') model = CartpoleModel(name_scope='noIdeaWhyNeedThis', act_dim=ACT_DIM) alg = PolicyGradient(model, LEARNING_RATE) agent = CartpoleAgent(alg, OBS_DIM, ACT_DIM) with fluid.dygraph.guard(): for i in range(1000): # 100 episodes obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: _, _, reward_list = run_episode(env, agent, train_or_test='test') total_reward = np.sum(reward_list) logger.info('Test reward: {}'.format(total_reward))
def main(): env = gym.make(args.env) env.seed(seed) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = float(env.action_space.high[0]) algorithm = SAC(ActorModel(act_dim), CriticModel(), max_action=act_limit, gamma=gamma, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = BipedalWalkerAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(replay_size, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps # logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps) ckpt = 'star2_model_dir/steps_{}.ckpt'.format(total_steps) agent.save(ckpt)
def watch(): import torch game = FlappyBird() env = PLE(game, fps=30, frame_skip=4, display_screen=True, force_fps=False, reward_values={"tick": 0.00}, state_preprocessor=None) env.init() model = Model(obs_dim=OBS_DIM, act_dim=ACT_DIM) if torch.cuda.is_available(): model = model.cuda() model.load_state_dict(torch.load('checkpoint.pt')) from parl.algorithms.torch import PolicyGradient alg = PolicyGradient(model, LEARNING_RATE) agent = Agent(alg) for i in range(10000): # 1000 episodes obs_list, action_list, reward_list = run_episode(env, agent) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) # agent.learn(batch_obs, batch_action, batch_reward) _, _, reward_list = run_episode(env, agent, train_or_test='test') total_reward = np.sum(reward_list) logger.info('Test reward: {}'.format(total_reward))
def _create_client_monitor(self, client_heartbeat_address): """When a new client connects to the master, a socket is created to send heartbeat signals to the client. """ client_heartbeat_socket = self.ctx.socket(zmq.REQ) client_heartbeat_socket.linger = 0 client_heartbeat_socket.setsockopt( zmq.RCVTIMEO, remote_constants.HEARTBEAT_TIMEOUT_S * 1000) client_heartbeat_socket.connect("tcp://" + client_heartbeat_address) client_is_alive = True while client_is_alive and self.master_is_alive: try: client_heartbeat_socket.send_multipart( [remote_constants.HEARTBEAT_TAG]) client_status = client_heartbeat_socket.recv_multipart() self.cluster_monitor.update_client_status( client_status, client_heartbeat_address, self.client_hostname[client_heartbeat_address]) except zmq.error.Again as e: client_is_alive = False self.cluster_monitor.drop_client_status( client_heartbeat_address) logger.warning("[Master] cannot connect to the client " + "{}. ".format(client_heartbeat_address) + "Please check if it is still alive.") time.sleep(remote_constants.HEARTBEAT_INTERVAL_S) logger.warning("Master exits client monitor for {}.\n".format( client_heartbeat_address)) logger.info( "Master connects to {} workers and have {} vacant CPUs.\n".format( self.worker_num, self.cpu_num)) client_heartbeat_socket.close(0)
def main(): # env = gym.make('CartPole-v0') # env = env.unwrapped # Cancel the minimum score limit # env = VideoOffloadEnv() env = TrainEnv() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 if os.path.exists('./policygradient_model'): agent.restore('./policygradient_model') print("加载模型成功,开始预测:") evaluate(env, agent) max_episode = 20000 log_list = [] fo = open( "log/" + str(math.floor(time.time() * 1000.0)) + "policygradient.txt", "w") train_episode = 0 test_episode = 0 while train_episode < max_episode: for i in range(0, 10): obs_list, action_list, reward_list = run_episode(env, agent) log_list.append("Train " + str(train_episode) + " " + str(sum(reward_list)) + "\n") logger.info("train_episode:{} train_reward:{}.".format( train_episode, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) train_episode += 1 total_reward = evaluate(env, agent) log_list.append("Test " + str(test_episode) + " " + str(total_reward) + "\n") logger.info('test_episode:{} test_reward:{}'.format( test_episode, total_reward)) test_episode += 1 fo.writelines(log_list) fo.close() # save the parameters to ./policygradient_model agent.save('./policygradient_model') print("模型保存成功")
def test(): # 创建环境 game = FlappyBird() env = PLE(game, fps=30, display_screen=True) obs_dim = len(env.getGameState()) act_dim = len(env.getActionSet()) print('action set:', env.getActionSet()) logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=act_dim) algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(algorithm, obs_dim=obs_dim, act_dim=act_dim, e_greed=0.3, e_greed_decrement=1e-6) # 加载模型 save_path = './DQN/checkpoints/episode_V14600.ckpt' print('checkpoints:', save_path) if os.path.exists(save_path): logger.info('load ckpt success!') agent.restore(save_path) else: logger.error('load ckpt error!') action_set = env.getActionSet() env.init() episode_reward = 0 steps = 0 while not env.game_over(): steps += 1 if (steps == 1): continue obs = list(env.getGameState().values()) action_idx = agent.predict(obs) # 预测动作,只选最优动作 act = action_set[action_idx] reward = env.act(act) episode_reward += reward reward_str = str(int(episode_reward)) drawText(env.game.screen, reward_str, 288, 0, 48, (255, 0, 0), (255, 255, 255)) env.reset_game() logger.info('[Test] steps:{}, reward:{}'.format(steps, episode_reward))
def main(): env = ContinuousCartPoleEnv() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] #obs_dim += 1 # add 1 to obs dim for time step feature logger.info('observation_dim {}, action_dim {}'.format(obs_dim, act_dim)) scaler = Scaler(obs_dim) model = Model(obs_dim, act_dim) alg = parl.algorithms.PPO( model, act_dim=act_dim, policy_lr=model.policy_lr, value_lr=model.value_lr) agent = Agent(alg, obs_dim, act_dim, args.kl_targ, loss_type=args.loss_type) # run a few episodes to initialize scaler collect_trajectories(env, agent, scaler, episodes=5) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: trajectories = collect_trajectories( env, agent, scaler, episodes=args.episodes_per_batch) total_steps += sum([t['obs'].shape[0] for t in trajectories]) total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories]) train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data( trajectories, agent) policy_loss, kl = agent.policy_learn(train_obs, train_actions, train_advantages) value_loss = agent.value_learn(train_obs, train_discount_sum_rewards) logger.info( 'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}' .format(total_steps, total_train_rewards / args.episodes_per_batch, policy_loss, kl, value_loss)) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 eval_reward = run_evaluate_episode(env, agent, scaler, render=True) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, eval_reward))
def main(): env = gym.make(args.env) env.seed(args.seed) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) model = MujocoModel(obs_dim, act_dim, max_action) algorithm = ADER(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR, kappa=args.kappa, epoch=args.epoch, alpha=args.alpha) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward, evaluate_fall_rate, total_steps_list = run_evaluate_episode( env, agent) mean_steps = np.mean(total_steps_list) logger.info('Steps {}, Evaluate reward: {}, Fall rate: {}'.format( total_steps, evaluate_reward, evaluate_fall_rate)) logger.info( 'Steps {}, Mean episode steps: {}, Steps list: {}'.format( total_steps, mean_steps, total_steps_list)) res = { 'eval_step': mean_steps, 'fall_rate': evaluate_fall_rate, 'Step': total_steps, 'Value': evaluate_reward } csv_logger.log_dict(res)
def main(): # create environment dist1 = Distribution(id=0, vals=[2], probs=[1]) dist2 = Distribution(id=1, vals=[5], probs=[1]) dist3 = Distribution(id=2, vals=[2,8], probs=[0.5,0.5]) env = Environment(total_bandwidth = 10,\ distribution_list=[dist1,dist2,dist3], \ mu_list=[1,2,3], lambda_list=[3,2,1],\ num_of_each_type_distribution_list=[300,300,300]) # env = gym.make('CartPole-v0') # env = env.unwrapped # Cancel the minimum score limit # obs_dim = env.observation_space.shape[0] # act_dim = env.action_space.n obs_dim = 6 act_dim = 2 logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 if os.path.exists('./policy_grad_model.ckpt'): agent.restore('./policy_grad_model.ckpt') # run_episode(env, agent, train_or_test='test', render=True) # exit() for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list, gamma=0.9) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate(env, agent, render=True) logger.info('Test reward: {}'.format(total_reward)) # save the parameters to ./policy_grad_model.ckpt agent.save('./policy_grad_model.ckpt')
def play_multi_episode(submit_model, episode_num=2, vis=False, seed=0): np.random.seed(seed) env = ProstheticsEnv(visualize=vis) env.change_model(model='3D', difficulty=1, prosthetic=True, seed=seed) env = ForwardReward(env) env = FrameSkip(env, 4) env = ActionScale(env) env = PelvisBasedObs(env) all_reward = [] all_shaping_reward = 0 last_frames_count = 0 for e in range(episode_num): t = time.time() episode_reward = 0.0 episode_shaping_reward = 0.0 observation = env.reset(project=False) target_change_times = 0 step = 0 loss = [] while True: step += 1 action = submit_model.pred_batch(observation, target_change_times) observation, reward, done, info = env.step(action, project=False) step_frames = info['frame_count'] - last_frames_count last_frames_count = info['frame_count'] episode_reward += reward # we pacle it here to drop the first step after changing if target_change_times >= 1: loss.append(10 * step_frames - reward) if info['target_changed']: target_change_times = min(target_change_times + 1, 3) logger.info("[step/{}]reward:{} info:{}".format( step, reward, info)) episode_shaping_reward += info['shaping_reward'] if done: break all_reward.append(episode_reward) all_shaping_reward += episode_shaping_reward t = time.time() - t logger.info( "[episode/{}] time: {} episode_reward:{} change_loss:{} after_change_loss:{} mean_reward:{}" .format(e, t, episode_reward, np.sum(loss[:15]), np.sum(loss[15:]), np.mean(all_reward))) logger.info("Mean reward:{}".format(np.mean(all_reward)))
def load_params(self, dirname, from_one_head): if from_one_head: logger.info('[From one head, extend to multi head:]') # load model 0 fluid.io.load_params( executor=self.fluid_executor, dirname=dirname, main_program=self.learn_programs[0]) # sync identity params of model/target_model 0 to other models/target_models for i in range(1, self.ensemble_num): params = list( filter( lambda x: 'identity' in x.name and '@GRAD' not in x.name, self.learn_programs[i].list_vars())) for param in params: param_var = _fetch_var(param.name, return_numpy=False) model0_name = re.sub(r"identity_\d+", "identity_0", param.name) model0_value = _fetch_var(model0_name, return_numpy=True) logger.info('{} -> {}'.format(model0_name, param.name)) param_var.set(model0_value, self.place) # sync share params of target_model 0 to other target models # After deepcopy, shapre params between target models is different for i in range(1, self.ensemble_num): params = list( filter( lambda x: 'shared' in x.name and 'PARL_target' in x.name and '@GRAD' not in x.name, self.learn_programs[i].list_vars())) for param in params: param_var = _fetch_var(param.name, return_numpy=False) model0_name = re.sub(r"_\d+$", "_0", param.name) model0_value = _fetch_var(model0_name, return_numpy=True) logger.info('{} -> {}'.format(model0_name, param.name)) param_var.set(model0_value, self.place) else: for i in range(self.ensemble_num): fluid.io.load_params( executor=self.fluid_executor, dirname=dirname, main_program=self.learn_programs[i])
def keep_training(self): episode_count = 1000000 for T in range(episode_count): if self.rpm.size() > BATCH_SIZE * args.warm_start_batchs: self.learn() logger.info( "[keep_training/{}] trying to acq a new env".format(T)) # Keep training and predicting balance # After training, wait for a ready actor, and make the actor start new episode ready_actor_event = self.ready_actor_queue.get() ready_actor_event.set() if np.mod(T, 100) == 0: logger.info("saving models") self.save(T) if np.mod(T, 10000) == 0: logger.info("saving rpm") self.save_rpm()
def create_actors(self): parl.connect(self.config['master_address']) logger.info('Waiting for {} remote actors to connect.'.format( self.config['actor_num'])) for i in six.moves.range(self.config['actor_num']): params_queue = queue.Queue() self.params_queues.append(params_queue) self.remote_count += 1 logger.info('Remote actor count: {}'.format(self.remote_count)) remote_thread = threading.Thread( target=self.run_remote_sample, args=(params_queue, )) remote_thread.setDaemon(True) remote_thread.start() logger.info('All remote actors are ready, begin to learn.') self.start_time = time.time()
def main(): # 创建环境 game = Snake(width=256, height=256, init_length=10) p = PLE(game, fps=30, display_screen=True, force_fps=True) # 根据parl框架构建agent p.reset_game() print(p.getActionSet()) act_dim = len(p.getActionSet()) obs_dim = 256 * 256 logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # # 加载模型 # if os.path.exists('model_dir/pg_pong_episode_19.ckpt'): # agent.restore('model_dir/pg_pong_episode_19.ckpt') best_total_reward = -float('inf') for i in range(50000): obs_list, action_list, reward_list = run_episode(p, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 50 == 0: total_reward = evaluate(p, agent, render=True) if total_reward > best_total_reward: best_total_reward = total_reward agent.save( 'model_dir/pg_pong_episode_{}_reward_{}.ckpt'.format( i, total_reward)) logger.info('Test reward: {}'.format(total_reward))
def main(): # 创建环境 env = gym.make('Pong-v0') obs_dim = 80 * 80 act_dim = env.action_space.n logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 if os.path.exists('model.ckpt'): agent.restore('model.ckpt') print("restore_succeed") best_reward = -float('inf') for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Train Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate(env, agent, render=False) if total_reward > best_reward: best_reward = total_reward agent.save('model_dir/pg_pong_episode_{}_reward_{}'.format( i, best_reward)) logger.info('Episode {}, Test reward: {}'.format( i + 1, total_reward)) # save the parameters to ./model.ckpt agent.save('./model.ckpt')
def main(): env = gym.make('CarRacing-v0') obs_dim = 28 * 32 act_dim = 3 # simply straight left and right logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 # if os.path.exists('./model.ckpt'): # agent.restore('./model.ckpt') for i in range(1): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Train Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate(env, agent, render=False) logger.info('Episode {}, Test reward: {}'.format( i + 1, total_reward)) ckpt = './model_episode_{}.ckpt'.format(i + 1) agent.save(ckpt) # save the parameters to ckpt agent.save('./final.ckpt') env.close()