def main(): env = gym.make(args.env) env.seed(ENV_SEED) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) model = MujocoModel(act_dim, max_action) algorithm = parl.algorithms.TD3(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps)
def log_metrics(self): """ Log metrics of learner and actors """ if self.start_time is None: return metrics = [] while True: try: metric = self.remote_metrics_queue.get_nowait() metrics.append(metric) except queue.Empty: break episode_rewards, episode_steps = [], [] for x in metrics: episode_rewards.extend(x['episode_rewards']) episode_steps.extend(x['episode_steps']) max_episode_rewards, mean_episode_rewards, min_episode_rewards, \ max_episode_steps, mean_episode_steps, min_episode_steps =\ None, None, None, None, None, None if episode_rewards: mean_episode_rewards = np.mean(np.array(episode_rewards).flatten()) max_episode_rewards = np.max(np.array(episode_rewards).flatten()) min_episode_rewards = np.min(np.array(episode_rewards).flatten()) mean_episode_steps = np.mean(np.array(episode_steps).flatten()) max_episode_steps = np.max(np.array(episode_steps).flatten()) min_episode_steps = np.min(np.array(episode_steps).flatten()) metric = { 'sample_steps': self.sample_total_steps, 'max_episode_rewards': max_episode_rewards, 'mean_episode_rewards': mean_episode_rewards, 'min_episode_rewards': min_episode_rewards, 'max_episode_steps': max_episode_steps, 'mean_episode_steps': mean_episode_steps, 'min_episode_steps': min_episode_steps, 'sample_queue_size': self.sample_data_queue.qsize(), 'total_params_sync': self.total_params_sync, 'cache_params_sent_cnt': self.cache_params_sent_cnt, 'total_loss': self.total_loss_stat.mean, 'pi_loss': self.pi_loss_stat.mean, 'vf_loss': self.vf_loss_stat.mean, 'entropy': self.entropy_stat.mean, 'kl': self.kl_stat.mean, 'learn_time_s': self.learn_time_stat.mean, 'elapsed_time_s': int(time.time() - self.start_time), 'lr': self.lr, 'entropy_coeff': self.entropy_coeff, } for key, value in metric.items(): if value is not None: summary.add_scalar(key, value, self.sample_total_steps) logger.info(metric)
def main(): env = gym.make(args.env) env.seed(ENV_SEED) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) actor = ActorModel(act_dim) critic = CriticModel() algorithm = parl.algorithms.SAC(actor, critic, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = BipedalWalkerAgent(algorithm, obs_dim, act_dim) if os.path.exists( 'model_dir/steps_1481164_reward_-1.6494146736737971.ckpt'): agent.restore( 'model_dir/steps_1481164_reward_-1.6494146736737971.ckpt') print("restore succeed") rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 best_reward = -float('inf') while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps) if evaluate_reward >= best_reward: best_reward = evaluate_reward # 保存模型 ckpt = 'model_dir_phase2/steps_{}_reward_{}.ckpt'.format( total_steps, best_reward) agent.save(ckpt) ckpt = 'model_dir_phase2/steps_{}_reward_{}.ckpt'.format( total_steps, evaluate_reward) agent.save(ckpt)
def main(is_train=True): env = gym.make(args.env) env.seed(ENV_SEED) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) print(max_action) exit() model = MujocoModel(act_dim, max_action) algorithm = parl.algorithms.TD3(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) if is_train: test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent, is_render=False) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps) agent.save_actor('actor.ckpt') agent.save_critic('critic.ckpt') else: agent.restore_critic('critic.ckpt') agent.restore_actor('actor.ckpt') for epics in range(1, 5): evaluate_reward = run_evaluate_episode(env, agent, is_render=True) print("evaluate_reward: ", evaluate_reward)
def main(): env = get_player(args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) test_env = get_player(args.rom, image_size=IMAGE_SIZE, frame_skip=FRAME_SKIP, context_len=CONTEXT_LEN) rpm = ReplayMemory(MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN) act_dim = env.action_space.n model = AtariModel(act_dim, args.algo) if args.algo == 'DDQN': algorithm = parl.algorithms.DDQN(model, act_dim=act_dim, gamma=GAMMA) elif args.algo in ['DQN', 'Dueling']: algorithm = parl.algorithms.DQN(model, act_dim=act_dim, gamma=GAMMA) agent = AtariAgent(algorithm, act_dim=act_dim, start_lr=LEARNING_RATE, total_step=args.train_total_steps, update_freq=UPDATE_FREQ) with tqdm(total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar: while rpm.size() < MEMORY_WARMUP_SIZE: total_reward, steps, _ = run_train_episode(env, agent, rpm) pbar.update(steps) # train test_flag = 0 pbar = tqdm(total=args.train_total_steps) total_steps = 0 max_reward = None while total_steps < args.train_total_steps: # start epoch total_reward, steps, loss = run_train_episode(env, agent, rpm) total_steps += steps pbar.set_description('[train]exploration:{}'.format(agent.exploration)) summary.add_scalar('dqn/score', total_reward, total_steps) summary.add_scalar('dqn/loss', loss, total_steps) # mean of total loss summary.add_scalar('dqn/exploration', agent.exploration, total_steps) pbar.update(steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 pbar.write("testing") eval_rewards = [] for _ in tqdm(range(3), desc='eval agent'): eval_reward = run_evaluate_episode(test_env, agent) eval_rewards.append(eval_reward) logger.info( "eval_agent done, (steps, eval_reward): ({}, {})".format( total_steps, np.mean(eval_rewards))) eval_test = np.mean(eval_rewards) summary.add_scalar('dqn/eval', eval_test, total_steps) pbar.close()
def run_episode(env, agents): obs_n = env.reset() total_reward = 0 agents_reward = [0 for _ in range(env.n)] steps = 0 while True: steps += 1 action_n = [agent.predict(obs) for agent, obs in zip(agents, obs_n)] next_obs_n, reward_n, done_n, _ = env.step(action_n) done = all(done_n) terminal = (steps >= args.max_step_per_episode) # store experience for i, agent in enumerate(agents): agent.add_experience(obs_n[i], action_n[i], reward_n[i], next_obs_n[i], done_n[i]) # compute reward of every agent obs_n = next_obs_n for i, reward in enumerate(reward_n): total_reward += reward agents_reward[i] += reward # check the end of an episode if done or terminal: break # show animation if args.show: time.sleep(0.1) env.render() # show model effect without training if args.restore and args.show: continue # learn policy for i, agent in enumerate(agents): critic_loss = agent.learn(agents) summary.add_scalar('critic_loss_%d' % i, critic_loss, agent.global_train_step) return total_reward, agents_reward, steps
def main(): # 加载数据 df = pd.read_csv('TD3gupiao/DATA/AAPL.csv') df = df.sort_values('Date') # 创建环境 env = StockTradingEnv(df) env.reset() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) model = MujocoModel(act_dim, max_action) algorithm = parl.algorithms.TD3(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps # logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps)
def main(): env = gym.make(args.env) env.seed(seed) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = float(env.action_space.high[0]) algorithm = SAC(ActorModel(act_dim), CriticModel(), max_action=act_limit, gamma=gamma, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = BipedalWalkerAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(replay_size, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps # logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps) ckpt = 'star2_model_dir/steps_{}.ckpt'.format(total_steps) agent.save(ckpt)
def train_agent(): env = MAenv(args.env) logger.info('agent num: {}'.format(env.n)) logger.info('observation_space: {}'.format(env.observation_space)) logger.info('action_space: {}'.format(env.action_space)) logger.info('obs_shape_n: {}'.format(env.obs_shape_n)) logger.info('act_shape_n: {}'.format(env.act_shape_n)) for i in range(env.n): logger.info('agent {} obs_low:{} obs_high:{}'.format( i, env.observation_space[i].low, env.observation_space[i].high)) logger.info('agent {} act_n:{}'.format(i, env.act_shape_n[i])) if ('low' in dir(env.action_space[i])): logger.info('agent {} act_low:{} act_high:{} act_shape:{}'.format( i, env.action_space[i].low, env.action_space[i].high, env.action_space[i].shape)) logger.info('num_discrete_space:{}'.format( env.action_space[i].num_discrete_space)) from gym import spaces from multiagent.multi_discrete import MultiDiscrete for space in env.action_space: assert (isinstance(space, spaces.Discrete) or isinstance(space, MultiDiscrete)) agents = [] for i in range(env.n): model = MAModel(env.act_shape_n[i]) algorithm = parl.algorithms.MADDPG( model, agent_index=i, act_space=env.action_space, gamma=args.gamma, tau=args.tau, lr=args.lr) agent = MAAgent( algorithm, agent_index=i, obs_dim_n=env.obs_shape_n, act_dim_n=env.act_shape_n, batch_size=args.batch_size, speedup=(not args.restore)) agents.append(agent) total_steps = 0 total_episodes = 0 episode_rewards = [] # sum of rewards for all agents agent_rewards = [[] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve if args.restore: # restore modle for i in range(len(agents)): model_file = args.model_dir + '/agent_' + str(i) + '.ckpt' if not os.path.exists(model_file): logger.info('model file {} does not exits'.format(model_file)) raise Exception agents[i].restore(model_file) t_start = time.time() logger.info('Starting...') while total_episodes <= args.max_episodes: # run an episode ep_reward, ep_agent_rewards, steps = run_episode(env, agents) if args.show: print('episode {}, reward {}, steps {}'.format( total_episodes, ep_reward, steps)) # Record reward total_steps += steps total_episodes += 1 episode_rewards.append(ep_reward) for i in range(env.n): agent_rewards[i].append(ep_agent_rewards[i]) # Keep track of final episode reward if total_episodes % args.stat_rate == 0: mean_episode_reward = np.mean(episode_rewards[-args.stat_rate:]) final_ep_rewards.append(mean_episode_reward) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-args.stat_rate:])) use_time = round(time.time() - t_start, 3) logger.info( 'Steps: {}, Episodes: {}, Mean episode reward: {}, Time: {}'. format(total_steps, total_episodes, mean_episode_reward, use_time)) t_start = time.time() summary.add_scalar('mean_episode_reward/episode', mean_episode_reward, total_episodes) summary.add_scalar('mean_episode_reward/steps', mean_episode_reward, total_steps) summary.add_scalar('use_time/1000episode', use_time, total_episodes) # save model if not args.restore: os.makedirs(os.path.dirname(args.model_dir), exist_ok=True) for i in range(len(agents)): model_name = '/agent_' + str(i) + '.ckpt' agents[i].save(args.model_dir + model_name)
def main(): # 获取游戏,skill_frame每个动作执行的次数,resize_shape图像预处理的大小,render_preprocess是否显示预处理后的图像 env = retro_util.RetroEnv(game=args.env, resize_shape=RESIZE_SHAPE, skill_frame=SKILL_FRAME, render_preprocess=args.show_play, is_train=True) env.seed(1) # 游戏的图像形状 # obs_dim = env.observation_space.shape obs_dim = RESIZE_SHAPE # 动作维度 action_dim = env.action_space.n # 动作正负的最大绝对值 max_action = 1 # 创建模型 actor = ActorModel(action_dim) critic = CriticModel() algorithm = parl.algorithms.SAC(actor=actor, critic=critic, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, action_dim) # 加载预训练模型 if os.path.exists(args.model_path): logger.info("加载预训练模型...") agent.restore(args.model_path) # 创建记录数据存储器 rpm = ReplayMemory(MEMORY_SIZE, obs_dim, action_dim) total_steps = 0 step_train = 0 print("开始训练模型。。。") while total_steps < args.train_total_steps: # 训练 train_reward, steps = run_train_episode(env, agent, rpm, render=args.show_play) logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) total_steps += steps # 评估 if step_train % 100 == 0: evaluate_reward = run_evaluate_episode(env, agent, render=args.show_play) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps) step_train += 1 # 保存模型 if not os.path.exists(args.model_path): os.makedirs(args.model_path) agent.save(args.model_path)
gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = QuadrotorAgent(algorithm, obs_dim, act_dim) # agent.restore('./model_dir/Pre_Training.ckpt') rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) #启动训练 test_flag = 0 total_steps = 0 while total_steps < TRAIN_TOTAL_STEPS: train_reward, steps = run_episode(env, agent, rpm) total_steps += steps summary.add_scalar('train/episode_reward', train_reward, total_steps) #logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) # 打印训练reward if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数,评估一次模型 while total_steps // TEST_EVERY_STEPS >= test_flag: test_flag += 1 evaluate_reward = evaluate(env, agent) logger.info('Steps {}, Test reward: {}'.format( total_steps, evaluate_reward)) # 打印评估的reward summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps) # 每评估一次,就保存一次模型,以训练的step数命名 ckpt = 'star3_model_dir/steps_{}.ckpt'.format(total_steps) agent.save(ckpt) #
def main(): env = get_player(args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) test_env = get_player(args.rom, image_size=IMAGE_SIZE, frame_skip=FRAME_SKIP, context_len=CONTEXT_LEN) rpm = ReplayMemory(MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN) act_dim = env.action_space.n device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = AtariModel(CONTEXT_LEN, act_dim, args.algo) if args.algo in ['DQN', 'Dueling']: algorithm = DQN(model, gamma=GAMMA, lr=args.lr) elif args.algo == 'Double': algorithm = DDQN(model, gamma=GAMMA, lr=args.lr) agent = AtariAgent(algorithm, act_dim=act_dim) with tqdm(total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar: while rpm.size() < MEMORY_WARMUP_SIZE: total_reward, steps, _ = run_train_episode(env, agent, rpm) pbar.update(steps) # Get fixed obs to check value function. fixed_obs = get_fixed_obs(rpm, args.batch_size) fixed_obs = torch.tensor(fixed_obs, dtype=torch.float, device=device) # train test_flag = 0 total_steps = 0 with tqdm(total=args.train_total_steps, desc='[Training Model]') as pbar: while total_steps < args.train_total_steps: total_reward, steps, loss = run_train_episode(env, agent, rpm) total_steps += steps pbar.update(steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 eval_rewards = [] for _ in range(3): eval_rewards.append(run_evaluate_episode(test_env, agent)) summary.add_scalar('dqn/eval', np.mean(eval_rewards), total_steps) summary.add_scalar('dqn/score', total_reward, total_steps) summary.add_scalar('dqn/loss', loss, total_steps) summary.add_scalar('dqn/exploration', agent.exploration, total_steps) summary.add_scalar('dqn/Q value', evaluate_fixed_Q(agent, fixed_obs), total_steps) summary.add_scalar('dqn/grad_norm', get_grad_norm(agent.alg.model), total_steps)
def test_add_scalar(self): x = range(100) for i in x: summary.add_scalar('y=2x', i * 2, i) self.assertTrue(os.path.exists('./train_log/summary_test'))
def main(): # Prepare environments # env = get_player( # args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) # test_env = get_player( # args.rom, # image_size=IMAGE_SIZE, # frame_skip=FRAME_SKIP, # context_len=CONTEXT_LEN) env = gym.make("pseudoslam:RobotExploration-v0") env = MonitorEnv(env, param={'goal': args.goal, 'obs': args.obs}) # obs = env.reset() # print(obs.shape) # raise NotImplementedError # Init Prioritized Replay Memory per = ProportionalPER(alpha=0.6, seg_num=args.batch_size, size=MEMORY_SIZE) suffix = args.suffix + "_Rp{}_Goal{}_Obs{}".format(args.Rp, args.goal, args.obs) logdir = os.path.join(args.logdir, suffix) if not os.path.exists(logdir): os.mkdir(logdir) logger.set_dir(logdir) modeldir = os.path.join(args.modeldir, suffix) if not os.path.exists(modeldir): os.mkdir(modeldir) # Prepare PARL agent act_dim = env.action_space.n model = AtariModel(act_dim) if args.alg == 'ddqn': algorithm = PrioritizedDoubleDQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) elif args.alg == 'dqn': algorithm = PrioritizedDQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = AtariAgent(algorithm, act_dim=act_dim, update_freq=UPDATE_FREQ) if os.path.exists(args.load): agent.restore(args.load) # Replay memory warmup total_step = 0 with tqdm(total=MEMORY_SIZE, desc='[Replay Memory Warm Up]') as pbar: mem = [] while total_step < MEMORY_WARMUP_SIZE: total_reward, steps, _, _ = run_episode(env, agent, per, mem=mem, warmup=True) total_step += steps pbar.update(steps) per.elements.from_list(mem[:int(MEMORY_WARMUP_SIZE)]) # env_name = args.rom.split('/')[-1].split('.')[0] test_flag = 0 total_steps = 0 pbar = tqdm(total=args.train_total_steps) save_steps = 0 while total_steps < args.train_total_steps: # start epoch total_reward, steps, loss, info = run_episode(env, agent, per, train=True) total_steps += steps save_steps += steps pbar.set_description('[train]exploration:{}'.format(agent.exploration)) summary.add_scalar('train/score', total_reward, total_steps) summary.add_scalar('train/loss', loss, total_steps) # mean of total loss summary.add_scalar('train/exploration', agent.exploration, total_steps) summary.add_scalar('train/steps', steps, total_steps) for key in info.keys(): summary.add_scalar('train/' + key, info[key], total_steps) pbar.update(steps) if total_steps // args.test_every_steps >= test_flag: print('start test!') while total_steps // args.test_every_steps >= test_flag: test_flag += 1 pbar.write("testing") test_rewards = [] for _ in tqdm(range(3), desc='eval agent'): eval_reward = run_evaluate_episode(env, agent) test_rewards.append(eval_reward) eval_reward = np.mean(test_rewards) logger.info( "eval_agent done, (steps, eval_reward): ({}, {})".format( total_steps, eval_reward)) summary.add_scalar('eval/reward', eval_reward, total_steps) if save_steps >= 100000: modeldir_ = os.path.join(modeldir, 'itr_{}'.format(total_steps)) if not os.path.exists(modeldir_): os.mkdir(modeldir_) print('save model!', modeldir_) agent.save(modeldir_) save_steps = 0 pbar.close()
def log_metrics(self): # 避免训练还未开始的情况 if self.start_time is None: return # 获取最好的模型 if self.best_loss is None: self.best_loss = self.total_loss_stat.mean else: if self.best_loss > self.total_loss_stat.mean: self.best_loss = self.total_loss_stat.mean self.save_model("model_best") # 训练数据写入到日志中 summary.add_scalar('total_loss', self.total_loss_stat.mean, self.sample_total_steps) summary.add_scalar('pi_loss', self.pi_loss_stat.mean, self.sample_total_steps) summary.add_scalar('vf_loss', self.vf_loss_stat.mean, self.sample_total_steps) summary.add_scalar('entropy', self.entropy_stat.mean, self.sample_total_steps) summary.add_scalar('lr', self.lr, self.sample_total_steps) summary.add_scalar('entropy_coeff', self.entropy_coeff, self.sample_total_steps) logger.info('total_loss: {}'.format(self.total_loss_stat.mean))
def main(): env = gym.make('FlappyBird-v0') test_env = Monitor(env, directory='test', video_callable=lambda x: True, force=True) rpm = ReplayMemory(MEMORY_SIZE) act_dim = env.action_space.n model = Model(act_dim) algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(algorithm, act_dim=act_dim, e_greed=E_GREED, e_greed_decrement=E_GREED_DECREMENT) # 加载模型 #save_path = './dqn_model.ckpt' #agent.restore(save_path) with tqdm(total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar: while len(rpm) < MEMORY_WARMUP_SIZE: total_reward, steps, _ = run_train_episode(env, agent, rpm) pbar.update(steps) # train best_reward = -5 pbar = tqdm(total=train_total_steps) test_flag = 0 total_steps = 0 while total_steps < train_total_steps: # start epoch total_reward, steps, loss = run_train_episode(env, agent, rpm) total_steps += steps pbar.set_description('[train]exploration:{}'.format(agent.e_greed)) summary.add_scalar('dqn/score', total_reward, total_steps) summary.add_scalar('dqn/loss', loss, total_steps) # mean of total loss summary.add_scalar('dqn/exploration', agent.e_greed, total_steps) pbar.update(steps) if total_steps // test_every_steps >= test_flag: while total_steps // test_every_steps >= test_flag: test_flag += 1 pbar.write("testing") eval_rewards = [] for _ in tqdm(range(3), desc='eval agent'): eval_reward = run_evaluate_episode(test_env, agent) eval_rewards.append(eval_reward) logger.info( "eval_agent done, (steps, eval_reward): ({}, {})".format( total_steps, np.mean(eval_rewards))) eval_test = np.mean(eval_rewards) summary.add_scalar('dqn/eval', eval_test, total_steps) if eval_test > best_reward: agent.save('./best_dqn_model.ckpt') best_reward = eval_test pbar.close() # 训练结束,保存模型 save_path = './dqn_model.ckpt' agent.save(save_path)
def log_metrics(self, metrics): logger.info(metrics) for k, v in metrics.items(): if v is not None: summary.add_scalar(k, v, self.sample_total_steps)
def log_metrics(self): """ Log metrics of learner and actors """ if self.start_time is None: return metrics = [] while True: try: metric = self.remote_metrics_queue.get_nowait() metrics.append(metric) except queue.Empty: break episode_rewards, episode_steps = [], [] for x in metrics: episode_rewards.extend(x['episode_rewards']) episode_steps.extend(x['episode_steps']) max_episode_rewards, mean_episode_rewards, min_episode_rewards, \ max_episode_steps, mean_episode_steps, min_episode_steps =\ None, None, None, None, None, None if episode_rewards: mean_episode_rewards = np.mean(np.array(episode_rewards).flatten()) max_episode_rewards = np.max(np.array(episode_rewards).flatten()) min_episode_rewards = np.min(np.array(episode_rewards).flatten()) mean_episode_steps = np.mean(np.array(episode_steps).flatten()) max_episode_steps = np.max(np.array(episode_steps).flatten()) min_episode_steps = np.min(np.array(episode_steps).flatten()) metric = { 'Sample steps': self.sample_total_steps, 'max_episode_rewards': max_episode_rewards, 'mean_episode_rewards': mean_episode_rewards, 'min_episode_rewards': min_episode_rewards, 'max_episode_steps': max_episode_steps, 'mean_episode_steps': mean_episode_steps, 'min_episode_steps': min_episode_steps, 'total_loss': self.total_loss_stat.mean, 'pi_loss': self.pi_loss_stat.mean, 'vf_loss': self.vf_loss_stat.mean, 'entropy': self.entropy_stat.mean, 'learn_time_s': self.learn_time_stat.mean, 'elapsed_time_s': int(time.time() - self.start_time), 'lr': self.lr, 'entropy_coeff': self.entropy_coeff, } if metric['mean_episode_rewards'] is not None: summary.add_scalar('train/mean_reward', metric['mean_episode_rewards'], self.sample_total_steps) summary.add_scalar('train/total_loss', metric['total_loss'], self.sample_total_steps) summary.add_scalar('train/pi_loss', metric['pi_loss'], self.sample_total_steps) summary.add_scalar('train/vf_loss', metric['vf_loss'], self.sample_total_steps) summary.add_scalar('train/entropy', metric['entropy'], self.sample_total_steps) summary.add_scalar('train/learn_rate', metric['lr'], self.sample_total_steps) logger.info(metric)
def _parse_memory(self, actor_state, last_obs): mem = actor_state.memory n = len(mem) episode_shaping_reward = np.sum( [exp.info['shaping_reward'] for exp in mem]) episode_env_reward = np.sum([exp.info['env_reward'] for exp in mem]) episode_time = time.time() - mem[0].timestamp episode_rpm = [] for i in range(n - 1): episode_rpm.append([ mem[i].obs, mem[i].action, mem[i].info['shaping_reward'], mem[i + 1].obs, False ]) episode_rpm.append([ mem[-1].obs, mem[-1].action, mem[-1].info['shaping_reward'], last_obs, not mem[-1].info['timeout'] ]) with self.memory_lock: self.total_steps += n self.add_episode_rpm(episode_rpm) if actor_state.ident % 3 == 2: # trajectory without noise self.env_reward_stat.add(episode_env_reward) self.shaping_reward_stat.add(episode_shaping_reward) self.max_env_reward = max(self.max_env_reward, episode_env_reward) if self.env_reward_stat.count > 500: summary.add_scalar('recent_env_reward', self.env_reward_stat.mean, self.total_steps) summary.add_scalar('recent_shaping_reward', self.shaping_reward_stat.mean, self.total_steps) if self.critic_loss_stat.count > 500: summary.add_scalar('recent_critic_loss', self.critic_loss_stat.mean, self.total_steps) summary.add_scalar('episode_length', n, self.total_steps) summary.add_scalar('max_env_reward', self.max_env_reward, self.total_steps) summary.add_scalar('ready_actor_num', self.ready_actor_queue.qsize(), self.total_steps) summary.add_scalar('episode_time', episode_time, self.total_steps) self.noiselevel = self.noiselevel * NOISE_DECAY
def train(self, total_train_step=100000, model_name=None, isco=''): time_start = time.time() epsilons = np.linspace(self.epsilon_start, self.epsilon_end, self.epsilon_decay_steps) count_simulate = [0] * self.n_episode count_action = {} # session config config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=config, graph=self.graph) as sess: sess.run(tf.global_variables_initializer()) # self.dqn_main.summary_writer.add_graph(sess.graph) # load model if model_name: self.saver.restore( sess, self.model_dir + '/{}.ckpt'.format(model_name)) print('Model {} Loaded!'.format(model_name)) step_tot = 0 loss = 0 print("Start training...") sys.stdout.flush() done = False flag_next_chronic = False for episode in range(self.n_episode): # if done or flag_next_chronic: state = self.environment.reset( ) if episode != 0 else self.environment._get_obs().as_array() state = np.array(state).reshape((-1, self.n_features)) episode_reward = [] for step in itertools.count(): reward_tot = 0 # update the target estimator if step_tot % self.replace_target_iter == 0: sess.run([self.params_copy_hard]) # print("\nCopied model parameters to target network.") # choose action action, q_predictions = self.dqn_main.act( sess, state, epsilons[min(episode, self.epsilon_decay_steps - 1)]) # check loadflow grid = self.environment.game.grid thermal_limits = grid.get_thermal_limits() # check overflow has_overflow = self.environment.game.n_timesteps_soft_overflowed_lines print('overflow lines: ', has_overflow) has_overflow = any(has_overflow) action_is_valid = True has_danger = False # if the action will terminate, try others obs_simulate, reward_simulate, done_simulate, _, score_simulate_pre = self.environment.simulate( self.action_space[action]) if obs_simulate is None: has_danger = True else: lineflow_simulate = self.get_lineflow(obs_simulate) lineflow_simulate_ratio = lineflow_simulate / thermal_limits lineflow_simulate_ratio = [ round(x, 4) for x in lineflow_simulate_ratio ] for ratio, limit in zip(lineflow_simulate_ratio, thermal_limits): if (limit < 400.00 and ratio > 0.90) or (limit >= 400.00 and ratio > 0.95): has_danger = True print('lineflow: ', lineflow_simulate_ratio) print(action, score_simulate_pre, done_simulate, action_is_valid) if done_simulate or not action_is_valid or score_simulate_pre < 13.50 or has_overflow or has_danger: # if has overflow. try all actions if has_overflow: print('has overflow !!!!!!!!!!!!!!!!!!!!!!!!!') if has_danger: print('has danger !!!!!!!!!!!!!!!!!!!!!!!!!!!') top_actions = np.argsort( q_predictions)[-1:-41:-1].tolist() chosen_action = 0 max_score = float('-inf') for action in top_actions: action_class = self.environment.action_space.array_to_action( self.action_space[action]) action_is_valid = self.environment.game.is_action_valid( action_class) if not action_is_valid: continue else: obs_simulate, reward_simulate, done_simulate, _, score_simulate = self.environment.simulate( self.action_space[action]) if obs_simulate is None: continue else: lineflow_simulate = self.get_lineflow( obs_simulate) lineflow_simulate_ratio = lineflow_simulate / thermal_limits lineflow_simulate_ratio = [ round(x, 4) for x in lineflow_simulate_ratio ] # has_danger = any([x > 0.92 for x in lineflow_simulate_ratio]) # seperate big line and small line has_danger = False for ratio, limit in zip( lineflow_simulate_ratio, thermal_limits): if (limit < 400.00 and ratio > 0.90) or ( limit >= 400.00 and ratio > 0.95): has_danger = True if not done_simulate and score_simulate > max_score and not has_danger: max_score = score_simulate chosen_action = action print('lineflow: ', lineflow_simulate_ratio) print( 'current best action: {}, score: {:.4f}' .format(chosen_action, score_simulate)) # chosen action action = chosen_action # count action count_action[action] = count_action.get(action, 0) + 1 # take a step next_state, reward, done, info, flag_next_chronic = self.environment.step( self.action_space[action]) episode_reward.append(reward) score = self.get_score() if done: next_state = state score = -15 else: next_state = np.array(next_state).reshape( (-1, self.n_features)) reward_tot = score / 15.0 # record self.episode_score_history[episode] += score # Save transition to replay memory if done: # if done: store more for i in range(5): self.replay_memory.store( [state, action, reward_tot, next_state, done]) else: self.replay_memory.store( [state, action, reward_tot, next_state, done]) # learn if step_tot > self.replay_memory_size and step_tot % 5 == 0: # Sample a minibatch from the replay memory tree_idx, batch_samples, IS_weights = self.replay_memory.sample( self.batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map( np.array, zip(*batch_samples)) states_batch = states_batch.reshape( (-1, self.n_features)) next_states_batch = next_states_batch.reshape( (-1, self.n_features)) # Calculate q targets q_values_next = self.dqn_target.predict( sess, next_states_batch) q_values_next = np.array(q_values_next[0]) targets_batch = reward_batch + \ np.invert(done_batch).astype(np.float32) * \ self.gamma * np.amax(q_values_next, axis=1) # Perform gradient descent update loss, abs_TD_errors = self.dqn_main.update( sess, states_batch, targets_batch, action_batch.reshape((-1, 1)), IS_weights) # Update priority self.replay_memory.batch_update( tree_idx, abs_TD_errors) # verbose step summary if episode % self.verbose_per_episode == 0 and ( step_tot + 1) % 1 == 0: print( "episode: {}, step: {}, action: {}, loss: {:4f}, reward: {:4f}, score: {:.4f}, tot: {:.4f}\n" .format(episode + 1, step + 1, action, loss, reward, score, reward_tot)) sys.stdout.flush() # update state state = next_state step_tot += 1 if done or step_tot > total_train_step or flag_next_chronic: break summary.add_scalar('train/episode_reward', np.mean(episode_reward), episode) # save model per episode self.saver.save( sess, self.model_dir + '/{}_model_251_step_{}_{}.ckpt'.format( isco, total_train_step, self.timestamp)) print('Model Saved!') # verbose episode summary print("\nepisode: {}, mean_score: {:4f}, sum_score: {:4f}\n". format(episode + 1, self.episode_score_history[episode] / (step + 1), self.episode_score_history[episode])) print("simulate used count: {}\naction count: {}\n".format( count_simulate[episode], sorted(count_action.items()))) if step_tot > total_train_step: break time_end = time.time() print("\nFinished, Total time used: {}s".format(time_end - time_start))
def main(): # Prepare environments env = get_player( args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) test_env = get_player( args.rom, image_size=IMAGE_SIZE, frame_skip=FRAME_SKIP, context_len=CONTEXT_LEN) # Init Prioritized Replay Memory per = ProportionalPER(alpha=0.6, seg_num=args.batch_size, size=MEMORY_SIZE) # Prepare PARL agent act_dim = env.action_space.n model = AtariModel(act_dim) if args.alg == 'ddqn': algorithm = PrioritizedDoubleDQN( model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) elif args.alg == 'dqn': algorithm = PrioritizedDQN( model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = AtariAgent(algorithm, act_dim=act_dim, update_freq=UPDATE_FREQ) # Replay memory warmup total_step = 0 with tqdm(total=MEMORY_SIZE, desc='[Replay Memory Warm Up]') as pbar: mem = [] while total_step < MEMORY_WARMUP_SIZE: total_reward, steps, _ = run_episode( env, agent, per, mem=mem, warmup=True) total_step += steps pbar.update(steps) per.elements.from_list(mem[:int(MEMORY_WARMUP_SIZE)]) env_name = args.rom.split('/')[-1].split('.')[0] test_flag = 0 total_steps = 0 pbar = tqdm(total=args.train_total_steps) while total_steps < args.train_total_steps: # start epoch total_reward, steps, loss = run_episode(env, agent, per, train=True) total_steps += steps pbar.set_description('[train]exploration:{}'.format(agent.exploration)) summary.add_scalar('{}/score'.format(env_name), total_reward, total_steps) summary.add_scalar('{}/loss'.format(env_name), loss, total_steps) # mean of total loss summary.add_scalar('{}/exploration'.format(env_name), agent.exploration, total_steps) pbar.update(steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 pbar.write("testing") test_rewards = [] for _ in tqdm(range(3), desc='eval agent'): eval_reward = run_evaluate_episode(test_env, agent) test_rewards.append(eval_reward) eval_reward = np.mean(test_rewards) logger.info( "eval_agent done, (steps, eval_reward): ({}, {})".format( total_steps, eval_reward)) summary.add_scalar('{}/eval'.format(env_name), eval_reward, total_steps) pbar.close()
def main(): config = deepcopy(QMixConfig) env = StarCraft2Env(map_name=config['scenario'], difficulty=config['difficulty']) env = SC2EnvWrapper(env) config['episode_limit'] = env.episode_limit config['obs_shape'] = env.obs_shape config['state_shape'] = env.state_shape config['n_agents'] = env.n_agents config['n_actions'] = env.n_actions rpm = EpisodeReplayBuffer(config['replay_buffer_size']) agent_model = RNNModel(config) qmixer_model = QMixerModel(config) algorithm = QMIX(agent_model, qmixer_model, config) qmix_agent = QMixAgent(algorithm, config) while rpm.count < config['memory_warmup_size']: train_reward, train_step, train_is_win, train_loss, train_td_error\ = run_train_episode(env, qmix_agent, rpm, config) total_steps = 0 last_test_step = -1e10 while total_steps < config['training_steps']: train_reward, train_step, train_is_win, train_loss, train_td_error\ = run_train_episode(env, qmix_agent, rpm, config) total_steps += train_step if total_steps - last_test_step >= config['test_steps']: last_test_step = total_steps eval_is_win_buffer = [] eval_reward_buffer = [] eval_steps_buffer = [] for _ in range(3): eval_reward, eval_step, eval_is_win = run_evaluate_episode( env, qmix_agent) eval_reward_buffer.append(eval_reward) eval_steps_buffer.append(eval_step) eval_is_win_buffer.append(eval_is_win) summary.add_scalar('train_loss', train_loss, total_steps) summary.add_scalar('eval_reward', np.mean(eval_reward_buffer), total_steps) summary.add_scalar('eval_steps', np.mean(eval_steps_buffer), total_steps) summary.add_scalar('eval_win_rate', np.mean(eval_is_win_buffer), total_steps) summary.add_scalar('exploration', qmix_agent.exploration, total_steps) summary.add_scalar('replay_buffer_size', rpm.count, total_steps) summary.add_scalar('target_update_count', qmix_agent.target_update_count, total_steps) summary.add_scalar('train_td_error:', train_td_error, total_steps)