def __init__(self, state_shape, action_shape, device, ensemble_models=None, seed=0, batch_size=256, gamma=0.99, lr=3e-4, alpha=0.2, buff_size=10**6, start_steps=2 * 10**3, tau=5e-3, reward_scale=1.0): super().__init__() np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) self.replay_buffer = buffer.ReplayBuffer(buff_size=buff_size, state_shape=state_shape, action_shape=action_shape, device=device) self.actor = Actor_network(state_shape=state_shape, action_shape=action_shape).to(device) self.critic = Critic_network(state_shape=state_shape, action_shape=action_shape).to(device) self.critic_target = Critic_network( state_shape=state_shape, action_shape=action_shape).to(device).eval() self.critic_target.load_state_dict(self.critic.state_dict()) for param in self.critic_target.parameters(): param.requires_grad = False self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) self.batch_size = batch_size self.learning_steps = 0 self.device = device self.gamma = gamma self.lr = lr self.buff_size = buff_size self.start_steps = start_steps self.tau = tau self.alpha = alpha self.reward_scale = reward_scale self.ensemble_models = ensemble_models
def __init__(self, action_size = 2, buffer_size = buffer_size , n_agents = 2 ,\ batch_size = batch_size , seed = 2, update_every = 1 , gamma = 1): self.madagents = [ ddpg.ddpg(24, 2, 256, 128, 64), ddpg.ddpg(24, 2, 256, 128, 64) ] self.update_every = update_every self.batch_size = batch_size self.buffer_size = buffer_size self.memory = buffer.ReplayBuffer(action_size, buffer_size, batch_size, seed=2) #self.t_step = 0 self.n_agents = n_agents self.gamma = gamma
def generate_data(self, replay_buffer): #D_model を定義 model_buffer = buffer.ReplayBuffer(replay_buffer.buff_size, replay_buffer.state_shape, replay_buffer.action_shape, device=self.device) #startするバッチをbufferから取り出す states, *_ = replay_buffer.sample_buffer(100) states = states.cpu().numpy() #modelを用いてステップする for h in range(H_steps): for b in range(100): action, _ = self.explore(states[b]) next_state, reward = predict_next_state_and_reward( states[b], action, self.ensemble_models, self.device) next_state = next_state.cpu().numpy()[0] reward = reward.cpu().numpy() model_buffer.add(states[b], action, next_state, reward, 0.) states[b] = next_state return model_buffer
def __init__( self, state_size, action_size, random_seed, warm_up=BATCH_SIZE, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, num_agents=2, ): super(MADDPG, self).__init__() # critic input = obs_full + actions = 14+2+2+2=20 self.shared_critic_local = model.Critic(state_size, action_size, random_seed).to(device) self.shared_critic_target = model.Critic(state_size, action_size, random_seed).to(device) self.shared_critic_optimizer = Adam( self.shared_critic_local.parameters(), lr=lr_critic, weight_decay=0) self.maddpg_agent = [ ddpg.DDPGAgent(state_size, action_size, 12, warm_up, lr_actor, lr_critic, self.shared_critic_local, self.shared_critic_target, self.shared_critic_optimizer), ddpg.DDPGAgent(state_size, action_size, 0, warm_up, lr_actor, lr_critic, self.shared_critic_local, self.shared_critic_target, self.shared_critic_optimizer) ] self.discount_factor = GAMMA self.tau = TAU self.iter = 0 self.num_agents = num_agents self.memory = buffer.ReplayBuffer(action_size, BUFFER_SIZE, random_seed)
def run(config): model_dir = Path('./MAAC/') if not model_dir.exists(): current_run = 'run1' else: run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(run_nums) == 0: current_run = 'run1' else: current_run = 'run%i' % (max(run_nums) + 1) run_dir = model_dir / current_run logs_dir = run_dir / 'logs' os.makedirs(logs_dir) writer = SummaryWriter(str(logs_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if torch.cuda.is_available() and config.cuda==True: cuda = True else: cuda = False env = UnityEnvironment(file_name="/data/Tennis_Linux_NoVis/Tennis") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) maac = agent.AttentionAC.init_from_env(env_info, brain, norm=config.norm, gamma=config.gamma, tau=config.tau, lra=config.lra, lrc=config.lrc, hid1=config.hid1, hid2=config.hid2, hidc=config.hidc, att_heads=config.att_heads) repbuffer = buffer.ReplayBuffer(config.capacity, maac.n_agents, [brain.vector_observation_space_size for _ in range(maac.n_agents)], [brain.vector_action_space_size for _ in range(maac.n_agents)]) for i, agent in enumerate(maac.agents): print('\nAgent %i:\n' % i) print(agent.actor) print('\n', maac.critic) episode = 0 rewards_100 = deque(maxlen=100) while True: t = time.time() total_rewards = np.zeros(num_agents) env_info = env.reset(train_mode=True)[brain_name] obs = env_info.vector_observations maac.prep_rollouts(device='cpu') while True: obs_v = [Variable(torch.Tensor(obs[agent_i, :]), requires_grad=False) for agent_i in range(maac.n_agents)] actions, regularizer = maac.step(obs_v, explore=True) #double check DDPG.step FloatTensor part instead of Variable env_info = env.step(actions)[brain_name] next_obs = env_info.vector_observations rewards = env_info.rewards total_rewards += rewards dones = env_info.local_done repbuffer.add(obs, actions, rewards, next_obs, dones) if np.any(dones): episode_reward = np.max(total_rewards) rewards_100.append(episode_reward) writer.add_scalar('episode_reward', episode_reward, episode) print("\n\nDone episode %d for an episode reward of %.3f in %.2f seconds." % (episode, episode_reward, (time.time() - t))) t = time.time() break obs = next_obs if repbuffer.filled > config.batch_size: if cuda: maac.prep_training(device='gpu') else: maac.prep_training(device='cpu') sample = repbuffer.sample(config.batch_size, to_gpu=cuda) maac.update_critic(sample, writer=writer) maac.update_actors(sample, writer=writer) maac.update_all_targets() maac.prep_rollouts(device='cpu') episode += 1 for agent_i, r in enumerate(total_rewards): writer.add_scalar('agent%i-episode_rewards' % agent_i, r, episode) print('Agent %i: episode reward of %.2f.' % (agent_i, r)) if np.mean(rewards_100) > 0.5: print("Solved the environment in %i episodes!" % episode) break maac.save(run_dir / 'tennisMAAC.pt') env.close() writer.export_scalars_to_json(str(logs_dir / 'summary.json')) writer.close()
ENV = 'MountainCar-v0' # 'CartPole-v0', 'MountainCar-v0', 'BipedalWalker-v2' env = gym.make(ENV) env = env.unwrapped # 还原env的原始设置,env外包了一层防作弊层 MAX_EPISODES = 201 MAX_BUFFER = 10000 S_DIM = env.observation_space.shape[0] A_DIM = env.action_space.n print(' Env: ', ENV) print(' State Dimension: ', S_DIM) print(' Number of Action(discrete) : ', A_DIM) ram = buffer.ReplayBuffer(MAX_BUFFER) trainer = train.Trainer(S_DIM, A_DIM, ram) RENDER = False total_reward = [] total_step = [] # 记录每个eps多少steps能搞定 for ep in range(MAX_EPISODES): ep_r = 0 ep_steps = 0 s = env.reset() if ep > MAX_EPISODES - 10: RENDER = True while 1: if RENDER: env.render() s = np.float32(s)