Example #1
0
    def __init__(self,
                 state_shape,
                 action_shape,
                 device,
                 ensemble_models=None,
                 seed=0,
                 batch_size=256,
                 gamma=0.99,
                 lr=3e-4,
                 alpha=0.2,
                 buff_size=10**6,
                 start_steps=2 * 10**3,
                 tau=5e-3,
                 reward_scale=1.0):
        super().__init__()

        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

        self.replay_buffer = buffer.ReplayBuffer(buff_size=buff_size,
                                                 state_shape=state_shape,
                                                 action_shape=action_shape,
                                                 device=device)
        self.actor = Actor_network(state_shape=state_shape,
                                   action_shape=action_shape).to(device)
        self.critic = Critic_network(state_shape=state_shape,
                                     action_shape=action_shape).to(device)
        self.critic_target = Critic_network(
            state_shape=state_shape,
            action_shape=action_shape).to(device).eval()

        self.critic_target.load_state_dict(self.critic.state_dict())
        for param in self.critic_target.parameters():
            param.requires_grad = False

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=lr)

        self.batch_size = batch_size
        self.learning_steps = 0
        self.device = device
        self.gamma = gamma
        self.lr = lr
        self.buff_size = buff_size
        self.start_steps = start_steps
        self.tau = tau
        self.alpha = alpha
        self.reward_scale = reward_scale
        self.ensemble_models = ensemble_models
    def __init__(self, action_size = 2, buffer_size = buffer_size , n_agents = 2 ,\
                 batch_size = batch_size , seed = 2, update_every = 1 , gamma = 1):

        self.madagents = [
            ddpg.ddpg(24, 2, 256, 128, 64),
            ddpg.ddpg(24, 2, 256, 128, 64)
        ]

        self.update_every = update_every
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.memory = buffer.ReplayBuffer(action_size,
                                          buffer_size,
                                          batch_size,
                                          seed=2)
        #self.t_step = 0
        self.n_agents = n_agents
        self.gamma = gamma
Example #3
0
 def generate_data(self, replay_buffer):
     #D_model を定義
     model_buffer = buffer.ReplayBuffer(replay_buffer.buff_size,
                                        replay_buffer.state_shape,
                                        replay_buffer.action_shape,
                                        device=self.device)
     #startするバッチをbufferから取り出す
     states, *_ = replay_buffer.sample_buffer(100)
     states = states.cpu().numpy()
     #modelを用いてステップする
     for h in range(H_steps):
         for b in range(100):
             action, _ = self.explore(states[b])
             next_state, reward = predict_next_state_and_reward(
                 states[b], action, self.ensemble_models, self.device)
             next_state = next_state.cpu().numpy()[0]
             reward = reward.cpu().numpy()
             model_buffer.add(states[b], action, next_state, reward, 0.)
             states[b] = next_state
     return model_buffer
Example #4
0
    def __init__(
        self,
        state_size,
        action_size,
        random_seed,
        warm_up=BATCH_SIZE,
        lr_actor=LR_ACTOR,
        lr_critic=LR_CRITIC,
        num_agents=2,
    ):
        super(MADDPG, self).__init__()
        # critic input = obs_full + actions = 14+2+2+2=20
        self.shared_critic_local = model.Critic(state_size, action_size,
                                                random_seed).to(device)
        self.shared_critic_target = model.Critic(state_size, action_size,
                                                 random_seed).to(device)
        self.shared_critic_optimizer = Adam(
            self.shared_critic_local.parameters(),
            lr=lr_critic,
            weight_decay=0)
        self.maddpg_agent = [
            ddpg.DDPGAgent(state_size, action_size, 12, warm_up, lr_actor,
                           lr_critic, self.shared_critic_local,
                           self.shared_critic_target,
                           self.shared_critic_optimizer),
            ddpg.DDPGAgent(state_size, action_size, 0, warm_up, lr_actor,
                           lr_critic, self.shared_critic_local,
                           self.shared_critic_target,
                           self.shared_critic_optimizer)
        ]

        self.discount_factor = GAMMA
        self.tau = TAU
        self.iter = 0
        self.num_agents = num_agents
        self.memory = buffer.ReplayBuffer(action_size, BUFFER_SIZE,
                                          random_seed)
Example #5
0
def run(config):
    model_dir = Path('./MAAC/')
    if not model_dir.exists():
        current_run = 'run1'
    else:
        run_nums = [int(str(folder.name).split('run')[1]) 
                        for folder in model_dir.iterdir() if str(folder.name).startswith('run')]
        if len(run_nums) == 0:
            current_run = 'run1'
        else:
            current_run = 'run%i' % (max(run_nums) + 1)
            
    run_dir = model_dir / current_run
    logs_dir = run_dir / 'logs'
    os.makedirs(logs_dir)
    
    writer = SummaryWriter(str(logs_dir))
    
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    
    if torch.cuda.is_available() and config.cuda==True:
        cuda = True
    else:
        cuda = False
    env = UnityEnvironment(file_name="/data/Tennis_Linux_NoVis/Tennis")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]
    num_agents = len(env_info.agents)
    
    maac = agent.AttentionAC.init_from_env(env_info, brain, norm=config.norm, gamma=config.gamma, tau=config.tau,
                                           lra=config.lra, lrc=config.lrc, hid1=config.hid1, hid2=config.hid2,
                                           hidc=config.hidc, att_heads=config.att_heads)
    
    repbuffer = buffer.ReplayBuffer(config.capacity, maac.n_agents,
                                 [brain.vector_observation_space_size for _ in range(maac.n_agents)],
                                 [brain.vector_action_space_size for _ in range(maac.n_agents)])
    
    for i, agent in enumerate(maac.agents):
        print('\nAgent %i:\n' % i)
        print(agent.actor)
    print('\n', maac.critic)
    
    episode = 0
    rewards_100 = deque(maxlen=100)
    while True:
        t = time.time()
        total_rewards = np.zeros(num_agents)
        env_info = env.reset(train_mode=True)[brain_name]
        obs = env_info.vector_observations
        maac.prep_rollouts(device='cpu')
        
        while True:
            obs_v = [Variable(torch.Tensor(obs[agent_i, :]), requires_grad=False) 
                     for agent_i in range(maac.n_agents)]
            actions, regularizer = maac.step(obs_v, explore=True) #double check DDPG.step FloatTensor part instead of Variable
            env_info = env.step(actions)[brain_name]
            next_obs = env_info.vector_observations
            rewards = env_info.rewards
            total_rewards += rewards
            dones = env_info.local_done

            repbuffer.add(obs, actions, rewards, next_obs, dones)
            if np.any(dones):
                episode_reward = np.max(total_rewards)
                rewards_100.append(episode_reward)
                writer.add_scalar('episode_reward', episode_reward, episode)
                print("\n\nDone episode %d for an episode reward of %.3f in %.2f seconds."
                      % (episode, episode_reward, (time.time() - t)))
                t = time.time()
                break
            
            obs = next_obs
            if repbuffer.filled > config.batch_size:
                if cuda:
                    maac.prep_training(device='gpu')
                else:
                    maac.prep_training(device='cpu')
                
                sample = repbuffer.sample(config.batch_size, to_gpu=cuda)
                maac.update_critic(sample, writer=writer)
                maac.update_actors(sample, writer=writer)
                maac.update_all_targets()
                maac.prep_rollouts(device='cpu')
                
        episode += 1
        for agent_i, r in enumerate(total_rewards):
            writer.add_scalar('agent%i-episode_rewards' % agent_i, r, episode)
            print('Agent %i: episode reward of %.2f.' % (agent_i, r))
            
        if np.mean(rewards_100) > 0.5:
            print("Solved the environment in %i episodes!" % episode)
            break
            
    maac.save(run_dir / 'tennisMAAC.pt')
    env.close()
    writer.export_scalars_to_json(str(logs_dir / 'summary.json'))
    writer.close()    
Example #6
0
ENV = 'MountainCar-v0'  # 'CartPole-v0', 'MountainCar-v0', 'BipedalWalker-v2'
env = gym.make(ENV)
env = env.unwrapped  # 还原env的原始设置,env外包了一层防作弊层

MAX_EPISODES = 201
MAX_BUFFER = 10000

S_DIM = env.observation_space.shape[0]
A_DIM = env.action_space.n

print(' Env: ', ENV)
print(' State Dimension: ', S_DIM)
print(' Number of Action(discrete) : ', A_DIM)

ram = buffer.ReplayBuffer(MAX_BUFFER)
trainer = train.Trainer(S_DIM, A_DIM, ram)

RENDER = False

total_reward = []
total_step = []  # 记录每个eps多少steps能搞定
for ep in range(MAX_EPISODES):
    ep_r = 0
    ep_steps = 0
    s = env.reset()
    if ep > MAX_EPISODES - 10: RENDER = True

    while 1:
        if RENDER: env.render()
        s = np.float32(s)