コード例 #1
0
ファイル: ddpg_v3.py プロジェクト: marsXyr/GESRL
    def __init__(self, state_dim, action_dim, max_action, args):

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action

        self._init_parameters(args)
        self._init_nets(args)

        self.replay_buffer = ReplayBuffer(self.buffer_size, self.state_dim,
                                          self.action_dim)
コード例 #2
0
def run(config):
    model_dir = Path('./results') / config.env_id
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir()
                         if str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    fig_dir = run_dir / 'figures'
    os.makedirs(str(fig_dir))
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)

    assert config.n_rollout_threads == 1, "For simple test, we assume the number of the environment is 1"
    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed)

    controller = Controller.init_from_env(env=env, config=config)
    obs_shape, n_actions = controller.obs_shape, controller.n_actions
    buffer = ReplayBuffer(controller.n_agents, obs_shape, n_actions, config.episode_limit, config.buffer_size)
    rolloutworker = RolloutWorker(env, controller, config)

    train_step = 0
    mean_episode_rewards = []
    for ep_i in range(config.n_episodes):
        episode, ep_rew, mean_ep_rew = rolloutworker.generate_episode()
        buffer.push(episode)
        for step in range(config.n_train_steps):
            mini_batch = buffer.sample(min(len(buffer), config.batch_size))
            controller.update(mini_batch, train_step)
            train_step += 1
        # ep_rew = buffer.get_average_rewards(config.episode_limit * config.n_rollout_threads)
        mean_episode_rewards.append(mean_ep_rew)
        print("Episode {} : Total reward {} , Mean reward {}" .format(ep_i + 1, ep_rew, mean_ep_rew))

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(str(run_dir / 'incremental'), exist_ok=True)
            controller.save(str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))))
            controller.save(str(run_dir / 'model.pt'))

    controller.save(str(run_dir / 'model.pt'))
    env.close()

    index = list(range(1, len(mean_episode_rewards) + 1))
    plt.plot(index, mean_episode_rewards)
    plt.ylabel("Mean Episode Reward")
    plt.savefig(str(fig_dir) + '/mean_episode_reward.jpg')
    # plt.show()
    plt.close()
コード例 #3
0
ファイル: dqn.py プロジェクト: sumeetpathania/DDQN
    def __init__(self, state_size, action_size, action_space, args):
        self.device = torch.device("cuda" if args.cuda else "cpu")
        self.buffer = ReplayBuffer(args.buffer_size, args.batch_size, self.device)

        self.action_size = action_size
        self.gamma = args.gamma
        self.tau = args.tau

        self.eps = EpsilonController(e_decays = args.eps_decays, e_min = args.eps_min)

        self.q_local = QNetwork(state_size, action_size, args.hidden_size).to(self.device)
        self.q_optimizer = optim.Adam(self.q_local.parameters(), lr=args.lr)
        self.q_target = copy.deepcopy(self.q_local)
コード例 #4
0
    def __init__(self):
        super(DDPG, self).__init__(
            actor=Actor(),
            critic=Critic(),
        )
        self.target_actor = deepcopy(self.actor)
        self.target_critic = deepcopy(self.critic)
        disable_train(self.target_actor)
        disable_train(self.target_critic)

        self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(A_DIM))
        self.buffer = ReplayBuffer(BUFFER_SIZE)
        self.time = 0
コード例 #5
0
ファイル: base_env.py プロジェクト: Makiah/MAAC
 def push_to_replay_buffer(self, buffer: ReplayBuffer):
     for i in range(len(self.frames)):
         frame = self.frames[i]
         next_frame = self.frames[i +
                                  1] if i < len(self.frames) - 1 else None
         for k in frame.keys():
             if next_frame is None:
                 frame[k].next_obs = [0] * len(frame[k].obs)
             elif k not in next_frame:
                 assert frame[k].done
                 frame[k].next_obs = [0] * len(frame[k].obs)
             else:
                 frame[k].next_obs = next_frame[k].obs
         buffer.push({k: v.build() for k, v in frame.items()})
コード例 #6
0
ファイル: buffer_test.py プロジェクト: Makiah/MAAC
    def test_buffer(self):
        data = \
            {AgentKey(0, '0-1'): AgentReplayFrame([2, 1, 2, 2, 3], [0, 1, 0], 3, False, [3, 1, 1, 2, 3]),
             AgentKey(0, '0-2'): AgentReplayFrame([1, 1, 3, 2, 1], [0, 1, 0], 4, False, [2, 1, 1, 2, 2]),
             AgentKey(1, '0-1'): AgentReplayFrame([2, 0, 3, 1, 2], [0, 1], 5, False, [3, 0, 1, 3, 4])}

        max_steps = 4
        buffer = ReplayBuffer(max_steps)
        for i in range(5):
            buffer.push(data)
            self.assertEqual(buffer.length(), min(i + 1, max_steps))

        sample: List[Dict[AgentKey,
                          AgentReplayFrame]] = buffer.sample(2,
                                                             norm_rews=False)
        for s in sample:
            for k, v in s.items():
                self.assertEqual(v.reward, data[k].reward)

        sample: List[Dict[AgentKey,
                          AgentReplayFrame]] = buffer.sample(2, norm_rews=True)
        for s in sample:
            for k, v in s.items():
                self.assertEqual(v.reward, 0)

        avg_rewards = buffer.get_average_rewards(3)
        for k, v in avg_rewards.items():
            self.assertEqual(v, data[k].reward)
コード例 #7
0
    def __init__(self, env, config, logger=None):
        """
    Initialize Policy Gradient Class

    Args:
            env: an OpenAI Gym environment
            config: class with hyperparameters
            use_mask: train time, omit velocity features in state
            logger: logger instance from the logging module

    You do not need to implement anything in this function. However,
    you will need to use self.discrete, self.observation_dim,
    self.action_dim, and self.lr in other methods.

    """
        # directory for training outputs
        if not os.path.exists(config.output_path):
            os.makedirs(config.output_path)

        # store hyperparameters
        self.config = config
        if self.config.use_mask:
            print('Using mask...')
        self.logger = logger
        if logger is None:
            self.logger = get_logger(config.log_path)
        self.env = env

        # discrete vs continuous action space
        self.discrete = isinstance(env.action_space, gym.spaces.Discrete)
        self.observation_dim = get_obs_dims(self.config.env_name,
                                            self.config.use_mask)
        self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[
            0]
        self.lr = self.config.learning_rate

        # for milestone: capture raw tuple embedding
        self.memory_dim = 6  #self.observation_dim * 2 + self.action_dim + 1 + 1 # (s, a, r, s', done_mask)
        self.replay_buffer = ReplayBuffer(self.config.memory_len + 1,
                                          1,
                                          action_dim=self.action_dim)
        self.percolated_buffer = ReplayBuffer(self.config.percolate_len + 1,
                                              1,
                                              action_dim=self.action_dim)

        # build model
        self.build()
コード例 #8
0
    def __init__(self, num_agents, state_size, action_size, opts):
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.opts = opts
        self.closs = np.inf
        self.aloss = np.inf

        self.eps = 1
        self.eps_decay = 0.998
        self.min_eps = 0.01

        # Actor Network
        self.actor_local = ActorNet(state_size,
                                    action_size,
                                    fc1_units=opts.a_fc1,
                                    fc2_units=opts.a_fc2).to(opts.device)
        self.actor_target = ActorNet(state_size,
                                     action_size,
                                     fc1_units=opts.a_fc1,
                                     fc2_units=opts.a_fc2).to(opts.device)
        self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
                                                lr=opts.actor_lr)

        # Critic Network
        self.critic_local = CriticNet(state_size,
                                      action_size,
                                      fc1_units=opts.c_fc1,
                                      fc2_units=opts.c_fc2).to(opts.device)
        self.critic_target = CriticNet(state_size,
                                       action_size,
                                       fc1_units=opts.c_fc1,
                                       fc2_units=opts.c_fc2).to(opts.device)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(),
            lr=opts.critic_lr,
            weight_decay=opts.critic_weight_decay)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), opts.random_seed)
        self.step_idx = 0

        # Replay memory
        self.memory = ReplayBuffer(action_size, opts.buffer_size,
                                   opts.batch_size, opts.random_seed,
                                   opts.device)
コード例 #9
0
def run(config):
    """

    :param config:
    """
    # model_dir = Path('./models') / config.env_id / config.model_name
    env = make_env(config.env_id)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if all([hasattr(a, 'adversary') for a in env.agents]):
        agent_types = [
            'adversary' if a.adversary else 'agent' for a in env.agents
        ]
    else:
        agent_types = ['agent' for _ in env.agents]

    maddpg = MADDPG.init_from_env(env,
                                  agent_types,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.num_agent)

    for ep_i in range(config.n_episodes):
        print("Episodes %i of %i" % (ep_i + 1, config.n_episodes))
        observations = env.reset()

        for et_i in range(config.episode_length):
            torch_observations = [
                torch.from_numpy(observations[i]).float()
                for i in range(maddpg.num_agent)
            ]
            torch_agent_actions = maddpg.step(torch_observations)
            agent_actions = [
                action.data.numpy() for action in torch_agent_actions
            ]
            next_observations, rewards, dones, infos = env.step(agent_actions)

            replay_buffer.push_data(observations, agent_actions, rewards,
                                    next_observations, dones)

            observations = next_observations

            if replay_buffer.get_size() >= config.batch_size:
                for a_i in range(maddpg.num_agent):
                    sample = replay_buffer.sample(config.batch_size)
                    maddpg.update(sample, agent_i=a_i)
                maddpg.update_all_agent()
        print("Episode rewards ")
        print(replay_buffer.get_episode_rewards(config.episode_length))

    env.close()
コード例 #10
0
    def __init__(self,
                 state_size,
                 action_size,
                 action_space,
                 args,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 policy_freq=2):
        self.device = torch.device("cuda" if args.cuda else "cpu")
        self.buffer = ReplayBuffer(args.buffer_size, args.batch_size,
                                   self.device)

        self.action_size = action_size
        self.gamma = args.gamma
        self.tau = args.tau
        self.start_steps = args.start_steps

        self.total_it = 0
        self.max_action = float(action_space.high[0])
        self.policy_noise = policy_noise * self.max_action  # Target policy smoothing is scaled wrt the action scale
        self.noise_clip = noise_clip * self.max_action
        self.policy_freq = policy_freq
        self.expl_noise = args.expl_noise

        self.ce = nn.CrossEntropyLoss()
        self.mse = nn.MSELoss()

        self.policy = Actor(state_size, action_space.shape[0],
                            args.hidden_size, action_space).to(self.device)
        self.policy_optimizer = optim.Adam(self.policy.parameters(),
                                           lr=args.lr)
        self.policy_target = copy.deepcopy(self.policy)

        self.critic_local = QNetwork(state_size, action_space.shape[0],
                                     args.hidden_size).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=args.lr)
        self.critic_target = copy.deepcopy(self.critic_local)
コード例 #11
0
ファイル: ddpg_v3.py プロジェクト: marsXyr/GESRL
class DDPG:
    def __init__(self, state_dim, action_dim, max_action, args):

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action

        self._init_parameters(args)
        self._init_nets(args)

        self.replay_buffer = ReplayBuffer(self.buffer_size, self.state_dim,
                                          self.action_dim)

    def _init_parameters(self, args):
        self.actor_lr = args.actor_lr
        self.critic_lr = args.critic_lr
        self.discount = args.discount
        self.tau = args.tau
        self.buffer_size = args.buffer_size
        self.batch_size = args.batch_size

    def _init_nets(self, args):
        self.actor = Actor(self.state_dim, self.action_dim, self.max_action,
                           args)
        self.actor_t = Actor(self.state_dim, self.action_dim, self.max_action,
                             args)
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=self.actor_lr)

        self.critic = Critic(self.state_dim, self.action_dim, args)
        self.critic_t = Critic(self.state_dim, self.action_dim, args)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=self.critic_lr)

        self.loss = nn.MSELoss()

        hard_update(self.actor_t, self.actor)
        hard_update(self.critic_t, self.critic)

    def train(self):
        states, n_states, actions, rewards, dones = self.replay_buffer.sample(
            self.batch_size)
        # Compute q target
        next_q = self.critic_t(n_states, self.actor_t(n_states))
        q_target = (rewards + self.discount *
                    (1 - dones.float()) * next_q).detach()
        # Compute q predict
        q_predict = self.critic(states, actions)

        # Critic update
        critic_loss = self.loss(q_predict, q_target)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        # Actor update
        actor_loss = -self.critic(states, self.actor(states)).mean()
        self.actor_optim.zero_grad()
        actor_loss.backward()
        actor_grad = self.actor.get_grads()
        self.actor_optim.step()

        soft_update(self.actor_t, self.actor, self.tau)
        soft_update(self.critic_t, self.critic, self.tau)

        return actor_grad
コード例 #12
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    envActionSpace = env.action_space
    envObservationSpace = env.observation_space

    model = AttentionSAC.init_from_env(
        envActionSpace,
        envObservationSpace,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,  #128
        critic_hidden_dim=config.critic_hidden_dim,  #128
        attend_heads=config.attend_heads,  #4
        reward_scale=config.reward_scale)
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):  #12
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):  #25
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads
                ):  # 100 steps across rollouts -> 4 updates
                model.prep_training(device='cpu')

                for u_i in range(config.num_updates):  #4
                    sample = replay_buffer.sample(config.batch_size)
                    model.update_critic(sample)
                    model.update_policies(sample)
                    model.update_all_targets()

                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
コード例 #13
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(1804)
    np.random.seed(1804)
    # initialize E parallel environments with N agents
    env = make_parallel_env(config.env_id, config.n_rollout_threads, 1804)
    model = AttentionSAC.init_from_save('model.pt')
    # model = AttentionSAC.init_from_env(env,
    #                                    tau=config.tau,
    #                                    pi_lr=config.pi_lr,
    #                                    q_lr=config.q_lr,
    #                                    gamma=config.gamma,
    #                                    pol_hidden_dim=config.pol_hidden_dim,
    #                                    critic_hidden_dim=config.critic_hidden_dim,
    #                                    attend_heads=config.attend_heads,
    #                                    reward_scale=config.reward_scale)
    # initialize replay buffer D
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    # T_update
    t = 0
    max_step = 0
    max_time = 0
    total_step = np.zeros(model.nagents)
    total_time = np.zeros(model.nagents)
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        success = np.zeros((config.n_rollout_threads, model.nagents),
                           dtype=bool)
        steps = np.zeros((config.n_rollout_threads, model.nagents))
        time_cost = np.zeros((config.n_rollout_threads, model.nagents))
        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]

            start = time.clock()
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=False)
            end = time.clock()
            per_time_cost = end - start

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            # calculate steps
            success = np.logical_or(success, dones)
            # steps += dones
            steps += np.logical_not(dones)
            time_cost += np.logical_not(dones) * per_time_cost

            # store transitions for all env in replay buffer
            # replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs

            # T_update = T_update + E
            t += config.n_rollout_threads

            # if (len(replay_buffer) >= max(config.pi_batch_size, config.q_batch_size) and
            #     (t % config.steps_per_update) < config.n_rollout_threads):
            #     if config.use_gpu:
            #         model.prep_training(device='gpu')
            #     else:
            #         model.prep_training(device='cpu')
            #     for u_i in range(config.num_critic_updates):
            #         sample = replay_buffer.sample(config.q_batch_size,
            #                                       to_gpu=config.use_gpu)
            #         model.update_critic(sample, logger=logger)
            #     for u_i in range(config.num_pol_updates):
            #         sample = replay_buffer.sample(config.pi_batch_size,
            #                                       to_gpu=config.use_gpu)
            #         model.update_policies(sample, logger=logger)
            #     model.update_all_targets()
            #     # for u_i in range(config.num_updates):
            #     #     sample = replay_buffer.sample(config.batch_size,
            #     #                                   to_gpu=config.use_gpu)
            #     #     model.update_critic(sample, logger=logger)
            #     #     model.update_policies(sample, logger=logger)
            #     #     model.update_all_targets()
            model.prep_rollouts(device='cpu')

        # ep_dones = np.mean(success, axis=0)
        # ep_steps = 1 - np.mean(steps / config.episode_length, axis=0)
        # ep_mean_step

        # ep_rews = replay_buffer.get_average_rewards(
        #     config.episode_length * config.n_rollout_threads)
        # for a_i, a_ep_rew in enumerate(ep_rews):
        #     logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
        # for a_i, a_ep_done in enumerate(ep_dones):
        # logger.add_scalar('agent%i/mean_episode_dones' % a_i, a_ep_done, ep_i)
        # for a_i, a_ep_step in enumerate(ep_steps):
        # logger.add_scalar('agent%i/mean_episode_steps' % a_i, a_ep_step, ep_i)

        total_step += np.mean(steps, axis=0)
        total_time += np.mean(time_cost, axis=0)

        max_step += np.max(steps)
        max_time += np.max(time_cost)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            # os.makedirs(run_dir / 'incremental', exist_ok=True)
            # model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            # model.save(run_dir / 'model.pt')

    mean_step = total_step / (100 / config.n_rollout_threads)
    mean_time = total_time / (100 / config.n_rollout_threads)
    max_time /= 100 / config.n_rollout_threads
    max_step /= 100 / config.n_rollout_threads

    print('; '.join([
        f'{chr(65 + i)} Mean Step:{mean_step[i]}, Mean Time:{mean_time[i]}'
        for i in range(model.nagents)
    ]))
    print('Mean Max Step:{}, Mean Max Time Cost:{}'.format(max_step, max_time))
    # model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
コード例 #14
0
ファイル: main.py プロジェクト: wsg1873/Multi-Explore
def run(config):
    torch.set_num_threads(1)
    env_descr = 'map%i_%iagents_task%i' % (config.map_ind, config.num_agents,
                                           config.task_config)
    model_dir = Path('./models') / config.env_type / env_descr / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config, run_num)
    if config.nonlinearity == 'relu':
        nonlin = torch.nn.functional.relu
    elif config.nonlinearity == 'leaky_relu':
        nonlin = torch.nn.functional.leaky_relu
    if config.intrinsic_reward == 0:
        n_intr_rew_types = 0
        sep_extr_head = True
    else:
        n_intr_rew_types = len(config.explr_types)
        sep_extr_head = False
    n_rew_heads = n_intr_rew_types + int(sep_extr_head)

    model = SAC.init_from_env(env,
                              nagents=config.num_agents,
                              tau=config.tau,
                              hard_update_interval=config.hard_update,
                              pi_lr=config.pi_lr,
                              q_lr=config.q_lr,
                              phi_lr=config.phi_lr,
                              adam_eps=config.adam_eps,
                              q_decay=config.q_decay,
                              phi_decay=config.phi_decay,
                              gamma_e=config.gamma_e,
                              gamma_i=config.gamma_i,
                              pol_hidden_dim=config.pol_hidden_dim,
                              critic_hidden_dim=config.critic_hidden_dim,
                              nonlin=nonlin,
                              reward_scale=config.reward_scale,
                              head_reward_scale=config.head_reward_scale,
                              beta=config.beta,
                              n_intr_rew_types=n_intr_rew_types,
                              sep_extr_head=sep_extr_head)
    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents,
                                 env.state_space,
                                 env.observation_space,
                                 env.action_space)
    intr_rew_rms = [[RunningMeanStd()
                     for i in range(config.num_agents)]
                    for j in range(n_intr_rew_types)]
    eps_this_turn = 0  # episodes so far this turn
    active_envs = np.ones(config.n_rollout_threads)  # binary indicator of whether env is active
    env_times = np.zeros(config.n_rollout_threads, dtype=int)
    env_ep_extr_rews = np.zeros(config.n_rollout_threads)
    env_extr_rets = np.zeros(config.n_rollout_threads)
    env_ep_intr_rews = [[np.zeros(config.n_rollout_threads) for i in range(config.num_agents)]
                        for j in range(n_intr_rew_types)]
    recent_ep_extr_rews = deque(maxlen=100)
    recent_ep_intr_rews = [[deque(maxlen=100) for i in range(config.num_agents)]
                           for j in range(n_intr_rew_types)]
    recent_ep_lens = deque(maxlen=100)
    recent_found_treasures = [deque(maxlen=100) for i in range(config.num_agents)]
    meta_turn_rets = []
    extr_ret_rms = [RunningMeanStd() for i in range(n_rew_heads)]
    t = 0
    steps_since_update = 0

    state, obs = env.reset()

    while t < config.train_time:
        model.prep_rollouts(device='cuda' if config.gpu_rollout else 'cpu')
        # convert to torch tensor
        torch_obs = apply_to_all_elements(obs, lambda x: torch.tensor(x, dtype=torch.float32, device='cuda' if config.gpu_rollout else 'cpu'))
        # get actions as torch tensors
        torch_agent_actions = model.step(torch_obs, explore=True)
        # convert actions to numpy arrays
        agent_actions = apply_to_all_elements(torch_agent_actions, lambda x: x.cpu().data.numpy())
        # rearrange actions to be per environment
        actions = [[ac[i] for ac in agent_actions] for i in range(int(active_envs.sum()))]
        try:
            with timeout(seconds=1):
                next_state, next_obs, rewards, dones, infos = env.step(actions, env_mask=active_envs)
        # either environment got stuck or vizdoom crashed (vizdoom is unstable w/ multi-agent scenarios)
        except (TimeoutError, ViZDoomErrorException, ViZDoomIsNotRunningException, ViZDoomUnexpectedExitException) as e:
            print("Environments are broken...")
            env.close(force=True)
            print("Closed environments, starting new...")
            env = make_parallel_env(config, run_num)
            state, obs = env.reset()
            env_ep_extr_rews[active_envs.astype(bool)] = 0.0
            env_extr_rets[active_envs.astype(bool)] = 0.0
            for i in range(n_intr_rew_types):
                for j in range(config.num_agents):
                    env_ep_intr_rews[i][j][active_envs.astype(bool)] = 0.0
            env_times = np.zeros(config.n_rollout_threads, dtype=int)
            state = apply_to_all_elements(state, lambda x: x[active_envs.astype(bool)])
            obs = apply_to_all_elements(obs, lambda x: x[active_envs.astype(bool)])
            continue

        steps_since_update += int(active_envs.sum())
        if config.intrinsic_reward == 1:
            # if using state-visit counts, store state indices
            # shape = (n_envs, n_agents, n_inds)
            state_inds = np.array([i['visit_count_lookup'] for i in infos],
                                  dtype=int)
            state_inds_t = state_inds.transpose(1, 0, 2)
            novelties = get_count_based_novelties(env, state_inds_t, device='cpu')
            intr_rews = get_intrinsic_rewards(novelties, config, intr_rew_rms,
                                              update_irrms=True, active_envs=active_envs,
                                              device='cpu')
            intr_rews = apply_to_all_elements(intr_rews, lambda x: x.numpy().flatten())
        else:
            intr_rews = None
            state_inds = None
            state_inds_t = None

        replay_buffer.push(state, obs, agent_actions, rewards, next_state, next_obs, dones,
                           state_inds=state_inds)
        env_ep_extr_rews[active_envs.astype(bool)] += np.array(rewards)
        env_extr_rets[active_envs.astype(bool)] += np.array(rewards) * config.gamma_e**(env_times[active_envs.astype(bool)])
        env_times += active_envs.astype(int)
        if intr_rews is not None:
            for i in range(n_intr_rew_types):
                for j in range(config.num_agents):
                    env_ep_intr_rews[i][j][active_envs.astype(bool)] += intr_rews[i][j]
        over_time = env_times >= config.max_episode_length
        full_dones = np.zeros(config.n_rollout_threads)
        for i, env_i in enumerate(np.where(active_envs)[0]):
            full_dones[env_i] = dones[i]
        need_reset = np.logical_or(full_dones, over_time)
        # create masks ONLY for active envs
        active_over_time = env_times[active_envs.astype(bool)] >= config.max_episode_length
        active_need_reset = np.logical_or(dones, active_over_time)
        if any(need_reset):
            try:
                with timeout(seconds=1):
                    # reset any environments that are past the max number of time steps or done
                    state, obs = env.reset(need_reset=need_reset)
            # either environment got stuck or vizdoom crashed (vizdoom is unstable w/ multi-agent scenarios)
            except (TimeoutError, ViZDoomErrorException, ViZDoomIsNotRunningException, ViZDoomUnexpectedExitException) as e:
                print("Environments are broken...")
                env.close(force=True)
                print("Closed environments, starting new...")
                env = make_parallel_env(config, run_num)
                state, obs = env.reset()
                # other envs that were force reset (rest taken care of in subsequent code)
                other_reset = np.logical_not(need_reset)
                env_ep_extr_rews[other_reset.astype(bool)] = 0.0
                env_extr_rets[other_reset.astype(bool)] = 0.0
                for i in range(n_intr_rew_types):
                    for j in range(config.num_agents):
                        env_ep_intr_rews[i][j][other_reset.astype(bool)] = 0.0
                env_times = np.zeros(config.n_rollout_threads, dtype=int)
        else:
            state, obs = next_state, next_obs
        for env_i in np.where(need_reset)[0]:
            recent_ep_extr_rews.append(env_ep_extr_rews[env_i])
            meta_turn_rets.append(env_extr_rets[env_i])
            if intr_rews is not None:
                for j in range(n_intr_rew_types):
                    for k in range(config.num_agents):
                        # record intrinsic rewards per step (so we don't confuse shorter episodes with less intrinsic rewards)
                        recent_ep_intr_rews[j][k].append(env_ep_intr_rews[j][k][env_i] / env_times[env_i])
                        env_ep_intr_rews[j][k][env_i] = 0

            recent_ep_lens.append(env_times[env_i])
            env_times[env_i] = 0
            env_ep_extr_rews[env_i] = 0
            env_extr_rets[env_i] = 0
            eps_this_turn += 1

            if eps_this_turn + active_envs.sum() - 1 >= config.metapol_episodes:
                active_envs[env_i] = 0

        for i in np.where(active_need_reset)[0]:
            for j in range(config.num_agents):
                # len(infos) = number of active envs
                recent_found_treasures[j].append(infos[i]['n_found_treasures'][j])

        if eps_this_turn >= config.metapol_episodes:
            if not config.uniform_heads and n_rew_heads > 1:
                meta_turn_rets = np.array(meta_turn_rets)
                if all(errms.count < 1 for errms in extr_ret_rms):
                    for errms in extr_ret_rms:
                        errms.mean = meta_turn_rets.mean()
                extr_ret_rms[model.curr_pol_heads[0]].update(meta_turn_rets)
                for i in range(config.metapol_updates):
                    model.update_heads_onpol(meta_turn_rets, extr_ret_rms, logger=logger)
            pol_heads = model.sample_pol_heads(uniform=config.uniform_heads)
            model.set_pol_heads(pol_heads)
            eps_this_turn = 0
            meta_turn_rets = []
            active_envs = np.ones(config.n_rollout_threads)

        if any(need_reset):  # reset returns state and obs for all envs, so make sure we're only looking at active
            state = apply_to_all_elements(state, lambda x: x[active_envs.astype(bool)])
            obs = apply_to_all_elements(obs, lambda x: x[active_envs.astype(bool)])

        if (len(replay_buffer) >= max(config.batch_size,
                                      config.steps_before_update) and
                (steps_since_update >= config.steps_per_update)):
            steps_since_update = 0
            print('Updating at time step %i' % t)
            model.prep_training(device='cuda' if config.use_gpu else 'cpu')

            for u_i in range(config.num_updates):
                sample = replay_buffer.sample(config.batch_size,
                                              to_gpu=config.use_gpu,
                                              state_inds=(config.intrinsic_reward == 1))

                if config.intrinsic_reward == 0:  # no intrinsic reward
                    intr_rews = None
                    state_inds = None
                else:
                    sample, state_inds = sample
                    novelties = get_count_based_novelties(
                        env, state_inds,
                        device='cuda' if config.use_gpu else 'cpu')
                    intr_rews = get_intrinsic_rewards(novelties, config, intr_rew_rms,
                                                      update_irrms=False,
                                                      device='cuda' if config.use_gpu else 'cpu')

                model.update_critic(sample, logger=logger, intr_rews=intr_rews)
                model.update_policies(sample, logger=logger)
                model.update_all_targets()
            if len(recent_ep_extr_rews) > 10:
                logger.add_scalar('episode_rewards/extrinsic/mean',
                                  np.mean(recent_ep_extr_rews), t)
                logger.add_scalar('episode_lengths/mean',
                                  np.mean(recent_ep_lens), t)
                if config.intrinsic_reward == 1:
                    for i in range(n_intr_rew_types):
                        for j in range(config.num_agents):
                            logger.add_scalar('episode_rewards/intrinsic%i_agent%i/mean' % (i, j),
                                              np.mean(recent_ep_intr_rews[i][j]), t)
                for i in range(config.num_agents):
                    logger.add_scalar('agent%i/n_found_treasures' % i, np.mean(recent_found_treasures[i]), t)
                logger.add_scalar('total_n_found_treasures', sum(np.array(recent_found_treasures[i]) for i in range(config.num_agents)).mean(), t)

        if t % config.save_interval < config.n_rollout_threads:
            model.prep_training(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_%isteps.pt' % (t + 1)))
            model.save(run_dir / 'model.pt')

        t += active_envs.sum()
    model.prep_training(device='cpu')
    model.save(run_dir / 'model.pt')
    logger.close()
    env.close(force=(config.env_type == 'vizdoom'))
コード例 #15
0
ファイル: main.py プロジェクト: yuanleirl/coordination-marl
def train(config, dir_manager=None, logger=None, pbar="default_pbar"):
    # A few safety checks

    check_training_args(config)

    # Creates a directory manager that encapsulates our directory-tree structure

    if dir_manager is None:
        dir_manager = DirectoryManager(agent_alg=config.agent_alg,
                                       env_name=config.env_name,
                                       desc=config.desc,
                                       seed=config.seed)
        dir_manager.create_directories()

    # Creates logger and prints config

    if logger is None:
        logger = create_logger('MASTER', config.log_level,
                               dir_manager.seed_dir / 'logger.out')
    logger.debug(config_to_str(config))

    # Creates a progress-bar

    if type(pbar) is str:
        if pbar == "default_pbar":
            pbar = tqdm()

    if pbar is not None:
        pbar.n = 0
        pbar.desc += f'{dir_manager.storage_dir.name}/{dir_manager.experiment_dir.name}/{dir_manager.seed_dir.name}'
        pbar.total = config.n_episodes

    # Encapsulates in a dict all user-defined params that concern the world (scenario.make_world())

    world_params = {}
    world_params['use_dense_rewards'] = config.use_dense_rewards
    if config.env_name == 'chase':
        if config.n_preys is not None: world_params['n_preys'] = config.n_preys
        if config.n_preds is not None: world_params['n_preds'] = config.n_preds
        if config.prey_variance is not None:
            world_params['prey_variance'] = config.prey_variance
        if config.individual_reward is not None:
            world_params['individual_reward'] = config.individual_reward

    elif config.env_name == 'gather':
        if config.n_agents is not None:
            world_params['n_agents'] = config.n_agents

    elif config.env_name == 'intersection':
        if config.n_agents is not None:
            world_params['n_agents'] = config.n_agents

    elif config.env_name == 'bounce':
        world_params['episode_length'] = config.episode_length
        if config.line_length is not None:
            world_params['line_length'] = config.line_length

    elif config.env_name == 'compromise':
        if config.line_length is not None:
            world_params['line_length'] = config.line_length
        if config.show_all_landmarks is not None:
            world_params['show_all_landmarks'] = config.show_all_landmarks

    elif config.env_name == 'imitation':
        if config.staged is not None: world_params['staged'] = config.staged
        if config.set_trap is not None:
            world_params['set_trap'] = config.set_trap

    elif config.env_name == 'intersection':
        if config.by_stander is not None:
            world_params['by_stander'] = config.by_stander

    elif config.env_name == 'spread':
        if config.n_agents is not None:
            world_params['n_agents'] = config.n_agents
        if config.shuffle_landmarks is not None:
            world_params['shuffle_landmarks'] = config.shuffle_landmarks
        if config.color_objects is not None:
            world_params['color_objects'] = config.color_objects
        if config.small_agents is not None:
            world_params['small_agents'] = config.small_agents

    save_dict_to_json(world_params,
                      str(dir_manager.seed_dir / 'world_params.json'))

    # Encapsulates in a dict all user-defined params that concern the environment (multiagent.environment.MultiAgentEnv)

    env_params = {}
    env_params['env_name'] = config.env_name
    if 'football' not in config.env_name:
        env_params['use_max_speed'] = config.use_max_speed

    save_dict_to_json(env_params,
                      str(dir_manager.seed_dir / 'env_params.json'))

    # Sets the random seeds (for reproducibility)

    set_seeds(config.seed)

    # Initializes environments

    # TODO: Check reproductibility and that different envs are seeded differently
    if '3v2football' == config.env_name:

        obs_rep = config.representation

        if config.feature_extractor == 'identity':
            assert obs_rep in ['simple115', 'simple37']
        elif config.feature_extractor == 'convNet':
            assert obs_rep == 'extracted'
        else:
            raise NotImplemented(
                f"config.feature_extractor={config.feature_extractor} not recognized."
            )

        env = make_parallel_football_env(
            seed_dir=dir_manager.seed_dir,
            seed=config.seed,
            dump_freq=config.dump_freq,
            representation=obs_rep,
            render=False,
            n_rollout_threads=config.n_rollout_threads
        )  # no rendering during training
    else:
        env = make_parallel_particle_env(
            scenario_name=config.env_name,
            n_rollout_threads=config.n_rollout_threads,
            seed=config.seed,
            use_discrete_action=config.use_discrete_action,
            use_max_speed=config.use_max_speed,
            world_params=world_params)

    if not config.use_cuda:
        torch.set_num_threads(config.n_training_threads)

    # Initialize the algo

    algorithm = init_from_config(env, config, logger)

    # Creates recorders and stores basic info regarding agent types

    os.makedirs(dir_manager.recorders_dir, exist_ok=True)
    train_recorder = algorithm.create_train_recorder()
    train_recorder.tape['agent_colors'] = env.agent_colors

    if 'football' in config.env_name:

        if config.feature_extractor == "convNet":
            n_stack = 4
        elif config.feature_extractor == "identity":
            n_stack = 1
        else:
            raise NotImplemented

        obs_buffers = ObsBufferCollection(n_env=config.n_rollout_threads,
                                          n_stack=n_stack)
        replay_buffer = StackingReplayBuffer(
            max_steps=config.buffer_length,
            num_agents=algorithm.nagents,
            obs_dims=[obsp.shape for obsp in env.observation_space],
            ac_dims=[
                acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                for acsp in env.action_space
            ],
            n_stack=n_stack)

    else:
        # defines observation buffer for multi-step
        obs_buffers = ObsBufferCollection(n_env=config.n_rollout_threads,
                                          n_stack=1)

        replay_buffer = ReplayBuffer(
            max_steps=config.buffer_length,
            num_agents=algorithm.nagents,
            obs_dims=[obsp.shape for obsp in env.observation_space],
            ac_dims=[
                acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                for acsp in env.action_space
            ])

    # Saves initial models

    current_model = "model_ep0.pt"

    best_eval_reward_exploit = -100000.
    best_model_exploit = "model_ep0_exploit_best.pt"
    algorithm.save(dir_manager.seed_dir / current_model)
    algorithm.save(dir_manager.seed_dir / best_model_exploit)

    best_eval_reward_explore = -100000.
    best_model_explore = "model_ep0_explore_best.pt"
    algorithm.save(dir_manager.seed_dir / current_model)
    algorithm.save(dir_manager.seed_dir / best_model_explore)

    # Initializes step and episode counters

    step_i = 0
    ep_steps = np.zeros(shape=(config.n_rollout_threads, ), dtype=np.int)
    ep_dones = 0
    ep_recorders = [
        EpisodeRecorder(stuff_to_record=['reward'])
        for _ in range(config.n_rollout_threads)
    ]
    obs = env.reset()
    obs_buffers.fill(obs)

    algorithm.set_exploration(
        begin_decay_proportion=config.begin_exploration_decay,
        n_episodes=config.n_episodes,
        end_decay_proportion=config.end_exploration_decay,
        initial_scale=config.init_noise_scale,
        final_scale=config.final_noise_scale,
        current_episode=ep_dones)

    # EPISODES LOOP

    while ep_dones < config.n_episodes:

        start_time = time.time()

        # ENVIRONMENT STEP

        # convert observations to torch Variable

        torch_obs = [
            Variable(torch.Tensor(obs_buffers.read()[:, i]),
                     requires_grad=False) for i in range(algorithm.nagents)
        ]

        # get actions as torch Variables

        torch_agent_actions = algorithm.select_action(torch_obs,
                                                      is_exploring=True)

        # convert actions to numpy arrays

        agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

        # rearrange actions to be per environment

        actions = [[ac[i] for ac in agent_actions]
                   for i in range(config.n_rollout_threads)]

        # makes one step in the environment

        next_obs, rewards, dones, infos = env.step(actions)

        # put transitions in the memory buffer

        replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)

        # saves relevant info in episode recorders

        for i in range(config.n_rollout_threads):
            ep_recorders[i].add_step(obs[i], actions[i], rewards[i],
                                     next_obs[i])

        # ending step

        obs = next_obs
        obs_buffers.append(obs)

        step_i += config.n_rollout_threads
        step_time = time.time() - start_time

        ep_steps += 1

        # LEARNING STEP

        if (len(replay_buffer) >= config.batch_size * config.warmup) \
                and (step_i % config.steps_per_update) < config.n_rollout_threads:

            # Prepares models to training

            if config.use_cuda:
                algorithm.prep_training(device='gpu')
            else:
                algorithm.prep_training(device='cpu')

            # Performs one algorithm update

            sample = replay_buffer.sample(config.batch_size,
                                          to_gpu=config.use_cuda,
                                          normalize_rewards=False)
            algorithm.update(sample, train_recorder)

            # Update target networks

            algorithm.update_all_targets()

            # Prepares models to go back in rollout phase

            algorithm.prep_rollouts(device='cpu')

        # EPISODE ENDINGS

        episodes_over = dones | (ep_steps >= config.episode_length)

        if any(episodes_over):

            if pbar is not None:
                pbar.update(sum(episodes_over))

            for env_i, is_over in enumerate(episodes_over):
                if is_over:
                    ep_dones += 1
                    ep_steps[env_i] = 0

                    # Reset environments

                    obs[env_i] = env.reset(env_i=env_i)

                    obs_buffers[env_i].flush()
                    obs_buffers[env_i].fill(obs[env_i])

                    # Summarizes episode metrics

                    train_recorder.append(
                        'total_reward', ep_recorders[env_i].get_total_reward())

                    # Reinitialise episode recorder

                    ep_recorders[env_i] = EpisodeRecorder(
                        stuff_to_record=['reward'])

                    # Printing if one third of training is completed

                    if (ep_dones -
                            1) % (config.n_episodes //
                                  3) == 0 and ep_dones != config.n_episodes:
                        step_time = time.time() - start_time
                        logger.info(
                            f"Episode {ep_dones}/{config.n_episodes}, "
                            f"speed={round_to_two(float(config.n_rollout_threads) / step_time)}steps/s"
                        )

            # Sets exploration noise

            current_noise_scale = algorithm.set_exploration(
                begin_decay_proportion=config.begin_exploration_decay,
                n_episodes=config.n_episodes,
                end_decay_proportion=config.end_exploration_decay,
                initial_scale=config.init_noise_scale,
                final_scale=config.final_noise_scale,
                current_episode=ep_dones)

            # BOOK-KEEPING

            if ep_dones % config.episodes_per_save < config.n_rollout_threads:

                # Model checkpoints

                if config.save_incrementals:
                    os.makedirs(dir_manager.incrementals_dir, exist_ok=True)
                    algorithm.save(dir_manager.incrementals_dir /
                                   ('model_ep%i.pt' % (ep_dones + 1)))
                os.remove(dir_manager.seed_dir / current_model)
                current_model = f"model_ep{ep_dones}.pt"
                algorithm.save(dir_manager.seed_dir / current_model)
                logger.debug('Saving model checkpoint')

                # Current model evaluation (run episodes without exploration)

                if config.n_evaluation_episodes > 0:
                    logger.debug(
                        f'Evaluating model for {config.n_evaluation_episodes} episodes'
                    )
                    set_seeds(
                        config.evaluation_seed)  # fixed seed for evaluation
                    env.seed(config.evaluation_seed)

                    eval_config = get_evaluation_args(overwritten_args="")
                    eval_config.storage_name = dir_manager.storage_dir.name
                    eval_config.experiment_num = int(
                        dir_manager.experiment_dir.stem.strip('experiment'))
                    eval_config.seed_num = int(
                        dir_manager.seed_dir.stem.strip('seed'))
                    eval_config.render = False
                    eval_config.n_episodes = config.n_evaluation_episodes
                    eval_config.last_model = True
                    eval_config.noise_scale = None
                    eval_config.episode_length = config.episode_length
                    eval_config.representation = config.representation

                    # Evaluate with exploit (without explorarion)
                    eval_reward_exploit = np.vstack(evaluate(eval_config))

                    train_recorder.append('eval_episodes', ep_dones)
                    train_recorder.append('eval_total_reward_exploit',
                                          eval_reward_exploit)
                    if eval_reward_exploit.mean() > best_eval_reward_exploit:
                        logger.debug("New best exploit model")
                        os.remove(dir_manager.seed_dir / best_model_exploit)
                        best_model_exploit = f"model_ep{ep_dones}_exploit_best.pt"
                        algorithm.save(dir_manager.seed_dir /
                                       best_model_exploit)
                        best_eval_reward_exploit = eval_reward_exploit.mean()

                    # Evaluate with exploration
                    eval_config.noise_scale = current_noise_scale

                    eval_reward_explore = np.vstack(evaluate(eval_config))

                    train_recorder.append('eval_total_reward_explore',
                                          eval_reward_explore)
                    if eval_reward_explore.mean() > best_eval_reward_explore:
                        logger.debug("New best explore model")
                        os.remove(dir_manager.seed_dir / best_model_explore)
                        best_model_explore = f"model_ep{ep_dones}_explore_best.pt"
                        algorithm.save(dir_manager.seed_dir /
                                       best_model_explore)
                        best_eval_reward_explore = eval_reward_explore.mean()

                set_seeds(config.seed + ep_dones)
                env.seed(config.seed + ep_dones)

                # Graphs checkpoints

                logger.debug('Saving recorder checkpoints and graphs')
                train_recorder.save(dir_manager.recorders_dir /
                                    'train_recorder.pkl')

                # Saving graphs

                if len(train_recorder.tape['actor_loss']) > 0:
                    algorithm.save_training_graphs(
                        train_recorder=train_recorder,
                        save_dir=dir_manager.seed_dir)

    # Saves model one last time and close the environment

    os.remove(dir_manager.seed_dir / current_model)
    current_model = f"model_ep{ep_dones}.pt"
    algorithm.save(dir_manager.seed_dir / current_model)
    env.close()
コード例 #16
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)
    maddpg = MADDPG.init_from_env(env,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim,
                                  noisy_sharing=True,
                                  noisy_SNR=config.noisy_SNR,
                                  game_id=config.env_id,
                                  est_ac=config.est_action)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    print(
        '#########################################################################'
    )
    print('Adversary using: ', config.adversary_alg, 'Good agent using: ',
          config.agent_alg, '\n')
    print('Noisy SNR is: ', config.noisy_SNR)
    print(
        '#########################################################################'
    )
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')
        if ep_i % 5000 == 0:
            maddpg.lr *= 0.5
        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()

                maddpg.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            print("Episodes %i-%i of %i, rewards are: \n" %
                  (ep_i + 1, ep_i + 1 + config.n_rollout_threads,
                   config.n_episodes))
            for a_i, a_ep_rew in enumerate(ep_rews):
                print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

        # *** perform validation every 1000 episodes. i.e. run N=10 times without exploration ***
        if ep_i % config.validate_every_n_eps == config.validate_every_n_eps - 1:
            # 假设只有一个env在跑
            episodes_stats = []
            info_for_one_env_among_timesteps = []
            print('*' * 10, 'Validation BEGINS', '*' * 10)
            for valid_et_i in range(config.run_n_eps_in_validation):
                obs = env.reset()
                maddpg.prep_rollouts(device='cpu')
                explr_pct_remaining = max(0, config.n_exploration_eps -
                                          ep_i) / config.n_exploration_eps
                maddpg.scale_noise(
                    config.final_noise_scale +
                    (config.init_noise_scale - config.final_noise_scale) *
                    explr_pct_remaining)
                maddpg.reset_noise()

                curr_episode_stats = []
                for et_i in range(config.episode_length):
                    # rearrange observations to be per agent, and convert to torch Variable
                    torch_obs = [
                        Variable(torch.Tensor(np.vstack(obs[:, i])),
                                 requires_grad=False)
                        for i in range(maddpg.nagents)
                    ]
                    # get actions as torch Variables
                    torch_agent_actions = maddpg.step(torch_obs, explore=False)
                    # convert actions to numpy arrays
                    agent_actions = [
                        ac.data.numpy() for ac in torch_agent_actions
                    ]
                    # rearrange actions to be per environment
                    actions = [[ac[i] for ac in agent_actions]
                               for i in range(config.n_rollout_threads)]
                    next_obs, rewards, dones, infos = env.step(actions)

                    info_for_one_env_among_timesteps.append(infos[0]['n'])

                    curr_episode_stats.append(infos[0]['n'])

                    obs = next_obs
                episodes_stats.append(curr_episode_stats)

            print('Summary statistics:')
            if config.env_id == 'simple_tag':
                # avg_collisions = sum(map(sum,info_for_one_env_among_timesteps))/config.run_n_eps_in_validation
                episodes_stats = np.array(episodes_stats)
                # print(episodes_stats.shape)
                # validation logging
                with open(f'{config.model_name}.log', 'a') as valid_logfile:
                    valid_logwriter = csv.writer(valid_logfile, delimiter=' ')
                    valid_logwriter.writerow(
                        np.sum(episodes_stats, axis=(1, 2)).tolist())
                avg_collisions = np.sum(
                    episodes_stats) / episodes_stats.shape[0]
                print(f'Avg of collisions: {avg_collisions}')

            elif config.env_id == 'simple_speaker_listener':
                for i, stat in enumerate(info_for_one_env_among_timesteps):
                    print(f'ep {i}: {stat}')
            else:
                raise NotImplementedError
            print('*' * 10, 'Validation ENDS', '*' * 10)

        # *** END of VALIDATION ***

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    valid_logfile.close()
コード例 #17
0
ファイル: main_test.py プロジェクト: RakshithaArun/MAAC
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    # model = AttentionSAC.init_from_env(env,
    #                                    tau=config.tau,
    #                                    pi_lr=config.pi_lr,
    #                                    q_lr=config.q_lr,
    #                                    gamma=config.gamma,
    #                                    pol_hidden_dim=config.pol_hidden_dim,
    #                                    critic_hidden_dim=config.critic_hidden_dim,
    #                                    attend_heads=config.attend_heads,
    #                                    reward_scale=config.reward_scale)

    # Model used to test with adversarial agent 
    # model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run140\\model.pt")
    # print("Model instantiated")

    # Model used to test without adversarial agent 
    model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run148\\model.pt")
    print("Model instantiated")

    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0

    row_list = []

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(model.nagents)]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            # print (rewards)
            # print (dones[0])
            # env.render('human')
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    #print(sample)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')

            if (dones[0][0]):
                print("Breakin the epsiodeeeee at timestep", et_i)
                break
        
        et_i += 1   

        row_list.append((ep_i+1,et_i))   

        ep_rews = replay_buffer.get_average_rewards(
            et_i * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * et_i, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    with open('Timesteps_vs_Episodes.csv', 'w', newline='') as file:
         writer = csv.writer(file)
         writer.writerow(["Ep No", "Number of Timesteps"])
         for row in row_list:
            writer.writerow(row)

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
コード例 #18
0
ファイル: main.py プロジェクト: laukikm/maddpg-pytorch
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    #logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action)

    if(env=='simple_reference'):
        for i in range(2):
            agent_init_params.append({'num_in_pol': num_in_pol,
                                          'num_out_pol': num_out_pol,
                                          'num_in_critic': num_in_critic})
            
            init_dict = {'gamma': gamma, 'tau': tau, 'lr': lr,
                         'hidden_dim': hidden_dim,
                         'alg_types': alg_types,
                         'agent_init_params': agent_init_params,
                         'discrete_action': discrete_action}

    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)

    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0

    episode_average_rewards=[]
    hundred_episode_average_rewards=[]

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):

        if (ep_i%100==0 and ep_i>0):
            hundred_episode_average_rewards.append(np.mean(episode_average_rewards))
            print('Rewards till',ep_i,'=',hundred_episode_average_rewards[-1])
            print('Agent Actions=',torch_agent_actions)
            episode_average_rewards=[]
        '''
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        '''
        obs = env.reset()

        rewards_for_this_episode=[]
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            rewards_for_this_episode.append(np.mean(rewards))

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i)#, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
            
            if ep_i>10000:
                print('Goal Color=',torch_obs[0])
                print('Communication=',agent_actions[0])
            
                env.render()
                time.sleep(0.01)


        if ep_i>100000:
            import ipdb
            ipdb.set_trace()

        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        
        episode_average_rewards.append(np.sum(rewards_for_this_episode))
        #for a_i, a_ep_rew in enumerate(ep_rews):
            #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    plt.plot(100*np.array(range(1,config.n_episodes//100)),hundred_episode_average_rewards)
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward for 100 episodes')
    plt.title('Speaker Discrete and Mover Continuous')
    plt.show('plot.png')

    maddpg.save(run_dir / 'model.pt')
    env.close()
コード例 #19
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    # if not model_dir.exists():
    #     run_num = 1
    # else:
    #     exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
    #                      model_dir.iterdir() if
    #                      str(folder.name).startswith('run')]
    #     if len(exst_run_nums) == 0:
    #         run_num = 1
    #     else:
    #         run_num = max(exst_run_nums) + 1
    run_num = 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir,exist_ok=True)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    model = AttentionSAC.init_from_env(env,
                                       tau=config.tau,
                                       pi_lr=config.pi_lr,
                                       q_lr=config.q_lr,
                                       gamma=config.gamma,
                                       pol_hidden_dim=config.pol_hidden_dim,
                                       critic_hidden_dim=config.critic_hidden_dim,
                                       attend_heads=config.attend_heads,
                                       reward_scale=config.reward_scale)
    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(model.nagents)]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * config.episode_length, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
コード例 #20
0
ファイル: main_walker.py プロジェクト: ShawnLue/MA_Control
def run(args, **args_dict):
    reward_flag, pos_flag = None, None
    save_data = {'reward': -1000., 'pos': 0.}
    # model_dir = Path('./models') / config.env_id / config.model_name
    # if not model_dir.exists():
    #     curr_run = 'run1'
    # else:
    #     exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
    #                      model_dir.iterdir() if
    #                      str(folder.name).startswith('run')]
    #     if len(exst_run_nums) == 0:
    #         curr_run = 'run1'
    #     else:
    #         curr_run = 'run%i' % (max(exst_run_nums) + 1)
    # run_dir = model_dir / curr_run
    # log_dir = run_dir / 'logs'
    # os.makedirs(log_dir)

    th.manual_seed(args.seed)
    np.random.seed(args.seed)
    if not args.use_cuda or not th.cuda.is_available():
        # th.set_num_threads(args.n_training_threads)
        FloatTensor = th.FloatTensor
    else:
        FloatTensor = th.cuda.FloatTensor
    env = make_parallel_env(**args_dict)
    maddpg = MADDPG.init_from_env(env, args)
    replay_buffer = ReplayBuffer(
        args.capacity, args.n_agents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, args.n_episodes, args.n_rollout_threads):
        ttt = time.time()
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        if args.use_cuda and th.cuda.is_available():
            maddpg.prep_rollouts(device='gpu')
        else:
            maddpg.prep_rollouts(device='cpu')
        # maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(
            0, args.n_exploration_eps - ep_i) / args.n_exploration_eps
        scale_noise_i = args.final_noise_scale + (
            args.init_noise_scale -
            args.final_noise_scale) * explr_pct_remaining
        maddpg.scale_noise(scale_noise_i)
        maddpg.reset_noise()

        print("Episodes %i-%i of %i, replay: %.2f, explore: %.2f" %
              (ep_i + 1, ep_i + 1 + args.n_rollout_threads, args.n_episodes,
               float(len(replay_buffer)) / replay_buffer.max_steps,
               scale_noise_i))

        for et_i in range(args.max_steps):
            ttt = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                th.from_numpy(np.vstack(obs[:, i])).type(FloatTensor)
                for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [
                ac.detach().cpu().numpy() for ac in torch_agent_actions
            ]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(args.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += args.n_rollout_threads
            #
            # ttt2 = time.time()
            # print('1', ttt2 - ttt)
            #
            if (len(replay_buffer) >= args.batch_size
                    and (t % args.steps_per_update) < args.n_rollout_threads):
                ttt = time.time()
                if args.use_cuda and th.cuda.is_available():
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                # for u_i in range(args.n_rollout_threads):
                for a_i in range(maddpg.nagents):
                    sample = replay_buffer.sample(args.batch_size,
                                                  to_gpu=args.use_cuda
                                                  and th.cuda.is_available(),
                                                  norm_rews=args.norm_rews)
                    _, _, _ = maddpg.update(sample, a_i)
                maddpg.update_all_targets()
                if args.use_cuda and th.cuda.is_available():
                    maddpg.prep_rollouts(device='gpu')
                else:
                    maddpg.prep_rollouts(device='cpu')
                # maddpg.prep_rollouts(device='cpu')
                #
                # ttt2 = time.time()
                # print('2', ttt2 - ttt)
                #
        # ep_rews = replay_buffer.get_average_rewards(
        #     config.episode_length * config.n_rollout_threads)
        # for a_i, a_ep_rew in enumerate(ep_rews):
        #     logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % args.test_interval < args.n_rollout_threads:
            ttt = time.time()
            obs = env.reset()
            if args.use_cuda and th.cuda.is_available():
                maddpg.prep_rollouts(device='gpu')
            else:
                maddpg.prep_rollouts(device='cpu')
            # maddpg.prep_rollouts(device='cpu')
            with th.no_grad():
                pos_total = 0.
                finish_ep = np.zeros(args.n_rollout_threads)
                r_total = np.zeros((args.n_rollout_threads, args.n_agents))
                record_r = np.zeros(args.n_agents)
                for eval_i in range(args.max_steps):
                    torch_obs = [
                        FloatTensor(np.vstack(obs[:, i]))
                        for i in range(maddpg.nagents)
                    ]
                    torch_agent_actions = maddpg.step(torch_obs, explore=False)
                    agent_actions = [
                        ac.detach().cpu().numpy() for ac in torch_agent_actions
                    ]
                    actions = [[ac[i] for ac in agent_actions]
                               for i in range(args.n_rollout_threads)]
                    next_obs, rewards, dones, infos = env.step(actions)
                    r_total += rewards
                    obs = next_obs
                    for d_i in range(dones.shape[0]):
                        if dones[d_i] or (eval_i == args.max_steps - 1
                                          and finish_ep[d_i] == 0.):
                            # if eval_i == args.max_steps - 1 and finish_ep[d_i] == 0.:
                            #     print(d_i)
                            pos_total += infos[d_i]['pos']
                            record_r += r_total[d_i]
                            r_total[d_i] = [0., 0.]
                            finish_ep[d_i] += 1
                record_r /= finish_ep.sum()
                pos_total /= finish_ep.sum()

                # ttt2 = time.time()
                # print('3', ttt2 - ttt)
                #

                new_path = model_path + '/' + str(ep_i) + '.pt'
                has_saved = False
                if record_r.sum() > save_data['reward']:
                    save_data['reward'] = record_r.sum()
                    if save_data['reward'] > 0 and pos_total > 10.:
                        # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True)
                        maddpg.save(new_path)
                if pos_total > save_data['pos']:
                    save_data['pos'] = pos_total
                    if record_r.sum(
                    ) > 0 and pos_total > 10. and not has_saved:
                        # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True)
                        maddpg.save(new_path)
                if pos_total > 17.0:
                    maddpg.save(new_path)

                if reward_flag is None:
                    reward_flag = vis.line(
                        X=np.arange(ep_i, ep_i + 1),
                        Y=np.array([np.append(record_r, record_r.sum())]),
                        opts=dict(ylabel='Test Reward',
                                  xlabel='Episode',
                                  title='Reward',
                                  legend=[
                                      'Agent-%d' % i
                                      for i in range(args.n_agents)
                                  ] + ['Total']))
                else:
                    vis.line(X=np.array(
                        [np.array(ep_i).repeat(args.n_agents + 1)]),
                             Y=np.array([np.append(record_r, record_r.sum())]),
                             win=reward_flag,
                             update='append')

                if pos_flag is None:
                    pos_flag = vis.line(X=np.arange(ep_i, ep_i + 1),
                                        Y=np.array([pos_total]),
                                        opts=dict(ylabel='Length',
                                                  xlabel='Episode',
                                                  title='How far ?',
                                                  legend=['position']))
                else:
                    vis.line(X=np.array([ep_i]),
                             Y=np.array([pos_total]),
                             win=pos_flag,
                             update='append')
        # if ep_i % config.save_interval < config.n_rollout_threads:
        #     os.makedirs(run_dir / 'incremental', exist_ok=True)
        #     maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
        #     maddpg.save(run_dir / 'model.pt')

    # maddpg.save(run_dir / 'model.pt')
    env.close()
コード例 #21
0
ファイル: main.py プロジェクト: xuezzee/-
def run(config):
    scores_window = deque(maxlen=100)

    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)

    # transport configuration
    name = 'Materials Transport'
    conf = {
        'n_player': 2,  #玩家数量
        'board_width': 11,  #地图宽
        'board_height': 11,  #地图高
        'n_cell_type': 5,  #格子的种类
        'materials': 4,  #集散点数量
        'cars': 2,  #汽车数
        'planes': 0,  #飞机数量
        'barriers': 12,  #固定障碍物数量
        'max_step': 500,  #最大步数
        'game_name': name,  #游戏名字
        'K': 5,  #每个K局更新集散点物资数目
        'map_path': 'env/map.txt',  #存放初始地图
        'cell_range': 6,  # 单格中各维度取值范围(tuple类型,只有一个int自动转为tuple)##?
        'ob_board_width': None,  # 不同智能体观察到的网格宽度(tuple类型),None表示与实际网格相同##?
        'ob_board_height': None,  # 不同智能体观察到的网格高度(tuple类型),None表示与实际网格相同##?
        'ob_cell_range':
        None,  # 不同智能体观察到的单格中各维度取值范围(二维tuple类型),None表示与实际网格相同##?
    }

    env = make_parallel_env_transport(config.env_id, conf,
                                      config.n_rollout_threads, config.seed,
                                      config.discrete_action)

    maddpg = MADDPG.init_from_env(env,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        score = 0
        # print("Episodes %i-%i of %i" % (ep_i + 1,
        #                                 ep_i + 1 + config.n_rollout_threads,
        #                                 config.n_episodes))

        obs = env.reset()  # TODO: TO CHECK
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # print('step', et_i)
            # env.render()
            # rearrange observations to be per agent, and convert to torch Variable
            # print('step', et_i)
            # print(maddpg.nagents)
            torch_obs = [
                Variable(
                    torch.Tensor(np.vstack(obs[:, i])),  # 沿着竖直方向将矩阵堆叠起来。
                    requires_grad=False) for i in range(maddpg.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            ############################################
            # add
            # actions = actions.astype(int)
            ############################################
            # add: 前两个action
            joint_action = []

            for i in range(2):
                player = []
                for j in range(1):
                    each = [0] * 11
                    # idx = np.random.randint(11)
                    each[3] = 1
                    player.append(each)
                joint_action.append(player)
            for m in range(2):
                joint_action.append([actions[0][m].astype(int).tolist()])

            next_obs, rewards, dones, infos = env.step(joint_action)

            #################################
            agents_action = actions[0]
            #################################

            replay_buffer.push(obs, agents_action, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')

            score += rewards[0][0]

        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

        scores_window.append(score)
        reward_epi = np.mean(scores_window)
        reward_epi_var = np.var(scores_window)
        logger.add_scalar('results/completion_window' % reward_epi, ep_i)
        logger.add_scalar('results/completion_window' % reward_epi_var, ep_i)
        print(
            '\r Episode {}\t Average Reward: {:.3f}\t Var Reward: {:.3f} \t '.
            format(ep_i, reward_epi, reward_epi_var))

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
コード例 #22
0
class DDPG(Chain):
    def __init__(self):
        super(DDPG, self).__init__(
            actor=Actor(),
            critic=Critic(),
        )
        self.target_actor = deepcopy(self.actor)
        self.target_critic = deepcopy(self.critic)
        disable_train(self.target_actor)
        disable_train(self.target_critic)

        self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(A_DIM))
        self.buffer = ReplayBuffer(BUFFER_SIZE)
        self.time = 0

    def reset(self, s):
        self.prev_s = s
        self.noise.reset()

    def step(self, s, r, done, trainable):
        self.time += 1
        self.buffer.add(self.prev_s, self.prev_a, r, done, s, self.prev_noise)
        self.prev_s = s
        if trainable and self.time % TRAIN_INTERVAL == 0:
            if len(self.buffer) > NUM_WARMUP_STEP:
                return self._update()

    def get_action(self):
        S, = make_batch(self.prev_s)
        a = self.actor(S)[0]  # (A_DIM, )
        noise = self.noise().astype(np.float32)
        self.prev_a = a
        self.prev_noise = noise
        return (a + noise).data.reshape(-1)

    def _update(self):
        S, A, R, D, S2, N = self.buffer.sample_batch(
            BATCH_SIZE)  # (6, BATCH_SIZE)
        S = np.array(S, dtype=np.float32)  # (BATCH_SIZE, O_DIM)
        S2 = np.array(S2, dtype=np.float32)
        A = F.stack(A)  # (BATCH_SIZE, A_DIM)
        R = np.array(R, dtype=np.float32).reshape(-1, 1)
        N = np.array(N)

        # update critic
        A_ = self.target_actor(S2)
        Y = R + GAMMA * self.target_critic(S2, A_.data)
        Q_batch = self.critic(S, (A + N).data)
        critic_loss = F.mean_squared_error(Y.data, Q_batch)
        self.critic.update(critic_loss)

        # update actor
        A = self.actor(S)  # why?? but essential!!
        Q = self.critic(S, A)
        actor_loss = -F.sum(Q) / BATCH_SIZE
        #from chainer import computational_graph as c
        #g = c.build_computational_graph([actor_loss])
        #with open('graph_actorloss.dot', 'w') as o:
        #    o.write(g.dump())
        #exit()
        self.actor.update(actor_loss)

        # update target
        soft_copy_param(self.target_critic, self.critic, TAU)
        soft_copy_param(self.target_actor, self.actor, TAU)

        return actor_loss.data, critic_loss.data
コード例 #23
0
ファイル: main.py プロジェクト: yathartha3/DPP
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)

    ##################### INITIALIZE FROM SAVED? ###########################
    if init_from_saved:
        if model_path is not None:
            maddpg = MADDPG.init_from_save(model_path)
            print("Initialized from saved model")
    # -------------------------------------------------------------------- #
    else:
        maddpg = MADDPG.init_from_env(env,
                                      agent_alg=config.agent_alg,
                                      adversary_alg=config.adversary_alg,
                                      tau=config.tau,
                                      lr=config.lr,
                                      hidden_dim=config.hidden_dim)
    # used for learning (updates)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    # This is just to store the global rewards and not for updating the policies
    g_storage_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions, maddpg)
            '''
            Reward Shaping using D++, D.
            The rewards now contain global as well as shaped rewards
            Keep the global for logging, and use the shaped rewards for updates
            '''
            # Choose which reward to use
            use_dpp = True

            # DIFFERENCE REWARDS
            d_rewards = []
            for n in range(maddpg.nagents):
                d_rewards.append([rewards[0][n][1]])
            d_rewards = [d_rewards]
            d_rewards = np.array(d_rewards)

            # GLOBAL REWARDS
            g_rewards = []
            for n in range(maddpg.nagents):
                g_rewards.append([rewards[0][n][0]])
            g_rewards = [g_rewards]
            g_rewards = np.array(g_rewards)

            if use_dpp:
                rewards = d_rewards
            else:
                rewards = g_rewards
            # ----------------------------------------------------------- #
            # Buffer used for updates
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            # push global rewards into g_replay_buffer for plotting
            g_storage_buffer.push(obs, agent_actions, g_rewards, next_obs,
                                  dones)

            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        # Take out global reward from g_storage_buffer
        ep_rews = g_storage_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)

        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
コード例 #24
0
ファイル: main_gpu.py プロジェクト: leehe228/TIL
def run(config):
    model_dir = Path('./models') / config["env_id"] / config["model_name"]
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config["n_rollout_threads"], run_num)
    model = AttentionSAC.init_from_env(
        env,
        tau=config["tau"],
        pi_lr=config["pi_lr"],
        q_lr=config["q_lr"],
        gamma=config["gamma"],
        pol_hidden_dim=config["pol_hidden_dim"],
        critic_hidden_dim=config["critic_hidden_dim"],
        attend_heads=config["attend_heads"],
        reward_scale=config["reward_scale"])
    # (** EDITED **) Set Replay Buffer
    # env.action_space, env.observation_space 의 shape를 iteration을 통해 버퍼 설정
    replay_buffer = ReplayBuffer(config["buffer_length"], model.nagents,
                                 [115 for _ in range(model.nagents)],
                                 [19 for _ in range(model.nagents)])
    t = 0
    for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]):
        print("Episodes %i-%i of %i" %
              (ep_i + 1, ep_i + 1 + config["n_rollout_threads"],
               config["n_episodes"]))

        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config["episode_length"]):
            print("episode : {} | step : {}".format(ep_i, et_i), end='\r')
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config["n_rollout_threads"])]

            # Reform Actions list to fit on Football Env
            # Google Football 환경은 액션 리스트 (one hot encoded)가 아닌 정수값을 받음
            actions_list = [[np.argmax(b) for b in a] for a in actions]

            # Step
            next_obs, rewards, dones, infos = env.step(actions_list)

            # Prevention of divergence
            # 안해주면 발산해서 학습 불가 (NaN)
            rewards = rewards - 0.000001

            # Reform Done Flag list
            # replay buffer에 알맞도록 done 리스트 재구성
            dones = (np.array([dones for _ in range(model.nagents)])).T

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config["n_rollout_threads"]
            if (len(replay_buffer) >= config["batch_size"]
                    and (t % config["steps_per_update"]) <
                    config["n_rollout_threads"]):
                if config["use_gpu"]:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config["num_updates"]):
                    sample = replay_buffer.sample(config["batch_size"],
                                                  to_gpu=config["use_gpu"])
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config["episode_length"] * config["n_rollout_threads"])
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * config["episode_length"], ep_i)

        if ep_i % config["save_interval"] < config["n_rollout_threads"]:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
コード例 #25
0
ファイル: __fortrail.py プロジェクト: Joshua-Ren/maddpg-again
torch.manual_seed(1024)
np.random.seed(1024)

env = make_parallel_env(env_id, n_rollout_threads, 1024, True)
maddpg = MADDPG.init_from_env(env,
                              agent_alg='MADDPG',
                              adversary_alg='MADDPG',
                              tau=0.01,
                              lr=0.01,
                              hidden_dim=64,
                              est_ac=True,
                              game_id='simple_speaker_listener')

replay_buffer = ReplayBuffer(
    buffer_length, maddpg.nagents,
    [obsp.shape[0] for obsp in env.observation_space], [
        acsp.shape[0] if isinstance(acsp, Box) else acsp.n
        for acsp in env.action_space
    ])

t = 0
#for ep_i in range(0, n_episodes, n_rollout_threads):
for ep_i in range(0, 10, 1):
    #print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + n_rollout_threads, n_episodes))
    obs = env.reset()
    maddpg.prep_rollouts(device='cpu')

    explr_pct_remaining = max(0, n_exploration_eps - ep_i) / n_exploration_eps
    maddpg.scale_noise(final_noise_scale +
                       (init_noise_scale - final_noise_scale) *
                       explr_pct_remaining)
    maddpg.reset_noise()
コード例 #26
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads,
                            config.seed, config.discrete_action)

    if isinstance(env.action_space[0], Box):
        discr_act = False
        get_shape = lambda x: x.shape[0]
    else:  # Discrete
        discr_act = True
        get_shape = lambda x: x.n
    num_out_pol = get_shape(env.action_space[0])

    agent_init_params = {
        'num_in_pol': env.observation_space[0].shape[0],
        'num_out_pol': num_out_pol,
        'num_vars': len(env.agent_types)
    }
    maddpg = MADDPG(agent_init_params,
                    nagents=len(env.agent_types),
                    tau=config.tau,
                    lr=config.lr,
                    hidden_dim=config.hidden_dim,
                    discrete_action=discr_act)

    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ], config.hidden_dim * (maddpg.nagents - 1))
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        rnn_hidden = (torch.zeros(
            1,
            config.n_rollout_threads * (maddpg.nagents) * (maddpg.nagents - 1),
            config.hidden_dim),
                      torch.zeros(
                          1,
                          config.n_rollout_threads * (maddpg.nagents) *
                          (maddpg.nagents - 1), config.hidden_dim))

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions, new_rnn_hidden = maddpg.step(torch_obs,
                                                              rnn_hidden,
                                                              explore=True)
            hid_to_store = (rnn_hidden[0].detach().contiguous().view(
                config.n_rollout_threads, maddpg.nagents,
                -1), rnn_hidden[1].detach().contiguous().view(
                    config.n_rollout_threads, maddpg.nagents, -1))
            next_hid_to_store = (new_rnn_hidden[0].detach().contiguous().view(
                config.n_rollout_threads, maddpg.nagents,
                -1), new_rnn_hidden[1].detach().contiguous().view(
                    config.n_rollout_threads, maddpg.nagents, -1))

            # convert actions to numpy arrays
            agent_actions = [
                ac.data.numpy() for ac in torch_agent_actions.cpu()
            ]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, hid_to_store, agent_actions, rewards,
                               next_obs, next_hid_to_store, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                sample = replay_buffer.sample(config.batch_size,
                                              to_gpu=USE_CUDA)
                maddpg.update(sample, ep_i)
                maddpg.update_all_targets()
            rnn_hidden = new_rnn_hidden
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)
            print("Episode %i, reward for %i is " % (ep_i + 1, a_i), a_ep_rew)

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
コード例 #27
0
def train(env):
    n_agents = env["n_agents"]
    x_dim = env["x_dim"]
    y_dim = env["y_dim"]
    n_cities = env["n_cities"]
    max_rails_between_cities = env["max_rails_between_cities"]
    max_rails_in_city = env["max_rails_in_city"]
    seed = 0
    use_fast_tree_obs = False

    # Observation parameters
    observation_tree_depth = 4
    observation_radius = 10
    observation_max_path_depth = 30

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 10000,  # Rate of malfunctions
        min_duration=15,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = None

    if use_fast_tree_obs:
        tree_observation = FastTreeObs(max_depth=observation_tree_depth)
        print("Using FastTreeObs")
    else:
        tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth,
                                             predictor=predictor)
        print("Using StandardTreeObs")

    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    env = RailEnv(
        width=x_dim,
        height=y_dim,
        rail_generator=sparse_rail_generator(
            max_num_cities=n_cities,
            grid_mode=False,
            max_rails_between_cities=max_rails_between_cities,
            max_rails_in_city=max_rails_in_city),
        schedule_generator=sparse_schedule_generator(speed_profiles),
        number_of_agents=n_agents,
        malfunction_generator_and_process_data=malfunction_from_params(
            malfunction_parameters),
        obs_builder_object=tree_observation,
        random_seed=seed)

    rewards = []
    obs, info = env.reset()

    if use_fast_tree_obs:
        state_size = tree_observation.observation_dim
    else:
        # Calculate the state size given the depth of the tree observation and the
        # number of features
        n_features_per_node = env.obs_builder.observation_dim
        n_nodes = 0
        for i in range(observation_tree_depth + 1):
            n_nodes += np.power(4, i)

        state_size = n_features_per_node * n_nodes

    action_size = 5

    DEVICE = 'cpu'
    # if torch.cuda.is_available():
    # 	DEVICE = 'gpu'

    buffer_length = 10000
    steps_to_save_model = 10
    step_size = 100
    num_steps = 100  # update every 100 steps
    avg_steps = 20  # num steps to average and plot rewards
    reward_q = []
    batch_size = 100

    agent_obs = np.array([None] * env.get_num_agents())

    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))
    num_episodes = 100000

    agent_init_params = []
    sa_size = []

    for i in range(n_agents):
        agent_init_params.append({
            'num_in_pol': state_size,
            'num_out_pol': action_size,
            'init_weights': 'model.pt'
        })
        sa_size.append((state_size, action_size))

    hyperparams = {
        "tau": 0.01,
        "pi_lr": 0.00001,
        "q_lr": 0.00005,
        "pol_hidden_dim": 256,
        "critic_hidden_dim": 256,
        "attend_heads": 8
    }

    model = AttentionSAC(agent_init_params=agent_init_params,
                         sa_size=sa_size,
                         tau=hyperparams["tau"],
                         pi_lr=hyperparams["pi_lr"],
                         q_lr=hyperparams["q_lr"],
                         pol_hidden_dim=hyperparams["pol_hidden_dim"],
                         critic_hidden_dim=hyperparams["critic_hidden_dim"],
                         attend_heads=hyperparams["attend_heads"])
    model.init_dict = {}

    replay_buffer = ReplayBuffer(buffer_length, n_agents,
                                 [state_size for i in range(n_agents)],
                                 [action_size for i in range(n_agents)])

    print("MAX STEPS: " + str(max_steps))
    print("NUM EPISODES: ", num_episodes)
    print("HYPERPARAMS: ")
    print(hyperparams)

    start_time = time.time()

    for ep in range(num_episodes):
        print("Episode " + str(ep) + ":", flush=True)
        obs, info = env.reset(True, True)
        model.prep_rollouts(device=DEVICE)
        reward_sum_for_this_episode = 0

        for steps in range(max_steps):
            if steps % step_size == 0:
                print("=", end="", flush=True)
            for agent in env.get_agent_handles():
                if obs[agent] is not None:
                    if use_fast_tree_obs:
                        agent_obs[agent] = obs[agent]
                    else:
                        agent_obs[agent] = normalize_observation(
                            obs[agent],
                            observation_tree_depth,
                            observation_radius=observation_radius)
                else:
                    agent_obs[agent] = np.array([0.] * state_size)

            action_dict = {}
            agent_actions = []

            torch_obs = [
                Variable(torch.Tensor([agent_obs[i]]), requires_grad=False)
                for i in range(n_agents)
            ]
            torch_agent_actions = model.step(torch_obs, explore=True)
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            for i in range(n_agents):
                dist = torch_agent_actions[i][0]
                idx = -1
                for j in range(action_size):
                    if dist[j] != 0:
                        idx = j
                        break
                action_dict[i] = idx

            next_obs, all_rewards, done, info = env.step(action_dict)

            rewards = []
            dones = []

            next_agent_obs = np.array([None] * env.get_num_agents())

            for agent in env.get_agent_handles():
                if next_obs[agent] is not None:
                    if use_fast_tree_obs:
                        next_agent_obs[agent] = next_obs[agent]
                    else:
                        next_agent_obs[agent] = normalize_observation(
                            obs[agent],
                            observation_tree_depth,
                            observation_radius=observation_radius)
                else:
                    next_agent_obs[agent] = np.array([0.] * state_size)

            for i in range(n_agents):
                reward_sum_for_this_episode += all_rewards[i]
                rewards.append(all_rewards[i])
                all_rewards[i] += augment_reward(agent_obs[agent])
                dones.append(done[i])

            replay_buffer.push(np.array([agent_obs]), np.array(agent_actions),
                               np.array([rewards]), np.array([next_agent_obs]),
                               np.array([dones]))

            if steps % num_steps == 0:
                model.prep_training(device=DEVICE)
                sample = replay_buffer.sample(batch_size, norm_rews=False)
                #print(sample)
                model.update_critic(sample)
                model.update_policies(sample)
                model.update_all_targets()
                model.prep_rollouts(device=DEVICE)

        reward_sum_for_this_episode /= n_agents
        reward_q.append(reward_sum_for_this_episode)

        if len(reward_q) == avg_steps:
            wandb.log({'reward': np.mean(reward_q)})
            reward_q = []

        print()

        if ep % steps_to_save_model == 0:
            print("\nSaving model")
            model.save(os.getcwd() + "/model.pt")
            cur_time = time.time()
            time_elapsed = (cur_time - start_time) // 60
            print("Time Elapsed: " + str(time_elapsed) + "\n")
コード例 #28
0
def run(config):
    model_dir = Path('./models') / config.env_name / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1

    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    os.system("cp shape.txt {}".format(run_dir))
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    #training时的线程数
    if not config.use_cuda:
        torch.set_num_threads(config.n_training_threads)

    #env并行采样的进程

    env = make_parallel_env(config.num_agents, config.n_rollout_threads,
                            run_num, config.shape_file)
    #'''
    maddpg = MADDPG.init_from_env(env=env,
                                  agent_alg=config.agent_alg,
                                  cripple_alg=config.cripple_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim,
                                  discrete_action=config.discrete_action)
    #'''
    #maddpg = MADDPG.init_from_save(model_dir/'run1'/'model.pt')

    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    t = 0
    a_loss = []
    c_loss = []
    rewss = []

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))

        obs = env.reset()

        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')  # show for the first time

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        #if config.display:
        #    for env_show in env.envs:
        #        env_show.render('human', close=False)

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(maddpg.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]

            #actions = [np.array([i.tolist().index(1.0) for i in action]) for action in actions_one_hot]

            for i in actions:
                #    print(i)
                for j in i:
                    j[1] *= np.pi
            #print(actions[0])

            next_obs, rewards, dones, infos = env.step(actions)

            #print(len(agent_actions),len(next_obs))
            #if config.display:
            #    for env_show in env.envs:
            #        env_show.render('human', close=False)

            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                #print(t)
                if config.use_cuda:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=config.use_cuda,
                                                      norm_rews=True)
                        maddpg.update(sample,
                                      a_i,
                                      logger=logger,
                                      actor_loss_list=a_loss,
                                      critic_loss_list=c_loss)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        rewss.append(ep_rews)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)
            # print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(str(run_dir / 'incremental'), exist_ok=True)
            maddpg.save(
                str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))))
            maddpg.save(str(run_dir / 'model.pt'))
    maddpg.save(str(run_dir / 'model.pt'))
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    '''
コード例 #29
0
ファイル: transport_main.py プロジェクト: xuezzee/-
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
                            config.discrete_action)
    maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        obs = env.reset()
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        for et_i in range(config.episode_length):
            env.render()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(maddpg.nagents)]
            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')
                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()
                maddpg.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
コード例 #30
0
def run(config):
    cover_ratio = []

    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    # os.makedirs(log_dir)
    # logger = SummaryWriter(str(log_dir))

    #    torch.manual_seed(run_num)
    #    np.random.seed(run_num)
    #env = make_parallel_env(, config.n_rollout_threads, run_num)
    env = make_env(config.env_id,
                   benchmark=BENCHMARK,
                   discrete_action=True,
                   use_handcraft_policy=config.use_handcraft_policy)
    model = AttentionSAC.init_from_env(
        env,
        tau=config.tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,
        critic_hidden_dim=config.critic_hidden_dim,
        attend_heads=config.attend_heads,
        reward_scale=config.reward_scale)

    model.init_from_save_self('./models/swift_scenario/model/run8/model.pt')
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0

    update_count = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(model.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            agent_actions = [
                ac.data.numpy().squeeze() for ac in torch_agent_actions
            ]
            # rearrange actions to be per environment
            # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            # agent_actions[0][5]=1
            # agent_actions[1][5]=1
            # agent_actions[2][5]=1
            next_obs, rewards, dones, infos = env.step(
                agent_actions,
                use_handcraft_policy=config.use_handcraft_policy)
            env.render()
            time.sleep(0.1)

            # # # get actions as torch Variables
            # torch_agent_actions = model.step(torch_obs, explore=True)
            # # convert actions to numpy arrays
            # agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # # rearrange actions to be per environment
            # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            # next_obs, rewards, dones, infos = env.step(actions)
            # env.render()

            #if et_i == config.episode_length - 1:
            #print(infos)
            #print(type(infos['cover_ratio']))
            #cover_ratio.append(float(infos[0]['n'][0]['cover_ratio']))
            #print(infos)

            #            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            '''
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):

                    update_count += 1
                    print("episode:", ep_i, ", total steps:", t, " update_count:", update_count)

                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(
            config.episode_length * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')

        logger.export_scalars_to_json(str(log_dir / 'summary.json'))

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    print(cover_ratio)
    '''
    env.close()