Beispiel #1
0
def mp_explore(args, pipe2_exp, worker_id):
    args.init_before_training(if_main=False)
    '''basic arguments'''
    env = args.env
    agent = args.agent
    rollout_num = args.rollout_num
    '''training arguments'''
    net_dim = args.net_dim
    max_memo = args.max_memo
    target_step = args.target_step
    gamma = args.gamma
    reward_scale = args.reward_scale

    random_seed = args.random_seed
    torch.manual_seed(random_seed + worker_id)
    np.random.seed(random_seed + worker_id)
    del args  # In order to show these hyper-parameters clearly, I put them above.
    '''init: environment'''
    max_step = env.max_step
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete
    '''init: Agent, ReplayBuffer'''
    agent.init(net_dim, state_dim, action_dim)
    agent.state = env.reset()

    if_on_policy = getattr(agent, 'if_on_policy', False)
    buffer = ReplayBuffer(max_len=max_memo // rollout_num + max_step,
                          if_on_policy=if_on_policy,
                          state_dim=state_dim,
                          action_dim=1 if if_discrete else action_dim,
                          if_gpu=False)
    '''start exploring'''
    exp_step = target_step // rollout_num
    with torch.no_grad():
        if not if_on_policy:
            explore_before_training(env, buffer, exp_step, reward_scale, gamma)

            buffer.update_now_len_before_sample()

            pipe2_exp.send((buffer.buf_state[:buffer.now_len],
                            buffer.buf_other[:buffer.now_len]))
            # buf_state, buf_other = pipe1_exp.recv()

            buffer.empty_buffer_before_explore()

        while True:
            agent.explore_env(env, buffer, exp_step, reward_scale, gamma)

            buffer.update_now_len_before_sample()

            pipe2_exp.send((buffer.buf_state[:buffer.now_len],
                            buffer.buf_other[:buffer.now_len]))
            # buf_state, buf_other = pipe1_exp.recv()

            buffer.empty_buffer_before_explore()

            # pipe1_exp.send(agent.act)
            agent.act = pipe2_exp.recv()
Beispiel #2
0
    def __init__(self, num_agents, state_size, action_size):

        self.agents = []

        for i in range(num_agents):
            self.agents.append(Agent(state_size, action_size))

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
Beispiel #3
0
 def __init__(self, num_agents, state_size, action_size, random_seed):
     self.num_agents = num_agents
     self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                random_seed)
     self.agents = [
         Agent(state_size, action_size, self.memory, BATCH_SIZE,
               random_seed) for agent_posit in range(num_agents)
     ]
Beispiel #4
0
def mp_explore_in_env(args, pipe2_exp, worker_id):
    env = args.env
    reward_scale = args.reward_scale
    gamma = args.gamma
    random_seed = args.random_seed

    agent_rl = args.agent_rl
    net_dim = args.net_dim
    max_memo = args.max_memo
    target_step = args.target_step
    rollout_num = args.rollout_num
    del args

    torch.manual_seed(random_seed + worker_id)
    np.random.seed(random_seed + worker_id)
    '''init: env'''
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete
    max_step = env.max_step
    '''build agent'''
    agent = agent_rl(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()
    # agent.device = torch.device('cpu')  # env_cpu--act_cpu a little faster than env_cpu--act_gpu, but high cpu-util
    '''build replay buffer, init: total_step, reward_avg'''
    if_on_policy = bool(
        agent_rl.__name__ in {'AgentPPO', 'AgentGaePPO', 'AgentInterPPO'})
    buffer = ReplayBuffer(max_memo // rollout_num + max_step,
                          state_dim,
                          if_on_policy=if_on_policy,
                          action_dim=1 if if_discrete else
                          action_dim)  # build experience replay buffer

    exp_step = target_step // rollout_num
    with torch.no_grad():
        while True:
            # pipe1_exp.send(agent.act)
            agent.act = pipe2_exp.recv()

            agent.update_buffer(env, buffer, exp_step, reward_scale, gamma)

            buffer.update__now_len__before_sample()
            pipe2_exp.send((buffer.buf_state[:buffer.now_len],
                            buffer.buf_other[:buffer.now_len]))
    def __init__(self, num_agents=2, state_size=24, action_size=2):
        """Initialize a maddpg_agent wrapper.
        Params
        ======
            num_agents (int): the number of agents in the environment
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size

        self.agents = [
            ddpg(state_size, action_size, i + 1, random_seed=0)
            for i in range(num_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(action_size,
                                   BUFFER_SIZE,
                                   BATCH_SIZE,
                                   seed=0)
Beispiel #6
0
class MultiAgent():
    def __init__(self, num_agents, state_size, action_size):

        self.agents = []

        for i in range(num_agents):
            self.agents.append(Agent(state_size, action_size))

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)

    def step(self, states, actions, rewards, next_states, done):

        for i in range(len(states)):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            done)

        if len(self.memory) > BATCH_SIZE:
            experiences, indexes = self.memory.sample()

            for agent in self.agents:
                error = agent.learn(experiences, GAMMA)

                #update priority replay memory
                self.memory.update(indexes, abs(error))

    def act(self, states, add_noise=True, noise_weight=1.0):

        actions = []

        for i in range(len(self.agents)):
            actions.append(self.agents[i].act(states[i], add_noise,
                                              noise_weight))

        return actions

    def reset(self):
        for agent in self.agents:
            agent.reset()
    def __init__(self, action_size, state_size, shared_replay_buffer,
                 num_agents):
        self.shared_replay_buffer = shared_replay_buffer
        memory_fn = lambda: ReplayBuffer(action_size, int(1e6), BATCH_SIZE,
                                         SEED, DEVICE)

        memory = None
        if shared_replay_buffer:
            self.memory = memory_fn()
            memory = self.memory

        self.ddpg_agents = [
            DDPGAgent(action_size, state_size, shared_replay_buffer, memory)
            for _ in range(num_agents)
        ]
        self.t_step = 0
Beispiel #8
0
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 random_seed=0,
                 discount_factor=0.95,
                 tau=0.02):
        super(MADDPG, self).__init__()

        #         self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        self.memories = [
            ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
            for _ in range(num_agents)
        ]
        self.agents = [
            Agent(state_size, action_size, random_seed)
            for _ in range(num_agents)
        ]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
Beispiel #9
0
def train_and_evaluate(args):
    args.init_before_training()
    '''basic arguments'''
    cwd = args.cwd
    env = args.env
    agent = args.agent
    gpu_id = args.gpu_id  # necessary for Evaluator?
    env_eval = args.env_eval
    '''training arguments'''
    net_dim = args.net_dim
    max_memo = args.max_memo
    break_step = args.break_step
    batch_size = args.batch_size
    target_step = args.target_step
    repeat_times = args.repeat_times
    if_break_early = args.if_allow_break
    gamma = args.gamma
    reward_scale = args.reward_scale
    '''evaluating arguments'''
    show_gap = args.show_gap
    eval_times1 = args.eval_times1
    eval_times2 = args.eval_times2
    env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval)
    del args  # In order to show these hyper-parameters clearly, I put them above.
    '''init: environment'''
    max_step = env.max_step
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete
    env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval)
    '''init: Agent, ReplayBuffer, Evaluator'''
    agent.init(net_dim, state_dim, action_dim)
    if_on_policy = getattr(agent, 'if_on_policy', False)

    buffer = ReplayBuffer(max_len=max_memo + max_step,
                          if_on_policy=if_on_policy,
                          if_gpu=True,
                          state_dim=state_dim,
                          action_dim=1 if if_discrete else action_dim)

    evaluator = Evaluator(cwd=cwd,
                          agent_id=gpu_id,
                          device=agent.device,
                          env=env_eval,
                          eval_times1=eval_times1,
                          eval_times2=eval_times2,
                          show_gap=show_gap)  # build Evaluator
    '''prepare for training'''
    agent.state = env.reset()
    if if_on_policy:
        steps = 0
    else:  # explore_before_training for off-policy
        with torch.no_grad():  # update replay buffer
            steps = explore_before_training(env, buffer, target_step,
                                            reward_scale, gamma)

        agent.update_net(buffer, target_step, batch_size,
                         repeat_times)  # pre-training and hard update
        agent.act_target.load_state_dict(agent.act.state_dict()) if getattr(
            agent, 'act_target', None) else None
        agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr(
            agent, 'cri_target', None) else None
    total_step = steps
    '''start training'''
    if_reach_goal = False
    while not ((if_break_early and if_reach_goal) or total_step > break_step
               or os.path.exists(f'{cwd}/stop')):
        with torch.no_grad():  # speed up running
            steps = agent.explore_env(env, buffer, target_step, reward_scale,
                                      gamma)

        total_step += steps

        obj_a, obj_c = agent.update_net(buffer, target_step, batch_size,
                                        repeat_times)

        with torch.no_grad():  # speed up running
            if_reach_goal = evaluator.evaluate_save(agent.act, steps, obj_a,
                                                    obj_c)
Beispiel #10
0
def train_and_evaluate(args):
    args.init_before_training()

    cwd = args.cwd
    env = args.env
    env_eval = args.env_eval
    agent_id = args.gpu_id
    agent_rl = args.agent_rl  # basic arguments

    gamma = args.gamma  # training arguments
    net_dim = args.net_dim
    max_memo = args.max_memo
    target_step = args.target_step
    batch_size = args.batch_size
    repeat_times = args.repeat_times
    reward_scale = args.reward_scale
    if_per = args.if_per

    show_gap = args.show_gap  # evaluate arguments
    eval_times1 = args.eval_times1
    eval_times2 = args.eval_times2
    break_step = args.break_step
    if_break_early = args.if_break_early
    env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval)
    del args  # In order to show these hyper-parameters clearly, I put them above.
    '''init: env'''
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete
    max_step = env.max_step
    env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval)
    '''init: Agent, Evaluator, ReplayBuffer'''
    agent = agent_rl(net_dim, state_dim, action_dim)  # build AgentRL
    agent.state = env.reset()
    evaluator = Evaluator(cwd=cwd,
                          agent_id=agent_id,
                          device=agent.device,
                          env=env_eval,
                          eval_times1=eval_times1,
                          eval_times2=eval_times2,
                          show_gap=show_gap)  # build Evaluator

    if_on_policy = agent_rl.__name__ in {'AgentPPO', 'AgentGaePPO'}
    buffer = ReplayBuffer(max_memo + max_step,
                          state_dim,
                          if_on_policy=if_on_policy,
                          if_per=if_per,
                          action_dim=1 if if_discrete else
                          action_dim)  # build experience replay buffer
    if if_on_policy:
        steps = 0
    else:
        with torch.no_grad():  # update replay buffer
            steps = _explore_before_train(env, buffer, target_step,
                                          reward_scale, gamma)

        agent.update_net(buffer, target_step, batch_size,
                         repeat_times)  # pre-training and hard update
        agent.act_target.load_state_dict(
            agent.act.state_dict()) if 'act_target' in dir(agent) else None
    total_step = steps

    if_solve = False
    while not ((if_break_early and if_solve) or total_step > break_step
               or os.path.exists(f'{cwd}/stop')):
        with torch.no_grad():  # speed up running
            steps = agent.update_buffer(env, buffer, target_step, reward_scale,
                                        gamma)

        total_step += steps

        obj_a, obj_c = agent.update_net(buffer, target_step, batch_size,
                                        repeat_times)

        with torch.no_grad():  # speed up running
            if_solve = evaluator.evaluate_act__save_checkpoint(
                agent.act, steps, obj_a, obj_c)
    args = parse_arguments()

    # Setup RNG
    if args.seed:
        print('torch RNG using seed:', args.seed)
        seed = torch.manual_seed(args.seed)

    # Setup Environment
    env = UnityEnvironment(file_name="Banana.app", no_graphics=args.no_render)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # Setup Agent and Experience Replay Buffer
    state_size = brain.vector_observation_space_size
    action_size = brain.vector_action_space_size
    memory = ReplayBuffer(action_size, args.memory, args.batch_size)
    agent = BananAgent(state_size,
                       action_size,
                       memory=memory,
                       checkpoint_filename=args.checkpoint if args.checkpoint
                       and os.path.exists(args.checkpoint) else None)
    print('Number of actions:', action_size)
    print('State Features: ', state_size)

    # Retrieve hyperparameters
    epsilon = args.eps
    epsilon_decay = args.eps_decay
    epsilon_mininum = args.eps_min
    evaluate = args.evaluate
    if evaluate:
        print('Running in evaluation mode!')
Beispiel #12
0
def train_and_evaluate(args):
    args.init_before_training()
    '''basic arguments'''
    cwd = args.cwd
    env = args.env
    agent = args.agent
    gpu_id = args.gpu_id  # necessary for Evaluator?
    '''training arguments'''
    net_dim = args.net_dim
    max_memo = args.max_memo
    break_step = args.break_step
    batch_size = args.batch_size
    target_step = args.target_step
    repeat_times = args.repeat_times
    if_break_early = args.if_allow_break
    if_per = args.if_per
    gamma = args.gamma
    reward_scale = args.reward_scale
    '''evaluating arguments'''
    eval_gap = args.env_eval
    eval_times1 = args.eval_times1
    eval_times2 = args.eval_times2
    if args.env_eval is not None:
        env_eval = args.env_eval
    elif args.env_eval in set(gym.envs.registry.env_specs.keys()):
        env_eval = PreprocessEnv(gym.make(env.env_name))
    else:
        env_eval = deepcopy(env)

    del args  # In order to show these hyper-parameters clearly, I put them above.
    '''init: environment'''
    max_step = env.max_step
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete
    '''init: Agent, ReplayBuffer, Evaluator'''
    agent.init(net_dim, state_dim, action_dim, if_per)
    if_on_policy = getattr(agent, 'if_on_policy', False)

    buffer = ReplayBuffer(max_len=max_memo + max_step,
                          state_dim=state_dim,
                          action_dim=1 if if_discrete else action_dim,
                          if_on_policy=if_on_policy,
                          if_per=if_per,
                          if_gpu=True)

    evaluator = Evaluator(
        cwd=cwd,
        agent_id=gpu_id,
        device=agent.device,
        env=env_eval,
        eval_gap=eval_gap,
        eval_times1=eval_times1,
        eval_times2=eval_times2,
    )
    '''prepare for training'''
    agent.state = env.reset()
    if if_on_policy:
        steps = 0
    else:  # explore_before_training for off-policy
        with torch.no_grad():  # update replay buffer
            steps = explore_before_training(env, buffer, target_step,
                                            reward_scale, gamma)

        agent.update_net(buffer, target_step, batch_size,
                         repeat_times)  # pre-training and hard update
        agent.act_target.load_state_dict(agent.act.state_dict()) if getattr(
            agent, 'act_target', None) else None
        agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr(
            agent, 'cri_target', None) else None
    total_step = steps
    '''start training'''
    if_reach_goal = False
    while not ((if_break_early and if_reach_goal) or total_step > break_step
               or os.path.exists(f'{cwd}/stop')):
        steps = agent.explore_env(env, buffer, target_step, reward_scale,
                                  gamma)
        total_step += steps

        train_record = agent.update_net(buffer, target_step, batch_size,
                                        repeat_times)

        if_reach_goal = evaluator.evaluate_save(agent.act, steps, train_record)
        evaluator.draw_plot()

    print(
        f'| SavedDir: {cwd}\n| UsedTime: {time.time() - evaluator.start_time:.0f}'
    )
class maddpg:
    """Wrapper class managing different agents in the environment."""
    def __init__(self, num_agents=2, state_size=24, action_size=2):
        """Initialize a maddpg_agent wrapper.
        Params
        ======
            num_agents (int): the number of agents in the environment
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size

        self.agents = [
            ddpg(state_size, action_size, i + 1, random_seed=0)
            for i in range(num_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(action_size,
                                   BUFFER_SIZE,
                                   BATCH_SIZE,
                                   seed=0)

    def reset(self):
        """Resets OU Noise for each agent."""
        for agent in self.agents:
            agent.reset()

    def act(self, observations, add_noise=False):
        """Picks an action for each agent given."""
        actions = []
        for agent, observation in zip(self.agents, observations):
            action = agent.act(observation, add_noise=add_noise)
            actions.append(action)
        return np.array(actions)

    def step(self, states, actions, rewards, next_states, dones, timestep):
        """Save experience in replay memory."""
        states = states.reshape(1, -1)
        actions = actions.reshape(1, -1)
        next_states = next_states.reshape(1, -1)

        self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep % LEARNING_PERIOD == 0:
            for a_i, agent in enumerate(self.agents):
                experiences = self.memory.sample()
                self.learn(experiences, a_i)

    def learn(self, experiences, agent_number):
        """ The critic takes as its input the combined observations and 
        actions from all agents. Collect actions from each agent for the 'experiences'. """
        next_actions = []
        actions_pred = []
        states, _, _, next_states, _ = experiences

        next_states = next_states.reshape(-1, self.num_agents, self.state_size)
        states = states.reshape(-1, self.num_agents, self.state_size)

        for a_i, agent in enumerate(self.agents):
            agent_id_tensor = self._get_agent_number(a_i)

            state = states.index_select(1, agent_id_tensor).squeeze(1)
            next_state = next_states.index_select(1,
                                                  agent_id_tensor).squeeze(1)

            next_actions.append(agent.actor_target(next_state))
            actions_pred.append(agent.actor_local(state))

        next_actions = torch.cat(next_actions, dim=1).to(device)
        actions_pred = torch.cat(actions_pred, dim=1).to(device)

        agent = self.agents[agent_number]
        agent.learn(experiences, next_actions, actions_pred)

    def _get_agent_number(self, i):
        """Helper to get an agent's number as a Torch tensor."""
        return torch.tensor([i]).to(device)

    def save_weights(self, dir):
        for i in range(self.num_agents):
            torch.save(self.agents[i].actor_local.state_dict(),
                       os.path.join(dir, 'checkpoint_actor_{}.pth'.format(i)))
            torch.save(self.agents[i].critic_local.state_dict(),
                       os.path.join(dir, 'checkpoint_critic_{}.pth'.format(i)))