Ejemplo n.º 1
0
    def __init__(self, discount_factor=0.95, tau=0.02):
        super(MADDPG, self).__init__()

        # critic input = obs_full + actions = 14+2+2+2=20
        self.maddpg_agent = [
            DDPGAgent(
                24,
                2,
                (8, 16, 32),
                (8, 4, 2),
                (2, 1, 1),
                (32, 16, 8),  # actor settings
                26,
                (8, 16, 32),
                (8, 4, 2),
                (2, 1, 1),
                (32, 16, 8)),  # critic settings
            DDPGAgent(
                24,
                2,
                (8, 16, 32),
                (8, 4, 2),
                (2, 1, 1),
                (32, 16, 8),  # actor settings
                26,
                (8, 16, 32),
                (8, 4, 2),
                (2, 1, 1),
                (32, 16, 8))
        ]  # critic settings

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
Ejemplo n.º 2
0
    def learn(self):
        agent = DDPGAgent(
            env=self.env,
            replay_memory_size=REPLAY_MEMORY_SIZE,
            learning_rate=LEARNING_RATE,
            batch_size=MINIBATCH_SIZE,
            gamma=GAMMA,
            tau=TAU
        )

        stats = {'scores': [], 'avg': [], 'min': [], 'max': []}
        for ep in tqdm(range(1, self.episodes + 1), ascii=True, unit='episodes'):

            print(self.epsilon)
            action_stats = [0, 0]
            current_state = self.env.reset()
            current_state = self.convert_gray(current_state)

            done = False
            score = 0
            steps = 0

            while not done:
                steps += 1

                if np.random.random() > self.epsilon:
                    action_stats[0] += 1
                    action = agent.get_action(current_state)
                else:
                    action_stats[1] += 1
                    action = self.env.action_space.sample()
                    action[2] = min(action[2], 0.2)
                    action[1] = action[1]*2

                new_state, reward, done, _ = self.env.step(action)
                if ep % self.results_every_n_episodes == 0:
                    self.env.render()

                score += reward

                new_state = self.convert_gray(new_state)

                agent.memory.push(current_state, action, reward, new_state)

                if steps % 64 == 0:
                    agent.update()

                current_state = new_state

                if self.epsilon > 0.1:
                    self.epsilon -= self.epsilon_decay_value

                if score < 0:
                    break

            print(action_stats)
            print(score)
            stats['scores'].append(score)
        self.env.close()
        return agent.actor
Ejemplo n.º 3
0
    def __init__(self, seed, discount_factor=0.95, tau=0.02):
        super(MADDPG, self).__init__()

        # critic input = obs_full + actions = 14+2+2+2=20
        self.maddpg_agent = [DDPGAgent(24, 2,seed),
                             DDPGAgent(24, 2,seed)]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
Ejemplo n.º 4
0
    def __init__(self, discount_factor=0.99, tau=0.001):
        super(MADDPG, self).__init__()

        # critic input = obs_full + actions = 48+2+2=52
        self.maddpg_agent = [DDPGAgent(24, 256, 128, 2, 26, 256, 128), 
                             DDPGAgent(24, 256, 128, 2, 26, 256, 128)]
        
        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
    def __init__(self, discount_factor=0.95, tau=0.02):
        super(MADDPG, self).__init__()

        # critic input = obs_full + actions = 24*2+2+2=52
        self.maddpg_agent = [
            DDPGAgent(24, 256, 256, 2, 52, 256, 256),
            DDPGAgent(24, 256, 256, 2, 52, 256, 256)
        ]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
Ejemplo n.º 6
0
def test(device, args):

    env = create_env(args.env, args)
    ram = MemoryBuffer(1)
    player = DDPGAgent(env.observation_space, env.action_space, ram, None,
                       device, args)
    if args.model_dir is not None:
        player.load_models(args.model_dir, test=True)
    steps_done = 0
    count_eps = 0
    count_success = 0
    while True:
        episode_rewards = []
        episode_lenghts = []
        for _ep in range(1, args.eval_eps):
            if args.ar:
                env.seed(True)
            observation = env.reset()
            total_reward = 0
            episode_action = []
            for steps in range(1000):
                if 'img' in args.obs:
                    state = np.expand_dims(observation, axis=0)
                else:
                    state = np.float32(observation)

                action, action_rescale = player.get_exploitation_action(state)
                episode_action.append(action)
                new_observation, reward, done, info = env.step(action_rescale)
                observation = new_observation
                total_reward += reward
                steps_done += 1

                if args.render:
                    env.render()
                if done:
                    episode_rewards.append(total_reward)
                    count_eps += 1
                    episode_lenghts.append(steps)
                    if reward > 1:
                        count_success += 1.0
                    break
            # check memory consumption and clear memory
            gc.collect()

        reward_ave = np.array(episode_rewards).mean()
        length_ave = np.array(episode_lenghts).mean()
        print(
            'Test, episode %d, steps: %d, Success_rate: %.3f ave_reward: %.3f, ave_length: %.3f'
            % (count_eps, steps_done, count_success / count_eps, reward_ave,
               length_ave))

    env.close()
Ejemplo n.º 7
0
    def __init__(self, discount_factor=0.95, tau=0.02):
        super(MADDPG, self).__init__()

        # critic input = obs_full + actions = 8+2 = 10
        self.maddpg_agent = [
            DDPGAgent(8, 16, 8, 2, 10, 32, 16),
            DDPGAgent(8, 16, 8, 2, 10, 32, 16)
        ]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
Ejemplo n.º 8
0
 def __init__(self, discount_factor=0.95, tau=0.02):
     super(MADDPG, self).__init__()
     # each agent has it's own critic and actor network'
     # each actor gets its agents state
     # but each agent critic seems to be getting the same input
     # critic input = obs_full + actions = 14+2+2+2=20
     self.maddpg_agent = [DDPGAgent(14, 16, 8, 2, 20, 32, 16), 
                          DDPGAgent(14, 16, 8, 2, 20, 32, 16), 
                          DDPGAgent(14, 16, 8, 2, 20, 32, 16)]
     
     self.discount_factor = discount_factor
     self.tau = tau
     self.iter = 0
    def __init__(self, discount_factor=0.95, tau=0.02):
        super(MADDPG, self).__init__()

        # critic input = obs_full + actions = 14+2+2+2=20
        # in_actor=14, hidden_in_actor=16, hidden_out_actor=8, out_actor=2,
        # in_critic=20, hidden_in_critic=32, hidden_out_critic=16,
        self.maddpg_agent = [DDPGAgent(14, 16, 8, 2, 20, 32, 16),
                             DDPGAgent(14, 16, 8, 2, 20, 32, 16),
                             DDPGAgent(14, 16, 8, 2, 20, 32, 16)]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
Ejemplo n.º 10
0
    def __init__(self, discount_factor=0.95, tau=0.02):
        super().__init__()

        # critic input = obs_full + actions = 14+2+2+2=20
        self.maddpg_agent = [
            DDPGAgent(14, 16, 8, 2, 20, 32, 16),
            DDPGAgent(14, 16, 8, 2, 20, 32, 16),
            DDPGAgent(14, 16, 8, 2, 20, 32, 16)
        ]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
Ejemplo n.º 11
0
    def __init__(self, discount_factor=0.95, tau=0.01):
        super(MADDPG, self).__init__()

        # args = in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic
        # critic input = obs_full + actions = 2*24+2+2=52
        self.maddpg_agent = [
            DDPGAgent(24, 400, 300, 2, 52, 400, 300),
            DDPGAgent(24, 400, 300, 2, 52, 400, 300)
        ]
        # DDPGAgent(24, 16, 8, 2, 52, 32, 16)]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
    def __init__(self, discount_factor=0.95, tau=0.02):
        super(MADDPG, self).__init__()

        # critic input = obs_full + actions = 24 + 24 + 2 + 2
        # in_actor=24, hidden_in_actor=16, hidden_out_actor=8, out_actor=2,
        # in_critic=52, hidden_in_critic=32, hidden_out_critic=16,
        self.maddpg_agent = [
            DDPGAgent(24, 256, 128, 2, 52, 256, 128),
            DDPGAgent(24, 256, 128, 2, 52, 256, 128)
        ]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
Ejemplo n.º 13
0
    def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-4, lr_critic=1.0e-5, discount_factor=0.99, tau=1.0e-2):
        super(MADDPG, self).__init__()

        # critic input = obs_full + actions = 14+2+2+2=20
        agent1 = DDPGAgent(in_actor, hidden_in_actor, hidden_out_actor, out_actor, 
                           in_critic, hidden_in_critic, hidden_out_critic, 
                           lr_actor, lr_critic)
        agent2 = DDPGAgent(in_actor, hidden_in_actor, hidden_out_actor, out_actor, 
                           in_critic, hidden_in_critic, hidden_out_critic,  
                           lr_actor, lr_critic)
        self.maddpg_agent = [agent1, agent2]
        
        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
    def __init__(self, discount_factor=0.99, tau=0.01, random_seed=0):
        super(MADDPG, self).__init__()

        self.maddpg_agent = [
            DDPGAgent(24, 52, 2, random_seed),
            DDPGAgent(24, 52, 2, random_seed)
        ]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Ejemplo n.º 15
0
    def __init__(self, actor_layer_sizes=[24, 128,128,2], critic_layer_sizes=[52, 128,128,1], discount_factor=0.95, tau=0.02, logger=None, lr_actor=1e-3,  lr_critic=1e-3, gradient_clipping=None, clamp_actions=True, log_layers=False, log_weights=False, log_losses=True):
        super(MADDPG, self).__init__()

        # INITIALIZE EACH AGENT AS A DDPG MODEL
        self.agents = [
            DDPGAgent(actor_layer_sizes=actor_layer_sizes, critic_layer_sizes=critic_layer_sizes, lr_actor=lr_actor, lr_critic=lr_critic, clamp_actions=clamp_actions, logger=logger, log_layers=log_layers),
            DDPGAgent(actor_layer_sizes=actor_layer_sizes, critic_layer_sizes=critic_layer_sizes, lr_actor=lr_actor, lr_critic=lr_critic, clamp_actions=clamp_actions, logger=logger, log_layers=log_layers),
            ]

        self.discount_factor = discount_factor # For discounted returns
        self.tau = tau          # Soft Update factor
        self.iter = 0           # Keep track of how many iterations have passed
        self.gradient_clipping = gradient_clipping # upper limit to gradients
        self.logger = logger         # Tensorboard logger object
        self.log_weights=log_weights # Monitor weights in Tensorboard?
Ejemplo n.º 16
0
    def __init__(self,
                 num_agents,
                 num_states,
                 num_actions,
                 discount_factor=0.99,
                 tau=1e-3):
        super(MADDPG, self).__init__()

        self.maddpg_agent = [
            DDPGAgent(num_states, num_actions, num_states * 2),
            DDPGAgent(num_states, num_actions, num_states * 2)
        ]
        self.num_agents = num_agents
        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
Ejemplo n.º 17
0
    def __init__(self,
                 num_agents,
                 x_dim,
                 o_dim,
                 a_dim,
                 lr_actor=1e-3,
                 lr_critic=1e-3,
                 batch_size=16,
                 gamma=0.99,
                 tau=0.001,
                 buffer_size=int(1e5),
                 seed=1234):

        self.num_agents = num_agents
        self.x_dim = x_dim
        self.o_dim = o_dim
        self.a_dim = a_dim
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size
        self.seed = seed

        self.buffer = ReplayBuffer(buffer_size, batch_size, seed)
        self.agents = [DDPGAgent(num_agents, id, x_dim, o_dim, a_dim, lr_actor, lr_critic, gamma, seed) \
                       for id in range(num_agents)]
Ejemplo n.º 18
0
 def __init__(self, state_size, action_size, num_agents, random_seed):
     self.agents = [
         DDPGAgent(state_size, action_size, random_seed)
         for _ in range(num_agents)
     ]
     self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                device, random_seed)
     self.t_step = 0
Ejemplo n.º 19
0
    def run(self):
        ### create TORCS environment
        env = TorcsEnv(vision=False, throttle=True)   

        ### start run according to supplied arguments
        if self.algorithm == "dqn" and self.modus == "train":
            agent = DQNAgent(env, self.track, self.numOfEpisodes)
            agent.trainAgent()
        elif self.algorithm == "dqn" and self.modus == "test":
            agent = DQNAgent(env, self.track, self.numOfEpisodes)
            agent.testAgent()
        elif self.algorithm == "ddpg" and self.modus == "train":
            agent = DDPGAgent(env, self.track, self.numOfEpisodes)
            agent.trainAgent()
        elif self.algorithm == "ddpg" and self.modus == "test":
            agent = DDPGAgent(env, self.track, self.numOfEpisodes)
            agent.testAgent()
Ejemplo n.º 20
0
    def __init__(self,
                 cfg: Config,
                 discount_factor=0.95,
                 tau=0.02,
                 checkpoint_path: Optional[str] = None):
        self.logger = logging.getLogger(__name__)
        self.maddpg_agent = [
            DDPGAgent(in_actor=24,
                      hidden_in_actor=cfg.actor_hidden[0],
                      hidden_out_actor=cfg.actor_hidden[1],
                      out_actor=2,
                      in_critic=52,
                      hidden_in_critic=cfg.critic_hidden[0],
                      hidden_out_critic=cfg.critic_hidden[1],
                      lr_actor=cfg.actor_lr,
                      lr_critic=cfg.critic_lr,
                      noise_dist=cfg.noise_distribution),
            DDPGAgent(in_actor=24,
                      hidden_in_actor=cfg.actor_hidden[0],
                      hidden_out_actor=cfg.actor_hidden[1],
                      out_actor=2,
                      in_critic=52,
                      hidden_in_critic=cfg.critic_hidden[0],
                      hidden_out_critic=cfg.critic_hidden[1],
                      lr_actor=cfg.actor_lr,
                      lr_critic=cfg.critic_lr,
                      noise_dist=cfg.noise_distribution)
        ]
        if checkpoint_path:
            checkpoint = torch.load(checkpoint_path)
            for i, agent in enumerate(self.maddpg_agent):
                agent.actor.load_state_dict(checkpoint[i]['actor_params'])
                agent.target_actor.load_state_dict(
                    checkpoint[i]['actor_params'])
                agent.critic.load_state_dict(checkpoint[i]['critic_params'])
                agent.target_critic.load_state_dict(
                    checkpoint[i]['critic_params'])

                # agent.actor_optimizer.load_state_dict(checkpoint[i]['actor_optim_params'])
                # agent.critic_optimizer.load_state_dict(checkpoint[i]['critic_optim_params'])

        self.tau = tau
        self.discount_factor = discount_factor
        self.iter = 0
    def __init__(self, discount_factor=0.95, tau=0.02):
        super(MADDPG, self).__init__()

        # critic input = obs_full + actions = 14+2+2+2=20
        #self.maddpg_agent = [DDPGAgent(14, 16, 8, 2, 20, 32, 16),
        #                     DDPGAgent(14, 16, 8, 2, 20, 32, 16),
        #                     DDPGAgent(14, 16, 8, 2, 20, 32, 16)]
        # DDPGAgent parameters are :
        #   in_actor, hidden_in_actor, hidden_out_actor, out_actor,
        #   in_critic, hidden_in_critic, hidden_out_critic
        #   lr_actor=1.0e-2, lr_critic=1.0e-2
        self.maddpg_agent = [
            DDPGAgent(14,
                      128,
                      128,
                      2,
                      20,
                      128,
                      128,
                      lr_actor=5.0e-3,
                      lr_critic=5.0e-3),
            DDPGAgent(14,
                      128,
                      128,
                      2,
                      20,
                      128,
                      128,
                      lr_actor=5.0e-3,
                      lr_critic=5.0e-3),
            DDPGAgent(14,
                      128,
                      128,
                      2,
                      20,
                      128,
                      128,
                      lr_actor=5.0e-3,
                      lr_critic=5.0e-3)
        ]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
    def __init__(self,
                 env,
                 state_dim: int,
                 action_dim: int,
                 config: Dict,
                 device=None,
                 writer=None):
        self.logger = logging.getLogger("MADDPG")
        self.device = device if device is not None else DEVICE
        self.writer = writer

        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.agents_number = config['agents_number']

        hidden_layers = config.get('hidden_layers', (400, 300))
        noise_scale = config.get('noise_scale', 0.2)
        noise_sigma = config.get('noise_sigma', 0.1)
        actor_lr = config.get('actor_lr', 1e-3)
        actor_lr_decay = config.get('actor_lr_decay', 0)
        critic_lr = config.get('critic_lr', 1e-3)
        critic_lr_decay = config.get('critic_lr_decay', 0)
        self.actor_tau = config.get('actor_tau', 0.002)
        self.critic_tau = config.get('critic_tau', 0.002)
        create_agent = lambda: DDPGAgent(state_dim,
                                         action_dim,
                                         agents=self.agents_number,
                                         hidden_layers=hidden_layers,
                                         actor_lr=actor_lr,
                                         actor_lr_decay=actor_lr_decay,
                                         critic_lr=critic_lr,
                                         critic_lr_decay=critic_lr_decay,
                                         noise_scale=noise_scale,
                                         noise_sigma=noise_sigma,
                                         device=self.device)
        self.agents = [create_agent() for _ in range(self.agents_number)]

        self.discount = 0.99 if 'discount' not in config else config['discount']
        self.gradient_clip = 1.0 if 'gradient_clip' not in config else config[
            'gradient_clip']

        self.warm_up = 1e3 if 'warm_up' not in config else config['warm_up']
        self.buffer_size = int(
            1e6) if 'buffer_size' not in config else config['buffer_size']
        self.batch_size = config.get('batch_size', 128)
        self.p_batch_size = config.get('p_batch_size',
                                       int(self.batch_size // 2))
        self.n_batch_size = config.get('n_batch_size',
                                       int(self.batch_size // 4))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.update_every_iterations = config.get('update_every_iterations', 2)
        self.number_updates = config.get('number_updates', 2)

        self.reset()
Ejemplo n.º 23
0
    def __init__(self, state_size, obs_size, action_size, num_agents):
        super(MADDPG, self).__init__()

        self.maddpg_agent = [
            DDPGAgent(state_size, obs_size, action_size, num_agents)
            for x in range(num_agents)
        ]
        self.discount_factor = DISCOUNT_FACTOR
        self.tau = TAU
        self.iter = 0
Ejemplo n.º 24
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 batchsize,
                 discount_factor=0.95,
                 tau=0.02):
        super(MADDPG, self).__init__()

        self.maddpg_agent = [
            DDPGAgent(state_size, action_size, num_agents),
            DDPGAgent(state_size, action_size, num_agents)
        ]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
        self.num_agents = num_agents
        self.batchsize = batchsize
Ejemplo n.º 25
0
 def __init__(self, config):
     
     self.config = config
     self.gamma = config.gamma
     self.memory = config.memory()
     self.batch_size = config.batch_size
     self.update_every = config.update_every
     self.num_updates = config.num_updates
     self.t_step = 0
     
     self.maddpg_agents = [DDPGAgent(config) for _ in range(config.num_agents)]
Ejemplo n.º 26
0
    def __init__(self, discount_factor, tau, batch_size):
        super(MADDPG, self).__init__()

        self.maddpg_agent = [
            DDPGAgent(24, 128, 128, 2, 52, 64, 64) for i in range(2)
        ]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
        self.batch_size = batch_size
Ejemplo n.º 27
0
    def __init__(self):
        """Initialize a MADDPG Agent object."""
        super(MADDPGAgent, self).__init__()
        self.config = Config.getInstance()
        self.action_num = self.config.action_size * self.config.num_agents
        self.t_step = 0

        self.maddpg_agent = [
            DDPGAgent() for _ in range(self.config.num_agents)
        ]

        self.memory = ReplayBuffer()
Ejemplo n.º 28
0
    def __init__(self,
                 episodes_before_train,
                 batch_size,
                 replay_buffer,
                 discount_factor=0.95,
                 tau=0.02):
        super(MADDPG, self).__init__()

        # critic input = obs_full + actions = 24+24+2+2=52
        self.maddpg_agent = [
            DDPGAgent(24, 2, 400, 300, 48, 4, 400, 300),
            DDPGAgent(24, 2, 400, 300, 48, 4, 400, 300)
        ]

        self.num_agents = 2
        self.action_size = 2
        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
        self.episodes_before_train = episodes_before_train
        self.batch_size = batch_size
        self.buffer = replay_buffer
    def __init__(self,
                 state_size,
                 action_size,
                 discount_factor=0.95,
                 tau=0.05,
                 lr_actor=2e-4,
                 lr_critic=2e-3,
                 num_agents=2):
        super(MADDPG, self).__init__()

        hidden_in_dim = 512
        hidden_out_dim = 256
        # critic input = obs_full + actions = 48+2+2=52
        # have to change the agent neurons for sure

        # the no of agents is two because there are only two players
        self.maddpg_agent = [
            DDPGAgent(state_size,
                      action_size,
                      hidden_in_dim,
                      hidden_out_dim,
                      num_agents=num_agents,
                      lr_actor=lr_actor,
                      lr_critic=lr_critic),
            DDPGAgent(state_size,
                      action_size,
                      hidden_in_dim,
                      hidden_out_dim,
                      num_agents=num_agents,
                      lr_actor=lr_actor,
                      lr_critic=lr_critic)
        ]

        self.num_agents = num_agents
        self.action_vector = 2

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
Ejemplo n.º 30
0
def train_ddpg():
    args = DDPGArgs()
    env = gym.make(args.env_name)
    agent = DDPGAgent(env, DDPGQNet, DDPGActor, SimpleNormalizer, args)
    for ep in range(args.max_ep):
        agent.train_one_episode()
        if ep % args.test_interval == 0:
            agent.test_model()