def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        hard_update(self.actor_target, self.actor_local)
        hard_update(self.critic_target, self.critic_local)


        # Noise process
        self.noise = OUNoise(action_size, random_seed)
Beispiel #2
0
    def __init__(self, action_size, discount_factor=0.95, tau=0.02):
        super(MADDPG, self).__init__()

        # Create the multi agent as a list of ddpg agents
        self.maddpg_agents = [AgentDDPG(24, 2, 0), AgentDDPG(24, 2, 0)]

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0
        self.total_reward = 0.0
        self.count = 0
        self.update_every = 1
        self.batch_size = 128
        self.agent_number = len(self.maddpg_agents)
        self.t_step = 0
        # Initialize the Replay Memory
        self.buffer_size = 1000000
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        self.action_size = action_size
        self.total_reward = np.zeros((1, 2))

        # Initialize the Gaussian Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)
Beispiel #3
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
    def __init__(
            self,
            task,
            actor_params={},
            critic_params={},
            noise_params={},
            replay_memory_params={},
            algo_params = {}
            ):

        # Default Params
        default_actor_params = {'lr': .001}
        default_critic_params= {'lr': .001}
        default_noise_params= {'mu': 0, 'theta': .15, 'sigma': .2}
        default_replay_memory_params= {'buffer_size': 100000, 'batch_size': 64}
        default_algo_params = {'gamma': .99, 'tau': .1}

        # Final Params
        final_actor_params= {**default_actor_params, **actor_params}
        final_critic_params={**default_critic_params, **critic_params}
        final_noise_params={**default_noise_params, **noise_params}
        final_replay_memory_params={**default_replay_memory_params, **replay_memory_params, }
        final_algo_params = {**default_algo_params, **algo_params}

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, final_critic_params)
        self.critic_target = Critic(self.state_size, self.action_size, final_critic_params)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(
                self.action_size,
                final_noise_params['mu'],
                final_noise_params['theta'],
                final_noise_params['sigma']
                )

        # Replay memory
        self.batch_size = final_replay_memory_params['batch_size']
        self.memory = ReplayBuffer(
                final_replay_memory_params['buffer_size'],
                final_replay_memory_params['batch_size']
                )

        # Algorithm parameters
        self.gamma = final_algo_params['gamma']  # discount factor
        self.tau = final_algo_params['tau']      # for soft update of target parameters
Beispiel #5
0
    def __init__(self, in_actor, in_critic, action_size, num_agents,
                 random_seed):
        super(DDPG_agent, self).__init__()
        """init the agent"""

        self.action_size = action_size
        self.seed = random_seed

        # Fully connected actor network
        self.actor_local = Actor(in_actor, self.action_size,
                                 self.seed).to(device)
        self.actor_target = Actor(in_actor, self.action_size,
                                  self.seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Fully connected critic network
        self.critic_local = Critic(in_critic, num_agents * self.action_size,
                                   self.seed).to(device)
        self.critic_target = Critic(in_critic, num_agents * self.action_size,
                                    self.seed).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise process for exploration
        self.noise = OUNoise((action_size), random_seed)
Beispiel #6
0
    def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args):
        self.args = args
        self.alow = alow
        self.ahigh = ahigh
        self.policy = Policy_net(ob_sp, act_sp)
        self.policy_targ = Policy_net(ob_sp, act_sp)
        self.qnet = Q_net(ob_sp, act_sp)
        self.qnet_targ = Q_net(ob_sp, act_sp)

        self.policy.to(device)
        self.qnet.to(device)
        self.policy_targ.to(device)
        self.qnet_targ.to(device)
        self.MSE_loss = nn.MSELoss()
        self.noise = OUNoise(1, 1)

        hard_update(self.policy_targ, self.policy)
        hard_update(self.qnet_targ, self.qnet)

        self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR)
        self.memory = ReplayMemory(int(1e6))
        self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS,
                                                FINAL_STD,
                                                INITIAL_STD,
                                                warmup_steps=WARMUP_STEPS)
        self.n_steps = 0
        self.n_updates = 0
        self.writer = writer
Beispiel #7
0
    def __init__(self, state_size=24, action_size=2, random_seed=0):
        """
        Initializes Agent object.
        @Param:
        1. state_size: dimension of each state.
        2. action_size: number of actions.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        #Actor network
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        #Critic network
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        #Noise proccess
        self.noise = OUNoise(action_size,
                             random_seed)  #define Ornstein-Uhlenbeck process

        #Replay memory
        self.memory = ReplayBuffer(
            self.action_size, BUFFER_SIZE, MINI_BATCH,
            random_seed)  #define experience replay buffer object
Beispiel #8
0
    def __init__(self, state_size, action_size, random_seed):
        """
        Initializes Agent object.
        @Param:
        1. state_size: dimension of each state.
        2. action_size: number of actions.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        #Actor network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        #Critic network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        #Perform hard copy
        self.hard_copy_weights(self.actor_target, self.actor_local)
        self.hard_copy_weights(self.critic_target, self.critic_local)

        #Noise proccess
        self.noise = OUNoise(action_size,
                             random_seed)  #define Ornstein-Uhlenbeck process
Beispiel #9
0
    def __init__(self, state_size, action_size, random_seed):
        """
        Args:
        ======
            state_size (int): state dim
            action_size (int): action dim
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # actor net initialization
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # critic net initialization
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck Exploration Noise Process
        self.noise = OUNoise(action_space=action_size, seed=random_seed)

        # Replay memory init
        self.memory = Memory(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
Beispiel #10
0
    def __init__(self, state_size: int, action_size: int, agent_no: int,
                 params: dict):
        """Initialize an Agent object.

        Args:
            state_size: dimension of each state
            action_size: dimension of each action
            agent_no: agent id
            params: architecture and hyperparameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = params['agent_seed']
        self.batch_size = params['batch_size']
        self.lr_actor = params['lr_actor']
        self.lr_critic = params['lr_critic']
        self.critic_weight_decay = params['critic_weight_decay']
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.update_step = params['update_step']
        self.num_agents = params['num_agents']

        random.seed(self.seed)
        self.t_step = 0
        self.agent_no = agent_no

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 params['first_hidden_units'],
                                 params['second_hidden_units'],
                                 self.seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  params['first_hidden_units'],
                                  params['second_hidden_units'],
                                  self.seed).to(device)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * self.num_agents,
                                   action_size * self.num_agents,
                                   params['first_hidden_units'],
                                   params['second_hidden_units'],
                                   self.seed).to(device)
        self.critic_target = Critic(state_size * self.num_agents,
                                    action_size * self.num_agents,
                                    params['first_hidden_units'],
                                    params['second_hidden_units'],
                                    self.seed).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.lr_critic,
            weight_decay=self.critic_weight_decay)

        # Noise process
        self.noise = OUNoise(action_size,
                             self.seed,
                             sigma=params['noise_sigma'])
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        hard_update(self.actor_target, self.actor_local)
        hard_update(self.critic_target, self.critic_local)


        # Noise process
        self.noise = OUNoise(action_size, random_seed)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)
    
    def target_act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_target.eval()
        with torch.no_grad():
            action = self.actor_target(state).cpu().data.numpy()
        self.actor_target.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)
    

    def reset(self):
        self.noise.reset()
    def __init__(self, state_size: int, action_size: int, num_agents: int,
                 epsilon, random_seed: int):
        """ Initialize a DDPG Agent Object

        :param state_size: dimension of state (input)
        :param action_size: dimension of action (output)
        :param num_agents: number of concurrent agents in the environment
        :param epsilon: initial value of epsilon for exploration
        :param random_seed: random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.t_step = 0

        # Hyperparameters
        self.buffer_size = 1000000
        self.batch_size = 128
        self.update_every = 10
        self.num_updates = 10
        self.gamma = 0.99
        self.tau = 0.001
        self.lr_actor = 0.0001
        self.lr_critic = 0.001
        self.weight_decay = 0
        self.epsilon = epsilon
        self.epsilon_decay = 0.97
        self.epsilon_min = 0.005

        # Networks (Actor: State -> Action, Critic: (State,Action) -> Value)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   random_seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)
        # Initialize actor and critic networks to start with same parameters
        self.soft_update(self.actor_local, self.actor_target, tau=1)
        self.soft_update(self.critic_local, self.critic_target, tau=1)

        # Noise Setup
        self.noise = OUNoise(self.action_size, random_seed)

        # Replay Buffer Setup
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
    def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            cfg (config object): main configuration with other passed settings
            num_agents (int): optional (default: 1). If >1 will multiply state and action
                            space sizes for critic. Used for usage with MADDPG.
            agent_id (int): optional (default: 0). Set agent id for MADDPG.
        """
        print("Initializing single DDPG agent!")

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(cfg.random_seed)
        self.n_agents = num_agents
        self.agent_id = agent_id

        self.cfg = cfg

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, cfg.random_seed,
                                 cfg.dense_layers_actor).to(device)
        self.actor_target = Actor(state_size, action_size, cfg.random_seed,
                                  cfg.dense_layers_actor).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=cfg.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents, cfg.random_seed,
                                   cfg.dense_layers_critic).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents, cfg.random_seed,
                                    cfg.dense_layers_critic).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=cfg.lr_critic,
                                           weight_decay=cfg.weight_decay)

        self.hard_copy_weights(self.critic_local, self.critic_target)
        self.hard_copy_weights(self.actor_local, self.actor_target)

        self.t_step = 0

        # Noise process
        self.noise = OUNoise(action_size,
                             cfg.random_seed,
                             theta=cfg.theta_ou,
                             sigma=cfg.sigma_ou)

        # Replay memory
        self.memory = ReplayBuffer(action_size, cfg.buffer_size,
                                   cfg.batch_size, cfg.random_seed, cfg)
    def __init__(self, env):
        """

        :param task: (class instance) Instructions about the goal and reward
        """

        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.action_low = env.action_space.low
        self.action_high = env.action_space.high
        self.score = 0.0
        self.best = 0.0

        # Instances of the policy function or actor and the value function or critic
        # Actor critic with Advantage

        # Actor local and target
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)

        # Save actor model for future use
        actor_local_model_yaml = self.actor_local.model.to_yaml()
        with open("actor_local_model.yaml", "w") as yaml_file:
            yaml_file.write(actor_local_model_yaml)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic local and target
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model with local model
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Initialize the Gaussin Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Initialize the Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64  # original 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Parameters for the Algorithm
        self.gamma = 0.99  # Discount factor
        self.tau = 0.01  # Soft update for target parameters Actor Critic with Advantage
Beispiel #15
0
    def __init__(self, num_in_pol, num_out_pol, num_in_critic, hidden_dim_actor=120,
    hidden_dim_critic=64,lr_actor=0.01,lr_critic=0.01,batch_size=64,
    max_episode_len=100,tau=0.02,gamma = 0.99,agent_name='one', discrete_action=False):
        """
        Inputs:
            num_in_pol (int): number of dimensions for policy input
            num_out_pol (int): number of dimensions for policy output
            num_in_critic (int): number of dimensions for critic input
        """
        self.policy = Actor(num_in_pol, num_out_pol,
                                 hidden_dim=hidden_dim_actor,
                                 discrete_action=discrete_action)
        self.critic = Critic(num_in_pol, 1,num_out_pol,
                                 hidden_dim=hidden_dim_critic)
        self.target_policy = Actor(num_in_pol, num_out_pol,
                                        hidden_dim=hidden_dim_actor,
                                        discrete_action=discrete_action)
        self.target_critic = Critic(num_in_pol, 1,num_out_pol,
                                        hidden_dim=hidden_dim_critic)
        hard_update(self.target_policy, self.policy)
        hard_update(self.target_critic, self.critic)
        self.policy_optimizer = Adam(self.policy.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic,weight_decay=0)
        
        self.policy = self.policy.float()
        self.critic = self.critic.float()
        self.target_policy = self.target_policy.float()
        self.target_critic = self.target_critic.float()

        self.agent_name = agent_name
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        #self.replay_buffer = ReplayBuffer(1e7)
        self.replay_buffer = ReplayBufferOption(500000,self.batch_size,12)
        self.max_replay_buffer_len = batch_size * max_episode_len
        self.replay_sample_index = None
        self.niter = 0
        self.eps = 5.0
        self.eps_decay = 1/(250*5)

        self.exploration = OUNoise(num_out_pol)
        self.discrete_action = discrete_action

        self.num_history = 2
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.dones = []
Beispiel #16
0
    def __init__(self, index, config, filenames=None):
        random.seed(config.general.seed)
        np.random.seed(config.general.seed)

        self.noise = OUNoise(config)
        self.index = index
        self.action_size = config.environment.action_size
        self.tau = config.hyperparameters.tau

        self.actor_local = Network(config.actor, config.general.seed)
        self.actor_target = Network(config.actor, config.general.seed)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=config.actor.lr)
        self.critic_local = Network(config.critic, config.general.seed)
        self.critic_target = Network(config.critic, config.general.seed)
        self.critic_optimizer = Adam(self.critic_local.parameters(), lr=config.critic.lr, weight_decay=config.hyperparameters.weight_decay)
Beispiel #17
0
class DDPG_agent(nn.Module):
    def __init__(self, in_actor, in_critic, action_size, num_agents,
                 random_seed):
        super(DDPG_agent, self).__init__()
        """init the agent"""

        self.action_size = action_size
        self.seed = random_seed

        # Fully connected actor network
        self.actor_local = Actor(in_actor, self.action_size,
                                 self.seed).to(device)
        self.actor_target = Actor(in_actor, self.action_size,
                                  self.seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Fully connected critic network
        self.critic_local = Critic(in_critic, num_agents * self.action_size,
                                   self.seed).to(device)
        self.critic_target = Critic(in_critic, num_agents * self.action_size,
                                    self.seed).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise process for exploration
        self.noise = OUNoise((action_size), random_seed)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def target_act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        action = self.actor_target(state)
        return action

    def reset(self):
        """ Resets noise """
        self.noise.reset()
Beispiel #18
0
    def __init__(self,
                 state_shape,
                 action_shape,
                 batch_size=128,
                 gamma=0.995,
                 tau=0.005,
                 actor_lr=0.0001,
                 critic_lr=0.001,
                 use_layer_norm=True):

        self.state_shape = state_shape
        self.action_shape = action_shape
        self.num_actions = np.prod(self.action_shape)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = batch_size
        self.memory = ReplayBuffer(self.buffer_size, self.action_shape,
                                   self.state_shape)

        # Noise process
        self.noise = OUNoise(self.num_actions)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  #soft update
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        #initialize
        self.models = Models(self.state_shape,
                             self.action_shape,
                             actor_lr=self.actor_lr,
                             critic_lr=self.critic_lr,
                             gamma=self.gamma,
                             use_layer_norm=use_layer_norm)
        self.initialize()
        self.saver = tf.train.Saver()
        self.current_path = os.getcwd()

        #initial episode vars
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0
        self.episode_num = 0
Beispiel #19
0
    def __init__(self,
                 in_actor,
                 hidden_in_actor,
                 hidden_out_actor,
                 out_actor,
                 in_critic,
                 hidden_in_critic,
                 hidden_out_critic,
                 lr_actor=1.0e-3,
                 lr_critic=1.0e-3,
                 noise_dist: str = 'normal',
                 checkpoint_path=None) -> None:
        super(DDPGAgent, self).__init__()

        self.actor = Network(in_actor,
                             hidden_in_actor,
                             hidden_out_actor,
                             out_actor,
                             actor=True).to(device)
        self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic,
                              1).to(device)
        self.target_actor = Network(in_actor,
                                    hidden_in_actor,
                                    hidden_out_actor,
                                    out_actor,
                                    actor=True).to(device)
        self.target_critic = Network(in_critic, hidden_in_critic,
                                     hidden_out_critic, 1).to(device)

        self.noise = OUNoise(out_actor, scale=1.0, noise_dist=noise_dist)
        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=1.e-5)
        if checkpoint_path:
            checkpoint = torch.load(checkpoint_path)
            self.actor.load_state_dict(checkpoint[0]['actor_params'])
            self.target_actor.load_state_dict(checkpoint[0]['actor_params'])
            self.critic.load_state_dict(checkpoint[0]['critic_params'])
            self.target_critic.load_state_dict(checkpoint[0]['critic_params'])
    def __init__(self, state_size, action_size, num_agents, cfg):
        """Initialize a MADDPG Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): Number of agents in environment
            cfg (config object): main configuration with other settings
        """
        print("Initializing MADDPG agent with {:d} agents!".format(num_agents))

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(cfg.random_seed)

        self.cfg = cfg

        # initializing list of single agents (2 for tennis)
        self.agents = []
        for aid in range(num_agents):
            agent = SingleDDPGAgent(state_size,
                                    action_size,
                                    cfg,
                                    num_agents=num_agents,
                                    agent_id=aid)
            self.agents.append(agent)

        self.t_step = 0

        # Noise process
        self.noise_scale = self.cfg.noise_scale
        self.noise = OUNoise(action_size,
                             cfg.random_seed,
                             theta=cfg.theta_ou,
                             sigma=cfg.sigma_ou)

        # as long as active, will fill replay buffer with random memories, no learning
        self.prefetching = True

        # Replay memory for shared experiences (all agents)
        self.memory = ReplayBuffer(action_size, cfg.buffer_size,
                                   cfg.batch_size, cfg.random_seed, cfg)
Beispiel #21
0
    def __init__(self, params, name, task):
        super(Twin_DDPG, self).__init__(params, name, task)
        self.aPars = params['actPars']
        self.aTrain = params['actTrain']

        if self.trainMode:
            self.values = [
                Network(self.vPars, self.vTrain),
                Network(self.vPars, self.vTrain)
            ]
            self.policyNet = TD3Network(self.aPars, self.aTrain)
            self.tarPolicy = TD3Network(self.aPars, self.aTrain)

            if self.load:
                self.load_nets()

            self.tarPolicy.load_state_dict(self.policyNet.state_dict())
            self.tar = [
                Network(self.vPars, self.vTrain),
                Network(self.vPars, self.vTrain)
            ]
            for i in range(len(self.values)):
                self.tar[i].load_state_dict(self.values[i].state_dict())
        else:
            self.policyNet = Network(self.aPars, self.aTrain)
            self.policyNet.load_state_dict(
                torch.load(
                    "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal_policy2.txt"
                ))

        self.base = self.vTrain['baseExplore']
        self.step = self.vTrain['decay']
        self.expSize = self.vTrain['buffer']
        self.exp = Replay(self.expSize)
        self.a = self.vTrain['a']
        self.tau = self.vPars['tau']
        self.smooth = self.vTrain['smooth']
        self.clip = self.vTrain['clip']
        self.delay = self.vTrain['policy_delay']
        self.mean_range = self.aPars['mean_range']
        self.noise = OUNoise(self.out_n,
                             mu=0,
                             theta=.15,
                             max_sigma=self.explore,
                             min_sigma=self.base,
                             decay=self.step)
        self.valueLoss = []
        self.actorLoss = []
        self.avgLoss = 0
        self.avgActLoss = 0

        task.initAgent(self)

        while (not self.stop):
            x = 1 + 1
        task.postTraining()
Beispiel #22
0
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """ Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # for MADDPG
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)
        self.eps = EPS_START
        self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM)
        self.timestep = 0

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
Beispiel #23
0
    def build_agent(self):
        # build the actor-critic network and also their target networks
        self.actor = Actor(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.alpha)
        self.target_actor = copy.deepcopy(self.actor)
        self.critic = Critic(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.beta)
        self.target_critic = copy.deepcopy(self.critic)

        # build the replaybuffer
        self.replaybuffer = ReplayBuffer(self.max_replay_size, self.state_dim, self.action_dim)
        # build the OUNoise for action selection 
        self.noise = OUNoise(self.action_dim)
Beispiel #24
0
class DDPGAgent():

    def __init__(self, index, config, filenames=None):
        random.seed(config.general.seed)
        np.random.seed(config.general.seed)

        self.noise = OUNoise(config)
        self.index = index
        self.action_size = config.environment.action_size
        self.tau = config.hyperparameters.tau

        self.actor_local = Network(config.actor, config.general.seed)
        self.actor_target = Network(config.actor, config.general.seed)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=config.actor.lr)
        self.critic_local = Network(config.critic, config.general.seed)
        self.critic_target = Network(config.critic, config.general.seed)
        self.critic_optimizer = Adam(self.critic_local.parameters(), lr=config.critic.lr, weight_decay=config.hyperparameters.weight_decay)

    def act(self, state, noise, random):
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(torch.from_numpy(state).float().to(device)).cpu().data.numpy()
        self.actor_local.train()
        if noise is not None:
            action += self.noise.sample() * noise
        if random is not None:
            action = (1 - random) * action + random * (np.random.rand(self.action_size) - 0.5) * 2.0
        return np.clip(action, -1, 1)

    def learn(self, index, experiences, gamma, all_next_actions, all_actions):
        states, actions, rewards, next_states, dones = experiences

        self.critic_optimizer.zero_grad()

        index = torch.tensor([index]).to(device)
        actions_next = torch.cat(all_next_actions, dim=1).to(device)
        with torch.no_grad():
            q_next = self.critic_target(critic_input(next_states, actions_next))
        q_exp = self.critic_local(critic_input(states, actions))
        q_t = rewards.index_select(1, index) + (gamma * q_next * (1 - dones.index_select(1, index)))
        F.mse_loss(q_exp, q_t.detach()).backward()
        self.critic_optimizer.step()

        self.actor_optimizer.zero_grad()

        actions_pred = [actions if i == self.index else actions.detach() for i, actions in enumerate(all_actions)]
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        actor_loss = -self.critic_local(critic_input(states, actions_pred)).mean()
        actor_loss.backward()

        self.actor_optimizer.step()

        self.actor_target.soft_update(self.actor_local, self.tau)
        self.critic_target.soft_update(self.critic_local, self.tau)
Beispiel #25
0
    def __init__(self, 
        state_size, action_size, replay_memory, random_seed=0, nb_agent = 20, bs = 128,
        gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-4, wd_actor=0, wd_critic=0,
        clip_actor = None, clip_critic=None, update_interval = 20, update_times = 10): 

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.nb_agent = nb_agent
        self.bs = bs
        self.update_interval = update_interval
        self.update_times = update_times
        self.timestep = 0

        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.wd_critic = wd_critic
        self.wd_actor = wd_actor
        self.clip_critic=clip_critic
        self.clip_actor = clip_actor
        self.actor_losses = []
        self.critic_losses = []

        # Actor #0
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor,weight_decay=self.wd_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic,weight_decay=self.wd_critic)

        # Noise process
        self.noise = OUNoise((self.nb_agent, action_size), random_seed)

        # Replay memory
        self.memory = replay_memory
Beispiel #26
0
    def __init__(self, params, name, task):
        self.name           = name
        self.task           = task

        self.vPars          = params['valPars']
        self.vTrain         = params['valTrain']
        self.mPars          = params['mPars']
        self.mTrain         = params['mTrain']
        self.wPars          = params['actPars']
        self.wTrain         = params['actTrain']
        self.w_vPars        = params['w_vPars']
        self.w_vTrain       = params['w_vTrain']

        self.agents         = params['agents']
        self.pubs = {}
        for key in self.agents.keys():
            bot             = self.agents[key]
            self.pubs[key]  = rospy.Publisher(bot['pub'], Vector3, queue_size = 1)
        rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size = 1)

        self.valueLoss      = []

        self.manager        = Network(self.mPars, self.mTrain)
        self.m_critic       = Network(self.vPars, self.vTrain) 
        self.m_critic_target= Network(self.vPars, self.vTrain)
        self.worker         = Network(self.wPars, self.wTrain)
        self.w_critic       = Network(self.w_vPars, self.w_vTrain)
        self.w_critic_target= Network(self.w_vPars, self.w_vTrain)

        self.m_discount     = self.vTrain['m_gamma']
        self.w_discount     = self.vTrain['w_gamma']
        self.lr             = self.vTrain['lr']
        self.trainMode      = self.vPars['trainMode']
        self.step           = self.vTrain['step']
        self.stop           = False
        self.c              = self.mTrain['c']
        self.tau            = .005
        self.noise          = Noise(self.manager.neurons[-1], theta = .4, max_sigma = .2, min_sigma = 0, decay = 1)

        self.exp            = Memory()
        self.temp           = []
        self.totalSteps     = 0
        self.soft           = nn.Softmax(dim=1)

        self.reset()

        task.initAgent(self)

        while(not self.stop):
            x = 1+1

        task.postTraining()
Beispiel #27
0
    def __init__(self, model_name, state_size, action_size, random_seed=0):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.model_name = model_name
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.rewards = list()
        self.losses = deque(maxlen=100)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
Beispiel #28
0
    def __init__(self, env):
        """Class initialization."""
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.action_low = env.action_space.high[0]
        self.action_high = env.action_space.low[0]

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
    def __init__(self, state_size, action_size, seed=0, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, gamma=GAMMA, checkpoint_path='./checkpoints/', pretrained=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.checkpoint_path = checkpoint_path

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        # If pretrained, load weights
        if pretrained:
            actor_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_actor.pth'))
            critic_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_critic.pth'))
            self.actor_local.load_state_dict(actor_dict)
            self.actor_target.load_state_dict(actor_dict)
            self.critic_local.load_state_dict(critic_dict)
            self.critic_target.load_state_dict(critic_dict)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device)
    def __init__(self,
                 num_agents=2,
                 obs_size=24,
                 act_size=2,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3,
                 weight_decay_actor=1e-5,
                 weight_decay_critic=1e-4,
                 clip_grad=1.0):
        super(MADDPGAgent, self).__init__()

        # Write parameters
        self.num_agents = num_agents
        self.gamma = gamma
        self.tau = tau
        self.clip_grad = clip_grad

        # Create all the networks
        self.actor = ActorNetwork(obs_size, act_size).to(device)
        self.critic = CriticNetwork(num_agents, obs_size, act_size).to(device)
        self.target_actor = ActorNetwork(obs_size, act_size).to(device)
        self.target_critic = CriticNetwork(num_agents, obs_size,
                                           act_size).to(device)

        # Copy initial network parameters to target networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Initialize training optimizers and OU noise
        self.noise = OUNoise(act_size, scale=1.0)
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=lr_actor,
                                    weight_decay=weight_decay_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay_critic)