Beispiel #1
0
    def __init__(self, game, num_agents, state_size, action_size, name, random_seed=0,
                    lr_critic=1e-3, lr_actor=1e-3,
                    fc1_units=400, fc2_units=300,
                    buffer_size=int(1e6), batch_size=128,
                    gamma=0.99, tau=1e-3,
                    max_norm=1.0,
                    epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.99,
                    exploration_mu=0.0, exploration_theta=0.15, exploration_sigma=0.2):
        
        """Initialize an Agent object.
        Args:
            game (class Game): meidator in chain-of-responsibility design pattern. (Broker chain)
            random_seed (int): random seed.
            
            max_norm (float): value of clip_grad_norm for critic optimizer
        """
        super().__init__()
        
        self.index_agent = None
        
        self.game = game
        self.num_agents = num_agents
            
        self.state_size = state_size
        self.action_size = action_size
        self.name = name
        self.seed = random.seed(random_seed)
        
        self.max_norm = max_norm
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        # Actor Network (w/ Target Network)
        self.actor_local = MADDPGActorVersion3(state_size, action_size, 
                                               fc1_units=fc1_units, fc2_units=fc2_units).to(device)
        self.actor_target = MADDPGActorVersion3(state_size, action_size, 
                                                fc1_units=fc1_units, fc2_units=fc2_units).to(device)
        
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)
        
        # Critic Network (w/ Target Network)
        self.critic_local = MADDPGCriticVersion4(num_agents, state_size, action_size, 
                                                 fcs1_units=fc1_units, fc2_units=fc2_units).to(device)
        self.critic_target = MADDPGCriticVersion4(num_agents, state_size, action_size, 
                                                  fcs1_units=fc1_units, fc2_units=fc2_units).to(device)
        
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        # Noise process for action
        # Noise process
        self.noise = OUNoise(self.action_size, exploration_mu, exploration_theta, exploration_sigma)

        # parameter of discounted reward
        self.gamma = gamma
        
        # soft update parameter
        self.tau = tau
        
        self.batch_size = batch_size
Beispiel #2
0
    def __init__(self,
                 agent_count,
                 observation_size,
                 action_size,
                 actor_optim_params,
                 critic_optim_params,
                 soft_update_tau,
                 discount_gamma,
                 use_batch_norm,
                 seed,
                 actor_network_states,
                 critic_network_states,
                 device):

        self._soft_update_tau = soft_update_tau
        self._gamma = discount_gamma

        # actor networks
        self._actor_local = ActorNetwork(
            observation_size, action_size, use_batch_norm, seed
        ).to(device)

        self._actor_target = ActorNetwork(
            observation_size, action_size, use_batch_norm, seed
        ).to(device)

        # critic networks
        self._critic_local = CriticNetwork(
            observation_size * agent_count, action_size * agent_count, use_batch_norm, seed
        ).to(device)

        self._critic_target = CriticNetwork(
            observation_size * agent_count, action_size * agent_count, use_batch_norm, seed
        ).to(device)

        # optimizers
        self._actor_optimizer = optim.Adam(
            self._actor_local.parameters(),
            **actor_optim_params
        )

        self._critic_optimizer = optim.Adam(
            self._critic_local.parameters(),
            **critic_optim_params
        )

        if actor_network_states is not None:
            self._actor_local.load_state_dict(actor_network_states[0])
            self._actor_target.load_state_dict(actor_network_states[1])

        if critic_network_states is not None:
            self._critic_local.load_state_dict(critic_network_states[0])
            self._critic_target.load_state_dict(critic_network_states[1])

        self.noise = OUNoise(action_size, seed)
Beispiel #3
0
    def __init__(self, task, buffer_size, batch_size, gamma, tau,
                 actor_dropout, critic_dropout, exploration_theta,
                 exploration_sigma, actor_lr, critic_lr):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.actor_dropout = actor_dropout
        self.critic_dropout = critic_dropout
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_dropout, self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_dropout, self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_dropout, self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_dropout, self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 5
        self.exploration_theta = exploration_theta
        self.exploration_sigma = exploration_sigma
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.memory = PrioritizedReplayBuffer(self.buffer_size,
                                              self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters

        self.best_score = -np.inf
Beispiel #4
0
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-2,
                 lr_critic=1e-2,
                 fc1_units=128,
                 fc2_units=128,
                 buffer_size=int(1e6),
                 batch_size=50,
                 gamma=0.95,
                 tau=1e-2,
                 max_norm=1.0,
                 learn_period=100,
                 learn_sampling_num=50,
                 adam_critic_weight_decay=0.0,
                 name=None,
                 exploration_mu=0.0,
                 exploration_sigma=0.2,
                 exploration_theta=0.15):
        """Initialize an Agent object.
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.max_norm = max_norm
        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num

        self.actor_local = DDPGActorVersion1(state_size,
                                             action_size,
                                             random_seed,
                                             fc1_units=fc1_units,
                                             fc2_units=fc2_units).to(device)

        self.actor_target = DDPGActorVersion1(state_size,
                                              action_size,
                                              random_seed,
                                              fc1_units=fc1_units,
                                              fc2_units=fc2_units).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPGCriticVersion1(state_size,
                                               action_size,
                                               random_seed,
                                               fcs1_units=fc1_units,
                                               fc2_units=fc2_units).to(device)

        self.critic_target = DDPGCriticVersion1(state_size,
                                                action_size,
                                                random_seed,
                                                fcs1_units=fc1_units,
                                                fc2_units=fc2_units).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=adam_critic_weight_decay)
        # Noise process for action
        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016)
        #         self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016)

        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta  # (Timothy Lillicrap, 2016)
        self.exploration_sigma = exploration_sigma  # (Timothy Lillicrap, 2016)

        self.noise = OUNoise(action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory

        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed, device)

        self.gamma = gamma

        # soft update parameter
        self.tau = tau

        self.batch_size = batch_size

        self.name = name

        self.time_step = 0
Beispiel #5
0
def main(gpu_num, exp_num, env=None):
    dir_name = 'Data/checkpoint/'
    if not os.path.isdir(dir_name):
        os.makedirs(dir_name)

    with open('log.txt', 'a') as text_file:
        text_file.write('gpu %i exp %i started.\n' % (gpu_num, exp_num))

    with tf.device('/gpu:%i' % (gpu_num)):
        pms = Paras_base().pms
        pms.save_model = True
        pms.save_dir = dir_name
        env = CartPoleEnv() if env is None else env
        action_size = env.action_space.shape[0]
        observation_size = env.observation_space.shape[0]
        max_action = env.action_space.high[0]
        pms.obs_shape = observation_size
        pms.max_iter = 1000000
        pms.action_shape = action_size
        pms.max_action = max_action
        pms.num_of_paths = 100
        pms.name_scope = 'ddpg'
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.20
        sess = tf.Session(config=config)

        state_ph = tf.placeholder(tf.float32, [None, pms.obs_shape])
        action_ph = tf.placeholder(tf.float32, [None, pms.action_shape])
        critic_input_ph = tf.concat([state_ph, action_ph], axis=1)
        actor_net = Fcnn(sess,
                         pms.obs_shape,
                         pms.action_shape, [400, 300],
                         name=pms.name_scope + '_actor_r',
                         if_bias=[False],
                         activation=['relu', 'relu', 'None'],
                         input_tf=state_ph)
        actor_target_net = Fcnn(sess,
                                pms.obs_shape,
                                pms.action_shape, [400, 300],
                                name=pms.name_scope + '_actor_t',
                                if_bias=[False],
                                activation=['relu', 'relu', 'None'],
                                input_tf=state_ph)
        critic_net = Fcnn(sess,
                          pms.obs_shape + pms.action_shape,
                          1, [400, 300],
                          name=pms.name_scope + '_critic_r',
                          if_bias=[False],
                          activation=['relu', 'relu', 'None'],
                          input_tf=critic_input_ph)
        critic_target_net = Fcnn(sess,
                                 pms.obs_shape + pms.action_shape,
                                 1, [400, 300],
                                 name=pms.name_scope + '_critic_t',
                                 if_bias=[False],
                                 activation=['relu', 'relu', 'None'],
                                 input_tf=critic_input_ph)
        critic_net.state_ph = state_ph
        critic_net.action_ph = action_ph
        print('sth')
        actor = DeterministicActor(actor_net, sess, pms)
        actor_target = DeterministicActor(actor_net, sess, pms)

        replay_buffer = ReplayBuffer(buffer_size=pms.buffer_size)
        ounoise = OUNoise(pms.action_shape)
        learn_agent = DDPGagent(env, actor, critic_net, actor_target,
                                critic_target_net, replay_buffer, ounoise,
                                sess, pms, [None])

    saver = tf.train.Saver()
    learn_agent.saver = saver
    sess.run(tf.global_variables_initializer())
    saving_result = learn_agent.learn()
    sess.close()