Esempio n. 1
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Score
        self.score = -np.inf
        self.best_score = -np.inf
Esempio n. 2
0
    def __init__(self, task, exp_mu, exp_theta, exp_sigma, gamma, tau):
        self.task = task

        self.s_size = task.s_size
        self.a_size = task.a_size

        self.a_low = task.a_low
        self.a_high = task.a_high

        # Actor Model
        self.actor_local = Actor(self.s_size, self.a_size, self.a_low,
                                 self.a_high)
        self.actor_target = Actor(self.s_size, self.a_size, self.a_low,
                                  self.a_high)

        # Critic Model
        self.critic_local = Critic(self.s_size, self.a_size)
        self.critic_target = Critic(self.s_size, self.a_size)

        # Initialize target model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # initialize noise
        self.exp_mu = exp_mu
        self.exp_theta = exp_theta
        self.exp_sigma = exp_sigma
        self.noise = OUNoise(self.a_size, self.exp_mu, self.exp_theta,
                             self.exp_sigma)

        # For Replay buffer
        self.buff_size = 1024 * 1024
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buff_size, self.batch_size)

        # discount factor
        self.gamma = gamma

        # for soft update of target parameters
        self.tau = tau
Esempio n. 3
0
    def __init__(self, task, lra, lrc, db):
        self.task = task
        self.s_sz = task.state_size
        self.a_sz = task.action_size
        self.a_max = task.max_action

        # Actor (Policy) Model
        self.actor_local = Actor(self.s_sz, self.a_sz, lra)
        self.actor_target = Actor(self.s_sz, self.a_sz, lra)

        # First Critic (Value) Model
        self.critic_local_1 = Critic(self.s_sz, self.a_sz, lrc)
        self.critic_target_1 = Critic(self.s_sz, self.a_sz, lrc)

        # Second Critic (Value) Model
        self.critic_local_2 = Critic(self.s_sz, self.a_sz, lrc)
        self.critic_target_2 = Critic(self.s_sz, self.a_sz, lrc)

        # Initialize target model parameters with local model parameters
        self.critic_target_1.model.set_weights(
            self.critic_local_1.model.get_weights())
        self.critic_target_2.model.set_weights(
            self.critic_local_2.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = GaussianNoise(self.a_sz)

        # Replay memory
        self.num_exp = 0
        self.batch = 32
        self.buffer = 10000
        labels = ["state", "action", "reward", "next_state", "done"]
        self.experience = namedtuple("Experience", field_names=labels)
        self.memory = PrioritizedReplayBuffer(self.buffer, self.batch, db)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.005  # for soft update of target parameters
Esempio n. 4
0
    def __init__(self, task):
        """Initialize models"""
        self.env = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # Initialize Actor (policy) models
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Initialize Critic (value) models
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay buffer

        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.9  # discount factor
        self.tau = 0.001  # for soft update of target parameters
Esempio n. 5
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.001

        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size)

        self.gamma = 0.99
        self.tau = 0.1
        self.learning_rate = 0.0005

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learning_rate=self.learning_rate)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learning_rate=self.learning_rate)

        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learning_rate=self.learning_rate)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learning_rate=self.learning_rate)
Esempio n. 6
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
Esempio n. 7
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99
        self.tau = 0.001
    def create_models(self,
                      hidden_sizes_actor=(512, 256),
                      hidden_sizes_critic=(512, 256, 256)):
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 hidden_sizes=hidden_sizes_actor)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  hidden_sizes=hidden_sizes_actor)
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   hidden_sizes=hidden_sizes_critic)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    hidden_sizes=hidden_sizes_critic)
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
Esempio n. 9
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Critic
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # Exploration noise
        self.exploration_mu = 0.1
        self.exploration_sigma = 0.1
        self.exploration_theta = 0.1
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Experience
        self.buffer_size = 100000000
        self.batch_size = 64
        self.buffer = ReplayBuffer(self.buffer_size)

        # Parameters
        self.gamma = 0.99
        self.tau = 0.001
Esempio n. 10
0
 def __init__(self, task):
     self.task = task
     self.state_size = task.state_size
     self.action_size = task.action_size
     self.action_low = task.action_low
     self.action_high = task.action_high
     
     #actor model
     self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
     self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
     
     #Critic model
     self.critic_local = Critic(self.state_size, self.action_size)
     self.critic_target = Critic(self.state_size, self.action_size)
     
     #Initialize target model params with local params
     self.critic_target.model.set_weights(
             self.critic_local.model.get_weights())
     self.actor_target.model.set_weights(
             self.actor_local.model.get_weights())
     
     #Initialize noise process
     self.exploration_mu = 0
     self.exploration_theta = 0.15
     self.exploration_sigma = 0.2
     self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
     
     #Replay memory Initialization
     self.buffer_size, self.batch_size = 2000000, 64
     self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
     
     #Initialize algorithm parameters
     self.gamma, self.tau = 0.95, 0.001
     
     #Initialize scores
     self.score, self.best_score = -np.inf, -np.inf
Esempio n. 11
0
    def __init__(self,
                 task,
                 single_rotor_control=False,
                 prioritised_replay=False):
        tf.reset_default_graph()

        self.task = task
        self.state_size = self.task.state_size
        self.action_size = self.task.action_size
        self.action_low = self.task.action_low
        self.action_high = self.task.action_high
        self.action_range = self.action_high - self.action_low
        self.prioritised_replay = prioritised_replay

        with tf.variable_scope("local"):
            self.actor_local = Actor(self.state_size,
                                     self.action_size,
                                     self.action_low,
                                     self.action_high,
                                     single_rotor_control,
                                     is_training=False)
            self.critic_local = Critic(self.state_size,
                                       self.action_size,
                                       is_training=False)

        with tf.variable_scope("target"):
            self.actor_target = Actor(self.state_size, self.action_size,
                                      self.action_low, self.action_high,
                                      single_rotor_control)
            self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        #self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        #self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        self.tau = tf.placeholder(tf.float32, name='tau')
        self.target_update_ops = self.soft_update()

        with tf.name_scope('summary'):
            self.reward_log = tf.Variable(0.,
                                          False,
                                          name='reward_log',
                                          dtype=tf.float32)
            self.eps_length_log = tf.Variable(0.,
                                              False,
                                              name='reward_log',
                                              dtype=tf.float32)
            tf.summary.scalar('reward_log', self.reward_log)
            tf.summary.scalar('eps_length_log', self.eps_length_log)
            self.summary_op = tf.summary.merge_all()

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = .3  #(self.action_high - self.action_low)*.05
        self.exploration_sigma = .4  #(self.action_high - self.action_low)*.05
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
    def __init__(self, task, basename):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # learning rates
        self.actor_learning_rate = 0.0001
        self.critic_learning_rate = 0.001

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_learning_rate)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_learning_rate)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_learning_rate)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_learning_rate)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 128
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # keep track of the best run
        self.nEpisode = 0
        self.bestEpisode = []
        self.bestEpisodeAt = -1

        # logging business
        self.state_labels = self.task.get_state_labels()
        self.action_labels = [
            'ac{}'.format(i) for i in range(self.action_size)
        ]
        self.df_columns = [
            't'
        ] + self.state_labels.tolist() + self.action_labels + ['R']
        self.basename = os.path.join('log', basename)
        self.currentEpisode = []
        self.bestCumReward = -np.inf
Esempio n. 13
0
    def __init__(self, task):
        '''
        Params
        ======
        task (object)   : environment

        '''
        '''
        Reference: Continuous Control With Deep Reinforcement Learning(2016)
        Playing CartPole through Asynchronous Advantage Actor Critic (A3C) with tf.keras
        =========
        gamma   : 0.99
        tau     : 0.001
        buffer_size (ReplayBuffer)  : 1e6
        batch_size (ReplayBuffer)   : 64
        theta (Ornstein-Uhlenbeck process)  : 0.15
        sigma (Ornstein-Uhlenbeck process)  : 0.2


        '''

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # actor (policy) model - use two copies of model for updating model and producing target
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # critic (value) model - use two copies of model for updating model and producing target
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # reward history
        self.best_avg_score = -np.inf
        self.accumulated_reward = 0
        self.count = 0
Esempio n. 14
0
    def __init__(self,
                 task,
                 mu=0.02,
                 theta=0.16,
                 sigma=0.21,
                 buffer=500000,
                 batch=64,
                 gamma=0.98,
                 tau=0.02,
                 learning=0.001,
                 dropout=0.2):

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high, learning,
                                 dropout)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high, learning,
                                  dropout)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, learning,
                                   dropout)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    learning, dropout)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = mu
        self.exploration_theta = theta
        self.exploration_sigma = sigma
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = buffer
        self.batch_size = batch
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters
        # Score tracker and learning parameters
        #        self.best_w = None
        self.score = 0
        self.best_score = -np.inf
        self.noise_scale = 0.1

        # Episode variables
        self.reset_episode()